Example #1
0
        states_and_rewards = play_game(policy, grid)
        for t in range(len(states_and_rewards) - 1):
            s, _ = states_and_rewards[t]
            s2, r = states_and_rewards[t + 1]
            old_theta = model.theta.copy()
            if grid.is_terminal(s2):
                target = r
            else:
                target = r + GAMMA * model.predict(s2)
            #x = model.s2x(s)
            model.theta += alpha * (target - model.predict(s)) * model.grad(s)
            biggest_change = max(biggest_change,
                                 np.abs(old_theta - model.theta).sum())
        deltas.append(biggest_change)

    plt.plot(deltas)
    plt.show()

    #Predict V
    V = {}
    for s in states:
        if s in grid.actions:
            V[s] = model.predict(s)
        else:
            V[s] = 0

    print("Values: ")
    print_values(V, grid)
    print("Policy: ")
    print_policy(policy, grid)
Example #2
0
    for s in States:
        if s == grid.actions:
            V[s] = np.random.random()
        else:
            #Terminal States
            V[s] = 0



    #Initial policy
    Policy = {}
    for s in grid.actions.keys():
        Policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)

    print('Print Policy: ')
    print_policy(Policy, grid)

    while True:
        while True:# k : iteration
            #Setting delta to say the new Value
            #is the policy True Value or Not
            delta = 0
            for s in States:
                old_v = V[s]
                new_v = 0
                if s in Policy:
                    #Check all the Posible actions
                    for a in ALL_POSSIBLE_ACTIONS:
                        #If the action is the action that the agent
                        #want to take then the probability is 0.5 otherwise
                        #the probability is 0.5/3 for each ofther actions