#throw out last element becuase it is a terminal state
    return list(reversed(state_returns))[:-1]



if __name__ == '__main__':
    #this demos how to evaluate a policy by playing the game.
    grid = standard_grid()
    V = {}
    #policy to check
    POLICY = {
        (2, 0): 'U',  (2, 1): 'R', (2, 2): 'R',   (2, 3): 'U',
        (1, 0): 'U',               (1, 2): 'U', #terminal L
        (0, 0): 'R',  (0, 1): 'R', (0, 2): 'R', #terminal win
    }
    print_policy(POLICY, grid)


    state_to_return_list = {}
    for s in grid.location_to_action.keys():
        state_to_return_list[s] = []


    start_state_list = list(grid.location_to_action.keys())
    for i in range(1000):
        #run game
        test_state_index = np.random.choice(len(start_state_list))
        state_returns = play_game_and_get_returns(start_state_list[test_state_index], grid, POLICY)

        eval_states = set()
        for s, G in state_returns:
            else:
                Qs2 = getQs(model, s2)
                a2, MaxQs2a2 = max_dict(Qs2)
                a2 = random_action(a2, eps=0.5 / t)
                model.theta += alpha * (r + gamma * MaxQs2a2 - model.predict(
                    s, a)) * model.grad(s, a)
                s = s2
                a = a2

            delta = max(delta, np.abs(old_theta - model.theta).sum())

        deltas.append(delta)

    plt.plot(deltas)
    plt.show()

    #Find Policy and V function
    Policy = {}
    V = {}
    Q = {}
    for s in g.actions.keys():
        Q[s] = getQs(model, s)
        a, max_q = max_dict(Q[s])
        Policy[s] = a
        V[s] = max_q

    print("Values")
    print_values(V, g)
    print("Policy")
    print_policy(Policy, g)
Beispiel #3
0
            break


if __name__ == '__main__':
    grid = negative_grid(-0.3)

    print("rewards:")
    print_values(grid.location_to_rewards, grid)

    # intialize random policy. then update
    policy = {}
    for s in grid.location_to_action.keys():
        policy[s] = np.random.choice(grid.location_to_action[s])

    print("initial policy:")
    print_policy(policy, grid)

    V = initalize_V(grid)
    while True:
        #evaluate policy to find V
        evalulate_v_for_policy(policy, grid, V)
        """
        Summary: change policy for biggest V
        Look at the policy. Now that we know V for the policy, see if we can update for more V 
        """
        if check_if_policy_converges(grid, V, policy):
            break

    print("finished")
    print_values(V, grid)
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'R',
        (2, 1): 'R',
        (2, 2): 'R',
        (2, 3): 'U',
    }
    V = initalize_V(grid)


    for i in range(1000):
        #play game
        state_to_returns = play_game_return_state_rewards(grid, POLICY_TO_EVAL) #play_game_return_state_rewards(grid, POLICY_TO_EVAL)
        #calc V
        for j in range(len(state_to_returns) -  1):
            s, r0 = state_to_returns[j]
            s2, r = state_to_returns[j+1]
            V[s] = V[s] + ALPHA * (r + GAMMA * V[s2] - V[s])
            #V[s] = V[s] + ALPHA*(r + GAMMA*V[s2] - V[s])

    print("rewards")
    print_values(grid.location_to_rewards, grid)
    print("values:")
    print_values(V, grid)
    print("policy:")
    print_policy(POLICY_TO_EVAL, grid)