コード例 #1
0
def visualize_step_by_step(mdp, gamma, max_iter_number, min_difference):
    fig = plt.figure(figsize=(5, 5))
    state_values = {state: 0 for state in mdp.get_all_states()}
    for i in range(max_iter_number):
        new_state_values, done = rl_value_iteration(mdp, gamma, 1, min_difference, state_values)
        if done:
            break
        draw_policy(mdp, new_state_values, fig)
        state_values = new_state_values
コード例 #2
0
def mass_gaming(mdp, gamma, num_iter, games_number, steps_number):
    state_values = {state: 0 for state in mdp.get_all_states()}
    state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, state_values)

    total_rewards = []
    for game_i in range(games_number):
        s = mdp.reset()
        rewards = []
        for t in range(steps_number):
            s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma))
            rewards.append(r)
            if done:
                break
        total_rewards.append(np.sum(rewards))
    print('Average reward: ', np.mean(total_rewards))
    if mdp.slip_chance == 0:
        assert (1.0 <= np.mean(total_rewards) <= 1.0)
    else:
        assert (0.8 <= np.mean(total_rewards) <= 0.95)
    print('Well done!')
コード例 #3
0
if __name__ == '__main__':
    visualize = True
    mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1)
    mdp.render()

    gamma = 0.9
    num_iter = 100
    min_difference = 1e-5

    # Play in Frozen Lake Env
    state_values = {state: 0
                    for state in mdp.get_all_states()
                    }  # Initialize state_values

    # Run value iteration algo!
    state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference,
                                         state_values)

    # See how our agent performs - e.g. render what is going on when agent choose `optimal` value
    s = mdp.reset()
    mdp.render()
    rewards = []  # Save all rewards to see mean reward.

    for _ in range(num_iter):
        action = get_optimal_action(mdp, state_values, s, gamma)
        new_state, reward, done, _ = mdp.step(action)
        rewards += [reward]
        s = new_state
        mdp.render()

        if done:
            break