Beispiel #1
0
def draw_policy(mdp, state_values, fig=None):
    h, w = mdp.desc.shape
    states = sorted(mdp.get_all_states())
    V = np.array([state_values[s] for s in states])
    Pi = {s: get_optimal_action(mdp, state_values, s, gamma) for s in states}
    plt.imshow(V.reshape(w, h), cmap='gray', interpolation='none', clim=(0, 1))
    ax = plt.gca()
    ax.set_xticks(np.arange(h)-.5)
    ax.set_yticks(np.arange(w)-.5)
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    Y, X = np.mgrid[0:4, 0:4]
    a2uv = {'left': (-1, 0), 'down': (0, -1), 'right': (1,0), 'up': (-1, 0)}
    for y in range(h):
        for x in range(w):
            plt.text(x, y, str(mdp.desc[y,x].item()),
                     color='g', size=12,  verticalalignment='center',
                     horizontalalignment='center', fontweight='bold')
            a = Pi[y, x]
            if a is None:
                continue
            u, v = a2uv[a]
            plt.arrow(x, y,u*.3, -v*.3, color='m', head_width=0.1, head_length=0.1)
    plt.grid(color='b', lw=2, ls='-')
    plt.draw()
    plt.pause(2)
    if fig is not None:
        plt.cla()
Beispiel #2
0
def mass_gaming(mdp, gamma, num_iter, games_number, steps_number):
    state_values = {state: 0 for state in mdp.get_all_states()}
    state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, state_values)

    total_rewards = []
    for game_i in range(games_number):
        s = mdp.reset()
        rewards = []
        for t in range(steps_number):
            s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma))
            rewards.append(r)
            if done:
                break
        total_rewards.append(np.sum(rewards))
    print('Average reward: ', np.mean(total_rewards))
    if mdp.slip_chance == 0:
        assert (1.0 <= np.mean(total_rewards) <= 1.0)
    else:
        assert (0.8 <= np.mean(total_rewards) <= 0.95)
    print('Well done!')
Beispiel #3
0
    # Play in Frozen Lake Env
    state_values = {state: 0
                    for state in mdp.get_all_states()
                    }  # Initialize state_values

    # Run value iteration algo!
    state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference,
                                         state_values)

    # See how our agent performs - e.g. render what is going on when agent choose `optimal` value
    s = mdp.reset()
    mdp.render()
    rewards = []  # Save all rewards to see mean reward.

    for _ in range(num_iter):
        action = get_optimal_action(mdp, state_values, s, gamma)
        new_state, reward, done, _ = mdp.step(action)
        rewards += [reward]
        s = new_state
        mdp.render()

        if done:
            break
            print('Done!')

        print(reward)

    print('Average reward: ', np.mean(rewards))

    # if visualize:
    #     draw_policy(mdp, state_values)
Beispiel #4
0
    states = mdp.get_all_states()

    # Play in Frozen Lake Env
    state_values = {s: 0 for s in mdp.get_all_states()}  # Initialize state_values

    # Run value iteration algo!
    state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, state_values)

    # See how our agent performs - e.g. render what is going on when agent choose `optimal` value
    s = mdp.reset()
    mdp.render()
    rewards = []  # Save all rewards to see mean reward.

    # Your code here!
    for step in range(1000):
        s, r, is_done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma))
        rewards.append(r)
        if is_done:
            break

    print('Average reward: ', np.mean(rewards))

    if visualize:
        draw_policy(mdp, state_values)

    # Let's see how it is improving in time.
    visualize_step_by_step(mdp, gamma, num_iter, min_difference)

    # Express test!
    mass_gaming(mdp, gamma, num_iter, 1000, 100)