Ejemplo n.º 1
0
def plotter(ax, v, vmax=0, vmin=-20, env=None):
    plt.cla()
    # ax.axis('off')
    ax.set_autoscaley_on(True)

    if env is not None:
        N = env.N
        all_states = dstack_product(np.arange(N), np.arange(N))
        for state in all_states:
            i, j = state
            if not env.is_the_new_state_allowed(state):
                v[i, j] = vmin

    plt.matshow(v, fignum=0, vmax=vmax, vmin=vmin)
    plt.draw()
    plt.show()
    plt.pause(0.1)
Ejemplo n.º 2
0
def plot_the_policy(plt, pi, env):
    N = env.N
    all_states = dstack_product(np.arange(N), np.arange(N))
    scale = 0.5
    for state in all_states:
        x, y = state
        for action in range(env.action_space.n):
            if action == 0:
                vx = 0
                vy = scale * pi[x, y][0]
            if action == 1:
                vx = 0
                vy = -1. * scale * pi[x, y][1]
            if action == 2:
                vx = scale * pi[x, y][2]
                vy = 0
            if action == 3:
                vx = -1. * scale * pi[x, y][3]
                vy = 0

            plt.arrow(y, x, vy, vx, head_width=0.1, color='black', alpha=0.5)
Ejemplo n.º 3
0
# initializing V to zero
pi = return_a_random_policy(N, env.action_space.n, epsilon=1000000)

V_accumulate = np.zeros((N, N))

# 1.6 setting up the plot
ax = create_plot(N)
plt.ion()
interactive(True)
plt.cla()
ax.axis('off')

nr_episodes = 1_000
gamma = 0.98

all_states = dstack_product(np.arange(N), np.arange(N))

for episode_id in tqdm(range(nr_episodes)):
    # a sweep over all the states in the system.
    for counter, init_state in enumerate(all_states):
        terminated = False
        env.reset(init_state)
        tmp_V = 0.0
        step_counter = 0
        while not terminated:
            action_id = choose_an_action_based_on_pi(env.state, pi)
            new_state, reward, terminated, info = env.step(action_id)
            tmp_V += np.power(gamma, step_counter) * reward
            step_counter += 1
        i, j = init_state