Exemple #1
0
 def render(self):
     """Renders the Q-function and policy learned"""
     if self.Q is None:
         self.Q = self.V + np.swapaxes(self.C[0, :, :], 0, 1).reshape(
             (self.actions.n_prim + self.actions.n_opt,
              self.GridWorld.n_states))
     gui.render_q(self.GridWorld, self.Q)  # Need a way to include options
     gui.render_policy(self.GridWorld,
                       self.policy)  # Need a way to include options
Exemple #2
0
# Policy definition
# If you want to represent deterministic action you can just use the number of
# the action. Recall that in the terminal states only action 0 (right) is
# defined.
# In this case, you can use gui.renderpol to visualize the policy
################################################################################
    pol = [1, 2, 0, 0, 1, 1, 0, 0, 0, 0, 3]
    gui.render_policy(env, pol)

################################################################################
# Try to simulate a trajectory
# you can use env.step(s,a, render=True) to visualize the transition
################################################################################
    env.render = True
    state = 0
    fps = 1
    for i in range(5):
            action = np.random.choice(env.state_actions[state])
            nexts, reward, term = env.step(state,action)
            state = nexts
            time.sleep(1./fps)

################################################################################
# You can also visualize the q-function using render_q
################################################################################
# first get the maximum number of actions available
    max_act = max(map(len, env.state_actions))
    q = np.random.rand(env.n_states, max_act)
    gui.render_q(env, q)

#Questions are answered in the Jupyter notebook
Exemple #3
0
 def render(self):
     """Renders the Q-function learned"""
     gui.render_q(self.GridWorld, self.SAV[:, :, self.max_iter - 1])
Exemple #4
0
env.render = True
state = 0
fps = 1
for i in range(5):
    action = np.random.choice(env.state_actions[state])
    nexts, reward, term = env.step(state, action)
    state = nexts
    time.sleep(1. / fps)

################################################################################
# You can also visualize the q-function using render_q
################################################################################
# first get the maximum number of actions available
max_act = max(map(len, env.state_actions))
q = np.random.rand(env.n_states, max_act)
gui.render_q(env, q)

################################################################################
# Work to do: Q4
################################################################################
# here the v-function and q-function to be used for question 4
v_q4 = [
    0.87691855, 0.92820033, 0.98817903, 0.00000000, 0.67106071, -0.99447514,
    0.00000000, -0.82847001, -0.87691855, -0.93358351, -0.99447514
]
q_q4 = [[0.87691855, 0.65706417], [0.92820033, 0.84364237],
        [0.98817903, -0.75639924, 0.89361129], [0.00000000],
        [-0.62503460, 0.67106071], [-0.99447514, -0.70433689, 0.75620264],
        [0.00000000], [-0.82847001, 0.49505225], [-0.87691855, -0.79703229],
        [-0.93358351, -0.84424050, -0.93896668], [-0.89268904, -0.99447514]]
gui.render_q(env, q_q4)
Exemple #5
0
                                           label=f'epsilon = {eps} / k = {k}',
                                           ax=ax2[i])
        model.plot_reward(discounted=True,
                          yline=model.avg_value_infty,
                          label=f'epsilon = {eps} / k = {k}',
                          ax=ax3[i])

for i in range(len(epss)):
    ax1[i].set_title('')
    ax2[i].set_title('')
    ax3[i].set_title('')

f1.suptitle(r'$||v^* - v^{\pi_n}||_{\infty}$ in terms of number of iterations')
f2.suptitle(r'$J_n - J^\pi$ in terms of number of iterations')
f3.suptitle('Empirical average of discounted reward of each episode')
plt.show()

# Optimal policy render
alpha = lambda x: 1 / (x**0.7)
learning_rates = np.full((n_states, n_actions), alpha)
model.learn_q(10000, tmax, 0.2, learning_rates)

pol = model.policy_n[:, -1]
gui.render_policy(env, pol)

q_formatted = [
    model.Q_n[state, :, -1][model.Q_n[state, :, -1] != -np.inf]
    for state in range(n_states)
]
gui.render_q(env, q_formatted)
Exemple #6
0
    plt.scatter(range(N), cumulatedReward)
    plt.xlabel("Episodes", fontsize=16)
    plt.ylabel("Cumulated reward for each episode ", fontsize=16)
    plt.show()

    # Plotting cumulated reward over all past episodes
    plt.figure()
    plt.plot(range(N), np.cumsum(cumulatedReward))
    plt.xlabel("Episodes", fontsize=16)
    plt.ylabel("Cumulated reward with episodes", fontsize=16)
    plt.show()

    return Q


def Vpn(env, n):
    Q = QLearning(env, N=n, gamma=0.95, Tmax=20)
    Vpi = np.zeros(env.n_states)
    for s in range(env.n_states):
        Vpi[s] = np.max(Q[s])
    return Vpi


Qdict = QLearning(env, N=20000, Tmax=20)

# Rendering Q-Value in Grid World
Q = np.zeros((env.n_states, 4))
for i in range(env.n_states):
    Q[i, :] = Qdict[i]
gui.render_q(env, Q)