def render(self): """Renders the Q-function and policy learned""" if self.Q is None: self.Q = self.V + np.swapaxes(self.C[0, :, :], 0, 1).reshape( (self.actions.n_prim + self.actions.n_opt, self.GridWorld.n_states)) gui.render_q(self.GridWorld, self.Q) # Need a way to include options gui.render_policy(self.GridWorld, self.policy) # Need a way to include options
# Policy definition # If you want to represent deterministic action you can just use the number of # the action. Recall that in the terminal states only action 0 (right) is # defined. # In this case, you can use gui.renderpol to visualize the policy ################################################################################ pol = [1, 2, 0, 0, 1, 1, 0, 0, 0, 0, 3] gui.render_policy(env, pol) ################################################################################ # Try to simulate a trajectory # you can use env.step(s,a, render=True) to visualize the transition ################################################################################ env.render = True state = 0 fps = 1 for i in range(5): action = np.random.choice(env.state_actions[state]) nexts, reward, term = env.step(state,action) state = nexts time.sleep(1./fps) ################################################################################ # You can also visualize the q-function using render_q ################################################################################ # first get the maximum number of actions available max_act = max(map(len, env.state_actions)) q = np.random.rand(env.n_states, max_act) gui.render_q(env, q) #Questions are answered in the Jupyter notebook
def render(self): """Renders the Q-function learned""" gui.render_q(self.GridWorld, self.SAV[:, :, self.max_iter - 1])
env.render = True state = 0 fps = 1 for i in range(5): action = np.random.choice(env.state_actions[state]) nexts, reward, term = env.step(state, action) state = nexts time.sleep(1. / fps) ################################################################################ # You can also visualize the q-function using render_q ################################################################################ # first get the maximum number of actions available max_act = max(map(len, env.state_actions)) q = np.random.rand(env.n_states, max_act) gui.render_q(env, q) ################################################################################ # Work to do: Q4 ################################################################################ # here the v-function and q-function to be used for question 4 v_q4 = [ 0.87691855, 0.92820033, 0.98817903, 0.00000000, 0.67106071, -0.99447514, 0.00000000, -0.82847001, -0.87691855, -0.93358351, -0.99447514 ] q_q4 = [[0.87691855, 0.65706417], [0.92820033, 0.84364237], [0.98817903, -0.75639924, 0.89361129], [0.00000000], [-0.62503460, 0.67106071], [-0.99447514, -0.70433689, 0.75620264], [0.00000000], [-0.82847001, 0.49505225], [-0.87691855, -0.79703229], [-0.93358351, -0.84424050, -0.93896668], [-0.89268904, -0.99447514]] gui.render_q(env, q_q4)
label=f'epsilon = {eps} / k = {k}', ax=ax2[i]) model.plot_reward(discounted=True, yline=model.avg_value_infty, label=f'epsilon = {eps} / k = {k}', ax=ax3[i]) for i in range(len(epss)): ax1[i].set_title('') ax2[i].set_title('') ax3[i].set_title('') f1.suptitle(r'$||v^* - v^{\pi_n}||_{\infty}$ in terms of number of iterations') f2.suptitle(r'$J_n - J^\pi$ in terms of number of iterations') f3.suptitle('Empirical average of discounted reward of each episode') plt.show() # Optimal policy render alpha = lambda x: 1 / (x**0.7) learning_rates = np.full((n_states, n_actions), alpha) model.learn_q(10000, tmax, 0.2, learning_rates) pol = model.policy_n[:, -1] gui.render_policy(env, pol) q_formatted = [ model.Q_n[state, :, -1][model.Q_n[state, :, -1] != -np.inf] for state in range(n_states) ] gui.render_q(env, q_formatted)
plt.scatter(range(N), cumulatedReward) plt.xlabel("Episodes", fontsize=16) plt.ylabel("Cumulated reward for each episode ", fontsize=16) plt.show() # Plotting cumulated reward over all past episodes plt.figure() plt.plot(range(N), np.cumsum(cumulatedReward)) plt.xlabel("Episodes", fontsize=16) plt.ylabel("Cumulated reward with episodes", fontsize=16) plt.show() return Q def Vpn(env, n): Q = QLearning(env, N=n, gamma=0.95, Tmax=20) Vpi = np.zeros(env.n_states) for s in range(env.n_states): Vpi[s] = np.max(Q[s]) return Vpi Qdict = QLearning(env, N=20000, Tmax=20) # Rendering Q-Value in Grid World Q = np.zeros((env.n_states, 4)) for i in range(env.n_states): Q[i, :] = Qdict[i] gui.render_q(env, Q)