def render(self): """Renders the Q-function and policy learned""" if self.Q is None: self.Q = self.V + np.swapaxes(self.C[0, :, :], 0, 1).reshape( (self.actions.n_prim + self.actions.n_opt, self.GridWorld.n_states)) gui.render_q(self.GridWorld, self.Q) # Need a way to include options gui.render_policy(self.GridWorld, self.policy) # Need a way to include options
################################################################################ print(env.state2coord) print(env.coord2state) print(env.state_actions) for i, el in enumerate(env.state_actions): print("s{}: {}".format(i, env.action_names[el])) ################################################################################ # Policy definition # If you want to represent deterministic action you can just use the number of # the action. Recall that in the terminal states only action 0 (right) is # defined. # In this case, you can use gui.renderpol to visualize the policy ################################################################################ pol = [1, 2, 0, 0, 1, 1, 0, 0, 0, 0, 3] gui.render_policy(env, pol) ################################################################################ # Try to simulate a trajectory # you can use env.step(s,a, render=True) to visualize the transition ################################################################################ env.render = True state = 0 fps = 1 for i in range(5): action = np.random.choice(env.state_actions[state]) nexts, reward, term = env.step(state,action) state = nexts time.sleep(1./fps) ################################################################################