gw.plot_policy(utility_grids[:, :, -1]) plot_convergence(utility_grids, policy_grids) plt.show() ql = QLearner(num_states=(shape[0] * shape[1]), num_actions=4, learning_rate=0.8, discount_rate=0.9, random_action_prob=0.5, random_action_decay_rate=0.99, dyna_iterations=0) start_state = gw.grid_coordinates_to_indices(start) iterations = 1000 flat_policies, flat_utilities = ql.learn(start_state, gw.generate_experience, iterations=iterations) new_shape = (gw.shape[0], gw.shape[1], iterations) ql_utility_grids = flat_utilities.reshape(new_shape) ql_policy_grids = flat_policies.reshape(new_shape) print('Final result of QLearning:') print(ql_policy_grids[:, :, -1]) print(ql_utility_grids[:, :, -1]) plt.figure() gw.plot_policy(ql_utility_grids[:, :, -1], ql_policy_grids[:, :, -1]) plot_convergence(ql_utility_grids, ql_policy_grids) plt.show()
def solve(self): reward_grid = np.zeros(self.shape) + self.default_reward reward_grid[self.goal] = self.goal_reward coords = zip(*self.traps) trap_mask = sparse.coo_matrix((np.ones(len(coords[0])), coords), shape=self.shape, dtype=bool).toarray() reward_grid[trap_mask] = self.trap_reward coords = zip(*self.obstacles) obstacle_mask = sparse.coo_matrix((np.ones(len(coords[0])), coords), shape=self.shape, dtype=bool).toarray() reward_grid[obstacle_mask] = 0 terminal_mask = np.zeros_like(reward_grid, dtype=np.bool) terminal_mask[self.goal] = True terminal_mask[trap_mask] = True gw = GridWorldMDP(start=self.start, reward_grid=reward_grid, obstacle_mask=obstacle_mask, terminal_mask=terminal_mask, action_probabilities=[ (-1, 0.1), (0, 0.8), (1, 0.1), ], no_action_probability=0.0) utility_grid = np.zeros(self.shape) gw.plot_policy( utility_grid, None, str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld') mdp_solvers = { 'Value Iteration': gw.run_value_iterations, 'Policy Iteration': gw.run_policy_iterations } time_results = [] steps_results = [] reward_results = [] for solver_name, solver_fn in mdp_solvers.items(): print('Solving {}:'.format(solver_name)) title = str(self.shape[0]) + 'x' + str( self.shape[1]) + ' Gridworld - ' + solver_name policy_grids, utility_grids, time_stamps, num_steps, total_reward = solver_fn( iterations=self.iterations[0], discount=0.5, title=title) a = np.empty(self.iterations[1] - self.iterations[0]) a.fill(time_stamps[-1]) time_stamps = np.concatenate((time_stamps, a)) time_results.append(time_stamps) a.fill(num_steps[-1]) num_steps = np.concatenate((num_steps, a)) steps_results.append(num_steps) a.fill(total_reward[-1]) total_reward = np.concatenate((total_reward, a)) reward_results.append(total_reward) #print(policy_grids[:, :, -1]) #print(utility_grids[:, :, -1]) gw.plot_policy(utility_grids[:, :, -1], None, title) plot_convergence(utility_grids, policy_grids, title) """for lr in [0.7, 0.8, 0.9]: for ra in [0.2, 0.5, 0.8]: for e in [.79, .89, .99]:""" ql = QLearner(num_states=(self.shape[0] * self.shape[1]), num_actions=4, obstacle_mask=obstacle_mask, terminal_mask=terminal_mask, learning_rate=0.8, discount_rate=0.975, random_action_prob=0.5, random_action_decay_rate=0.89, dyna_iterations=0) print('Solving QLearning:') start_state = gw.grid_coordinates_to_indices(self.start) #title = str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld - Q Learning - ' + str(lr).replace('.', '') + str(ra).replace('.', '') + str(e).replace('.', '') title = str(self.shape[0]) + 'x' + str( self.shape[1]) + ' Gridworld - Q Learning' iterations = self.iterations[1] flat_policies, flat_utilities, time_stamps, num_steps, total_reward = ql.learn( start_state, gw, iterations=iterations, title=str(self.shape[0]) + 'x' + str(self.shape[1]) + '/QL/' + title) new_shape = (gw.shape[0], gw.shape[1], iterations) ql_utility_grids = flat_utilities.reshape(new_shape) ql_policy_grids = flat_policies.reshape(new_shape) time_results.append(time_stamps) steps_results.append(num_steps) reward_results.append(total_reward) #print(ql_policy_grids[:, :, -1]) #print(ql_utility_grids[:, :, -1]) gw.plot_policy(ql_utility_grids[:, :, -1], ql_policy_grids[:, :, -1], title) plot_convergence(ql_utility_grids[:, :, 0:-2], ql_policy_grids[:, :, 0:-2], title) plot_time( np.array(time_results), str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld - Time') plot_num_steps( np.array(steps_results), str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld - # Steps') plot_reward( np.array(reward_results), str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld - Reward')