def improve_policy(self, **kwargs): policy_changed = True i = 0 while policy_changed: policy_changed = False for s in self.action_states: self.env.set_state(s) best_score = -1 for a in self.env.actions[s]: reward = self.env.move(a) next_score = reward + self.discount * self.state_values[ self.env.current_state()] if next_score > best_score: best_action = a best_score = next_score self.env.undo_move(a) if best_action != self.policy[s]: self.policy[s] = best_action policy_changed = True self.update_state_value_function(policy='self', **kwargs) i += 1 if i % 1 == 0: print("Number of iterations: ", i) grid_world.print_policy(self.policy, self.env) grid_world.print_values(self.state_values, self.env)
def run(self, nb_iter=10000, **kwargs): while self.cur_episode < nb_iter: self.episode_function(**kwargs) self.update_explore_threshold() grid_world.print_policy(self.policy, self.env) state_values = self.compute_state_values_from_action_values() grid_world.print_values(state_values, self.env)
def improve_policy(self, nb_iter=1000, **kwargs): for t in range(nb_iter): states, actions, rewards = self.perform_episode(**kwargs) self.update_state_value_function(states, actions, rewards) self.update_policy() self.update_explore_threshold(t + 1) grid_world.print_policy(self.policy, self.env) grid_world.print_values(self.state_values, self.env)
def perform_value_iteration(self, wind, wind_force=0.5): deltas = [1] t = 0 while max(deltas) > self.epsilon: deltas = [None] * len(self.action_states) for i, s in enumerate(self.action_states): best_action_value = -1 self.env.set_state(s) old_value = self.state_values[s] self.state_values[s] = 0 for a in self.env.actions[s]: if wind == 'random': if self.policy[s] == a: p_a = wind_force else: p_a = wind_force * (1 / (len(self.env.actions[s])) - 1) elif wind == 'right': if a == 'R' and a in self.env.actions[s]: p_a = wind_force else: p_a = 0 if self.policy[s] == a: p_a += (1 - wind_force) else: p_a = int(self.policy[s] == a) reward = self.env.move(a) next_state = self.env.current_state() action_value = reward + self.discount * self.state_values[ next_state] if action_value > best_action_value: best_action_value = action_value best_action = a self.state_values[s] += p_a * action_value self.env.undo_move(a) self.policy[s] = best_action deltas[i] = np.abs(self.state_values[s] - old_value) t += 1 print("Number of iterations: ", t) grid_world.print_policy(self.policy, self.env) grid_world.print_values(self.state_values, self.env)
action_value = reward + self.discount * self.state_values[ next_state] if action_value > best_action_value: best_action_value = action_value best_action = a self.state_values[s] += p_a * action_value self.env.undo_move(a) self.policy[s] = best_action deltas[i] = np.abs(self.state_values[s] - old_value) t += 1 print("Number of iterations: ", t) grid_world.print_policy(self.policy, self.env) grid_world.print_values(self.state_values, self.env) if __name__ == "__main__": optimizer = PolicyOptimizer(environment=grid_world.negative_grid()) grid_world.print_policy(optimizer.policy, optimizer.env) grid_world.print_values(optimizer.state_values, optimizer.env) optimizer.perform_value_iteration(wind=None) # Windy Gridworld: each action has a 50% chance to fail, another action (chosen at random) is performed instead optimizer = PolicyOptimizer(environment=grid_world.negative_grid()) grid_world.print_policy(optimizer.policy, optimizer.env) grid_world.print_values(optimizer.state_values, optimizer.env) optimizer.perform_value_iteration( wind='right', wind_force=0.26) # .25 is the threshold to switch optimal agency
reversed(states[:-1]), reversed(states[1:]), reversed(rewards)): new_value = self.state_values[s] + 1/(np.log(t+2)) * \ (r + self.discount_factor * self.state_values[s_prime] - self.state_values[s]) deltas[i] = np.abs(new_value - self.state_values[s]) self.state_values[s] = new_value def solve_prediction_problem(self, max_iter=10000): state_values = {} for t in tqdm(range(max_iter)): states, actions, rewards = self.play_game() self.update_state_value_function(states, rewards, t) if t % 1000 == 0: state_values[t] = copy.deepcopy(self.state_values) return state_values if __name__ == "__main__": a = Agent(grid_world.standard_grid(), policy='random', discount_factor=1.0) state_values = a.solve_prediction_problem() for k, v in state_values.items(): print(k) grid_world.print_values(v, a.env) a = Agent(grid_world.standard_grid(), policy='win-from-start') state_values = a.solve_prediction_problem() for k, v in state_values.items(): print(k) grid_world.print_values(v, a.env)