class DQN_Agent(Base_Agent): agent_name = "DQN" def __init__(self, config): Base_Agent.__init__(self, config) self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed) self.q_network_local = Neural_Network(self.state_size, self.action_size, config.seed, self.hyperparameters, "VANILLA_NN").to(self.device) self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"]) def step(self): """Runs a step within a game including a learning step if required""" while not self.done: self.pick_and_conduct_action() self.update_next_state_reward_done_and_score() if self.time_for_q_network_to_learn(): self.q_network_learn() self.save_experience() self.state = self.next_state #this is to set the state for the next iteration self.episode_step_number += 1 self.episode_number += 1 def pick_action(self): """Uses the local Q network and an epsilon greedy policy to pick an action""" # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add # a "fake" dimension to make it a mini-batch rather than a single observation state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device) self.q_network_local.eval() #puts network in evaluation mode with torch.no_grad(): action_values = self.q_network_local(state) self.q_network_local.train() #puts network back in training mode action = self.make_epsilon_greedy_choice(action_values) return action def make_epsilon_greedy_choice(self, action_values): epsilon = self.hyperparameters["epsilon"] / (1.0 + (self.episode_number / self.hyperparameters["epsilon_decay_rate_denominator"])) if random.random() > epsilon: return np.argmax(action_values.data.cpu().numpy()) return random.choice(np.arange(self.action_size)) def q_network_learn(self, experiences_given=False, experiences=None): if not experiences_given: states, actions, rewards, next_states, dones = self.sample_experiences() #Sample experiences else: states, actions, rewards, next_states, dones = experiences loss = self.compute_loss(states, next_states, rewards, actions, dones) if self.done: #we only update the learning rate at end of each episode self.update_learning_rate(self.hyperparameters["learning_rate"], self.q_network_optimizer) self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss, self.hyperparameters["gradient_clipping_norm"]) def compute_loss(self, states, next_states, rewards, actions, dones): with torch.no_grad(): Q_targets = self.compute_q_targets(next_states, rewards, dones) Q_expected = self.compute_expected_q_values(states, actions) loss = F.mse_loss(Q_expected, Q_targets) return loss def compute_q_targets(self, next_states, rewards, dones): Q_targets_next = self.compute_q_values_for_next_states(next_states) Q_targets = self.compute_q_values_for_current_states(rewards, Q_targets_next, dones) return Q_targets def compute_q_values_for_next_states(self, next_states): Q_targets_next = self.q_network_local(next_states).detach().max(1)[0].unsqueeze(1) return Q_targets_next def compute_q_values_for_current_states(self, rewards, Q_targets_next, dones): Q_targets_current = rewards + (self.hyperparameters["discount_rate"] * Q_targets_next * (1 - dones)) return Q_targets_current def compute_expected_q_values(self, states, actions): Q_expected = self.q_network_local(states).gather(1, actions.long()) #must convert actions to long so can be used as index return Q_expected def locally_save_policy(self): pass # torch.save(self.qnetwork_local.state_dict(), "Models/{}_local_network.pt".format(self.agent_name)) def time_for_q_network_to_learn(self): return self.right_amount_of_steps_taken() and self.enough_experiences_to_learn_from() def right_amount_of_steps_taken(self): return self.episode_step_number % self.hyperparameters["update_every_n_steps"] == 0 def sample_experiences(self): experiences = self.memory.sample() states, actions, rewards, next_states, dones = experiences return states, actions, rewards, next_states, dones
class REINFORCE_Agent(Base_Agent): agent_name = "REINFORCE" def __init__(self, config): Base_Agent.__init__(self, config) self.policy = Neural_Network(self.state_size, self.action_size, config.seed, self.hyperparameters, "VANILLA_NN").to(self.device) self.optimizer = optim.Adam(self.policy.parameters(), lr=self.hyperparameters["learning_rate"]) self.episode_rewards = [] self.episode_log_probabilities = [] def reset_game(self): """Resets the game information so we are ready to play a new episode""" self.environment.reset_environment() self.state = self.environment.get_state() self.next_state = None self.action = None self.reward = None self.done = False self.total_episode_score_so_far = 0 self.episode_rewards = [] self.episode_log_probabilities = [] self.episode_step_number = 0 def step(self): """Runs a step within a game including a learning step if required""" while not self.done: self.pick_and_conduct_action_and_save_log_probabilities() self.update_next_state_reward_done_and_score() self.store_reward() if self.time_to_learn(): self.actor_learn() self.state = self.next_state #this is to set the state for the next iteration self.episode_step_number += 1 self.episode_number += 1 def pick_and_conduct_action_and_save_log_probabilities(self): action, log_probabilities = self.pick_action_and_get_log_probabilities( ) self.store_log_probabilities(log_probabilities) self.store_action(action) self.conduct_action() def pick_action_and_get_log_probabilities(self): # PyTorch only accepts mini-batches and not individual observations so we have to add # a "fake" dimension to our observation using unsqueeze state = torch.from_numpy(self.state).float().unsqueeze(0).to( self.device) action_probabilities = self.policy.forward(state).cpu() action_distribution = Categorical( action_probabilities) # this creates a distribution to sample from action = action_distribution.sample() return action.item(), action_distribution.log_prob(action) def store_log_probabilities(self, log_probabilities): self.episode_log_probabilities.append(log_probabilities) def store_action(self, action): self.action = action def store_reward(self): self.episode_rewards.append(self.reward) def actor_learn(self): total_discounted_reward = self.calculate_episode_discounted_reward() policy_loss = self.calculate_policy_loss_on_episode( total_discounted_reward) self.optimizer.zero_grad() policy_loss.backward() self.optimizer.step() def calculate_episode_discounted_reward(self): discounts = self.hyperparameters["discount_rate"]**np.arange( len(self.episode_rewards)) total_discounted_reward = np.dot(discounts, self.episode_rewards) return total_discounted_reward def calculate_policy_loss_on_episode(self, total_discounted_reward): policy_loss = [] for log_prob in self.episode_log_probabilities: policy_loss.append(-log_prob * total_discounted_reward) policy_loss = torch.cat(policy_loss).sum( ) # We need to add up the losses across the mini-batch to get 1 overall loss # policy_loss = Variable(policy_loss, requires_grad = True) return policy_loss def time_to_learn(self): """Tells us whether it is time for the algorithm to learn. With REINFORCE we only learn at the end of every episode so this just returns whether the episode is over""" return self.done
class PPO_Agent(Base_Agent): agent_name = "PPO" def __init__(self, config): Base_Agent.__init__(self, config) self.policy_output_size = self.calculate_policy_output_size() self.policy_new = Neural_Network(self.state_size, self.policy_output_size, self.random_seed, self.hyperparameters, "VANILLA_NN").to(self.device) self.policy_old = Neural_Network(self.state_size, self.policy_output_size, self.random_seed, self.hyperparameters, "VANILLA_NN").to(self.device) self.max_steps_per_episode = config.environment.get_max_steps_per_episode( ) self.policy_new_optimizer = optim.Adam( self.policy_new.parameters(), lr=self.hyperparameters["learning_rate"]) self.episode_number = 0 self.many_episode_states = [] self.many_episode_actions = [] self.many_episode_rewards = [] self.experience_generator = Parallel_Experience_Generator( self.environment, self.policy_new, self.random_seed, self.hyperparameters) def calculate_policy_output_size(self): """Initialises the policies""" if self.action_types == "DISCRETE": return self.action_size elif self.action_types == "CONTINUOUS": return self.action_size * 2 #Because we need 1 parameter for mean and 1 for std of distribution def step(self): self.many_episode_states, self.many_episode_actions, self.many_episode_rewards = self.experience_generator.play_n_episodes( self.hyperparameters["episodes_per_learning_round"]) self.episode_number += self.hyperparameters[ "episodes_per_learning_round"] self.policy_learn() self.update_learning_rate(self.hyperparameters["learning_rate"], self.policy_new_optimizer) self.equalise_policies() def policy_learn(self): """A learning round for the policy""" all_discounted_returns = self.calculate_all_discounted_returns() if self.hyperparameters["normalise_rewards"]: all_discounted_returns = normalise_rewards(all_discounted_returns) for _ in range(self.hyperparameters["learning_iterations_per_round"]): all_ratio_of_policy_probabilities = self.calculate_all_ratio_of_policy_probabilities( ) loss = self.calculate_loss([all_ratio_of_policy_probabilities], all_discounted_returns) self.take_policy_new_optimisation_step(loss) def calculate_all_discounted_returns(self): all_discounted_returns = [] for episode in range(len(self.many_episode_states)): discounted_returns = [0] for ix in range(len(self.many_episode_states[episode])): return_value = self.many_episode_rewards[episode][-( ix + 1)] + self.hyperparameters[ "discount_rate"] * discounted_returns[-1] discounted_returns.append(return_value) discounted_returns = discounted_returns[1:] all_discounted_returns.extend(discounted_returns[::-1]) return all_discounted_returns def calculate_all_ratio_of_policy_probabilities(self): all_states = [ state for states in self.many_episode_states for state in states ] all_actions = [ action for actions in self.many_episode_actions for action in actions ] all_states = torch.stack([ torch.Tensor(states).float().to(self.device) for states in all_states ]) all_actions = torch.stack([ torch.Tensor(actions).float().to(self.device) for actions in all_actions ]) all_actions = all_actions.view(-1, len(all_states)) new_policy_distribution_log_prob = self.calculate_log_probability_of_actions( self.policy_new, all_states, all_actions) old_policy_distribution_log_prob = self.calculate_log_probability_of_actions( self.policy_old, all_states, all_actions) ratio_of_policy_probabilities = torch.exp( new_policy_distribution_log_prob) / ( torch.exp(old_policy_distribution_log_prob) + 1e-8) return ratio_of_policy_probabilities def calculate_log_probability_of_actions(self, policy, states, actions): """Calculates the log probability of an action occuring given a policy and starting state""" policy_output = policy.forward(states).to(self.device) policy_distribution = create_actor_distribution( self.action_types, policy_output, self.action_size) actions_tensor = actions policy_distribution_log_prob = policy_distribution.log_prob( actions_tensor) return policy_distribution_log_prob def calculate_loss(self, all_ratio_of_policy_probabilities, all_discounted_returns): """Calculates the PPO loss""" all_ratio_of_policy_probabilities = torch.squeeze( torch.stack(all_ratio_of_policy_probabilities)) all_ratio_of_policy_probabilities = torch.clamp( input=all_ratio_of_policy_probabilities, min=-sys.maxsize, max=sys.maxsize) all_discounted_returns = torch.tensor(all_discounted_returns).to( all_ratio_of_policy_probabilities) potential_loss_value_1 = all_discounted_returns * all_ratio_of_policy_probabilities potential_loss_value_2 = all_discounted_returns * self.clamp_probability_ratio( all_ratio_of_policy_probabilities) loss = torch.min(potential_loss_value_1, potential_loss_value_2) loss = -torch.mean(loss) return loss def clamp_probability_ratio(self, value): """Clamps a value between a certain range determined by hyperparameter clip epsilon""" return torch.clamp(input=value, min=1.0 - self.hyperparameters["clip_epsilon"], max=1.0 + self.hyperparameters["clip_epsilon"]) def take_policy_new_optimisation_step(self, loss): self.policy_new_optimizer.zero_grad() # reset gradients to 0 loss.backward() # this calculates the gradients torch.nn.utils.clip_grad_norm_( self.policy_new.parameters(), self.hyperparameters["gradient_clipping_norm"] ) # clip gradients to help stabilise training self.policy_new_optimizer.step() # this applies the gradients def equalise_policies(self): """Sets the old policy's parameters equal to the new policy's parameters""" for old_param, new_param in zip(self.policy_old.parameters(), self.policy_new.parameters()): old_param.data.copy_(new_param.data) def save_result(self): for ep in range(len(self.many_episode_rewards)): total_reward = np.sum(self.many_episode_rewards[ep]) self.game_full_episode_scores.append(total_reward) self.rolling_results.append( np.mean( self.game_full_episode_scores[-1 * self.rolling_score_window:])) self.save_max_result_seen()