def __init__(self, config): Base_Agent.__init__(self, config) self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed) self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"]) self.exploration_strategy = Epsilon_Greedy_Exploration(config)
def __init__(self, config): Base_Agent.__init__(self, config) assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions" assert self.config.hyperparameters["Actor"][ "final_layer_activation"] == "Softmax", "Final actor layer must be softmax" self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"] if self.automatic_entropy_tuning: # we set the max possible entropy as the target entropy self.target_entropy = -np.log((1.0 / self.action_size)) * 0.98 self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) else: self.alpha = self.hyperparameters["entropy_term_weight"] assert not self.hyperparameters[ "add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment" self.add_extra_noise = False self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]
class HER_Base(object): """Contains methods needed to turn an algorithm into a hindsight experience replay (HER) algorithm""" def __init__(self, buffer_size, batch_size, HER_sample_proportion): self.HER_memory = Replay_Buffer(buffer_size, batch_size, self.config.seed) self.ordinary_buffer_batch_size = int(batch_size * (1.0 - HER_sample_proportion)) self.HER_buffer_batch_size = batch_size - self.ordinary_buffer_batch_size def reset_game(self): """Resets the game information so we are ready to play a new episode""" self.state_dict = self.environment.reset() self.observation = self.state_dict["observation"] self.desired_goal = self.state_dict["desired_goal"] self.achieved_goal = self.state_dict["achieved_goal"] self.state = self.create_state_from_observation_and_desired_goal( self.observation, self.desired_goal) self.next_state = None self.action = None self.reward = None self.done = False self.episode_states = [] self.episode_rewards = [] self.episode_actions = [] self.episode_next_states = [] self.episode_dones = [] self.episode_desired_goals = [] self.episode_achieved_goals = [] self.episode_observations = [] self.episode_next_desired_goals = [] self.episode_next_achieved_goals = [] self.episode_next_observations = [] self.total_episode_score_so_far = 0 def track_changeable_goal_episodes_data(self): """Saves the data from the recent episodes in a way compatible with changeable goal environments""" self.episode_rewards.append(self.reward) self.episode_actions.append(self.action) self.episode_dones.append(self.done) self.episode_states.append(self.state) self.episode_next_states.append(self.next_state) self.episode_desired_goals.append(self.state_dict["desired_goal"]) self.episode_achieved_goals.append(self.state_dict["achieved_goal"]) self.episode_observations.append(self.state_dict["observation"]) self.episode_next_desired_goals.append( self.next_state_dict["desired_goal"]) self.episode_next_achieved_goals.append( self.next_state_dict["achieved_goal"]) self.episode_next_observations.append( self.next_state_dict["observation"]) def conduct_action_in_changeable_goal_envs(self, action): """Adapts conduct_action from base agent so that can handle changeable goal environments""" self.next_state_dict, self.reward, self.done, _ = self.environment.step( action) self.total_episode_score_so_far += self.reward if self.hyperparameters["clip_rewards"]: self.reward = max(min(self.reward, 1.0), -1.0) self.observation = self.next_state_dict["observation"] self.desired_goal = self.next_state_dict["desired_goal"] self.achieved_goal = self.next_state_dict["achieved_goal"] self.next_state = self.create_state_from_observation_and_desired_goal( self.observation, self.desired_goal) def create_state_from_observation_and_desired_goal(self, observation, desired_goal): return np.concatenate((observation, desired_goal)) def save_alternative_experience(self): """Saves the experiences as if the final state visited in the episode was the goal state""" new_goal = self.achieved_goal new_states = [ self.create_state_from_observation_and_desired_goal( observation, new_goal) for observation in self.episode_observations ] new_next_states = [ self.create_state_from_observation_and_desired_goal( observation, new_goal) for observation in self.episode_next_observations ] new_rewards = [ self.environment.compute_reward(next_achieved_goal, new_goal, None) for next_achieved_goal in self.episode_next_achieved_goals ] if self.hyperparameters["clip_rewards"]: new_rewards = [ max(min(reward, 1.0), -1.0) for reward in new_rewards ] self.HER_memory.add_experience(new_states, self.episode_actions, new_rewards, new_next_states, self.episode_dones) def sample_from_HER_and_Ordinary_Buffer(self): """Samples from the ordinary replay buffer and HER replay buffer according to a proportion specified in config""" states, actions, rewards, next_states, dones = self.memory.sample( self.ordinary_buffer_batch_size, ) HER_states, HER_actions, HER_rewards, HER_next_states, HER_dones = self.HER_memory.sample( self.HER_buffer_batch_size) states = torch.cat((states, HER_states)) actions = torch.cat((actions, HER_actions)) rewards = torch.cat((rewards, HER_rewards)) next_states = torch.cat((next_states, HER_next_states)) dones = torch.cat((dones, HER_dones)) return states, actions, rewards, next_states, dones
def __init__(self, buffer_size, batch_size, HER_sample_proportion): self.HER_memory = Replay_Buffer(buffer_size, batch_size, self.config.seed) self.ordinary_buffer_batch_size = int(batch_size * (1.0 - HER_sample_proportion)) self.HER_buffer_batch_size = batch_size - self.ordinary_buffer_batch_size
class DQN(Base_Agent): """A deep Q learning agent""" agent_name = "DQN" def __init__(self, config): Base_Agent.__init__(self, config) self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed) self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"]) self.exploration_strategy = Epsilon_Greedy_Exploration(config) def reset_game(self): super(DQN, self).reset_game() self.update_learning_rate(self.hyperparameters["learning_rate"], self.q_network_optimizer) def step(self): """Runs a step within a game including a learning step if required""" while not self.done: self.action = self.pick_action() self.conduct_action(self.action) if self.time_for_q_network_to_learn(): for _ in range(self.hyperparameters["learning_iterations"]): self.learn() self.save_experience() self.state = self.next_state # this is to set the state for the next iteration self.global_step_number += 1 self.episode_number += 1 def pick_action(self, state=None): """Uses the local Q network and an epsilon greedy policy to pick an action""" # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add # a "fake" dimension to make it a mini-batch rather than a single observation if state is None: state = self.state if isinstance(state, np.int64) or isinstance(state, int): state = np.array([state]) state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) if len(state.shape) < 2: state = state.unsqueeze(0) self.q_network_local.eval() # puts network in evaluation mode with torch.no_grad(): action_values = self.q_network_local(state) self.q_network_local.train() # puts network back in training mode action = self.exploration_strategy.perturb_action_for_exploration_purposes( { "action_values": action_values, "turn_off_exploration": self.turn_off_exploration, "episode_number": self.episode_number }) self.logger.info("Q values {} -- Action chosen {}".format( action_values, action)) return action def learn(self, experiences=None): """Runs a learning iteration for the Q network""" if experiences is None: states, actions, rewards, next_states, dones = self.sample_experiences( ) # Sample experiences else: states, actions, rewards, next_states, dones = experiences loss = self.compute_loss(states, next_states, rewards, actions, dones) actions_list = [action_X.item() for action_X in actions] self.logger.info("Action counts {}".format(Counter(actions_list))) self.take_optimisation_step( self.q_network_optimizer, self.q_network_local, loss, self.hyperparameters["gradient_clipping_norm"]) def compute_loss(self, states, next_states, rewards, actions, dones): """Computes the loss required to train the Q network""" with torch.no_grad(): Q_targets = self.compute_q_targets(next_states, rewards, dones) Q_expected = self.compute_expected_q_values(states, actions) loss = F.mse_loss(Q_expected, Q_targets) return loss def compute_q_targets(self, next_states, rewards, dones): """Computes the q_targets we will compare to predicted q values to create the loss to train the Q network""" Q_targets_next = self.compute_q_values_for_next_states(next_states) Q_targets = self.compute_q_values_for_current_states( rewards, Q_targets_next, dones) return Q_targets def compute_q_values_for_next_states(self, next_states): """Computes the q_values for next state we will use to create the loss to train the Q network""" Q_targets_next = self.q_network_local(next_states).detach().max( 1)[0].unsqueeze(1) return Q_targets_next def compute_q_values_for_current_states(self, rewards, Q_targets_next, dones): """Computes the q_values for current state we will use to create the loss to train the Q network""" Q_targets_current = rewards + (self.hyperparameters["discount_rate"] * Q_targets_next * (1 - dones)) return Q_targets_current def compute_expected_q_values(self, states, actions): """Computes the expected q_values we will use to create the loss to train the Q network""" Q_expected = self.q_network_local(states).gather(1, actions.long( )) # must convert actions to long so can be used as index return Q_expected def locally_save_policy(self): """Saves the policy""" torch.save(self.q_network_local.state_dict(), "Models/{}_local_network.pt".format(self.agent_name)) def time_for_q_network_to_learn(self): """Returns boolean indicating whether enough steps have been taken for learning to begin and there are enough experiences in the replay buffer to learn from""" return self.right_amount_of_steps_taken( ) and self.enough_experiences_to_learn_from() def right_amount_of_steps_taken(self): """Returns boolean indicating whether enough steps have been taken for learning to begin""" return self.global_step_number % self.hyperparameters[ "update_every_n_steps"] == 0 def sample_experiences(self): """Draws a random sample of experience from the memory buffer""" experiences = self.memory.sample() states, actions, rewards, next_states, dones = experiences return states, actions, rewards, next_states, dones
def __init__(self, config): Base_Agent.__init__(self, config) assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions" assert self.config.hyperparameters["Actor"][ "final_layer_activation"] != "Softmax", "Final actor layer must not be softmax" self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_local_2 = self.create_NN( input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_optimizer = torch.optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_optimizer_2 = torch.optim.Adam( self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size * 2, key_to_use="Actor") self.actor_optimizer = torch.optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.automatic_entropy_tuning = self.hyperparameters[ "automatically_tune_entropy_hyperparameter"] if self.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.environment.action_space.shape).to( self.device)).item() # heuristic value from the paper self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam( [self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) else: self.alpha = self.hyperparameters["entropy_term_weight"] self.add_extra_noise = self.hyperparameters["add_extra_noise"] if self.add_extra_noise: self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"], self.hyperparameters["theta"], self.hyperparameters["sigma"]) self.do_evaluation_iterations = self.hyperparameters[ "do_evaluation_iterations"]
class SAC(Base_Agent): """Soft Actor-Critic model based on the 2018 paper https://arxiv.org/abs/1812.05905 and on this github implementation https://github.com/pranz24/pytorch-soft-actor-critic. It is an actor-critic algorithm where the agent is also trained to maximise the entropy of their actions as well as their cumulative reward""" agent_name = "SAC" def __init__(self, config): Base_Agent.__init__(self, config) assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions" assert self.config.hyperparameters["Actor"][ "final_layer_activation"] != "Softmax", "Final actor layer must not be softmax" self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_local_2 = self.create_NN( input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_optimizer = torch.optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_optimizer_2 = torch.optim.Adam( self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size * 2, key_to_use="Actor") self.actor_optimizer = torch.optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.automatic_entropy_tuning = self.hyperparameters[ "automatically_tune_entropy_hyperparameter"] if self.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.environment.action_space.shape).to( self.device)).item() # heuristic value from the paper self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam( [self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) else: self.alpha = self.hyperparameters["entropy_term_weight"] self.add_extra_noise = self.hyperparameters["add_extra_noise"] if self.add_extra_noise: self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"], self.hyperparameters["theta"], self.hyperparameters["sigma"]) self.do_evaluation_iterations = self.hyperparameters[ "do_evaluation_iterations"] def save_result(self): """Saves the result of an episode of the game. Overriding the method in Base Agent that does this because we only want to keep track of the results during the evaluation episodes""" if self.episode_number == 1 or not self.do_evaluation_iterations: self.game_full_episode_scores.extend( [self.total_episode_score_so_far]) self.rolling_results.append( np.mean( self.game_full_episode_scores[-1 * self.rolling_score_window:])) self.save_max_result_seen() elif (self.episode_number - 1) % TRAINING_EPISODES_PER_EVAL_EPISODE == 0: self.game_full_episode_scores.extend([ self.total_episode_score_so_far for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE) ]) self.rolling_results.extend([ np.mean( self.game_full_episode_scores[-1 * self.rolling_score_window:]) for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE) ]) self.save_max_result_seen() def reset_game(self): """Resets the game information so we are ready to play a new episode""" Base_Agent.reset_game(self) if self.add_extra_noise: self.noise.reset() def step(self): """Runs an episode on the game, saving the experience and running a learning step if appropriate""" eval_ep = self.episode_number % TRAINING_EPISODES_PER_EVAL_EPISODE == 0 and self.do_evaluation_iterations self.episode_step_number_val = 0 while not self.done: self.episode_step_number_val += 1 self.action = self.pick_action(eval_ep) self.conduct_action(self.action) if self.time_for_critic_and_actor_to_learn(): for _ in range(self.hyperparameters[ "learning_updates_per_learning_session"]): self.learn() mask = False if self.episode_step_number_val >= self.environment._max_episode_steps else self.done if not eval_ep: self.save_experience(experience=(self.state, self.action, self.reward, self.next_state, mask)) self.state = self.next_state self.global_step_number += 1 print(self.total_episode_score_so_far) if eval_ep: self.print_summary_of_latest_evaluation_episode() self.episode_number += 1 def pick_action(self, eval_ep, state=None): """Picks an action using one of three methods: 1) Randomly if we haven't passed a certain number of steps, 2) Using the actor in evaluation mode if eval_ep is True 3) Using the actor in training mode if eval_ep is False. The difference between evaluation and training mode is that training mode does more exploration""" if state is None: state = self.state if eval_ep: action = self.actor_pick_action(state=state, eval=True) elif self.global_step_number < self.hyperparameters[ "min_steps_before_learning"]: action = self.environment.action_space.sample() print("Picking random action ", action) else: action = self.actor_pick_action(state=state) if self.add_extra_noise: action += self.noise.sample() return action def actor_pick_action(self, state=None, eval=False): """Uses actor to pick an action in one of two ways: 1) If eval = False and we aren't in eval mode then it picks an action that has partly been randomly sampled 2) If eval = True then we pick the action that comes directly from the network and so did not involve any random sampling""" if state is None: state = self.state state = torch.FloatTensor([state]).to(self.device) if len(state.shape) == 1: state = state.unsqueeze(0) if eval == False: action, _, _ = self.produce_action_and_action_info(state) else: with torch.no_grad(): _, z, action = self.produce_action_and_action_info(state) action = action.detach().cpu().numpy() return action[0] def produce_action_and_action_info(self, state): """Given the state, produces an action, the log probability of the action, and the tanh of the mean action""" actor_output = self.actor_local(state) mean, log_std = actor_output[:, :self. action_size], actor_output[:, self. action_size:] std = log_std.exp() normal = Normal(mean, std) x_t = normal.rsample( ) # rsample means it is sampled using reparameterisation trick action = torch.tanh(x_t) log_prob = normal.log_prob(x_t) log_prob -= torch.log(1 - action.pow(2) + EPSILON) log_prob = log_prob.sum(1, keepdim=True) return action, log_prob, torch.tanh(mean) def time_for_critic_and_actor_to_learn(self): """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the actor and critic""" return self.global_step_number > self.hyperparameters["min_steps_before_learning"] and \ self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters[ "update_every_n_steps"] == 0 def learn(self): """Runs a learning iteration for the actor, both critics and (if specified) the temperature parameter""" state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.sample_experiences( ) qf1_loss, qf2_loss = self.calculate_critic_losses( state_batch, action_batch, reward_batch, next_state_batch, mask_batch) policy_loss, log_pi = self.calculate_actor_loss(state_batch) if self.automatic_entropy_tuning: alpha_loss = self.calculate_entropy_tuning_loss(log_pi) else: alpha_loss = None self.update_all_parameters(qf1_loss, qf2_loss, policy_loss, alpha_loss) def sample_experiences(self): return self.memory.sample() def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch): """Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy term is taken into account""" with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.produce_action_and_action_info( next_state_batch) qf1_next_target = self.critic_target( torch.cat((next_state_batch, next_state_action), 1)) qf2_next_target = self.critic_target_2( torch.cat((next_state_batch, next_state_action), 1)) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + ( 1.0 - mask_batch) * self.hyperparameters["discount_rate"] * ( min_qf_next_target) qf1 = self.critic_local(torch.cat((state_batch, action_batch), 1)) qf2 = self.critic_local_2(torch.cat((state_batch, action_batch), 1)) qf1_loss = F.mse_loss(qf1, next_q_value) qf2_loss = F.mse_loss(qf2, next_q_value) return qf1_loss, qf2_loss def calculate_actor_loss(self, state_batch): """Calculates the loss for the actor. This loss includes the additional entropy term""" action, log_pi, _ = self.produce_action_and_action_info(state_batch) qf1_pi = self.critic_local(torch.cat((state_batch, action), 1)) qf2_pi = self.critic_local_2(torch.cat((state_batch, action), 1)) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() return policy_loss, log_pi def calculate_entropy_tuning_loss(self, log_pi): """Calculates the loss for the entropy temperature parameter. This is only relevant if self.automatic_entropy_tuning is True.""" alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() return alpha_loss def update_all_parameters(self, critic_loss_1, critic_loss_2, actor_loss, alpha_loss): """Updates the parameters for the actor, both critics and (if specified) the temperature parameter""" self.take_optimisation_step( self.critic_optimizer, self.critic_local, critic_loss_1, self.hyperparameters["Critic"]["gradient_clipping_norm"]) self.take_optimisation_step( self.critic_optimizer_2, self.critic_local_2, critic_loss_2, self.hyperparameters["Critic"]["gradient_clipping_norm"]) self.take_optimisation_step( self.actor_optimizer, self.actor_local, actor_loss, self.hyperparameters["Actor"]["gradient_clipping_norm"]) self.soft_update_of_target_network( self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"]) self.soft_update_of_target_network( self.critic_local_2, self.critic_target_2, self.hyperparameters["Critic"]["tau"]) if alpha_loss is not None: self.take_optimisation_step(self.alpha_optim, None, alpha_loss, None) self.alpha = self.log_alpha.exp() def print_summary_of_latest_evaluation_episode(self): """Prints a summary of the latest episode""" print(" ") print("----------------------------") print("Episode score {} ".format(self.total_episode_score_so_far)) print("----------------------------")