def get_episode_loss(self, env_index: int, done: bool) -> Optional[Loss]: """Calculate a loss to train with, given the last (up to max_episode_window_length) observations/actions/rewards of the current episode in the environment at the given index in the batch. If `done` is True, then this is for the end of an episode. If `done` is False, the episode is still underway. NOTE: While the Batch Observations/Actions/Rewards objects usually contain the "batches" of data coming from the N different environments, now they are actually a sequence of items coming from this single environment. For more info on how this is done, see the """ inputs: Tensor actions: PolicyHeadOutput rewards: ContinualRLSetting.Rewards if not done: # This particular algorithm (REINFORCE) can't give a loss until the # end of the episode is reached. return None if len(self.actions[env_index]) == 0: logger.error(f"Weird, asked to get episode loss, but there is " f"nothing in the buffer?") return None inputs, actions, rewards = self.stack_buffers(env_index) episode_length = actions.batch_size assert len(inputs) == len(actions.y_pred) == len(rewards.y) if episode_length <= 1: # TODO: If the episode has len of 1, we can't really get a loss! logger.error("Episode is too short!") return None log_probabilities = actions.y_pred_log_prob rewards = rewards.y loss_tensor = self.policy_gradient( rewards=rewards, log_probs=log_probabilities, gamma=self.hparams.gamma, ) loss = Loss(self.name, loss_tensor) loss.metric = EpisodeMetrics( n_samples=1, mean_episode_reward=float(rewards.sum()), mean_episode_length=len(rewards), ) # TODO: add something like `add_metric(self, metric: Metrics, name: str=None)` # to `Loss`. loss.metrics["gradient_usage"] = self.get_gradient_usage_metrics( env_index) return loss
def get_episode_loss(self, env_index: int, done: bool) -> Optional[Loss]: # IDEA: Actually, now that I think about it, instead of detaching the # tensors, we could instead use the critic's 'value' estimate and get a # loss for that incomplete episode using the tensors in the buffer, # rather than detaching them! if not done: return None # TODO: Add something like a 'num_steps_since_update' for each env? (it # would actually be a num_steps_since_backward) # if self.num_steps_since_update? n_stored_steps = self.num_stored_steps(env_index) if n_stored_steps < 5: # For now, we only give back a loss at the end of the episode. # TODO: Test if giving back a loss at each step or every few steps # would work better! logger.warning( RuntimeWarning( f"Returning None as the episode loss, because only have " f"{n_stored_steps} steps stored for that environment.")) return None inputs: Tensor actions: A2CHeadOutput rewards: Rewards inputs, actions, rewards = self.stack_buffers(env_index) logits: Tensor = actions.logits action_log_probs: Tensor = actions.action_log_prob values: Tensor = actions.value assert rewards.y is not None episode_rewards: Tensor = rewards.y # target values are calculated backward # it's super important to handle correctly done states, # for those cases we want our to target to be equal to the reward only episode_length = len(episode_rewards) dones = torch.zeros(episode_length, dtype=torch.bool) dones[-1] = bool(done) returns = self.get_returns(episode_rewards, gamma=self.hparams.gamma).type_as(values) advantages = returns - values # Normalize advantage (not present in the original implementation) if self.hparams.normalize_advantages: advantages = normalize(advantages) # Create the Loss to be returned. loss = Loss(self.name) # Policy gradient loss (actor loss) policy_gradient_loss = -(advantages.detach() * action_log_probs).mean() actor_loss = Loss("actor", policy_gradient_loss) loss += self.hparams.actor_loss_coef * actor_loss # Value loss: Try to get the critic's values close to the actual return, # which means the advantages should be close to zero. value_loss_tensor = F.mse_loss(values, returns.reshape(values.shape)) critic_loss = Loss("critic", value_loss_tensor) loss += self.hparams.critic_loss_coef * critic_loss # Entropy loss, to "favor exploration". entropy_loss_tensor = -actions.action_dist.entropy().mean() entropy_loss = Loss("entropy", entropy_loss_tensor) loss += self.hparams.entropy_loss_coef * entropy_loss if done: episode_rewards_array = episode_rewards.reshape([-1]) loss.metric = EpisodeMetrics( n_samples=1, mean_episode_reward=float(episode_rewards_array.sum()), mean_episode_length=len(episode_rewards_array), ) loss.metrics["gradient_usage"] = self.get_gradient_usage_metrics( env_index) return loss