Exemple #1
0
    def get_episode_loss(self, env_index: int, done: bool) -> Optional[Loss]:
        """Calculate a loss to train with, given the last (up to
        max_episode_window_length) observations/actions/rewards of the current
        episode in the environment at the given index in the batch.

        If `done` is True, then this is for the end of an episode. If `done` is
        False, the episode is still underway.

        NOTE: While the Batch Observations/Actions/Rewards objects usually
        contain the "batches" of data coming from the N different environments,
        now they are actually a sequence of items coming from this single
        environment. For more info on how this is done, see the  
        """
        inputs: Tensor
        actions: PolicyHeadOutput
        rewards: ContinualRLSetting.Rewards
        if not done:
            # This particular algorithm (REINFORCE) can't give a loss until the
            # end of the episode is reached.
            return None

        if len(self.actions[env_index]) == 0:
            logger.error(f"Weird, asked to get episode loss, but there is "
                         f"nothing in the buffer?")
            return None

        inputs, actions, rewards = self.stack_buffers(env_index)

        episode_length = actions.batch_size
        assert len(inputs) == len(actions.y_pred) == len(rewards.y)

        if episode_length <= 1:
            # TODO: If the episode has len of 1, we can't really get a loss!
            logger.error("Episode is too short!")
            return None

        log_probabilities = actions.y_pred_log_prob
        rewards = rewards.y

        loss_tensor = self.policy_gradient(
            rewards=rewards,
            log_probs=log_probabilities,
            gamma=self.hparams.gamma,
        )
        loss = Loss(self.name, loss_tensor)
        loss.metric = EpisodeMetrics(
            n_samples=1,
            mean_episode_reward=float(rewards.sum()),
            mean_episode_length=len(rewards),
        )
        # TODO: add something like `add_metric(self, metric: Metrics, name: str=None)`
        # to `Loss`.
        loss.metrics["gradient_usage"] = self.get_gradient_usage_metrics(
            env_index)
        return loss
Exemple #2
0
    def get_episode_loss(self, env_index: int, done: bool) -> Optional[Loss]:
        # IDEA: Actually, now that I think about it, instead of detaching the
        # tensors, we could instead use the critic's 'value' estimate and get a
        # loss for that incomplete episode using the tensors in the buffer,
        # rather than detaching them!

        if not done:
            return None

        # TODO: Add something like a 'num_steps_since_update' for each env? (it
        # would actually be a num_steps_since_backward)
        # if self.num_steps_since_update?
        n_stored_steps = self.num_stored_steps(env_index)
        if n_stored_steps < 5:
            # For now, we only give back a loss at the end of the episode.
            # TODO: Test if giving back a loss at each step or every few steps
            # would work better!
            logger.warning(
                RuntimeWarning(
                    f"Returning None as the episode loss, because only have "
                    f"{n_stored_steps} steps stored for that environment."))
            return None

        inputs: Tensor
        actions: A2CHeadOutput
        rewards: Rewards
        inputs, actions, rewards = self.stack_buffers(env_index)
        logits: Tensor = actions.logits
        action_log_probs: Tensor = actions.action_log_prob
        values: Tensor = actions.value
        assert rewards.y is not None
        episode_rewards: Tensor = rewards.y

        # target values are calculated backward
        # it's super important to handle correctly done states,
        # for those cases we want our to target to be equal to the reward only
        episode_length = len(episode_rewards)
        dones = torch.zeros(episode_length, dtype=torch.bool)
        dones[-1] = bool(done)

        returns = self.get_returns(episode_rewards,
                                   gamma=self.hparams.gamma).type_as(values)
        advantages = returns - values

        # Normalize advantage (not present in the original implementation)
        if self.hparams.normalize_advantages:
            advantages = normalize(advantages)

        # Create the Loss to be returned.
        loss = Loss(self.name)

        # Policy gradient loss (actor loss)
        policy_gradient_loss = -(advantages.detach() * action_log_probs).mean()
        actor_loss = Loss("actor", policy_gradient_loss)
        loss += self.hparams.actor_loss_coef * actor_loss

        # Value loss: Try to get the critic's values close to the actual return,
        # which means the advantages should be close to zero.
        value_loss_tensor = F.mse_loss(values, returns.reshape(values.shape))
        critic_loss = Loss("critic", value_loss_tensor)
        loss += self.hparams.critic_loss_coef * critic_loss

        # Entropy loss, to "favor exploration".
        entropy_loss_tensor = -actions.action_dist.entropy().mean()
        entropy_loss = Loss("entropy", entropy_loss_tensor)
        loss += self.hparams.entropy_loss_coef * entropy_loss
        if done:
            episode_rewards_array = episode_rewards.reshape([-1])
            loss.metric = EpisodeMetrics(
                n_samples=1,
                mean_episode_reward=float(episode_rewards_array.sum()),
                mean_episode_length=len(episode_rewards_array),
            )
        loss.metrics["gradient_usage"] = self.get_gradient_usage_metrics(
            env_index)
        return loss