def compute_advantage(self, batch:Batch, last_r: float, gamma: float = 0.9, lamda: float = 1.0, use_gae: bool = True, use_critic: bool = True): """ Given a rollout, compute its value targets and the advantage Args: batch (Batch): batch of a single trajectory last_r (float): value estimation for the last observation gamma (float): Discount factor lambda (float): parameter for GAE use_gae (bool): using Generalized Advantage Estimation use_critic (bool): whether to use critic (value estimation), setting this to false will use 0 as baseline Returns: batch (Batch): object with experience from batch and processed rewards """ assert batch.vf_preds in batch or not use_critic assert use_critic or not use_gae if use_gae: vpred_t = np.concatenate([batch.vf_preds, np.array([last_r])]) delta_t = (batch.rew + gamma * vpred_t[1:] - vpred_t[:-1]) # This formula for the advantage comes from "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438 batch.advantages = self.discount_cumsum(delta_t, gamma * lamda) batch.value_targets = (batch.advantages + batch.vf_preds).astype(np.float32) else: rewards_plus_v = np.concatenate([batch.rew, np.array([last_r])]) discounted_returns = discount_cumsum(rewards_plus_v, gamma)[:-1].astype(np.float32) if use_critic: batch.advantages = discounted_returns - batch.vf_preds batch.value_targets = discounted_return else: batch.advantages = discounted_returns batch.value_targets = np.zeros_like(batch.advantages) batch.advantages = batch.advantages.astype(np.float32) return batch