def get_rollout(self, states, actions, rewards, dones): assert len(states) == len(actions) == len(rewards) == len(dones) trajectory_len = \ rewards.shape[0] if dones[-1] else rewards.shape[0] - 1 states_len = states.shape[0] states = utils.any2device(states, device=self._device) actions = utils.any2device(actions, device=self._device) rewards = np.array(rewards)[:trajectory_len] values = torch.zeros( (states_len + 1, self._num_heads, self._num_atoms)).\ to(self._device) values[:states_len, ...] = self.critic(states).squeeze_(dim=2) # Each column corresponds to a different gamma values = values.cpu().numpy()[:trajectory_len + 1, ...] _, logprobs = self.actor(states, logprob=actions) logprobs = logprobs.cpu().numpy().reshape(-1)[:trajectory_len] # len x num_heads deltas = rewards[:, None, None] \ + self._gammas[:, None] * values[1:] - values[:-1] # For each gamma in the list of gammas compute the # advantage and returns # len x num_heads x num_atoms advantages = np.stack([ utils.geometric_cumsum(gamma * self.gae_lambda, deltas[:, i]) for i, gamma in enumerate(self._gammas) ], axis=1) # len x num_heads returns = np.stack([ utils.geometric_cumsum(gamma, rewards[:, None])[:, 0] for gamma in self._gammas ], axis=1) # final rollout dones = dones[:trajectory_len] values = values[:trajectory_len] assert len(logprobs) == len(advantages) \ == len(dones) == len(returns) == len(values) rollout = { "action_logprob": logprobs, "advantage": advantages, "done": dones, "return": returns, "value": values, } return rollout
def get_rollout(self, states, actions, rewards, dones): trajectory_len = \ rewards.shape[0] if dones[-1] else rewards.shape[0] - 1 states = utils.any2device(states, device=self._device) actions = utils.any2device(actions, device=self._device) rewards = np.array(rewards)[:trajectory_len] _, logprobs = self.actor(states, logprob=actions) logprobs = logprobs.cpu().numpy().reshape(-1)[:trajectory_len] returns = utils.geometric_cumsum(self.gamma, rewards)[0] rollout = {"return": returns, "action_logprob": logprobs} return rollout