예제 #1
0
    def get_rollout(self, states, actions, rewards, dones):
        assert len(states) == len(actions) == len(rewards) == len(dones)

        trajectory_len = \
            rewards.shape[0] if dones[-1] else rewards.shape[0] - 1
        states_len = states.shape[0]

        states = utils.any2device(states, device=self._device)
        actions = utils.any2device(actions, device=self._device)
        rewards = np.array(rewards)[:trajectory_len]
        values = torch.zeros(
            (states_len + 1, self._num_heads, self._num_atoms)).\
            to(self._device)
        values[:states_len, ...] = self.critic(states).squeeze_(dim=2)
        # Each column corresponds to a different gamma
        values = values.cpu().numpy()[:trajectory_len + 1, ...]
        _, logprobs = self.actor(states, logprob=actions)
        logprobs = logprobs.cpu().numpy().reshape(-1)[:trajectory_len]
        # len x num_heads
        deltas = rewards[:, None, None] \
            + self._gammas[:, None] * values[1:] - values[:-1]

        # For each gamma in the list of gammas compute the
        # advantage and returns
        # len x num_heads x num_atoms
        advantages = np.stack([
            utils.geometric_cumsum(gamma * self.gae_lambda, deltas[:, i])
            for i, gamma in enumerate(self._gammas)
        ],
                              axis=1)

        # len x num_heads
        returns = np.stack([
            utils.geometric_cumsum(gamma, rewards[:, None])[:, 0]
            for gamma in self._gammas
        ],
                           axis=1)

        # final rollout
        dones = dones[:trajectory_len]
        values = values[:trajectory_len]
        assert len(logprobs) == len(advantages) \
            == len(dones) == len(returns) == len(values)
        rollout = {
            "action_logprob": logprobs,
            "advantage": advantages,
            "done": dones,
            "return": returns,
            "value": values,
        }

        return rollout
예제 #2
0
    def get_rollout(self, states, actions, rewards, dones):
        trajectory_len = \
            rewards.shape[0] if dones[-1] else rewards.shape[0] - 1

        states = utils.any2device(states, device=self._device)
        actions = utils.any2device(actions, device=self._device)
        rewards = np.array(rewards)[:trajectory_len]

        _, logprobs = self.actor(states, logprob=actions)
        logprobs = logprobs.cpu().numpy().reshape(-1)[:trajectory_len]

        returns = utils.geometric_cumsum(self.gamma, rewards)[0]

        rollout = {"return": returns, "action_logprob": logprobs}
        return rollout