Ejemplo n.º 1
0
    def finish(self, last_value, adv_type, gamma, gae_discount):
        self['value'][:, self.idx] = last_value
        self['mask'][:, self.idx:] = 0
        valid_slice = np.s_[:, :self.idx]
        mask = self['mask'][valid_slice]

        if adv_type == 'nae':
            traj_ret = self['traj_ret'][valid_slice]
            next_return = last_value
            for i in reversed(range(self.idx)):
                traj_ret[:, i] = next_return = self['reward'][:, i] + self[
                    'nonterminal'][:, i] * gamma * next_return

            # standardize traj_ret and advantages
            traj_ret_mean, traj_ret_std = moments(traj_ret, mask=mask)
            value = standardize(self['value'][valid_slice], mask=mask)
            value = (value + traj_ret_mean) / (
                traj_ret_std + 1e-8
            )  # to have the same mean and std as trajectory return
            self['advantage'][valid_slice] = standardize(traj_ret - value,
                                                         mask=mask)
            self['traj_ret'][valid_slice] = standardize(traj_ret, mask=mask)
        elif adv_type == 'gae':
            advs = delta = (self['reward'][valid_slice] +
                            self['nonterminal'][valid_slice] * gamma *
                            self['value'][:, 1:self.idx + 1] -
                            self['value'][valid_slice])
            next_adv = 0
            for i in reversed(range(self.idx)):
                advs[:, i] = next_adv = delta[:, i] + self[
                    'nonterminal'][:, i] * gae_discount * next_adv
            self['traj_ret'][valid_slice] = advs + self['value'][valid_slice]
            self['advantage'][valid_slice] = standardize(advs, mask=mask)
        else:
            raise NotImplementedError(
                f'Advantage type should be either "nae" or "gae", but get "{adv_type}".'
            )

        for k, v in self.items():
            v[valid_slice] = (v[valid_slice].T * mask.T).T

        self.ready = True
Ejemplo n.º 2
0
def compute_nae(reward,
                discount,
                value,
                last_value,
                traj_ret,
                gamma,
                mask=None,
                epsilon=1e-8):
    next_return = last_value
    for i in reversed(range(reward.shape[1])):
        traj_ret[:, i] = next_return = (reward[:, i] +
                                        discount[:, i] * gamma * next_return)

    # Standardize traj_ret and advantages
    traj_ret_mean, traj_ret_var = moments(traj_ret)
    traj_ret_std = np.maximum(np.sqrt(traj_ret_var), 1e-8)
    value = standardize(value, mask=mask, epsilon=epsilon)
    # To have the same mean and std as trajectory return
    value = (value + traj_ret_mean) / traj_ret_std
    advantage = standardize(traj_ret - value, mask=mask, epsilon=epsilon)
    traj_ret = standardize(traj_ret, mask=mask, epsilon=epsilon)

    return advantage, traj_ret
Ejemplo n.º 3
0
    def test_gae2(self):
        from algo.ppo2.buffer import Buffer
        for prec in [16, 32]:
            config['precision'] = prec
            buffer = Buffer(config, state_keys=['h', 'c'])
            n_envs = config['n_envs']
            n_steps = config['N_STEPS']
            d = np.zeros(n_envs)
            m = np.ones(n_envs)
            for i in range(n_steps):
                r = np.random.rand(n_envs)
                v = np.random.rand(n_envs)
                h = np.random.rand(n_envs, 32)
                c = np.random.rand(n_envs, 32)
                if np.random.randint(2):
                    d[np.random.randint(n_envs)] = 1
                buffer.add(reward=r, value=v, discount=1 - d, mask=m, c=c, h=h)
                m = 1 - d
            last_value = np.random.rand(n_envs)
            buffer.finish(last_value)

            memory = {
                k: v.copy().reshape((n_envs, -1))
                for k, v in buffer._memory.items()
            }
            mb_advs = np.zeros_like(memory['reward'])
            lastgaelam = 0
            for t in reversed(range(buffer._idx)):
                if t == buffer._idx - 1:
                    nextdiscount = memory['discount'][:, t]
                    nextvalues = last_value
                else:
                    nextdiscount = memory['discount'][:, t]
                    nextvalues = memory['value'][:, t + 1]
                delta = memory[
                    'reward'][:,
                              t] + gamma * nextvalues * nextdiscount - memory[
                                  'value'][:, t]
                mb_advs[:,
                        t] = lastgaelam = delta + gae_discount * nextdiscount * lastgaelam
            # no mask is used here
            mb_advs = standardize(mb_advs)

            np.testing.assert_allclose(mb_advs, memory['advantage'], atol=1e-5)
Ejemplo n.º 4
0
    def sample(self, sample_keys=None):
        assert self._ready
        if self._mb_idx == 0:
            np.random.shuffle(self._shuffled_idxes)

        sample_keys = sample_keys or self._sample_keys
        self._mb_idx, self._curr_idxes = compute_indices(
            self._shuffled_idxes, self._mb_idx, self._mb_size, self.N_MBS)

        sample = {
            k: self._memory[k][self._curr_idxes, 0]
            if k in self._state_keys else self._memory[k][self._curr_idxes]
            for k in sample_keys
        }

        if self._norm_adv == 'minibatch':
            sample['advantage'] = standardize(sample['advantage'],
                                              epsilon=self._epsilon)

        return sample
Ejemplo n.º 5
0
def compute_gae(reward,
                discount,
                value,
                last_value,
                gamma,
                gae_discount,
                norm_adv=False,
                mask=None,
                epsilon=1e-8):
    if last_value is not None:
        last_value = np.expand_dims(last_value, 1)
        next_value = np.concatenate([value[:, 1:], last_value], axis=1)
    else:
        next_value = value[:, 1:]
    assert value.shape == next_value.shape, (value.shape, next_value.shape)
    advs = delta = (reward + discount * gamma * next_value - value)
    next_adv = 0
    for i in reversed(range(advs.shape[1])):
        advs[:, i] = next_adv = (delta[:, i] +
                                 discount[:, i] * gae_discount * next_adv)
    traj_ret = advs + value
    if norm_adv:
        advs = standardize(advs, mask=mask, epsilon=epsilon)
    return advs, traj_ret
Ejemplo n.º 6
0
    def test_gae0(self):
        from algo.ppo.buffer import Buffer
        buffer = Buffer(config)
        n_envs = config['n_envs']
        n_steps = config['N_STEPS']
        d = np.zeros(n_envs)
        for i in range(n_steps):
            r = np.random.rand(n_envs)
            v = np.random.rand(n_envs)
            if np.random.randint(2):
                d[np.random.randint(n_envs)] = 1
            buffer.add(reward=r, value=v, discount=1 - d)
        last_value = np.random.rand(n_envs)
        buffer.finish(last_value)

        memory = {
            k: v.copy().reshape(n_envs, -1)
            for k, v in buffer._memory.items()
        }
        mb_advs = np.zeros_like(memory['reward'])
        lastgaelam = 0
        for t in reversed(range(buffer._idx)):
            if t == buffer._idx - 1:
                nextdiscount = memory['discount'][:, t]
                nextvalues = last_value
            else:
                nextdiscount = memory['discount'][:, t]
                nextvalues = memory['value'][:, t + 1]
            delta = memory[
                'reward'][:, t] + gamma * nextvalues * nextdiscount - memory[
                    'value'][:, t]
            mb_advs[:,
                    t] = lastgaelam = delta + gae_discount * nextdiscount * lastgaelam
        mb_advs = standardize(mb_advs)

        np.testing.assert_allclose(mb_advs, memory['advantage'], atol=1e-5)