def compute_advantage(self, vfn: BaseVFunction, samples: Dataset):
        n_steps = len(samples) // self.n_envs
        samples = samples.reshape((n_steps, self.n_envs))
        if not self.add_absorbing_state:
            use_next_vf = ~samples.done
            use_next_adv = ~(samples.done | samples.timeout)
        else:
            absorbing_mask = samples.mask == Mask.ABSORBING
            use_next_vf = np.ones_like(samples.done)
            use_next_adv = ~(absorbing_mask | samples.timeout)

        next_values = vfn.get_values(samples.reshape(-1).next_state).reshape(
            n_steps, self.n_envs)
        values = vfn.get_values(samples.reshape(-1).state).reshape(
            n_steps, self.n_envs)
        advantages = np.zeros((n_steps, self.n_envs), dtype=np.float32)
        last_gae_lambda = 0

        for t in reversed(range(n_steps)):
            delta = samples[t].reward + self.gamma * next_values[
                t] * use_next_vf[t] - values[t]
            advantages[
                t] = last_gae_lambda = delta + self.gamma * self.lambda_ * last_gae_lambda * use_next_adv[
                    t]
            # next_values = values[t]
        return advantages.reshape(-1), values.reshape(-1)
Beispiel #2
0
    def compute_advantage(self,
                          vfn: BaseVFunction,
                          samples: Dataset,
                          task=None):
        n_steps = len(samples) // self.n_envs
        samples = samples.reshape((n_steps, self.n_envs))
        use_next_vf = ~samples.done
        use_next_adv = ~(samples.done | samples.timeout)

        next_values = vfn.get_values(samples[-1].next_state)
        values = vfn.get_values(samples.reshape(-1).state).reshape(
            n_steps, self.n_envs)
        advantages = np.zeros((n_steps, self.n_envs), dtype=np.float32)
        advantages_shadow = np.zeros((n_steps, self.n_envs), dtype=np.float32)

        next_values_all = np.zeros_like(values, dtype=np.float32)
        next_values_all[:-1] = values[1:] * (1.0 - samples.done[1:])
        next_values_all[-1] = next_values
        td = self.gamma * next_values_all * use_next_vf - values

        coef_mat = np.zeros([n_steps, n_steps, self.n_envs], np.float32)
        coef_mat_returns = np.zeros([n_steps, n_steps, self.n_envs],
                                    np.float32)
        #print ('use_next_adv:', use_next_adv.shape)
        tmp = []
        for i in range(n_steps):
            coef = np.ones([self.n_envs], dtype=np.float32)
            coef_r = np.ones([self.n_envs], dtype=np.float32)
            coef_mat[i][i] = coef
            coef_mat_returns[i][i] = coef_r
            if i == n_steps - 1: tmp.append(coef)
            for j in range(i + 1, n_steps):
                coef *= self.gamma * self.lambda_ * use_next_adv[
                    j -
                    1]  #~samples.done[j] #* use_next_adv[j] #~samples.done[j]
                if i == n_steps - 1: tmp.append(coef)
                coef_mat[i][j] = coef
                #TODO
                coef_r *= self.gamma * use_next_vf[j - 1]  #~samples.done[j]
                coef_mat_returns[i][j] = coef_r
        coef_mat = np.transpose(coef_mat, (2, 0, 1))
        coef_mat_returns = np.transpose(coef_mat_returns, (2, 0, 1))

        reward_ctrl_list = np.array(self.reward_ctrl_list, dtype=np.float32)
        reward_state_list = np.array(self.reward_state_list, dtype=np.float32)

        last_gae_lambda = 0
        next_values = vfn.get_values(samples[-1].next_state)
        for t in reversed(range(n_steps)):
            delta = samples[t].reward + self.gamma * next_values * use_next_vf[
                t] - values[t]
            advantages[
                t] = last_gae_lambda = delta + self.gamma * self.lambda_ * last_gae_lambda * use_next_adv[
                    t]
            next_values = values[t]

        advantages_params = None
        return advantages.reshape(-1), advantages_params, values.reshape(
            -1
        ), td, coef_mat, coef_mat_returns, reward_ctrl_list, reward_state_list, self.begin_mark
Beispiel #3
0
    def compute_advantage(self, vfn: BaseVFunction, samples: Dataset):
        n_steps = len(samples) // self.n_envs
        samples = samples.reshape((n_steps, self.n_envs))
        use_next_vf = ~samples.done
        use_next_adv = ~(samples.done | samples.timeout)

        next_values = vfn.get_values(samples[-1].next_state)
        values = vfn.get_values(samples.reshape(-1).state).reshape(
            n_steps, self.n_envs)
        advantages = np.zeros((n_steps, self.n_envs), dtype=np.float32)
        last_gae_lambda = 0

        for t in reversed(range(n_steps)):
            delta = samples[t].reward + self.gamma * next_values * use_next_vf[
                t] - values[t]
            advantages[
                t] = last_gae_lambda = delta + self.gamma * self.lambda_ * last_gae_lambda * use_next_adv[
                    t]
            next_values = values[t]
        return advantages.reshape(-1), values.reshape(-1)
    def store_episode(self, data: Dataset):
        data = data.reshape([self.n_steps, self.num_envs])

        if self.state_block is None:
            self.obs_shape, self.obs_dtype = list(
                data.state.shape[2:]), data.state.dtype
            self.state_block = np.empty([self._size], dtype=object)
            self.actions = np.empty([self._size] + list(data.action.shape),
                                    dtype=data.action.dtype)
            self.rewards = np.empty([self._size] + list(data.reward.shape),
                                    dtype=data.reward.dtype)
            self.mus = np.empty([self._size] + list(data.mu.shape),
                                dtype=data.mu.dtype)
            self.dones = np.empty([self._size] + list(data.done.shape),
                                  dtype=np.bool)
            self.timeouts = np.empty([self._size] + list(data.timeout.shape),
                                     dtype=np.bool)
            self.infos = np.empty([self._size] + list(data.info.shape),
                                  dtype=object)

        terminals = data.done | data.timeout
        if self.stacked_frame:
            self.state_block[self._next_idx] = StackedFrame(
                data.state, data.next_state, terminals)
        else:
            self.state_block[self._next_idx] = StateBlock(
                data.state, data.next_state, terminals)
        self.actions[self._next_idx] = data.action
        self.rewards[self._next_idx] = data.reward
        self.mus[self._next_idx] = data.mu
        self.dones[self._next_idx] = data.done
        self.timeouts[self._next_idx] = data.timeout
        self.infos[self._next_idx] = data.info

        self._next_idx = (self._next_idx + 1) % self._size
        self._total_size += 1
        self._num_in_buffer = min(self._size, self._num_in_buffer + 1)