def compute_advantage(self, vfn: BaseVFunction, samples: Dataset): n_steps = len(samples) // self.n_envs samples = samples.reshape((n_steps, self.n_envs)) if not self.add_absorbing_state: use_next_vf = ~samples.done use_next_adv = ~(samples.done | samples.timeout) else: absorbing_mask = samples.mask == Mask.ABSORBING use_next_vf = np.ones_like(samples.done) use_next_adv = ~(absorbing_mask | samples.timeout) next_values = vfn.get_values(samples.reshape(-1).next_state).reshape( n_steps, self.n_envs) values = vfn.get_values(samples.reshape(-1).state).reshape( n_steps, self.n_envs) advantages = np.zeros((n_steps, self.n_envs), dtype=np.float32) last_gae_lambda = 0 for t in reversed(range(n_steps)): delta = samples[t].reward + self.gamma * next_values[ t] * use_next_vf[t] - values[t] advantages[ t] = last_gae_lambda = delta + self.gamma * self.lambda_ * last_gae_lambda * use_next_adv[ t] # next_values = values[t] return advantages.reshape(-1), values.reshape(-1)
def compute_advantage(self, vfn: BaseVFunction, samples: Dataset, task=None): n_steps = len(samples) // self.n_envs samples = samples.reshape((n_steps, self.n_envs)) use_next_vf = ~samples.done use_next_adv = ~(samples.done | samples.timeout) next_values = vfn.get_values(samples[-1].next_state) values = vfn.get_values(samples.reshape(-1).state).reshape( n_steps, self.n_envs) advantages = np.zeros((n_steps, self.n_envs), dtype=np.float32) advantages_shadow = np.zeros((n_steps, self.n_envs), dtype=np.float32) next_values_all = np.zeros_like(values, dtype=np.float32) next_values_all[:-1] = values[1:] * (1.0 - samples.done[1:]) next_values_all[-1] = next_values td = self.gamma * next_values_all * use_next_vf - values coef_mat = np.zeros([n_steps, n_steps, self.n_envs], np.float32) coef_mat_returns = np.zeros([n_steps, n_steps, self.n_envs], np.float32) #print ('use_next_adv:', use_next_adv.shape) tmp = [] for i in range(n_steps): coef = np.ones([self.n_envs], dtype=np.float32) coef_r = np.ones([self.n_envs], dtype=np.float32) coef_mat[i][i] = coef coef_mat_returns[i][i] = coef_r if i == n_steps - 1: tmp.append(coef) for j in range(i + 1, n_steps): coef *= self.gamma * self.lambda_ * use_next_adv[ j - 1] #~samples.done[j] #* use_next_adv[j] #~samples.done[j] if i == n_steps - 1: tmp.append(coef) coef_mat[i][j] = coef #TODO coef_r *= self.gamma * use_next_vf[j - 1] #~samples.done[j] coef_mat_returns[i][j] = coef_r coef_mat = np.transpose(coef_mat, (2, 0, 1)) coef_mat_returns = np.transpose(coef_mat_returns, (2, 0, 1)) reward_ctrl_list = np.array(self.reward_ctrl_list, dtype=np.float32) reward_state_list = np.array(self.reward_state_list, dtype=np.float32) last_gae_lambda = 0 next_values = vfn.get_values(samples[-1].next_state) for t in reversed(range(n_steps)): delta = samples[t].reward + self.gamma * next_values * use_next_vf[ t] - values[t] advantages[ t] = last_gae_lambda = delta + self.gamma * self.lambda_ * last_gae_lambda * use_next_adv[ t] next_values = values[t] advantages_params = None return advantages.reshape(-1), advantages_params, values.reshape( -1 ), td, coef_mat, coef_mat_returns, reward_ctrl_list, reward_state_list, self.begin_mark
def compute_advantage(self, vfn: BaseVFunction, samples: Dataset): n_steps = len(samples) // self.n_envs samples = samples.reshape((n_steps, self.n_envs)) use_next_vf = ~samples.done use_next_adv = ~(samples.done | samples.timeout) next_values = vfn.get_values(samples[-1].next_state) values = vfn.get_values(samples.reshape(-1).state).reshape( n_steps, self.n_envs) advantages = np.zeros((n_steps, self.n_envs), dtype=np.float32) last_gae_lambda = 0 for t in reversed(range(n_steps)): delta = samples[t].reward + self.gamma * next_values * use_next_vf[ t] - values[t] advantages[ t] = last_gae_lambda = delta + self.gamma * self.lambda_ * last_gae_lambda * use_next_adv[ t] next_values = values[t] return advantages.reshape(-1), values.reshape(-1)
def store_episode(self, data: Dataset): data = data.reshape([self.n_steps, self.num_envs]) if self.state_block is None: self.obs_shape, self.obs_dtype = list( data.state.shape[2:]), data.state.dtype self.state_block = np.empty([self._size], dtype=object) self.actions = np.empty([self._size] + list(data.action.shape), dtype=data.action.dtype) self.rewards = np.empty([self._size] + list(data.reward.shape), dtype=data.reward.dtype) self.mus = np.empty([self._size] + list(data.mu.shape), dtype=data.mu.dtype) self.dones = np.empty([self._size] + list(data.done.shape), dtype=np.bool) self.timeouts = np.empty([self._size] + list(data.timeout.shape), dtype=np.bool) self.infos = np.empty([self._size] + list(data.info.shape), dtype=object) terminals = data.done | data.timeout if self.stacked_frame: self.state_block[self._next_idx] = StackedFrame( data.state, data.next_state, terminals) else: self.state_block[self._next_idx] = StateBlock( data.state, data.next_state, terminals) self.actions[self._next_idx] = data.action self.rewards[self._next_idx] = data.reward self.mus[self._next_idx] = data.mu self.dones[self._next_idx] = data.done self.timeouts[self._next_idx] = data.timeout self.infos[self._next_idx] = data.info self._next_idx = (self._next_idx + 1) % self._size self._total_size += 1 self._num_in_buffer = min(self._size, self._num_in_buffer + 1)