def _fit_reward_baseline_compute_advantages(self, paths): """ only to be called if return_baseline is provided. Computes GAE advantage estimates """ assert self.return_baseline is not None # a) compute returns for idx, path in enumerate(paths): path["returns"] = utils.discount_cumsum(path["rewards"], self.discount) # b) fit return baseline estimator using the path returns and predict the return baselines self.return_baseline.fit(paths, target_key='returns') all_path_baselines = [ self.return_baseline.predict(path) for path in paths ] # c) generalized advantage estimation for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = utils.discount_cumsum( deltas, self.discount * self.gae_lambda) # d) pad paths and stack them advantages = [] for path in paths: path_length = path["observations"].shape[0] advantages.append(self._pad(path["advantages"], path_length)) advantages = np.stack(advantages, axis=0) # e) desired normalize / shift advantages if self.normalize_adv: advantages = utils.normalize_advantages(advantages) if self.positive_adv: advantages = utils.shift_advantages_to_positive(advantages) return paths, advantages
def _compute_advantages(self, paths, all_path_baselines): assert len(paths) == len(all_path_baselines) for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = utils.discount_cumsum( deltas, self.discount * self.gae_lambda) return paths
def _compute_samples_data(self, paths): assert type(paths) == list # 1) compute discounted rewards and return paths = self._compute_discounted_rewards(paths) # 2) fit a meta baseline for _, path in enumerate(paths): path["returns"] = utils.discount_cumsum(path["rewards"], self.discount) self.metabaseline.fit(paths, target_key="returns") for _, path in enumerate(paths): path["meta_baselines_nu"] = self.metabaseline.predict(path) # 3) fit baseline estimator using the path returns and predict the return baselines self.baseline.fit(paths, target_key='discounted_rewards') all_path_baselines = [self.baseline.predict(path) for path in paths] # 4) compute adjusted rewards (r - b) paths = self._compute_adjusted_rewards(paths, all_path_baselines) # 5) stack path data mask, observations, actions, rewards, adjusted_rewards, env_infos, agent_infos, meta_baselines = self._pad_and_stack_paths( paths) # 6) if desired normalize / shift adjusted_rewards if self.normalize_adv: adjusted_rewards = utils.normalize_advantages(adjusted_rewards) meta_baselines = utils.normalize_metabaselines(meta_baselines) if self.positive_adv: adjusted_rewards = utils.shift_advantages_to_positive( adjusted_rewards) # 7) create samples_data object samples_data = dict( mask=mask, observations=observations, actions=actions, rewards=rewards, env_infos=env_infos, agent_infos=agent_infos, adjusted_rewards=adjusted_rewards, meta_baselines=meta_baselines, ) # if return baseline is provided also compute GAE advantage estimates if self.return_baseline is not None: paths, advantages = self._fit_reward_baseline_compute_advantages( paths) samples_data['advantages'] = advantages return samples_data, paths
def testFit(self): paths = self.sampler.obtain_samples() for task in paths.values(): unfit_error = 0 for path in task: path["returns"] = utils.discount_cumsum(path["rewards"], 0.99) unfit_pred = self.linear.predict(path) unfit_error += sum([ np.square(pred - actual) for pred, actual in zip(unfit_pred, path['returns']) ]) self.linear.fit(task) fit_error = 0 for path in task: fit_pred = self.linear.predict(path) fit_error += sum([ np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns']) ]) self.assertTrue(fit_error < unfit_error)
def _compute_samples_data(self, paths): assert type(paths) == list path_lengths = [len(path) for path in paths] # 1) compute discounted rewards (returns) for idx, path in enumerate(paths): path["returns"] = utils.discount_cumsum(path["rewards"], self.discount) # 2) fit baseline estimator using the path returns and predict the return baselines self.baseline.fit(paths, target_key="returns") all_path_baselines = [self.baseline.predict(path) for path in paths] # 3) compute advantages and adjusted rewards paths = self._compute_advantages(paths, all_path_baselines) # 4) stack path data observations, actions, rewards, returns, advantages, env_infos, agent_infos = self._stack_path_data(paths) # 5) if desired normalize / shift advantages if self.normalize_adv: advantages = utils.normalize_advantages(advantages) if self.positive_adv: advantages = utils.shift_advantages_to_positive(advantages) # 6) create samples_data object samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, path_lengths=path_lengths, ) return samples_data, paths
def testSerialize(self): paths = self.sampler.obtain_samples() for task in paths.values(): for path in task: path["returns"] = utils.discount_cumsum(path["rewards"], 0.99) self.linear.fit(task) fit_error_pre = 0 for path in task: fit_pred = self.linear.predict(path) fit_error_pre += sum([ np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns']) ]) pkl = pickle.dumps(self.linear) self.linear = pickle.loads(pkl) fit_error_post = 0 for path in task: fit_pred = self.linear.predict(path) fit_error_post += sum([ np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns']) ]) self.assertEqual(fit_error_pre, fit_error_post)