コード例 #1
0
    def _fit_reward_baseline_compute_advantages(self, paths):
        """
        only to be called if return_baseline is provided. Computes GAE advantage estimates
        """
        assert self.return_baseline is not None

        # a) compute returns
        for idx, path in enumerate(paths):
            path["returns"] = utils.discount_cumsum(path["rewards"],
                                                    self.discount)

        # b) fit return baseline estimator using the path returns and predict the return baselines
        self.return_baseline.fit(paths, target_key='returns')
        all_path_baselines = [
            self.return_baseline.predict(path) for path in paths
        ]

        # c) generalized advantage estimation
        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = utils.discount_cumsum(
                deltas, self.discount * self.gae_lambda)

        # d) pad paths and stack them
        advantages = []
        for path in paths:
            path_length = path["observations"].shape[0]
            advantages.append(self._pad(path["advantages"], path_length))

        advantages = np.stack(advantages, axis=0)

        # e) desired normalize / shift advantages
        if self.normalize_adv:
            advantages = utils.normalize_advantages(advantages)
        if self.positive_adv:
            advantages = utils.shift_advantages_to_positive(advantages)

        return paths, advantages
コード例 #2
0
    def _compute_advantages(self, paths, all_path_baselines):
        assert len(paths) == len(all_path_baselines)

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = utils.discount_cumsum(
                deltas, self.discount * self.gae_lambda)

        return paths
コード例 #3
0
    def _compute_samples_data(self, paths):
        assert type(paths) == list

        # 1) compute discounted rewards and return
        paths = self._compute_discounted_rewards(paths)

        # 2) fit a meta baseline
        for _, path in enumerate(paths):
            path["returns"] = utils.discount_cumsum(path["rewards"],
                                                    self.discount)
        self.metabaseline.fit(paths, target_key="returns")
        for _, path in enumerate(paths):
            path["meta_baselines_nu"] = self.metabaseline.predict(path)

        # 3) fit baseline estimator using the path returns and predict the return baselines
        self.baseline.fit(paths, target_key='discounted_rewards')
        all_path_baselines = [self.baseline.predict(path) for path in paths]

        # 4) compute adjusted rewards (r - b)
        paths = self._compute_adjusted_rewards(paths, all_path_baselines)

        # 5) stack path data
        mask, observations, actions, rewards, adjusted_rewards, env_infos, agent_infos, meta_baselines = self._pad_and_stack_paths(
            paths)

        # 6) if desired normalize / shift adjusted_rewards
        if self.normalize_adv:
            adjusted_rewards = utils.normalize_advantages(adjusted_rewards)
            meta_baselines = utils.normalize_metabaselines(meta_baselines)
        if self.positive_adv:
            adjusted_rewards = utils.shift_advantages_to_positive(
                adjusted_rewards)

        # 7) create samples_data object
        samples_data = dict(
            mask=mask,
            observations=observations,
            actions=actions,
            rewards=rewards,
            env_infos=env_infos,
            agent_infos=agent_infos,
            adjusted_rewards=adjusted_rewards,
            meta_baselines=meta_baselines,
        )

        # if return baseline is provided also compute GAE advantage estimates
        if self.return_baseline is not None:
            paths, advantages = self._fit_reward_baseline_compute_advantages(
                paths)
            samples_data['advantages'] = advantages

        return samples_data, paths
コード例 #4
0
 def testFit(self):
     paths = self.sampler.obtain_samples()
     for task in paths.values():
         unfit_error = 0
         for path in task:
             path["returns"] = utils.discount_cumsum(path["rewards"], 0.99)
             unfit_pred = self.linear.predict(path)
             unfit_error += sum([
                 np.square(pred - actual)
                 for pred, actual in zip(unfit_pred, path['returns'])
             ])
         self.linear.fit(task)
         fit_error = 0
         for path in task:
             fit_pred = self.linear.predict(path)
             fit_error += sum([
                 np.square(pred - actual)
                 for pred, actual in zip(fit_pred, path['returns'])
             ])
         self.assertTrue(fit_error < unfit_error)
コード例 #5
0
    def _compute_samples_data(self, paths):
        assert type(paths) == list

        path_lengths = [len(path) for path in paths]

        # 1) compute discounted rewards (returns)
        for idx, path in enumerate(paths):
            path["returns"] = utils.discount_cumsum(path["rewards"], self.discount)

        # 2) fit baseline estimator using the path returns and predict the return baselines
        self.baseline.fit(paths, target_key="returns")
        all_path_baselines = [self.baseline.predict(path) for path in paths]

        # 3) compute advantages and adjusted rewards
        paths = self._compute_advantages(paths, all_path_baselines)

        # 4) stack path data
        observations, actions, rewards, returns, advantages, env_infos, agent_infos = self._stack_path_data(paths)

        # 5) if desired normalize / shift advantages
        if self.normalize_adv:
            advantages = utils.normalize_advantages(advantages)
        if self.positive_adv:
            advantages = utils.shift_advantages_to_positive(advantages)

        # 6) create samples_data object
        samples_data = dict(
            observations=observations,
            actions=actions,
            rewards=rewards,
            returns=returns,
            advantages=advantages,
            env_infos=env_infos,
            agent_infos=agent_infos,
            path_lengths=path_lengths,
        )

        return samples_data, paths
コード例 #6
0
 def testSerialize(self):
     paths = self.sampler.obtain_samples()
     for task in paths.values():
         for path in task:
             path["returns"] = utils.discount_cumsum(path["rewards"], 0.99)
         self.linear.fit(task)
         fit_error_pre = 0
         for path in task:
             fit_pred = self.linear.predict(path)
             fit_error_pre += sum([
                 np.square(pred - actual)
                 for pred, actual in zip(fit_pred, path['returns'])
             ])
         pkl = pickle.dumps(self.linear)
         self.linear = pickle.loads(pkl)
         fit_error_post = 0
         for path in task:
             fit_pred = self.linear.predict(path)
             fit_error_post += sum([
                 np.square(pred - actual)
                 for pred, actual in zip(fit_pred, path['returns'])
             ])
         self.assertEqual(fit_error_pre, fit_error_post)