Esempio n. 1
0
    def _compute_advantages(self, paths, all_path_baselines):
        assert len(paths) == len(all_path_baselines)

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                self.discount * path_baselines[1:] - \
                path_baselines[:-1]
            path["advantages"] = utils.discount_cumsum(
                deltas, self.discount * self.gae_lambda)

        return paths
    def _fit_reward_baseline_compute_advantages(self, paths):
        """
        only to be called if return_baseline is provided. Computes GAE advantage estimates
        """
        assert self.return_baseline is not None

        # a) compute returns
        for idx, path in enumerate(paths):
            path["returns"] = utils.discount_cumsum(path["rewards"], self.discount)

        # b) fit return baseline estimator using the path returns and predict the return baselines
        self.return_baseline.fit(paths, target_key='returns')
        all_path_baselines = [self.return_baseline.predict(path) for path in paths]

        # c) generalized advantage estimation
        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = utils.discount_cumsum(
                deltas, self.discount * self.gae_lambda)

        # d) pad paths and stack them
        advantages = []
        for path in paths:
            path_length = path["observations"].shape[0]
            advantages.append(self._pad(path["advantages"], path_length))

        advantages = np.stack(advantages, axis=0)

        # e) desired normalize / shift advantages
        if self.normalize_adv:
            advantages = utils.normalize_advantages(advantages)
        if self.positive_adv:
            advantages = utils.shift_advantages_to_positive(advantages)

        return paths, advantages
Esempio n. 3
0
 def testFit(self):
     paths = self.sampler.obtain_samples()
     for task in paths.values():
         unfit_error = 0
         for path in task:
             path["returns"] = utils.discount_cumsum(path["rewards"], 0.99)
             unfit_pred = self.linear.predict(path)
             unfit_error += sum([
                 np.square(pred - actual)
                 for pred, actual in zip(unfit_pred, path['returns'])
             ])
         self.linear.fit(task)
         fit_error = 0
         for path in task:
             fit_pred = self.linear.predict(path)
             fit_error += sum([
                 np.square(pred - actual)
                 for pred, actual in zip(fit_pred, path['returns'])
             ])
         self.assertTrue(fit_error < unfit_error)
Esempio n. 4
0
    def _compute_samples_data(self, paths):
        assert type(paths) == list

        # 1) compute discounted rewards (returns)
        for idx, path in enumerate(paths):
            path["returns"] = utils.discount_cumsum(path["rewards"],
                                                    self.discount)

        # 2) fit baseline estimator using the path returns and predict the return baselines
        self.baseline.fit(paths, target_key="returns")
        all_path_baselines = [self.baseline.predict(path) for path in paths]

        # 3) compute advantages and adjusted rewards
        paths = self._compute_advantages(paths, all_path_baselines)

        # 4) stack path data
        observations, actions, rewards, dones, returns, advantages, env_infos, agent_infos = self._concatenate_path_data(
            paths)

        # 5) if desired normalize / shift advantages
        if self.normalize_adv:
            advantages = utils.normalize_advantages(advantages)
        if self.positive_adv:
            advantages = utils.shift_advantages_to_positive(advantages)

        # 6) create samples_data object
        samples_data = dict(
            observations=observations,
            actions=actions,
            rewards=rewards,
            dones=dones,
            returns=returns,
            advantages=advantages,
            env_infos=env_infos,
            agent_infos=agent_infos,
        )

        return samples_data, paths
Esempio n. 5
0
 def testSerialize(self):
     paths = self.sampler.obtain_samples()
     for task in paths.values():
         for path in task:
             path["returns"] = utils.discount_cumsum(path["rewards"], 0.99)
         self.linear.fit(task)
         fit_error_pre = 0
         for path in task:
             fit_pred = self.linear.predict(path)
             fit_error_pre += sum([
                 np.square(pred - actual)
                 for pred, actual in zip(fit_pred, path['returns'])
             ])
         pkl = pickle.dumps(self.linear)
         self.linear = pickle.loads(pkl)
         fit_error_post = 0
         for path in task:
             fit_pred = self.linear.predict(path)
             fit_error_post += sum([
                 np.square(pred - actual)
                 for pred, actual in zip(fit_pred, path['returns'])
             ])
         self.assertEqual(fit_error_pre, fit_error_post)