コード例 #1
0
ファイル: npo.py プロジェクト: seraliilhan/garage
    def _fit_baseline_with_data(self, samples_data):
        """Update baselines from samples.

        Args:
            samples_data (dict): Processed sample data.
                See garage.tf.paths_to_tensors() for details.

        """
        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Augment reward from baselines
        rewards_tensor = self._f_rewards(*policy_opt_input_values)
        returns_tensor = self._f_returns(*policy_opt_input_values)
        returns_tensor = np.squeeze(returns_tensor, -1)

        paths = samples_data['paths']
        valids = samples_data['valids']

        # Recompute parts of samples_data
        aug_rewards = []
        aug_returns = []
        for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids,
                                       paths):
            path['rewards'] = rew[val.astype(np.bool)]
            path['returns'] = ret[val.astype(np.bool)]
            aug_rewards.append(path['rewards'])
            aug_returns.append(path['returns'])
        samples_data['rewards'] = np_tensor_utils.pad_tensor_n(
            aug_rewards, self.max_path_length)
        samples_data['returns'] = np_tensor_utils.pad_tensor_n(
            aug_returns, self.max_path_length)

        # Fit baseline
        logger.log('Fitting baseline...')
        self._baseline.fit(paths)
コード例 #2
0
ファイル: _rl2npo.py プロジェクト: jamartinh/garage
    def _get_baseline_prediction(self, samples_data):
        """Get baseline prediction.

        Args:
            samples_data (dict): Processed sample data.
                See garage.tf.paths_to_tensors() for details.

        Returns:
            np.ndarray: Baseline prediction, with shape
                :math:`(N, max_path_length * episode_per_task)`.

        """
        paths = samples_data['paths']
        baselines = [self._baseline.predict(path) for path in paths]
        return np_tensor_utils.pad_tensor_n(baselines, self.max_path_length)
コード例 #3
0
ファイル: meh_wat.py プロジェクト: pything/agent
    def traj_list_to_tensors(paths, max_path_length, baseline_predictions,
                             discount):
        """Return processed sample data based on the collected paths.

    Args:
        paths (list[dict]): A list of collected paths.
        max_path_length (int): Maximum length of a single rollout.
        baseline_predictions(numpy.ndarray): : Predicted value of GAE
            (Generalized Advantage Estimation) Baseline.
        discount (float): Environment reward discount.

    Returns:
        dict: Processed sample data, with key
            * observations (numpy.ndarray): Padded array of the observations of
                the environment
            * actions (numpy.ndarray): Padded array of the actions fed to the
                the environment
            * rewards (numpy.ndarray): Padded array of the acquired rewards
            * agent_infos (dict): a dictionary of {stacked tensors or
                dictionary of stacked tensors}
            * env_infos (dict): a dictionary of {stacked tensors or
                dictionary of stacked tensors}
            * rewards (numpy.ndarray): Padded array of the validity information


    """
        baselines = []
        returns = []

        for idx, path in enumerate(paths):
            # baselines
            path["baselines"] = baseline_predictions[idx]
            baselines.append(path["baselines"])

            # returns
            path["returns"] = tensor_utils.discount_cumsum(
                path["rewards"], discount)
            returns.append(path["returns"])

        obs = [path["observations"] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path["actions"] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path["rewards"] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        agent_infos = [path["agent_infos"] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path["env_infos"] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path["returns"]) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            agent_infos=agent_infos,
            env_infos=env_infos,
            valids=valids,
        )

        return samples_data
コード例 #4
0
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            dict: Processed sample data, with key
                * average_return: (float)

        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = special.discount_cumsum(path['rewards'],
                                                      self.discount)
            returns.append(path['returns'])

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        average_discounted_return = (np.mean(
            [path['returns'][0] for path in paths]))

        undiscounted_returns = [sum(path['rewards']) for path in paths]
        self.episode_reward_mean.extend(undiscounted_returns)

        ent = np.sum(self.policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        samples_data = dict(average_return=np.mean(undiscounted_returns))

        tabular.record('Iteration', itr)
        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))
        tabular.record('NumTrajs', len(paths))
        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))

        return samples_data
コード例 #5
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        if hasattr(self.algo.baseline, 'predict_n'):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path['rewards'] + \
                self.algo.discount * path_baselines[1:] - path_baselines[:-1]
            path['advantages'] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path['returns'] = special.discount_cumsum(path['rewards'],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path['returns'])

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path['observations'] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path['actions'] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path['rewards'] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path['returns'] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path['advantages'] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path['env_infos'] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path['agent_infos'] for path in paths])

            if self.algo.center_adv:
                advantages = utils.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = utils.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path['returns'][0] for path in paths])

            undiscounted_returns = [sum(path['rewards']) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path['advantages']) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path['observations'] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path['advantages'] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path['advantages'] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path['advantages'] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path['actions'] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path['rewards'] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path['returns'] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path['agent_infos'] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path['env_infos'] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path['returns']) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path['returns'][0] for path in paths])

            undiscounted_returns = [sum(path['rewards']) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log('fitting baseline...')
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            self.algo.baseline.fit_with_samples(paths, samples_data)
        else:
            self.algo.baseline.fit(paths)
        logger.log('fitted')

        tabular.record('Iteration', itr)
        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('ExplainedVariance', ev)
        tabular.record('NumTrajs', len(paths))
        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))

        return samples_data
コード例 #6
0
ファイル: batch_polopt.py プロジェクト: Jorsorokin/garage
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            dict: Processed sample data, with key
                * average_return: (float)

        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self.env_spec, paths),
            discount=self.discount)

        self.episode_reward_mean.extend(undiscounted_returns)

        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))

        samples_data = dict(average_return=np.mean(undiscounted_returns))

        return samples_data
コード例 #7
0
ファイル: batch_polopt.py プロジェクト: wjssx/garage
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            dict: Processed sample data, with key
                * average_return: (float)

        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        terminals = [path['dones'] for path in paths]

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        lengths = np.asarray([v.sum() for v in valids])

        ent = np.sum(self.policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        undiscounted_returns = self.evaluate_performance(
            itr,
            dict(env_spec=None,
                 observations=obs,
                 actions=actions,
                 rewards=rewards,
                 terminals=terminals,
                 env_infos=env_infos,
                 agent_infos=agent_infos,
                 lengths=lengths,
                 discount=self.discount))

        self.episode_reward_mean.extend(undiscounted_returns)

        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))

        samples_data = dict(average_return=np.mean(undiscounted_returns))

        return samples_data