コード例 #1
0
    def process_samples(self, paths):
        r"""Process sample data based on the collected paths.

        Notes: P is the maximum path length (self.max_path_length)

        Args:
            paths (list[dict]): A list of collected paths

        Returns:
            torch.Tensor: The observations of the environment
                with shape :math:`(N, P, O*)`.
            torch.Tensor: The actions fed to the environment
                with shape :math:`(N, P, A*)`.
            torch.Tensor: The acquired rewards with shape :math:`(N, P)`.
            list[int]: Numbers of valid steps in each paths.
            torch.Tensor: Value function estimation at each step
                with shape :math:`(N, P)`.

        """
        valids = torch.Tensor([len(path['actions']) for path in paths]).int()
        obs = torch.stack([
            pad_to_last(path['observations'],
                        total_length=self.max_path_length,
                        axis=0) for path in paths
        ])
        actions = torch.stack([
            pad_to_last(path['actions'],
                        total_length=self.max_path_length,
                        axis=0) for path in paths
        ])
        rewards = torch.stack([
            pad_to_last(path['rewards'], total_length=self.max_path_length)
            for path in paths
        ])
        returns = torch.stack([
            pad_to_last(tu.discount_cumsum(path['rewards'],
                                           self.discount).copy(),
                        total_length=self.max_path_length) for path in paths
        ])
        with torch.no_grad():
            baselines = self._value_function(obs)

        return obs, actions, rewards, returns, valids, baselines
コード例 #2
0
    def _process_samples(self, paths):
        """Process sample data based on the collected paths.

        Args:
            paths (list[dict]): A list of collected paths.

        Returns:
            MAMLTrajectoryBatch: Processed samples data.

        """
        for path in paths:
            path['returns'] = tensor_utils.discount_cumsum(
                path['rewards'], self._inner_algo.discount).copy()

        self._train_value_function(paths)
        obs, actions, rewards, _, valids, baselines \
            = self._inner_algo.process_samples(paths)
        return MAMLTrajectoryBatch(paths, obs, actions, rewards, valids,
                                   baselines)
コード例 #3
0
ファイル: vpg.py プロジェクト: seba-1511/metarl
    def process_samples(self, itr, paths):
        """Process sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            tuple:
                * obs (torch.Tensor): The observations of the environment.
                * actions (torch.Tensor): The actions fed to the environment.
                * rewards (torch.Tensor): The acquired rewards.
                * valids (list[int]): Numbers of valid steps in each paths.
                * baselines (torch.Tensor): Value function estimation
                    at each step.

        """
        for path in paths:
            if 'returns' not in path:
                path['returns'] = tensor_utils.discount_cumsum(
                    path['rewards'], self.discount)

        obs = torch.stack([
            pad_to_last(path['observations'],
                        total_length=self.max_path_length,
                        axis=0) for path in paths
        ])
        actions = torch.stack([
            pad_to_last(path['actions'],
                        total_length=self.max_path_length,
                        axis=0) for path in paths
        ])
        rewards = torch.stack([
            pad_to_last(path['rewards'], total_length=self.max_path_length)
            for path in paths
        ])
        valids = np.array([len(path['actions']) for path in paths])
        baselines = torch.stack([
            pad_to_last(self._get_baselines(path),
                        total_length=self.max_path_length) for path in paths
        ])

        return Samples(obs, actions, rewards, valids, baselines)
コード例 #4
0
    def _process_samples(self, itr, paths):
        """Process sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            MAMLTrajectoryBatch: Processed samples data.

        """
        for path in paths:
            path['returns'] = tensor_utils.discount_cumsum(
                path['rewards'], self._inner_algo.discount)

        self._baseline.fit(paths)
        obs, actions, rewards, valids, baselines \
            = self._inner_algo.process_samples(itr, paths)
        return MAMLTrajectoryBatch(paths, obs, actions, rewards, valids,
                                   baselines)
コード例 #5
0
ファイル: _functions.py プロジェクト: seba-1511/metarl
def log_performance(itr, batch, discount, prefix='Evaluation'):
    """Evaluate the performance of an algorithm on a batch of trajectories.

    Args:
        itr (int): Iteration number.
        batch (TrajectoryBatch): The trajectories to evaluate with.
        discount (float): Discount value, from algorithm's property.
        prefix (str): Prefix to add to all logged keys.

    Returns:
        numpy.ndarray: Undiscounted returns.

    """
    returns = []
    undiscounted_returns = []
    completion = []
    success = []
    for trajectory in batch.split():
        returns.append(discount_cumsum(trajectory.rewards, discount))
        undiscounted_returns.append(sum(trajectory.rewards))
        completion.append(float(trajectory.terminals.any()))
        if 'success' in trajectory.env_infos:
            success.append(float(trajectory.env_infos['success'].any()))

    average_discounted_return = np.mean([rtn[0] for rtn in returns])

    with tabular.prefix(prefix + '/'):
        tabular.record('Iteration', itr)
        tabular.record('NumTrajs', len(returns))

        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))
        tabular.record('CompletionRate', np.mean(completion))
        if success:
            tabular.record('SuccessRate', np.mean(success))

    return undiscounted_returns, np.mean(success)
コード例 #6
0
def paths_to_tensors(paths, max_path_length, baseline_predictions, discount):
    """Return processed sample data based on the collected paths.

    Args:
        paths (list[dict]): A list of collected paths.
        max_path_length (int): Maximum length of a single rollout.
        baseline_predictions(numpy.ndarray): : Predicted value of GAE
            (Generalized Advantage Estimation) Baseline.
        discount (float): Environment reward discount.

    Returns:
        dict: Processed sample data, with key
            * observations (numpy.ndarray): Padded array of the observations of
                the environment
            * actions (numpy.ndarray): Padded array of the actions fed to the
                the environment
            * rewards (numpy.ndarray): Padded array of the acquired rewards
            * agent_infos (dict): a dictionary of {stacked tensors or
                dictionary of stacked tensors}
            * env_infos (dict): a dictionary of {stacked tensors or
                dictionary of stacked tensors}
            * rewards (numpy.ndarray): Padded array of the validity information


    """
    baselines = []
    returns = []

    for idx, path in enumerate(paths):
        # baselines
        path['baselines'] = baseline_predictions[idx]
        baselines.append(path['baselines'])

        # returns
        path['returns'] = tensor_utils.discount_cumsum(path['rewards'],
                                                       discount)
        returns.append(path['returns'])

    obs = [path['observations'] for path in paths]
    obs = tensor_utils.pad_tensor_n(obs, max_path_length)

    actions = [path['actions'] for path in paths]
    actions = tensor_utils.pad_tensor_n(actions, max_path_length)

    rewards = [path['rewards'] for path in paths]
    rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

    agent_infos = [path['agent_infos'] for path in paths]
    agent_infos = tensor_utils.stack_tensor_dict_list([
        tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos
    ])

    env_infos = [path['env_infos'] for path in paths]
    env_infos = tensor_utils.stack_tensor_dict_list(
        [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos])

    valids = [np.ones_like(path['returns']) for path in paths]
    valids = tensor_utils.pad_tensor_n(valids, max_path_length)

    samples_data = dict(observations=obs,
                        actions=actions,
                        rewards=rewards,
                        agent_infos=agent_infos,
                        env_infos=env_infos,
                        valids=valids)

    return samples_data
コード例 #7
0
def paths_to_tensors(paths, max_path_length, baseline_predictions, discount,
                     gae_lambda):
    """Return processed sample data based on the collected paths.

    Args:
        paths (list[dict]): A list of collected paths.
        max_path_length (int): Maximum length of a single rollout.
        baseline_predictions(numpy.ndarray): : Predicted value of GAE
            (Generalized Advantage Estimation) Baseline.
        discount (float): Environment reward discount.
        gae_lambda (float): Lambda used for generalized advantage
            estimation.

    Returns:
        dict: Processed sample data, with key
            * observations: (numpy.ndarray)
            * actions: (numpy.ndarray)
            * rewards: (numpy.ndarray)
            * baselines: (numpy.ndarray)
            * returns: (numpy.ndarray)
            * valids: (numpy.ndarray)
            * agent_infos: (dict)
            * env_infos: (dict)
            * paths: (list[dict])

    """
    baselines = []
    returns = []
    total_steps = 0

    for idx, path in enumerate(paths):
        total_steps += len(path['rewards'])
        path_baselines = np.append(baseline_predictions[idx], 0)
        deltas = (path['rewards'] + discount * path_baselines[1:] -
                  path_baselines[:-1])
        path['advantages'] = np_tensor_utils.discount_cumsum(
            deltas, discount * gae_lambda)
        path['deltas'] = deltas

    for idx, path in enumerate(paths):
        # baselines
        path['baselines'] = baseline_predictions[idx]
        baselines.append(path['baselines'])

        # returns
        path['returns'] = np_tensor_utils.discount_cumsum(
            path['rewards'], discount)
        returns.append(path['returns'])

    # make all paths the same length
    obs = [path['observations'] for path in paths]
    obs = tensor_utils.pad_tensor_n(obs, max_path_length)

    actions = [path['actions'] for path in paths]
    actions = tensor_utils.pad_tensor_n(actions, max_path_length)

    rewards = [path['rewards'] for path in paths]
    rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

    returns = [path['returns'] for path in paths]
    returns = tensor_utils.pad_tensor_n(returns, max_path_length)

    baselines = tensor_utils.pad_tensor_n(baselines, max_path_length)

    agent_infos = [path['agent_infos'] for path in paths]
    agent_infos = tensor_utils.stack_tensor_dict_list([
        tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos
    ])

    env_infos = [path['env_infos'] for path in paths]
    env_infos = tensor_utils.stack_tensor_dict_list(
        [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos])

    valids = [np.ones_like(path['returns']) for path in paths]
    valids = tensor_utils.pad_tensor_n(valids, max_path_length)

    lengths = np.asarray([v.sum() for v in valids])

    samples_data = dict(
        observations=obs,
        actions=actions,
        rewards=rewards,
        baselines=baselines,
        returns=returns,
        valids=valids,
        lengths=lengths,
        agent_infos=agent_infos,
        env_infos=env_infos,
        paths=paths,
    )

    return samples_data
コード例 #8
0
    def log_performance(self, indices, test, epoch):
        """Get average returns for specific tasks.

        Args:
            indices (list): List of tasks.

        """
        discounted_returns = []
        undiscounted_returns = []
        completion = []
        success = []
        traj = []
        for idx in indices:
            eval_paths = []
            for _ in range(self._num_evals):
                paths = self.collect_paths(idx, test)
                paths[-1]['terminals'] = paths[-1]['terminals'].squeeze()
                paths[-1]['dones'] = paths[-1]['terminals']
                # HalfCheetahVel env
                if 'task' in paths[-1]['env_infos'].keys():
                    paths[-1]['env_infos']['task'] = paths[-1]['env_infos'][
                        'task']['velocity']
                eval_paths.append(paths[-1])
                discounted_returns.append(
                    discount_cumsum(paths[-1]['rewards'], self._discount))
                undiscounted_returns.append(sum(paths[-1]['rewards']))
                completion.append(float(paths[-1]['terminals'].any()))
                # calculate success rate for metaworld tasks
                if 'success' in paths[-1]['env_infos']:
                    success.append(paths[-1]['env_infos']['success'].any())

            if test:
                env = self.test_env[idx]()
                temp_traj = TrajectoryBatch.from_trajectory_list(
                    env, eval_paths)
            else:
                env = self.env[idx]()
                temp_traj = TrajectoryBatch.from_trajectory_list(
                    env, eval_paths)
            traj.append(temp_traj)

        if test:
            with tabular.prefix('Test/'):
                if self._test_task_names:
                    log_multitask_performance(
                        epoch,
                        TrajectoryBatch.concatenate(*traj),
                        self._discount,
                        task_names=self._test_task_names)
                log_performance(epoch,
                                TrajectoryBatch.concatenate(*traj),
                                self._discount,
                                prefix='Average')
        else:
            with tabular.prefix('Train/'):
                if self._train_task_names:
                    log_multitask_performance(
                        epoch,
                        TrajectoryBatch.concatenate(*traj),
                        self._discount,
                        task_names=self._train_task_names)
                log_performance(epoch,
                                TrajectoryBatch.concatenate(*traj),
                                self._discount,
                                prefix='Average')
コード例 #9
0
    def _process_samples(self, itr, paths):
        # pylint: disable=too-many-statements
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (OrderedDict[dict]): A list of collected paths for each
                task. In RL^2, there are n environments/tasks and paths in
                each of them will be concatenated at some point and fed to
                the policy.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)
                * paths: (list[dict])
                * average_return: (numpy.float64)

        Raises:
            ValueError: If 'batch_idx' is not found.

        """
        concatenated_paths = []

        paths_by_task = collections.defaultdict(list)
        for path in paths:
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self._discount)
            path['lengths'] = [len(path['rewards'])]
            if 'batch_idx' in path:
                paths_by_task[path['batch_idx']].append(path)
            elif 'batch_idx' in path['agent_infos']:
                paths_by_task[path['agent_infos']['batch_idx'][0]].append(path)
            else:
                raise ValueError(
                    'Batch idx is required for RL2 but not found, '
                    'Make sure to use metarl.tf.algos.rl2.RL2Worker '
                    'for sampling')

        # all path in paths_by_task[i] are sampled from task[i]
        for _paths in paths_by_task.values():
            concatenated_path = self._concatenate_paths(_paths)
            concatenated_paths.append(concatenated_path)

        # stack and pad to max path length of the concatenated
        # path, which will be fed to inner algo
        # i.e. max_path_length * episode_per_task
        concatenated_paths_stacked = (
            np_tensor_utils.stack_and_pad_tensor_dict_list(
                concatenated_paths, self._inner_algo.max_path_length))

        name_map = None
        if hasattr(self._task_sampler, '_envs') and hasattr(
                self._task_sampler._envs[0].env, 'all_task_names'):
            names = [
                env.env.all_task_names[0] for env in self._task_sampler._envs
            ]
            name_map = dict(enumerate(names))

        undiscounted_returns = log_multitask_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self.env_spec, paths),
            self._inner_algo._discount,
            name_map=name_map)

        concatenated_paths_stacked['paths'] = concatenated_paths
        concatenated_paths_stacked['average_return'] = np.mean(
            undiscounted_returns)

        return concatenated_paths_stacked
コード例 #10
0
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            dict: Processed sample data, with key
                * average_return: (float)

        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        ent = np.sum(self.policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self.env_spec, paths),
            discount=self.discount)

        self.episode_reward_mean.extend(undiscounted_returns)

        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))

        samples_data = dict(average_return=np.mean(undiscounted_returns))

        return samples_data
コード例 #11
0
ファイル: rl2.py プロジェクト: seba-1511/metarl
    def _process_samples(self, itr, paths):
        # pylint: disable=too-many-statements
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (OrderedDict[dict]): A list of collected paths for each task.
                In RL^2, there are n environments/tasks and paths in each of them
                will be concatenated at some point and fed to the policy.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)
                * paths: (list[dict])
                * average_return: (numpy.float64)

        """
        concatenated_path_in_meta_batch = []
        lengths = []

        paths_by_task = collections.defaultdict(list)
        for path in paths:
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self._discount)
            path['lengths'] = [len(path['rewards'])]
            if 'batch_idx' in path:
                paths_by_task[path['batch_idx']].append(path)
            elif 'batch_idx' in path['agent_infos']:
                paths_by_task[path['agent_infos']['batch_idx'][0]].append(path)
            else:
                raise ValueError('Batch idx is required for RL2 but not found')

        # all path in paths_by_task[i] are sampled from task[i]
        #
        for path in paths_by_task.values():
            concatenated_path = self._concatenate_paths(path)
            concatenated_path_in_meta_batch.append(concatenated_path)

        # prepare paths for inner algorithm
        # pad the concatenated paths
        observations, actions, rewards, terminals, returns, valids, lengths, env_infos, agent_infos = \
            self._stack_paths(max_len=self._inner_algo.max_path_length, paths=concatenated_path_in_meta_batch)

        # prepare paths for performance evaluation
        # performance is evaluated across all paths, so each path
        # is padded with self._max_path_length
        _observations, _actions, _rewards, _terminals, _, _valids, _lengths, _env_infos, _agent_infos = \
            self._stack_paths(max_len=self._max_path_length, paths=paths)

        ent = np.sum(self._policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        undiscounted_returns = log_multitask_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self._env_spec, paths),
            self._inner_algo.discount,
            task_names=self._task_names)

        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))

        # all paths in each meta batch is stacked together
        # shape: [meta_batch, max_path_length * episoder_per_task, *dims]
        # per RL^2
        concatenated_path = dict(observations=observations,
                                 actions=actions,
                                 rewards=rewards,
                                 valids=valids,
                                 lengths=lengths,
                                 baselines=np.zeros_like(rewards),
                                 agent_infos=agent_infos,
                                 env_infos=env_infos,
                                 paths=concatenated_path_in_meta_batch,
                                 average_return=np.mean(undiscounted_returns))

        return concatenated_path
コード例 #12
0
    def process_samples(self, itr, paths):
        # pylint: disable=too-many-statements
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * baselines: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)
                * paths: (list[dict])
                * average_return: (numpy.float64)

        """
        baselines = []
        returns = []
        total_steps = 0

        max_path_length = self.max_path_length

        dic = defaultdict(lambda: list())
        for path in paths:
            assert all(path['env_infos']['task_name'][0] == name
                       for name in path['env_infos']['task_name'])
            dic[path['env_infos']['task_name'][0]].append(path)

        avg_success = 0
        for task_name, sep_paths in dic.items():
            undiscounted_returns, success = log_multitask_performance(
                itr,
                TrajectoryBatch.from_trajectory_list(self.env_spec, sep_paths),
                discount=self.discount,
                task_names=self.task_names)
            avg_success += success
        avg_success /= len(dic)
        tabular.record('AverageSuccessRate', avg_success)

        if self.flatten_input:
            paths = [
                dict(
                    observations=(self.env_spec.observation_space.flatten_n(
                        path['observations'])),
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos'],
                    dones=path['dones']) for path in paths
            ]
        else:
            paths = [
                dict(
                    observations=path['observations'],
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos'],
                    dones=path['dones']) for path in paths
            ]

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            total_steps += len(path['rewards'])
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = (path['rewards'] + self.discount * path_baselines[1:] -
                      path_baselines[:-1])
            path['advantages'] = np_tensor_utils.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path['deltas'] = deltas

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        # make all paths the same length
        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        returns = [path['returns'] for path in paths]
        returns = tensor_utils.pad_tensor_n(returns, max_path_length)

        baselines = tensor_utils.pad_tensor_n(baselines, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        lengths = np.asarray([v.sum() for v in valids])

        ent = np.sum(self.policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        self.episode_reward_mean.extend(undiscounted_returns)

        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            baselines=baselines,
            returns=returns,
            valids=valids,
            lengths=lengths,
            agent_infos=agent_infos,
            env_infos=env_infos,
            paths=paths,
            average_return=np.mean(undiscounted_returns),
        )

        return samples_data
コード例 #13
0
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            dict: Processed sample data.

        Note:
            The returned samples is a dictionary with keys
                - observations: (numpy.ndarray), shape [B * (T), *obs_dims]
                - actions: (numpy.ndarray), shape [B * (T), *act_dims]
                - rewards : (numpy.ndarray), shape [B * (T), ]
                - baselines: (numpy.ndarray), shape [B * (T), ]
                - returns: (numpy.ndarray), shape [B * (T), ]
                - lengths: (numpy.ndarray), shape [P, ], i-th entry represents
                  the length of i-th path.
                - valids: (numpy.ndarray), shape [P, ], [i, j] entry is 1 if
                  the j-th sample in i-th path is valid, otherwise 0.
                - agent_infos: (dict), see
                  OnPolicyVectorizedSampler.obtain_samples()
                - env_infos: (dict), see
                  OnPolicyVectorizedSampler.obtain_samples()
                - paths: (list[dict]) The original path with observation or
                  action flattened
                - average_return: (numpy.float64)

            where B = batch size, (T) = variable-length of each trajectory,
            P = number of paths. Notice that B * T equals to the total number
            of environment steps in all trajectories.

        """
        baselines = []
        returns = []

        if self._flatten_input:
            paths = [
                dict(
                    observations=(self._env_spec.observation_space.flatten_n(
                        path['observations'])),
                    actions=(
                        self._env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos']) for path in paths
            ]
        else:
            paths = [
                dict(
                    observations=path['observations'],
                    actions=(
                        self._env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos']) for path in paths
            ]

        if hasattr(self._baseline, 'predict_n'):
            all_path_baselines = self._baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self._baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = (path['rewards'] + self._discount * path_baselines[1:] -
                      path_baselines[:-1])
            path['advantages'] = np_tensor_utils.discount_cumsum(
                deltas, self._discount * self._gae_lambda)
            path['deltas'] = deltas
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])
            # returns
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self._discount)
            returns.append(path['returns'])

        obs = np.concatenate([path['observations'] for path in paths])
        actions = np.concatenate([path['actions'] for path in paths])
        rewards = np.concatenate([path['rewards'] for path in paths])
        returns = np.concatenate(returns)
        baselines = np.concatenate(baselines)

        agent_infos_path = [path['agent_infos'] for path in paths]
        agent_infos = dict()
        for key in self._policy.state_info_keys:
            agent_infos[key] = np.concatenate(
                [infos[key] for infos in agent_infos_path])

        env_infos_path = [path['env_infos'] for path in paths]
        env_infos = dict()
        for key in paths[0]['env_infos'].keys():
            env_infos[key] = np.concatenate(
                [infos[key] for infos in env_infos_path])

        valids = np.asarray([np.ones_like(path['returns']) for path in paths])
        lengths = np.asarray([v.sum() for v in valids])

        average_discounted_return = (np.mean(
            [path['returns'][0] for path in paths]))

        undiscounted_returns = [sum(path['rewards']) for path in paths]
        self._episode_reward_mean.extend(undiscounted_returns)

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            baselines=baselines,
            returns=returns,
            lengths=lengths,
            valids=valids,
            agent_infos=agent_infos,
            env_infos=env_infos,
            paths=paths,
            average_return=np.mean(undiscounted_returns),
        )

        tabular.record('Iteration', itr)
        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))
        tabular.record('NumTrajs', len(paths))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))

        return samples_data