Beispiel #1
0
    def evaluate(self, algo, test_rollouts_per_task=None):
        """Evaluate the Meta-RL algorithm on the test tasks.

        Args:
            algo (metarl.np.algos.MetaRLAlgorithm): The algorithm to evaluate.
            test_rollouts_per_task (int or None): Number of rollouts per task.

        """
        if test_rollouts_per_task is None:
            test_rollouts_per_task = self._n_test_rollouts
        adapted_trajectories = []
        logger.log('Sampling for adapation and meta-testing...')
        for env_up in self._test_task_sampler.sample(self._n_test_tasks):
            policy = algo.get_exploration_policy()
            traj = TrajectoryBatch.concatenate(*[
                self._test_sampler.obtain_samples(self._eval_itr, 1, policy,
                                                  env_up)
                for _ in range(self._n_exploration_traj)
            ])
            adapted_policy = algo.adapt_policy(policy, traj)
            adapted_traj = self._test_sampler.obtain_samples(
                self._eval_itr, test_rollouts_per_task * self._max_path_length,
                adapted_policy)
            adapted_trajectories.append(adapted_traj)
        logger.log('Finished meta-testing...')

        with tabular.prefix(self._prefix + '/' if self._prefix else ''):
            log_multitask_performance(
                self._eval_itr,
                TrajectoryBatch.concatenate(*adapted_trajectories),
                getattr(algo, 'discount', 1.0),
                task_names=self._test_task_names)
        self._eval_itr += 1
Beispiel #2
0
def test_log_multitask_performance_task_id():
    lengths = np.array([10, 5, 1, 1])
    batch = TrajectoryBatch(
        EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])),
                akro.Box(np.array([-1., -1.]), np.array([0., 0.]))),
        observations=np.ones((sum(lengths), 3), dtype=np.float32),
        last_observations=np.ones((len(lengths), 3), dtype=np.float32),
        actions=np.zeros((sum(lengths), 2), dtype=np.float32),
        rewards=np.array([
            0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901,
            0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933,
            0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551,
            0.24203526, 0.43328910
        ]),
        terminals=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1],
                           dtype=bool),
        env_infos={
            'success':
            np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1],
                     dtype=bool),
            'task_id':
            np.array([1] * 10 + [3] * 5 + [1] + [4])
        },
        agent_infos={},
        lengths=lengths)

    log_file = tempfile.NamedTemporaryFile()
    csv_output = dowel.CsvOutput(log_file.name)
    logger.add_output(csv_output)
    log_multitask_performance(7, batch, 0.8, {
        1: 'env1',
        3: 'env2',
        4: 'env3',
        5: 'env4'
    })
    logger.log(tabular)
    logger.dump_output_type(dowel.CsvOutput)
    with open(log_file.name, 'r') as file:
        rows = list(csv.DictReader(file))
    res = {k: float(r) for (k, r) in rows[0].items()}
    assert res['env1/Iteration'] == 7
    assert res['env2/Iteration'] == 7
    assert res['env3/Iteration'] == 7
    assert res['env4/Iteration'] == 7
    assert res['env1/NumTrajs'] == 2
    assert res['env2/NumTrajs'] == 1
    assert res['env3/NumTrajs'] == 1
    assert res['env4/NumTrajs'] == 0
    assert math.isclose(res['env1/SuccessRate'], 0.5)
    assert math.isclose(res['env2/SuccessRate'], 1.0)
    assert math.isclose(res['env3/SuccessRate'], 1.0)
    assert math.isnan(res['env4/SuccessRate'])
    assert math.isnan(res['env4/AverageReturn'])
    def evaluate(self, algo, test_rollouts_per_task=None):
        """Evaluate the Meta-RL algorithm on the test tasks.

        Args:
            algo (metarl.np.algos.MetaRLAlgorithm): The algorithm to evaluate.
            test_rollouts_per_task (int or None): Number of rollouts per task.

        """
        if test_rollouts_per_task is None:
            test_rollouts_per_task = self._n_test_rollouts
        adapted_trajectories = []
        logger.log('Sampling for adapation and meta-testing...')
        if self._test_sampler is None:
            self._test_sampler = LocalSampler.from_worker_factory(
                WorkerFactory(seed=get_seed(),
                              max_path_length=self._max_path_length,
                              n_workers=1,
                              worker_class=self._worker_class,
                              worker_args=self._worker_args),
                agents=algo.get_exploration_policy(),
                envs=self._test_task_sampler.sample(1))
        for env_up in self._test_task_sampler.sample(self._n_test_tasks):
            policy = algo.get_exploration_policy()
            traj = TrajectoryBatch.concatenate(*[
                self._test_sampler.obtain_samples(self._eval_itr, 1, policy,
                                                  env_up)
                for _ in range(self._n_exploration_traj)
            ])
            adapted_policy = algo.adapt_policy(policy, traj)
            adapted_traj = self._test_sampler.obtain_samples(
                self._eval_itr, test_rollouts_per_task * self._max_path_length,
                adapted_policy)
            adapted_trajectories.append(adapted_traj)
        logger.log('Finished meta-testing...')

        if self._test_task_names is not None:
            name_map = dict(enumerate(self._test_task_names))
        else:
            name_map = None

        with tabular.prefix(self._prefix + '/' if self._prefix else ''):
            log_multitask_performance(
                self._eval_itr,
                TrajectoryBatch.concatenate(*adapted_trajectories),
                getattr(algo, 'discount', 1.0),
                name_map=name_map)
        self._eval_itr += 1
Beispiel #4
0
    def log_performance(self, itr, all_samples, loss_before, loss_after,
                        kl_before, kl, policy_entropy):
        """Evaluate performance of this batch.

        Args:
            itr (int): Iteration number.
            all_samples (list[list[MAMLTrajectoryBatch]]): Two
                dimensional list of MAMLTrajectoryBatch of size
                [meta_batch_size * (num_grad_updates + 1)]
            loss_before (float): Loss before optimization step.
            loss_after (float): Loss after optimization step.
            kl_before (float): KL divergence before optimization step.
            kl (float): KL divergence after optimization step.
            policy_entropy (float): Policy entropy.

        Returns:
            float: The average return in last epoch cycle.

        """
        tabular.record('Iteration', itr)

        name_map = None
        if hasattr(self._env, 'all_task_names'):
            names = self._env.all_task_names
            name_map = dict(zip(names, names))

        rtns = log_multitask_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(
                env_spec=self._env.spec,
                paths=[
                    path for task_paths in all_samples
                    for path in task_paths[self._num_grad_updates].paths
                ]),
            discount=self._inner_algo.discount,
            name_map=name_map)

        with tabular.prefix(self._policy.name + '/'):
            tabular.record('LossBefore', loss_before)
            tabular.record('LossAfter', loss_after)
            tabular.record('dLoss', loss_before - loss_after)
            tabular.record('KLBefore', kl_before)
            tabular.record('KLAfter', kl)
            tabular.record('Entropy', policy_entropy)

        return np.mean(rtns)
    def _evaluate_policy(self, epoch):
        """Evaluate the performance of the policy via deterministic rollouts.

            Statistics such as (average) discounted return and success rate are
            recorded.

        Args:
            epoch (int): The current training epoch.

        Returns:
            float: The average return across self._num_evaluation_trajectories
                trajectories

        """
        eval_trajs = []
        for _ in range(self._num_tasks):
            eval_trajs.append(
                self._obtain_evaluation_samples(
                    self._eval_env,
                    num_trajs=self._num_evaluation_trajectories))
        eval_trajs = TrajectoryBatch.concatenate(*eval_trajs)
        last_return = log_multitask_performance(epoch, eval_trajs,
                                                self.discount)
        return last_return
Beispiel #6
0
    def log_performance(self, indices, test, epoch):
        """Get average returns for specific tasks.

        Args:
            indices (list): List of tasks.

        """
        discounted_returns = []
        undiscounted_returns = []
        completion = []
        success = []
        traj = []
        for idx in indices:
            eval_paths = []
            for _ in range(self._num_evals):
                paths = self.collect_paths(idx, test)
                paths[-1]['terminals'] = paths[-1]['terminals'].squeeze()
                paths[-1]['dones'] = paths[-1]['terminals']
                # HalfCheetahVel env
                if 'task' in paths[-1]['env_infos'].keys():
                    paths[-1]['env_infos']['task'] = paths[-1]['env_infos'][
                        'task']['velocity']
                eval_paths.append(paths[-1])
                discounted_returns.append(
                    discount_cumsum(paths[-1]['rewards'], self._discount))
                undiscounted_returns.append(sum(paths[-1]['rewards']))
                completion.append(float(paths[-1]['terminals'].any()))
                # calculate success rate for metaworld tasks
                if 'success' in paths[-1]['env_infos']:
                    success.append(paths[-1]['env_infos']['success'].any())

            if test:
                env = self.test_env[idx]()
                temp_traj = TrajectoryBatch.from_trajectory_list(
                    env, eval_paths)
            else:
                env = self.env[idx]()
                temp_traj = TrajectoryBatch.from_trajectory_list(
                    env, eval_paths)
            traj.append(temp_traj)

        if test:
            with tabular.prefix('Test/'):
                if self._test_task_names:
                    log_multitask_performance(
                        epoch,
                        TrajectoryBatch.concatenate(*traj),
                        self._discount,
                        task_names=self._test_task_names)
                log_performance(epoch,
                                TrajectoryBatch.concatenate(*traj),
                                self._discount,
                                prefix='Average')
        else:
            with tabular.prefix('Train/'):
                if self._train_task_names:
                    log_multitask_performance(
                        epoch,
                        TrajectoryBatch.concatenate(*traj),
                        self._discount,
                        task_names=self._train_task_names)
                log_performance(epoch,
                                TrajectoryBatch.concatenate(*traj),
                                self._discount,
                                prefix='Average')
Beispiel #7
0
    def _process_samples(self, itr, paths):
        # pylint: disable=too-many-statements
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (OrderedDict[dict]): A list of collected paths for each
                task. In RL^2, there are n environments/tasks and paths in
                each of them will be concatenated at some point and fed to
                the policy.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)
                * paths: (list[dict])
                * average_return: (numpy.float64)

        Raises:
            ValueError: If 'batch_idx' is not found.

        """
        concatenated_paths = []

        paths_by_task = collections.defaultdict(list)
        for path in paths:
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self._discount)
            path['lengths'] = [len(path['rewards'])]
            if 'batch_idx' in path:
                paths_by_task[path['batch_idx']].append(path)
            elif 'batch_idx' in path['agent_infos']:
                paths_by_task[path['agent_infos']['batch_idx'][0]].append(path)
            else:
                raise ValueError(
                    'Batch idx is required for RL2 but not found, '
                    'Make sure to use metarl.tf.algos.rl2.RL2Worker '
                    'for sampling')

        # all path in paths_by_task[i] are sampled from task[i]
        for _paths in paths_by_task.values():
            concatenated_path = self._concatenate_paths(_paths)
            concatenated_paths.append(concatenated_path)

        # stack and pad to max path length of the concatenated
        # path, which will be fed to inner algo
        # i.e. max_path_length * episode_per_task
        concatenated_paths_stacked = (
            np_tensor_utils.stack_and_pad_tensor_dict_list(
                concatenated_paths, self._inner_algo.max_path_length))

        name_map = None
        if hasattr(self._task_sampler, '_envs') and hasattr(
                self._task_sampler._envs[0].env, 'all_task_names'):
            names = [
                env.env.all_task_names[0] for env in self._task_sampler._envs
            ]
            name_map = dict(enumerate(names))

        undiscounted_returns = log_multitask_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self.env_spec, paths),
            self._inner_algo._discount,
            name_map=name_map)

        concatenated_paths_stacked['paths'] = concatenated_paths
        concatenated_paths_stacked['average_return'] = np.mean(
            undiscounted_returns)

        return concatenated_paths_stacked
Beispiel #8
0
    def _process_samples(self, itr, paths):
        # pylint: disable=too-many-statements
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (OrderedDict[dict]): A list of collected paths for each task.
                In RL^2, there are n environments/tasks and paths in each of them
                will be concatenated at some point and fed to the policy.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)
                * paths: (list[dict])
                * average_return: (numpy.float64)

        """
        concatenated_path_in_meta_batch = []
        lengths = []

        paths_by_task = collections.defaultdict(list)
        for path in paths:
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self._discount)
            path['lengths'] = [len(path['rewards'])]
            if 'batch_idx' in path:
                paths_by_task[path['batch_idx']].append(path)
            elif 'batch_idx' in path['agent_infos']:
                paths_by_task[path['agent_infos']['batch_idx'][0]].append(path)
            else:
                raise ValueError('Batch idx is required for RL2 but not found')

        # all path in paths_by_task[i] are sampled from task[i]
        #
        for path in paths_by_task.values():
            concatenated_path = self._concatenate_paths(path)
            concatenated_path_in_meta_batch.append(concatenated_path)

        # prepare paths for inner algorithm
        # pad the concatenated paths
        observations, actions, rewards, terminals, returns, valids, lengths, env_infos, agent_infos = \
            self._stack_paths(max_len=self._inner_algo.max_path_length, paths=concatenated_path_in_meta_batch)

        # prepare paths for performance evaluation
        # performance is evaluated across all paths, so each path
        # is padded with self._max_path_length
        _observations, _actions, _rewards, _terminals, _, _valids, _lengths, _env_infos, _agent_infos = \
            self._stack_paths(max_len=self._max_path_length, paths=paths)

        ent = np.sum(self._policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        undiscounted_returns = log_multitask_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self._env_spec, paths),
            self._inner_algo.discount,
            task_names=self._task_names)

        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))

        # all paths in each meta batch is stacked together
        # shape: [meta_batch, max_path_length * episoder_per_task, *dims]
        # per RL^2
        concatenated_path = dict(observations=observations,
                                 actions=actions,
                                 rewards=rewards,
                                 valids=valids,
                                 lengths=lengths,
                                 baselines=np.zeros_like(rewards),
                                 agent_infos=agent_infos,
                                 env_infos=env_infos,
                                 paths=concatenated_path_in_meta_batch,
                                 average_return=np.mean(undiscounted_returns))

        return concatenated_path
Beispiel #9
0
    def process_samples(self, itr, paths):
        # pylint: disable=too-many-statements
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * baselines: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)
                * paths: (list[dict])
                * average_return: (numpy.float64)

        """
        baselines = []
        returns = []
        total_steps = 0

        max_path_length = self.max_path_length

        dic = defaultdict(lambda: list())
        for path in paths:
            assert all(path['env_infos']['task_name'][0] == name
                       for name in path['env_infos']['task_name'])
            dic[path['env_infos']['task_name'][0]].append(path)

        avg_success = 0
        for task_name, sep_paths in dic.items():
            undiscounted_returns, success = log_multitask_performance(
                itr,
                TrajectoryBatch.from_trajectory_list(self.env_spec, sep_paths),
                discount=self.discount,
                task_names=self.task_names)
            avg_success += success
        avg_success /= len(dic)
        tabular.record('AverageSuccessRate', avg_success)

        if self.flatten_input:
            paths = [
                dict(
                    observations=(self.env_spec.observation_space.flatten_n(
                        path['observations'])),
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos'],
                    dones=path['dones']) for path in paths
            ]
        else:
            paths = [
                dict(
                    observations=path['observations'],
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos'],
                    dones=path['dones']) for path in paths
            ]

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            total_steps += len(path['rewards'])
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = (path['rewards'] + self.discount * path_baselines[1:] -
                      path_baselines[:-1])
            path['advantages'] = np_tensor_utils.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path['deltas'] = deltas

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        # make all paths the same length
        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        returns = [path['returns'] for path in paths]
        returns = tensor_utils.pad_tensor_n(returns, max_path_length)

        baselines = tensor_utils.pad_tensor_n(baselines, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        lengths = np.asarray([v.sum() for v in valids])

        ent = np.sum(self.policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        self.episode_reward_mean.extend(undiscounted_returns)

        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            baselines=baselines,
            returns=returns,
            valids=valids,
            lengths=lengths,
            agent_infos=agent_infos,
            env_infos=env_infos,
            paths=paths,
            average_return=np.mean(undiscounted_returns),
        )

        return samples_data