Exemple #1
0
    def __call__(self):
        results = OrderedDict()
        for name, indices in [
            ('train_tasks', self.train_task_indices),
            ('test_tasks', self.test_task_indices),
        ]:
            final_returns, online_returns, idx_to_final_context = self.algorithm._do_eval(
                indices, -1)
            results['eval/adaptation/{}/final_returns Mean'.format(
                name)] = np.mean(final_returns)
            results['eval/adaptation/{}/all_returns Mean'.format(
                name)] = np.mean(online_returns)

            if 'train' in name:
                z_dist_log = self.algorithm._get_z_distribution_log(
                    idx_to_final_context)
                append_log(results,
                           z_dist_log,
                           prefix='trainer/{}/'.format(name))

            paths = []
        for idx in self.train_task_indices:
            paths += self._get_init_from_buffer_path(idx)
        results[
            'eval/init_from_buffer/train_tasks/all_returns Mean'] = np.mean(
                eval_util.get_average_returns(paths))
        return results
Exemple #2
0
 def _do_eval(self, indices, epoch):
     final_returns = []
     online_returns = []
     for idx in indices:
         all_rets = []
         all_success = []
         for r in range(self.num_evals):
             paths = self.collect_paths(idx, epoch, r)
             all_rets.append(
                 [eval_util.get_average_returns([p]) for p in paths])
             all_success.append(eval_util.get_success_rate(paths))
         success_rate = np.mean(all_success)
         self.eval_statistics['Success_test_task{}'.format(
             self.task_idx)] = success_rate
         final_returns.append(np.mean([a[-1] for a in all_rets]))
         # record online returns for the first n trajectories
         n = min([len(a) for a in all_rets])
         all_rets = [a[:n] for a in all_rets]
         all_rets = np.mean(np.stack(all_rets),
                            axis=0)  # avg return per nth rollout
         online_returns.append(all_rets)
         self.eval_statistics['AverageReturn_test_task{}'.format(
             self.task_idx)] = all_rets
     n = min([len(t) for t in online_returns])
     online_returns = [t[:n] for t in online_returns]
     return final_returns, online_returns
Exemple #3
0
    def _get_returns_init_from_offline_buffer(self, indices):
        train_returns = []
        for idx in indices:
            self.env.reset_task(idx)
            paths = []
            for _ in range(self.num_steps_per_eval // self.max_path_length):
                init_context = self._reward_decoder_buffer.sample_context(
                    idx, self.embedding_batch_size)
                init_context = ptu.from_numpy(init_context)
                p, _ = self.sampler.obtain_samples(
                    deterministic=self.eval_deterministic,
                    max_samples=self.max_path_length,
                    accum_context=False,
                    max_trajs=1,
                    resample_latent_period=0,
                    update_posterior_period=0,
                    initial_context=init_context,
                    task_idx=idx,
                )
                paths += p

            if self.sparse_rewards:
                for p in paths:
                    sparse_rewards = np.stack(e['sparse_reward']
                                              for e in p['env_infos']).reshape(
                                                  -1, 1)
                    p['rewards'] = sparse_rewards

            train_returns.append(eval_util.get_average_returns(paths))
        return train_returns
    def evaluate(self, epoch, eval_paths=None):
        statistics = OrderedDict()
        statistics.update(self.eval_statistics)

        logger.log("Collecting samples for evaluation")
        if eval_paths:
            test_paths = eval_paths
        else:
            test_paths = self.get_eval_paths()
        statistics.update(
            eval_util.get_generic_path_information(
                test_paths,
                stat_prefix="Test",
            ))
        if len(self._exploration_paths) > 0:
            statistics.update(
                eval_util.get_generic_path_information(
                    self._exploration_paths,
                    stat_prefix="Exploration",
                ))
        if hasattr(self.env, "log_diagnostics"):
            self.env.log_diagnostics(test_paths, logger=logger)
        if hasattr(self.env, "get_diagnostics"):
            statistics.update(self.env.get_diagnostics(test_paths))

        average_returns = eval_util.get_average_returns(test_paths)
        statistics['AverageReturn'] = average_returns
        for key, value in statistics.items():
            logger.record_tabular(key, value)
        self.need_to_update_eval_statistics = True
Exemple #5
0
 def _do_eval(self, indices, epoch):
     final_returns = []
     online_returns = []
     for idx in indices:
         runs, all_rets = [], []
         for r in range(self.num_evals):
             paths = self.collect_paths(idx, epoch, r)
             all_rets.append([eval_util.get_average_returns([p]) for p in paths])
             runs.append(paths)
         all_rets = np.mean(np.stack(all_rets), axis=0) # avg return per nth rollout
         final_returns.append(all_rets[-1])
         online_returns.append(all_rets)
     return final_returns, online_returns
Exemple #6
0
    def evaluate(self, epoch):
        """
        Evaluate the policy, e.g. save/print progress.
        :param epoch:
        :return:
        """
        statistics = OrderedDict()
        try:
            statistics.update(self.eval_statistics)
            self.eval_statistics = None
        except:
            print('No Stats to Eval')

        logger.log("Collecting samples for evaluation")
        test_paths = self.eval_sampler.obtain_samples()

        statistics.update(
            eval_util.get_generic_path_information(
                test_paths,
                stat_prefix="Test",
            ))
        statistics.update(
            eval_util.get_generic_path_information(
                self._exploration_paths,
                stat_prefix="Exploration",
            ))

        if hasattr(self.env, "log_diagnostics"):
            self.env.log_diagnostics(test_paths)
        if hasattr(self.env, "log_statistics"):
            statistics.update(self.env.log_statistics(test_paths))
        if epoch % self.freq_log_visuals == 0:
            if hasattr(self.env, "log_visuals"):
                self.env.log_visuals(test_paths, epoch,
                                     logger.get_snapshot_dir())

        average_returns = eval_util.get_average_returns(test_paths)
        statistics['AverageReturn'] = average_returns
        for key, value in statistics.items():
            logger.record_tabular(key, value)

        best_statistic = statistics[self.best_key]
        if best_statistic > self.best_statistic_so_far:
            self.best_statistic_so_far = best_statistic
            if self.save_best and epoch >= self.save_best_starting_from_epoch:
                data_to_save = {'epoch': epoch, 'statistics': statistics}
                data_to_save.update(self.get_epoch_snapshot(epoch))
                logger.save_extra_data(data_to_save, 'best.pkl')
                print('\n\nSAVED BEST\n\n')
Exemple #7
0
    def collect_data_for_embedding_online_with_logging(self, idx, epoch):
        self.task_idx = idx
        dprint('Task:', idx)
        self.env.reset_task(idx)

        n_exploration_episodes = 10
        n_inference_episodes = 10
        all_init_paths = []
        all_inference_paths = []

        self.enc_replay_buffer.clear_buffer(idx)

        for i in range(n_exploration_episodes):
            initial_z = self.sample_z_from_prior()

            init_paths = self.obtain_eval_paths(idx,
                                                z=initial_z,
                                                eval_task=True)
            all_init_paths += init_paths
            self.enc_replay_buffer.add_paths(idx, init_paths)
        dprint('enc_replay_buffer.task_buffers[idx]._size',
               self.enc_replay_buffer.task_buffers[idx]._size)

        for i in range(n_inference_episodes):
            paths = self.obtain_eval_paths(idx, eval_task=True)
            all_inference_paths += paths
            self.enc_replay_buffer.add_paths(idx, init_paths)

        # save evaluation rollouts for vis
        # all paths
        with open(
                self.output_dir +
                "/proto-sac-point-mass-fb-16z-init-task{}-{}.pkl".format(
                    idx, epoch), 'wb+') as f:
            pickle.dump(all_init_paths, f, pickle.HIGHEST_PROTOCOL)
        with open(
                self.output_dir +
                "/proto-sac-point-mass-fb-16z-inference-task{}-{}.pkl".format(
                    idx, epoch), 'wb+') as f:
            pickle.dump(all_inference_paths, f, pickle.HIGHEST_PROTOCOL)

        average_inference_returns = [
            eval_util.get_average_returns(paths)
            for paths in all_inference_paths
        ]
        self.eval_statistics['AverageInferenceReturns_test_task{}'.format(
            idx)] = average_inference_returns
    def _do_eval(self, goal_set, epoch):

        final_returns = []
        final_achieved = []
        for goal in goal_set:
            all_rets = []
            all_achieved = []
            for r in range(self.num_evals):
                paths = self.collect_paths(goal, epoch, r)
                all_rets.append(
                    [eval_util.get_average_returns([p]) for p in paths])
                all_achieved.append(
                    [eval_util.get_average_achieved([p]) for p in paths])
            final_returns.append(np.mean([a[-1] for a in all_rets]))
            final_achieved.append(np.mean([a[-1] for a in all_achieved]))

        return final_returns, final_achieved
Exemple #9
0
 def _do_eval(self, indices, epoch):
     final_returns = []
     online_returns = []
     for idx in indices:
         all_rets = []
         for r in range(self.num_evals):
             paths = self.collect_paths(idx, epoch, r)
             all_rets.append([eval_util.get_average_returns([p]) for p in paths])
         final_returns.append(np.mean([a[-1] for a in all_rets]))
         # record online returns for the first n trajectories
         n = min([len(a) for a in all_rets])
         all_rets = [a[:n] for a in all_rets]
         all_rets = np.mean(np.stack(all_rets), axis=0) # avg return per nth rollout
         online_returns.append(all_rets)
     n = min([len(t) for t in online_returns])
     online_returns = [t[:n] for t in online_returns]
     return final_returns, online_returns
def eval_alg(policy,
             env,
             num_eval_rollouts,
             eval_deterministic=False,
             max_path_length=1000):
    if eval_deterministic:
        policy = MakeDeterministic(policy)

    eval_sampler = InPlacePathSampler(env=env,
                                      policy=policy,
                                      max_samples=max_path_length *
                                      (num_eval_rollouts + 1),
                                      max_path_length=max_path_length,
                                      policy_uses_pixels=False,
                                      policy_uses_task_params=False,
                                      concat_task_params_to_policy_obs=False)
    test_paths = eval_sampler.obtain_samples()
    average_returns = get_average_returns(test_paths)
    return average_returns
def experiment(variant):
    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    if variant['scale_env_with_demo_stats']:
        with open('expert_demos_listing.yaml', 'r') as f:
            listings = yaml.load(f.read())
        expert_demos_path = listings[variant['expert_name']]['file_paths'][
            variant['expert_idx']]
        buffer_save_dict = joblib.load(expert_demos_path)
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )

    policy = joblib.load(variant['policy_checkpoint'])['exploration_policy']
    if variant['eval_deterministic']:
        policy = MakeDeterministic(policy)
    policy.to(ptu.device)

    eval_sampler = PathSampler(env,
                               policy,
                               variant['num_eval_steps'],
                               variant['max_path_length'],
                               no_terminal=variant['no_terminal'],
                               render=variant['render'],
                               render_kwargs=variant['render_kwargs'])
    test_paths = eval_sampler.obtain_samples()
    average_returns = eval_util.get_average_returns(test_paths)
    print(average_returns)

    return 1
Exemple #12
0
 def log_statistics(self, paths, split=''):
     self.eval_statistics.update(
         eval_util.get_generic_path_information(
             paths,
             stat_prefix="{}_task{}".format(split, self.task_idx),
         ))
     # TODO(KR) what are these?
     self.eval_statistics.update(
         eval_util.get_generic_path_information(
             self._exploration_paths,
             stat_prefix="Exploration_task{}".format(self.task_idx),
         )
     )  # something is wrong with these exploration paths i'm pretty sure...
     average_returns = eval_util.get_average_returns(paths)
     self.eval_statistics['AverageReturn_{}_task{}'.format(
         split, self.task_idx)] = average_returns
     goal = self.env._goal
     dprint('GoalPosition_{}_task'.format(split))
     dprint(goal)
     self.eval_statistics['GoalPosition_{}_task{}'.format(
         split, self.task_idx)] = goal
Exemple #13
0
    def evaluate(self, epoch):
        statistics = OrderedDict()
        statistics.update(self.eval_statistics)
        self.eval_statistics = statistics

        # old_device = ptu.device
        # ptu.device = torch.device('cpu')
        # self.policy.cnn_enc.to(ptu.device)
        # self.policy.task_enc.to(ptu.device)
        # self.policy.qf1.to(ptu.device)

        # self.policy_dataset = PolicyDataset(self, eval_task=False)

        # self.policy_loader = iter(torch.utils.data.DataLoader(self.policy_dataset, batch_size=1,
        #         shuffle=False, pin_memory=True, sampler=None, batch_sampler=None, num_workers=10,
        #         worker_init_fn=None, collate_fn=lambda x: x))

        import time
        # for i in range(10):
        #     t0 = time.time()
        #     paths = self.policy_loader.next()
        #     print((time.time() - t0))

        # # import pdb; pdb.set_trace()

        ### train tasks
        dprint('evaluating on {} train tasks'.format(len(self.train_tasks)))
        train_avg_returns = []
        train_avg_succ = []
        train_avg_len = []
        for idx in self.train_tasks:
            dprint('task {} encoder RB size'.format(idx),
                   self.enc_replay_buffer.task_buffers[idx]._size)
            paths = self.collect_paths(idx, epoch, eval_task=False)

            t0 = time.time()
            # paths = self.policy_loader.next()[0]
            # import pdb; pdb.set_trace()
            train_avg_returns.append(eval_util.get_average_returns(paths))
            train_avg_succ.append(
                [sum([j['succ'] for j in i['env_infos']]) for i in paths])
            train_avg_len.append([len(i['env_infos']) for i in paths])
            print((time.time() - t0))

        # import pdb; pdb.set_trace()

        # ptu.device = old_device
        # self.policy.cnn_enc.to(ptu.device)
        # self.policy.task_enc.to(ptu.device)
        # self.policy.qf1.to(ptu.device)

        ## test tasks
        dprint('evaluating on {} test tasks'.format(len(self.eval_tasks)))
        test_avg_returns = []
        test_avg_succ = []
        test_avg_len = []
        # This is calculating the embedding online, because every iteration
        # we clear the encoding buffer for the test tasks.

        for idx in np.random.choice(self.eval_tasks,
                                    self.num_evals,
                                    replace=False):
            print('eval task', idx)
            self.task_idx = idx
            self.env.reset_task(idx)

            # collect data fo computing embedding if needed
            if self.eval_embedding_source in ['online', 'initial_pool']:
                pass
            elif self.eval_embedding_source == 'online_exploration_trajectories':
                self.eval_enc_replay_buffer.task_buffers[idx].clear()
                # task embedding sampled from prior and held fixed
                self.collect_data_sampling_from_prior(
                    num_samples=self.num_steps_per_task,
                    resample_z_every_n=self.max_path_length,
                    eval_task=True)
            elif self.eval_embedding_source == 'online_on_policy_trajectories':
                self.eval_enc_replay_buffer.task_buffers[idx].clear()
                # half the data from z sampled from prior, the other half from z sampled from posterior
                self.collect_data_online(idx=idx,
                                         num_samples=self.num_steps_per_task,
                                         eval_task=True)
            else:
                raise Exception("Invalid option for computing eval embedding")

            dprint('task {} encoder RB size'.format(idx),
                   self.eval_enc_replay_buffer.task_buffers[idx]._size)
            test_paths = self.collect_paths(idx, epoch, eval_task=True)

            test_avg_returns.append(eval_util.get_average_returns(test_paths))
            test_avg_succ.append(
                [sum([j['succ'] for j in i['env_infos']]) for i in test_paths])
            test_avg_len.append([len(i['env_infos']) for i in test_paths])

            if self.use_information_bottleneck:
                z_mean = np.mean(
                    np.abs(ptu.get_numpy(self.policy.z_dists[0].mean)))
                z_sig = np.mean(ptu.get_numpy(self.policy.z_dists[0].variance))
                self.eval_statistics['Z mean eval'] = z_mean
                self.eval_statistics['Z variance eval'] = z_sig

            # TODO(KR) what does this do
            if hasattr(self.env, "log_diagnostics"):
                self.env.log_diagnostics(test_paths)

        avg_train_return = np.mean(train_avg_returns)
        avg_test_return = np.mean(test_avg_returns)
        avg_train_succ = np.mean(train_avg_succ, axis=0)
        avg_test_succ = np.mean(test_avg_succ, axis=0)
        avg_train_len = np.mean(train_avg_len, axis=0)
        avg_test_len = np.mean(test_avg_len, axis=0)

        self.eval_statistics[
            'AverageReturn_all_train_tasks'] = avg_train_return
        self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return

        for i, s in enumerate(avg_train_succ):
            self.eval_statistics['Succ_train_tasks_%s' % i] = s
        for i, s in enumerate(avg_test_succ):
            self.eval_statistics['Succ_test_tasks_%s' % i] = s

        for i, s in enumerate(avg_train_len):
            self.eval_statistics['Len__train_tasks_%s' % i] = s
        for i, s in enumerate(avg_test_len):
            self.eval_statistics['Len_test_tasks_%s' % i] = s

        for key, value in self.eval_statistics.items():
            logger.record_tabular(key, value)
        self.eval_statistics = None

        if self.render_eval_paths:
            self.env.render_paths(test_paths)

        if self.plotter:
            self.plotter.draw()
Exemple #14
0
    def evaluate(self, epoch):
        if self.eval_statistics is None:
            self.eval_statistics = OrderedDict()

        ### sample trajectories from prior for debugging / visualization
        if self.dump_eval_paths:
            # 100 arbitrarily chosen for visualizations of point_robot trajectories
            # just want stochasticity of z, not the policy
            self.agent.clear_z()
            prior_paths, _ = self.sampler.obtain_samples(
                deterministic=self.eval_deterministic,
                max_samples=self.max_path_length * 20,
                accum_context=False,
                resample_latent_period=self.exploration_resample_latent_period,
                update_posterior_period=self.
                exploration_update_posterior_period,  # following PEARL protocol
            )
            logger.save_extra_data(
                prior_paths,
                file_name='eval_trajectories/prior-epoch{}'.format(epoch))
        ### train tasks
        if self._num_tasks_to_eval_on >= len(self.train_task_indices):
            indices = self.train_task_indices
        else:
            # eval on a subset of train tasks in case num train tasks is huge
            indices = np.random.choice(self.offline_train_task_indices,
                                       self._num_tasks_to_eval_on)
        # logger.log('evaluating on {} train tasks'.format(len(indices)))
        ### eval train tasks with posterior sampled from the training replay buffer
        train_returns = []
        for idx in indices:
            self.env.reset_task(idx)
            paths = []
            for _ in range(self.num_steps_per_eval // self.max_path_length):
                # init_context = self.sample_context(idx)
                if self.use_meta_learning_buffer:
                    init_context = self.meta_replay_buffer._sample_contexts(
                        [idx], self.embedding_batch_size)
                else:
                    init_context = self.enc_replay_buffer.sample_context(
                        idx, self.embedding_batch_size)
                if self.eval_data_collector:
                    p = self.eval_data_collector.collect_new_paths(
                        num_steps=self.
                        max_path_length,  # TODO: also cap num trajs
                        max_path_length=self.max_path_length,
                        discard_incomplete_paths=False,
                        accum_context=False,
                        resample_latent_period=0,
                        update_posterior_period=0,
                        initial_context=init_context,
                        task_idx=idx,
                    )
                else:
                    init_context = ptu.from_numpy(init_context)
                    # TODO: replace with sampler
                    # self.agent.infer_posterior(context)
                    p, _ = self.sampler.obtain_samples(
                        deterministic=self.eval_deterministic,
                        max_samples=self.max_path_length,
                        accum_context=False,
                        max_trajs=1,
                        resample_latent_period=0,
                        update_posterior_period=0,
                        initial_context=init_context,
                        task_idx=idx,
                    )
                paths += p

            if self.sparse_rewards:
                for p in paths:
                    sparse_rewards = np.stack(e['sparse_reward']
                                              for e in p['env_infos']).reshape(
                                                  -1, 1)
                    p['rewards'] = sparse_rewards

            train_returns.append(eval_util.get_average_returns(paths))

        train_returns_offline_buffer = self._get_returns_init_from_offline_buffer(
            indices)
        # train_returns = np.mean(train_returns)
        ### eval train tasks with on-policy data to match eval of test tasks
        train_final_returns, train_online_returns, train_task_to_final_context = (
            self._do_eval(indices, epoch))
        # logger.log('train online returns')
        # logger.log(train_online_returns)

        ### test tasks
        # logger.log('evaluating on {} test tasks'.format(len(self.eval_task_indices)))
        test_final_returns, test_online_returns, test_task_to_final_context = (
            self._do_eval(self.eval_task_indices, epoch))
        # logger.log('test online returns')
        # logger.log(test_online_returns)
        # save the final posterior
        self.agent.log_diagnostics(self.eval_statistics)

        z_dist_log = self._get_z_distribution_log(train_task_to_final_context)
        append_log(self.eval_statistics,
                   z_dist_log,
                   prefix='trainer/train_tasks/')

        if hasattr(self.env, "log_diagnostics"):
            self.env.log_diagnostics(paths, prefix=None)

        avg_train_online_return = np.mean(np.stack(train_online_returns),
                                          axis=0)
        avg_test_online_return = np.mean(np.stack(test_online_returns), axis=0)
        self.eval_statistics.update(
            eval_util.create_stats_ordered_dict(
                'eval/init_from_offline_buffer/train_tasks/all_returns',
                train_returns_offline_buffer,
            ))
        self.eval_statistics.update(
            eval_util.create_stats_ordered_dict(
                'eval/init_from_buffer/train_tasks/all_returns',
                train_returns,
            ))
        self.eval_statistics.update(
            eval_util.create_stats_ordered_dict(
                'eval/adaptation/train_tasks/final_returns',
                train_final_returns,
            ))
        self.eval_statistics.update(
            eval_util.create_stats_ordered_dict(
                'eval/adaptation/test_tasks/final_returns',
                test_final_returns,
            ))
        self.eval_statistics.update(
            eval_util.create_stats_ordered_dict(
                'eval/adaptation/train_tasks/all_returns',
                avg_train_online_return,
            ))
        self.eval_statistics.update(
            eval_util.create_stats_ordered_dict(
                'eval/adaptation/test_tasks/all_returns',
                avg_test_online_return,
            ))

        if len(self.fake_task_idx_to_z) > 0:
            self_generated_indices = np.random.choice(
                np.array(list(self.fake_task_idx_to_z.keys())),
                self._num_tasks_to_eval_on,
            )
            self_generated_final_returns, self_generated_online_returns, _ = self._do_eval(
                self_generated_indices, epoch)
            avg_self_generated_return = np.mean(
                np.stack(self_generated_online_returns))
            self.eval_statistics.update(
                eval_util.create_stats_ordered_dict(
                    'eval/adaptation/generated_tasks/final_returns',
                    self_generated_final_returns,
                ))
            self.eval_statistics.update(
                eval_util.create_stats_ordered_dict(
                    'eval/adaptation/generated_tasks/all_returns',
                    avg_self_generated_return,
                ))

        try:
            import os
            import psutil
            process = psutil.Process(os.getpid())
            self.eval_statistics['RAM Usage (Mb)'] = int(
                process.memory_info().rss / 1000000)
        except ImportError:
            pass
        logger.save_extra_data(avg_train_online_return,
                               file_name='online-train-epoch{}'.format(epoch))
        logger.save_extra_data(avg_test_online_return,
                               file_name='online-test-epoch{}'.format(epoch))

        for key, value in self.eval_statistics.items():
            logger.record_tabular(key, value)
        self.eval_statistics = None

        if self.render_eval_paths:
            self.env.render_paths(paths)

        if self.plotter:
            self.plotter.draw()
Exemple #15
0
def get_custom_generic_path_information(paths,
                                        path_length,
                                        reward_scale,
                                        stat_prefix=''):
    """
    Get an OrderedDict with a bunch of statistic names and values.

    Differs from normal rlkit utility function in the following ways:
    Grabs normalized reward / return values where reward is normalized to 1.0
    Grabs cumulative reward specified accumulated at @path_length timestep
    """
    statistics = OrderedDict()
    returns = [sum(path["rewards"]) for path in paths]

    # Grab returns accumulated up to specified timestep
    expl_returns = [sum(path["rewards"][:path_length]) for path in paths]

    rewards = np.vstack([path["rewards"] for path in paths])
    # norm_rewards = [path["rewards"] / reward_scale for path in paths]
    statistics.update(
        eval_util.create_stats_ordered_dict('Rewards',
                                            rewards,
                                            stat_prefix=stat_prefix))
    statistics.update(
        eval_util.create_stats_ordered_dict('Returns',
                                            returns,
                                            stat_prefix=stat_prefix))

    # Add extra stats
    statistics.update(
        eval_util.create_stats_ordered_dict('ExplReturns',
                                            expl_returns,
                                            stat_prefix=stat_prefix))

    actions = [path["actions"] for path in paths]
    if len(actions[0].shape) == 1:
        actions = np.hstack([path["actions"] for path in paths])
    else:
        actions = np.vstack([path["actions"] for path in paths])
    statistics.update(
        eval_util.create_stats_ordered_dict('Actions',
                                            actions,
                                            stat_prefix=stat_prefix))
    statistics['Num Paths'] = len(paths)
    statistics[stat_prefix +
               'Average Returns'] = eval_util.get_average_returns(paths)

    for info_key in ['env_infos', 'agent_infos']:
        if info_key in paths[0]:
            all_env_infos = [
                ppp.list_of_dicts__to__dict_of_lists(p[info_key])
                for p in paths
            ]
            for k in all_env_infos[0].keys():
                final_ks = np.array([info[k][-1] for info in all_env_infos])
                first_ks = np.array([info[k][0] for info in all_env_infos])
                all_ks = np.concatenate([info[k] for info in all_env_infos])
                statistics.update(
                    eval_util.create_stats_ordered_dict(
                        stat_prefix + k,
                        final_ks,
                        stat_prefix='{}/final/'.format(info_key),
                    ))
                statistics.update(
                    eval_util.create_stats_ordered_dict(
                        stat_prefix + k,
                        first_ks,
                        stat_prefix='{}/initial/'.format(info_key),
                    ))
                statistics.update(
                    eval_util.create_stats_ordered_dict(
                        stat_prefix + k,
                        all_ks,
                        stat_prefix='{}/'.format(info_key),
                    ))

    return statistics
Exemple #16
0
def get_traffic_path_information(paths, stat_prefix=''):
    """
    Get an OrderedDict with a bunch of statistic names and values.
    """
    statistics = OrderedDict()
    returns = [sum(path["rewards"]) for path in paths]

    rewards = np.vstack([path["rewards"] for path in paths])
    statistics.update(
        create_stats_ordered_dict('Rewards', rewards, stat_prefix=stat_prefix))
    statistics.update(
        create_stats_ordered_dict('Returns', returns, stat_prefix=stat_prefix))
    actions = [path["actions"] for path in paths]
    if len(actions[0].shape) == 1:
        actions = np.hstack([path["actions"] for path in paths])
    else:
        actions = np.vstack([path["actions"] for path in paths])
    statistics.update(
        create_stats_ordered_dict('Actions', actions, stat_prefix=stat_prefix))
    statistics['Num Paths'] = len(paths)
    statistics[stat_prefix + 'Average Returns'] = get_average_returns(paths)

    num_collision, num_block, num_outroad, num_success, num_timeout = 0, 0, 0, 0, 0
    log_path = logger.get_snapshot_dir()
    for pid, path in enumerate(paths):
        event = path["env_infos"][-1]['event']
        if event == 'collision':
            num_collision += 1
        elif event == 'block':
            num_block += 1
        elif event == 'outroad':
            num_outroad += 1
        elif event == 'goal':
            num_success += 1
        else:
            num_timeout += 1
    statistics['Num Collision'] = num_collision
    statistics['Num Block'] = num_block
    statistics['Num Outroad'] = num_outroad
    statistics['Num Success'] = num_success
    statistics['Num Timeout'] = num_timeout

    for info_key in ['agent_infos']:
        if info_key in paths[0]:
            all_env_infos = [
                ppp.list_of_dicts__to__dict_of_lists(p[info_key])
                for p in paths
            ]
            for k in all_env_infos[0].keys():
                final_ks = np.array([info[k][-1] for info in all_env_infos])
                first_ks = np.array([info[k][0] for info in all_env_infos])
                all_ks = np.concatenate([info[k] for info in all_env_infos])
                statistics.update(
                    create_stats_ordered_dict(
                        stat_prefix + k,
                        final_ks,
                        stat_prefix='{}/final/'.format(info_key),
                    ))
                statistics.update(
                    create_stats_ordered_dict(
                        stat_prefix + k,
                        first_ks,
                        stat_prefix='{}/initial/'.format(info_key),
                    ))
                statistics.update(
                    create_stats_ordered_dict(
                        stat_prefix + k,
                        all_ks,
                        stat_prefix='{}/'.format(info_key),
                    ))

    return statistics
Exemple #17
0
    def evaluate(self, epoch):
        statistics = OrderedDict()
        statistics.update(self.eval_statistics)
        self.eval_statistics = statistics

        ### train tasks
        dprint('evaluating on {} train tasks'.format(len(self.train_tasks)))
        train_avg_returns = []
        for idx in self.train_tasks:
            dprint('task {} encoder RB size'.format(idx),
                   self.enc_replay_buffer.task_buffers[idx]._size)
            paths = self.collect_paths(idx, epoch, eval_task=False)
            train_avg_returns.append(eval_util.get_average_returns(paths))

        ### test tasks
        dprint('evaluating on {} test tasks'.format(len(self.eval_tasks)))
        test_avg_returns = []
        # This is calculating the embedding online, because every iteration
        # we clear the encoding buffer for the test tasks.
        for idx in self.eval_tasks:
            self.task_idx = idx
            self.env.reset_task(idx)

            # collect data fo computing embedding if needed
            if self.eval_embedding_source in ['online', 'initial_pool']:
                pass
            elif self.eval_embedding_source == 'online_exploration_trajectories':
                self.eval_enc_replay_buffer.task_buffers[idx].clear()
                # task embedding sampled from prior and held fixed
                self.collect_data_sampling_from_prior(
                    num_samples=self.num_steps_per_task,
                    resample_z_every_n=self.max_path_length,
                    eval_task=True)
            elif self.eval_embedding_source == 'online_on_policy_trajectories':
                self.eval_enc_replay_buffer.task_buffers[idx].clear()
                # half the data from z sampled from prior, the other half from z sampled from posterior
                self.collect_data_online(idx=idx,
                                         num_samples=self.num_steps_per_task,
                                         eval_task=True)
            else:
                raise Exception("Invalid option for computing eval embedding")

            dprint('task {} encoder RB size'.format(idx),
                   self.eval_enc_replay_buffer.task_buffers[idx]._size)
            test_paths = self.collect_paths(idx, epoch, eval_task=True)

            test_avg_returns.append(eval_util.get_average_returns(test_paths))

            if self.use_information_bottleneck:
                z_mean = np.mean(
                    np.abs(ptu.get_numpy(self.policy.z_dists[0].mean)))
                z_sig = np.mean(ptu.get_numpy(self.policy.z_dists[0].variance))
                self.eval_statistics['Z mean eval'] = z_mean
                self.eval_statistics['Z variance eval'] = z_sig

            # TODO(KR) what does this do
            if hasattr(self.env, "log_diagnostics"):
                self.env.log_diagnostics(test_paths)

        avg_train_return = np.mean(train_avg_returns)
        avg_test_return = np.mean(test_avg_returns)
        self.eval_statistics[
            'AverageReturn_all_train_tasks'] = avg_train_return
        self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return

        for key, value in self.eval_statistics.items():
            logger.record_tabular(key, value)
        self.eval_statistics = None

        if self.render_eval_paths:
            self.env.render_paths(test_paths)

        if self.plotter:
            self.plotter.draw()
    def evaluate(self, epoch):
        if self.eval_statistics is None:
            self.eval_statistics = OrderedDict()

        ### sample trajectories from prior for debugging / visualization
        if self.dump_eval_paths:
            # 100 arbitrarily chosen for visualizations of point_robot trajectories
            # just want stochasticity of z, not the policy
            self.agent.clear_z()
            prior_paths, _ = self.sampler.obtain_samples(
                deterministic=self.eval_deterministic,
                max_samples=self.max_path_length * 20,
                accum_context=False,
                resample=1,
                testing=True)
            logger.save_extra_data(
                prior_paths,
                path='eval_trajectories/prior-epoch{}'.format(epoch))

        ### train tasks
        # eval on a subset of train tasks for speed
        indices = np.random.choice(self.train_tasks, len(self.eval_tasks))
        eval_util.dprint('evaluating on {} train tasks'.format(len(indices)))
        ### eval train tasks with posterior sampled from the training replay buffer
        train_returns = []
        for idx in indices:
            self.task_idx = idx
            self.env.reset_task(idx)
            paths = []
            for _ in range(self.num_steps_per_eval // self.max_path_length):
                context = self.sample_context(idx)
                self.agent.infer_posterior(context)
                p, _ = self.sampler.obtain_samples(
                    deterministic=self.eval_deterministic,
                    max_samples=self.max_path_length,
                    accum_context=False,
                    max_trajs=1,
                    resample=np.inf,
                    testing=True)
                paths += p

            if self.sparse_rewards:
                for p in paths:
                    sparse_rewards = np.stack(e['sparse_reward']
                                              for e in p['env_infos']).reshape(
                                                  -1, 1)
                    p['rewards'] = sparse_rewards

            train_returns.append(eval_util.get_average_returns(paths))
        train_returns = np.mean(train_returns)
        ### eval train tasks with on-policy data to match eval of test tasks
        train_final_returns, train_online_returns = self._do_eval(
            indices, epoch)
        eval_util.dprint('train online returns')
        eval_util.dprint(train_online_returns)

        ### test tasks
        eval_util.dprint('evaluating on {} test tasks'.format(
            len(self.eval_tasks)))
        test_final_returns, test_online_returns = self._do_eval(
            self.eval_tasks, epoch)
        eval_util.dprint('test online returns')
        eval_util.dprint(test_online_returns)

        # save the final posterior
        self.agent.log_diagnostics(self.eval_statistics)

        if hasattr(self.env, "log_diagnostics"):
            self.env.log_diagnostics(paths, prefix=None)

        avg_train_return = np.mean(train_final_returns)
        avg_test_return = np.mean(test_final_returns)
        avg_train_online_return = np.mean(np.stack(train_online_returns),
                                          axis=0)
        avg_test_online_return = np.mean(np.stack(test_online_returns), axis=0)
        self.eval_statistics[
            'AverageTrainReturn_all_train_tasks'] = train_returns
        self.eval_statistics[
            'AverageReturn_all_train_tasks'] = avg_train_return
        self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return
        logger.save_extra_data(avg_train_online_return,
                               path='online-train-epoch{}'.format(epoch))
        logger.save_extra_data(avg_test_online_return,
                               path='online-test-epoch{}'.format(epoch))

        for key, value in self.eval_statistics.items():
            logger.record_tabular(key, value)
        self.eval_statistics = None

        if self.render_eval_paths:
            self.env.render_paths(paths)

        if self.plotter:
            self.plotter.draw()
    def evaluate(self, epoch):
        """
        Evaluate the policy, e.g. save/print progress.
        :param epoch:
        :return:
        """
        statistics = OrderedDict()
        try:
            statistics.update(self.eval_statistics)
            self.eval_statistics = None
        except:
            print('No Stats to Eval')

        logger.log("Collecting samples for evaluation")

        test_paths = []
        sampled_task_params = self.test_task_params_sampler.sample_unique(
            self.num_eval_tasks)
        for i in range(self.num_eval_tasks):
            env = self.env_factory(sampled_task_params[i])
            for _ in range(self.num_rollouts_per_task_per_eval):
                test_paths.append(
                    rollout(
                        self.env,
                        self.get_eval_policy(sampled_task_params[i]),
                        self.max_path_length,
                        no_terminal=self.no_terminal,
                        render=self.render,
                        render_kwargs=self.render_kwargs,
                    ))

        statistics.update(
            eval_util.get_generic_path_information(
                test_paths,
                stat_prefix="Test",
            ))
        statistics.update(
            eval_util.get_generic_path_information(
                self._exploration_paths,
                stat_prefix="Exploration",
            ))

        if hasattr(self.env, "log_diagnostics"):
            self.env.log_diagnostics(test_paths)
        if hasattr(self.env, "log_statistics"):
            statistics.update(self.env.log_statistics(test_paths))
        if epoch % self.freq_log_visuals == 0:
            if hasattr(self.env, "log_visuals"):
                self.env.log_visuals(test_paths, epoch,
                                     logger.get_snapshot_dir())

        average_returns = eval_util.get_average_returns(test_paths)
        statistics['AverageReturn'] = average_returns
        for key, value in statistics.items():
            logger.record_tabular(key, value)

        best_statistic = statistics[self.best_key]
        if best_statistic > self.best_statistic_so_far:
            self.best_statistic_so_far = best_statistic
            if self.save_best and epoch >= self.save_best_starting_from_epoch:
                data_to_save = {'epoch': epoch, 'statistics': statistics}
                data_to_save.update(self.get_epoch_snapshot(epoch))
                logger.save_extra_data(data_to_save, 'best.pkl')
                print('\n\nSAVED BEST\n\n')