Beispiel #1
0
    def obtain_eval_paths(self, idx, eval_task=False, deterministic=False):
        '''
        collect paths with current policy
        if online, task encoding will be updated after each transition
        otherwise, sample a task encoding once and keep it fixed
        '''
        is_online = (self.eval_embedding_source == 'online')
        self.policy.clear_z()

        if not is_online:
            self.sample_z_from_posterior(idx, eval_task=eval_task)

        # import pdb; pdb.set_trace()

        dprint('task encoding ', self.policy.z)

        test_paths = self.eval_sampler.obtain_samples(
            deterministic=deterministic,
            is_online=is_online,
            num_rollouts=np.ceil(self.num_steps_per_task /
                                 self.max_path_length))

        # import pdb; pdb.set_trace()
        if self.sparse_rewards:
            for p in test_paths:
                p['rewards'] = ptu.sparsify_rewards(p['rewards'])
        return test_paths
Beispiel #2
0
    def collect_paths(self, idx, epoch, eval_task=False):
        self.task_idx = idx
        dprint('Task:', idx)
        self.env.reset_task(idx)
        # if eval_task:
        #     num_evals = self.num_evals
        # else:
        num_evals = 1

        paths = []
        for _ in range(num_evals):
            paths += self.obtain_eval_paths(idx,
                                            eval_task=eval_task,
                                            deterministic=True)

        # goal = self.env._goal
        # for path in paths:
        #     path['goal'] = goal # goal

        # save the paths for visualization, only useful for point mass
        if self.dump_eval_paths:
            split = 'test' if eval_task else 'train'
            logger.save_extra_data(
                paths,
                path='eval_trajectories/{}-task{}-epoch{}'.format(
                    split, idx, epoch))
        return paths
Beispiel #3
0
    def collect_data_for_embedding_online_with_logging(self, idx, epoch):
        self.task_idx = idx
        dprint('Task:', idx)
        self.env.reset_task(idx)

        n_exploration_episodes = 10
        n_inference_episodes = 10
        all_init_paths = []
        all_inference_paths = []

        self.enc_replay_buffer.clear_buffer(idx)

        for i in range(n_exploration_episodes):
            initial_z = self.sample_z_from_prior()

            init_paths = self.obtain_eval_paths(idx,
                                                z=initial_z,
                                                eval_task=True)
            all_init_paths += init_paths
            self.enc_replay_buffer.add_paths(idx, init_paths)
        dprint('enc_replay_buffer.task_buffers[idx]._size',
               self.enc_replay_buffer.task_buffers[idx]._size)

        for i in range(n_inference_episodes):
            paths = self.obtain_eval_paths(idx, eval_task=True)
            all_inference_paths += paths
            self.enc_replay_buffer.add_paths(idx, init_paths)

        # save evaluation rollouts for vis
        # all paths
        with open(
                self.output_dir +
                "/proto-sac-point-mass-fb-16z-init-task{}-{}.pkl".format(
                    idx, epoch), 'wb+') as f:
            pickle.dump(all_init_paths, f, pickle.HIGHEST_PROTOCOL)
        with open(
                self.output_dir +
                "/proto-sac-point-mass-fb-16z-inference-task{}-{}.pkl".format(
                    idx, epoch), 'wb+') as f:
            pickle.dump(all_inference_paths, f, pickle.HIGHEST_PROTOCOL)

        average_inference_returns = [
            eval_util.get_average_returns(paths)
            for paths in all_inference_paths
        ]
        self.eval_statistics['AverageInferenceReturns_test_task{}'.format(
            idx)] = average_inference_returns
Beispiel #4
0
 def log_statistics(self, paths, split=''):
     self.eval_statistics.update(
         eval_util.get_generic_path_information(
             paths,
             stat_prefix="{}_task{}".format(split, self.task_idx),
         ))
     # TODO(KR) what are these?
     self.eval_statistics.update(
         eval_util.get_generic_path_information(
             self._exploration_paths,
             stat_prefix="Exploration_task{}".format(self.task_idx),
         )
     )  # something is wrong with these exploration paths i'm pretty sure...
     average_returns = eval_util.get_average_returns(paths)
     self.eval_statistics['AverageReturn_{}_task{}'.format(
         split, self.task_idx)] = average_returns
     goal = self.env._goal
     dprint('GoalPosition_{}_task'.format(split))
     dprint(goal)
     self.eval_statistics['GoalPosition_{}_task{}'.format(
         split, self.task_idx)] = goal
Beispiel #5
0
    def evaluate(self, epoch):
        if self.eval_statistics is None:
            self.eval_statistics = OrderedDict()

        ### train tasks
        # eval on a subset of train tasks for speed
        if self.eval_train_tasks:
            train_final_returns, train_online_returns = self._do_eval(
                self.train_tasks, epoch)
            eval_util.dprint('train online returns')
            eval_util.dprint(train_online_returns)

            avg_train_return = np.mean(train_final_returns)
            self.eval_statistics[
                'AverageReturn_all_train_tasks'] = avg_train_return
            for i, _ret in enumerate(train_final_returns):
                self.eval_statistics['train_task' + str(i) + '_return'] = _ret

        ### test tasks
        if len(self.eval_tasks) > 0:
            test_final_returns, test_online_returns = self._do_eval(
                self.eval_tasks, epoch)
            eval_util.dprint('test online returns')
            eval_util.dprint(test_online_returns)

            avg_test_return = np.mean(test_final_returns)
            self.eval_statistics[
                'AverageReturn_all_test_tasks'] = avg_test_return
            for i, _ret in enumerate(test_final_returns):
                self.eval_statistics['eval_task' + str(i) + '_return'] = _ret

        self.agent.log_diagnostics(self.eval_statistics)

        for key, value in self.eval_statistics.items():
            logger.record_tabular(key, value)
        self.eval_statistics = None

        if self.plotter:
            self.plotter.draw()
Beispiel #6
0
    def evaluate(self, epoch):
        if self.eval_statistics is None:
            self.eval_statistics = OrderedDict()

        ### test tasks
        eval_util.dprint('evaluating on {} test tasks'.format(len(self.eval_tasks)))
        test_final_returns, test_online_returns = self._do_eval(self.eval_tasks, epoch)
        eval_util.dprint('test online returns')
        eval_util.dprint(test_online_returns)

        # save the final posterior
        self.agent.log_diagnostics(self.eval_statistics)

        avg_test_return = np.mean(test_final_returns)
        self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return

        for key, value in self.eval_statistics.items():
            logger.record_tabular(key, value)
        self.eval_statistics = None

        if self.plotter:
            self.plotter.draw()
    def evaluate(self, epoch):
        if self.eval_statistics is None:
            self.eval_statistics = OrderedDict()

        ### sample trajectories from prior for debugging / visualization
        if self.dump_eval_paths:
            # 100 arbitrarily chosen for visualizations of point_robot trajectories
            # just want stochasticity of z, not the policy
            self.agent.clear_z()
            prior_paths, _ = self.sampler.obtain_samples(
                deterministic=self.eval_deterministic,
                max_samples=self.max_path_length * 20,
                accum_context=False,
                resample=1,
                testing=True)
            logger.save_extra_data(
                prior_paths,
                path='eval_trajectories/prior-epoch{}'.format(epoch))

        ### train tasks
        # eval on a subset of train tasks for speed
        indices = np.random.choice(self.train_tasks, len(self.eval_tasks))
        eval_util.dprint('evaluating on {} train tasks'.format(len(indices)))
        ### eval train tasks with posterior sampled from the training replay buffer
        train_returns = []
        for idx in indices:
            self.task_idx = idx
            self.env.reset_task(idx)
            paths = []
            for _ in range(self.num_steps_per_eval // self.max_path_length):
                context = self.sample_context(idx)
                self.agent.infer_posterior(context)
                p, _ = self.sampler.obtain_samples(
                    deterministic=self.eval_deterministic,
                    max_samples=self.max_path_length,
                    accum_context=False,
                    max_trajs=1,
                    resample=np.inf,
                    testing=True)
                paths += p

            if self.sparse_rewards:
                for p in paths:
                    sparse_rewards = np.stack(e['sparse_reward']
                                              for e in p['env_infos']).reshape(
                                                  -1, 1)
                    p['rewards'] = sparse_rewards

            train_returns.append(eval_util.get_average_returns(paths))
        train_returns = np.mean(train_returns)
        ### eval train tasks with on-policy data to match eval of test tasks
        train_final_returns, train_online_returns = self._do_eval(
            indices, epoch)
        eval_util.dprint('train online returns')
        eval_util.dprint(train_online_returns)

        ### test tasks
        eval_util.dprint('evaluating on {} test tasks'.format(
            len(self.eval_tasks)))
        test_final_returns, test_online_returns = self._do_eval(
            self.eval_tasks, epoch)
        eval_util.dprint('test online returns')
        eval_util.dprint(test_online_returns)

        # save the final posterior
        self.agent.log_diagnostics(self.eval_statistics)

        if hasattr(self.env, "log_diagnostics"):
            self.env.log_diagnostics(paths, prefix=None)

        avg_train_return = np.mean(train_final_returns)
        avg_test_return = np.mean(test_final_returns)
        avg_train_online_return = np.mean(np.stack(train_online_returns),
                                          axis=0)
        avg_test_online_return = np.mean(np.stack(test_online_returns), axis=0)
        self.eval_statistics[
            'AverageTrainReturn_all_train_tasks'] = train_returns
        self.eval_statistics[
            'AverageReturn_all_train_tasks'] = avg_train_return
        self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return
        logger.save_extra_data(avg_train_online_return,
                               path='online-train-epoch{}'.format(epoch))
        logger.save_extra_data(avg_test_online_return,
                               path='online-test-epoch{}'.format(epoch))

        for key, value in self.eval_statistics.items():
            logger.record_tabular(key, value)
        self.eval_statistics = None

        if self.render_eval_paths:
            self.env.render_paths(paths)

        if self.plotter:
            self.plotter.draw()
Beispiel #8
0
    def evaluate(self, epoch):
        statistics = OrderedDict()
        statistics.update(self.eval_statistics)
        self.eval_statistics = statistics

        # old_device = ptu.device
        # ptu.device = torch.device('cpu')
        # self.policy.cnn_enc.to(ptu.device)
        # self.policy.task_enc.to(ptu.device)
        # self.policy.qf1.to(ptu.device)

        # self.policy_dataset = PolicyDataset(self, eval_task=False)

        # self.policy_loader = iter(torch.utils.data.DataLoader(self.policy_dataset, batch_size=1,
        #         shuffle=False, pin_memory=True, sampler=None, batch_sampler=None, num_workers=10,
        #         worker_init_fn=None, collate_fn=lambda x: x))

        import time
        # for i in range(10):
        #     t0 = time.time()
        #     paths = self.policy_loader.next()
        #     print((time.time() - t0))

        # # import pdb; pdb.set_trace()

        ### train tasks
        dprint('evaluating on {} train tasks'.format(len(self.train_tasks)))
        train_avg_returns = []
        train_avg_succ = []
        train_avg_len = []
        for idx in self.train_tasks:
            dprint('task {} encoder RB size'.format(idx),
                   self.enc_replay_buffer.task_buffers[idx]._size)
            paths = self.collect_paths(idx, epoch, eval_task=False)

            t0 = time.time()
            # paths = self.policy_loader.next()[0]
            # import pdb; pdb.set_trace()
            train_avg_returns.append(eval_util.get_average_returns(paths))
            train_avg_succ.append(
                [sum([j['succ'] for j in i['env_infos']]) for i in paths])
            train_avg_len.append([len(i['env_infos']) for i in paths])
            print((time.time() - t0))

        # import pdb; pdb.set_trace()

        # ptu.device = old_device
        # self.policy.cnn_enc.to(ptu.device)
        # self.policy.task_enc.to(ptu.device)
        # self.policy.qf1.to(ptu.device)

        ## test tasks
        dprint('evaluating on {} test tasks'.format(len(self.eval_tasks)))
        test_avg_returns = []
        test_avg_succ = []
        test_avg_len = []
        # This is calculating the embedding online, because every iteration
        # we clear the encoding buffer for the test tasks.

        for idx in np.random.choice(self.eval_tasks,
                                    self.num_evals,
                                    replace=False):
            print('eval task', idx)
            self.task_idx = idx
            self.env.reset_task(idx)

            # collect data fo computing embedding if needed
            if self.eval_embedding_source in ['online', 'initial_pool']:
                pass
            elif self.eval_embedding_source == 'online_exploration_trajectories':
                self.eval_enc_replay_buffer.task_buffers[idx].clear()
                # task embedding sampled from prior and held fixed
                self.collect_data_sampling_from_prior(
                    num_samples=self.num_steps_per_task,
                    resample_z_every_n=self.max_path_length,
                    eval_task=True)
            elif self.eval_embedding_source == 'online_on_policy_trajectories':
                self.eval_enc_replay_buffer.task_buffers[idx].clear()
                # half the data from z sampled from prior, the other half from z sampled from posterior
                self.collect_data_online(idx=idx,
                                         num_samples=self.num_steps_per_task,
                                         eval_task=True)
            else:
                raise Exception("Invalid option for computing eval embedding")

            dprint('task {} encoder RB size'.format(idx),
                   self.eval_enc_replay_buffer.task_buffers[idx]._size)
            test_paths = self.collect_paths(idx, epoch, eval_task=True)

            test_avg_returns.append(eval_util.get_average_returns(test_paths))
            test_avg_succ.append(
                [sum([j['succ'] for j in i['env_infos']]) for i in test_paths])
            test_avg_len.append([len(i['env_infos']) for i in test_paths])

            if self.use_information_bottleneck:
                z_mean = np.mean(
                    np.abs(ptu.get_numpy(self.policy.z_dists[0].mean)))
                z_sig = np.mean(ptu.get_numpy(self.policy.z_dists[0].variance))
                self.eval_statistics['Z mean eval'] = z_mean
                self.eval_statistics['Z variance eval'] = z_sig

            # TODO(KR) what does this do
            if hasattr(self.env, "log_diagnostics"):
                self.env.log_diagnostics(test_paths)

        avg_train_return = np.mean(train_avg_returns)
        avg_test_return = np.mean(test_avg_returns)
        avg_train_succ = np.mean(train_avg_succ, axis=0)
        avg_test_succ = np.mean(test_avg_succ, axis=0)
        avg_train_len = np.mean(train_avg_len, axis=0)
        avg_test_len = np.mean(test_avg_len, axis=0)

        self.eval_statistics[
            'AverageReturn_all_train_tasks'] = avg_train_return
        self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return

        for i, s in enumerate(avg_train_succ):
            self.eval_statistics['Succ_train_tasks_%s' % i] = s
        for i, s in enumerate(avg_test_succ):
            self.eval_statistics['Succ_test_tasks_%s' % i] = s

        for i, s in enumerate(avg_train_len):
            self.eval_statistics['Len__train_tasks_%s' % i] = s
        for i, s in enumerate(avg_test_len):
            self.eval_statistics['Len_test_tasks_%s' % i] = s

        for key, value in self.eval_statistics.items():
            logger.record_tabular(key, value)
        self.eval_statistics = None

        if self.render_eval_paths:
            self.env.render_paths(test_paths)

        if self.plotter:
            self.plotter.draw()
Beispiel #9
0
    def evaluate(self, epoch):
        statistics = OrderedDict()
        statistics.update(self.eval_statistics)
        self.eval_statistics = statistics

        ### train tasks
        dprint('evaluating on {} train tasks'.format(len(self.train_tasks)))
        train_avg_returns = []
        for idx in self.train_tasks:
            dprint('task {} encoder RB size'.format(idx),
                   self.enc_replay_buffer.task_buffers[idx]._size)
            paths = self.collect_paths(idx, epoch, eval_task=False)
            train_avg_returns.append(eval_util.get_average_returns(paths))

        ### test tasks
        dprint('evaluating on {} test tasks'.format(len(self.eval_tasks)))
        test_avg_returns = []
        # This is calculating the embedding online, because every iteration
        # we clear the encoding buffer for the test tasks.
        for idx in self.eval_tasks:
            self.task_idx = idx
            self.env.reset_task(idx)

            # collect data fo computing embedding if needed
            if self.eval_embedding_source in ['online', 'initial_pool']:
                pass
            elif self.eval_embedding_source == 'online_exploration_trajectories':
                self.eval_enc_replay_buffer.task_buffers[idx].clear()
                # task embedding sampled from prior and held fixed
                self.collect_data_sampling_from_prior(
                    num_samples=self.num_steps_per_task,
                    resample_z_every_n=self.max_path_length,
                    eval_task=True)
            elif self.eval_embedding_source == 'online_on_policy_trajectories':
                self.eval_enc_replay_buffer.task_buffers[idx].clear()
                # half the data from z sampled from prior, the other half from z sampled from posterior
                self.collect_data_online(idx=idx,
                                         num_samples=self.num_steps_per_task,
                                         eval_task=True)
            else:
                raise Exception("Invalid option for computing eval embedding")

            dprint('task {} encoder RB size'.format(idx),
                   self.eval_enc_replay_buffer.task_buffers[idx]._size)
            test_paths = self.collect_paths(idx, epoch, eval_task=True)

            test_avg_returns.append(eval_util.get_average_returns(test_paths))

            if self.use_information_bottleneck:
                z_mean = np.mean(
                    np.abs(ptu.get_numpy(self.policy.z_dists[0].mean)))
                z_sig = np.mean(ptu.get_numpy(self.policy.z_dists[0].variance))
                self.eval_statistics['Z mean eval'] = z_mean
                self.eval_statistics['Z variance eval'] = z_sig

            # TODO(KR) what does this do
            if hasattr(self.env, "log_diagnostics"):
                self.env.log_diagnostics(test_paths)

        avg_train_return = np.mean(train_avg_returns)
        avg_test_return = np.mean(test_avg_returns)
        self.eval_statistics[
            'AverageReturn_all_train_tasks'] = avg_train_return
        self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return

        for key, value in self.eval_statistics.items():
            logger.record_tabular(key, value)
        self.eval_statistics = None

        if self.render_eval_paths:
            self.env.render_paths(test_paths)

        if self.plotter:
            self.plotter.draw()
    def evaluate(self, epoch):
        if self.eval_statistics is None:
            self.eval_statistics = OrderedDict()

        ### sample trajectories from prior for debugging / visualization
        if self.dump_eval_paths:
            # 100 arbitrarily chosen for visualizations of point_robot trajectories
            # just want stochasticity of z, not the policy
            self.agent.clear_z()
            prior_paths, _ = self.sampler.obtain_samples(
                deterministic=self.eval_deterministic,
                max_samples=self.max_path_length * 20,
                accum_context=False,
                resample=1)
            logger.save_extra_data(
                prior_paths,
                path='eval_trajectories/prior-epoch{}'.format(epoch))

        ### train tasks
        # eval on a subset of train tasks for speed
        eval_util.dprint('evaluating on {} train tasks'.format(
            len(self.train_goals)))

        ### eval train tasks with on-policy data to match eval of test tasks
        train_final_returns, train_final_achieved = self._do_eval(
            self.train_goals, epoch)

        # Comment this line for walker-param
        # train_final_achieved_pair = [(train_final_achieved[i], goal) for i, goal in enumerate(self.train_goals)]
        train_final_achieved_pair = [(train_final_achieved[i], -1)
                                     for i, goal in enumerate(self.train_goals)
                                     ]

        eval_util.dprint('train final achieved')
        eval_util.dprint(train_final_achieved_pair)

        ### WD tasks

        eval_util.dprint('evaluating on {} wd tasks'.format(len(
            self.wd_goals)))
        wd_final_returns, wd_final_achieved = self._do_eval(
            self.wd_goals, epoch)

        # Comment this line for walker-param
        # wd_final_achieved_pair = [(wd_final_achieved[i], goal) for i, goal in enumerate(self.wd_goals)]
        wd_final_achieved_pair = [(wd_final_achieved[i], -1)
                                  for i, goal in enumerate(self.wd_goals)]

        eval_util.dprint('WD test final achieved')
        eval_util.dprint(wd_final_achieved_pair)

        # ### OOD tasks

        # eval_util.dprint('evaluating on {} wd tasks'.format(len(self.ood_goals)))
        # ood_final_returns, ood_final_achieved = self._do_eval(self.ood_goals, epoch)

        # # Comment this line for walker-param
        # # ood_final_achieved_pair = [(ood_final_achieved[i], goal) for i, goal in enumerate(self.ood_goals)]
        # ood_final_achieved_pair = [(ood_final_achieved[i], -1) for i, goal in enumerate(self.ood_goals)]

        # eval_util.dprint('OOD test final achieved')
        # eval_util.dprint(ood_final_achieved_pair)

        # # save the final posterior
        # self.agent.log_diagnostics(self.eval_statistics)

        avg_train_return = np.mean(train_final_returns)
        avg_wd_return = np.mean(wd_final_returns)
        # avg_ood_return = np.mean(ood_final_returns)

        self.eval_statistics[
            'AverageReturn_all_train_tasks'] = avg_train_return
        self.eval_statistics['AverageReturn_all_wd_tasks'] = avg_wd_return
        # self.eval_statistics['AverageReturn_all_ood_tasks'] = avg_ood_return

        self.eval_statistics['Return_all_train_tasks'] = train_final_returns
        self.eval_statistics['Return_all_wd_tasks'] = wd_final_returns
        # self.eval_statistics['Return_all_ood_tasks'] = ood_final_returns

        self.eval_statistics[
            'Achieved_all_train_tasks'] = train_final_achieved_pair
        self.eval_statistics['Achieved_all_wd_tasks'] = wd_final_achieved_pair
        # self.eval_statistics['Achieved_all_ood_tasks'] = ood_final_achieved_pair

        for key, value in self.eval_statistics.items():
            logger.record_tabular(key, value)
        self.eval_statistics = None

        if self.plotter:
            self.plotter.draw()