def obtain_eval_paths(self, idx, eval_task=False, deterministic=False): ''' collect paths with current policy if online, task encoding will be updated after each transition otherwise, sample a task encoding once and keep it fixed ''' is_online = (self.eval_embedding_source == 'online') self.policy.clear_z() if not is_online: self.sample_z_from_posterior(idx, eval_task=eval_task) # import pdb; pdb.set_trace() dprint('task encoding ', self.policy.z) test_paths = self.eval_sampler.obtain_samples( deterministic=deterministic, is_online=is_online, num_rollouts=np.ceil(self.num_steps_per_task / self.max_path_length)) # import pdb; pdb.set_trace() if self.sparse_rewards: for p in test_paths: p['rewards'] = ptu.sparsify_rewards(p['rewards']) return test_paths
def collect_paths(self, idx, epoch, eval_task=False): self.task_idx = idx dprint('Task:', idx) self.env.reset_task(idx) # if eval_task: # num_evals = self.num_evals # else: num_evals = 1 paths = [] for _ in range(num_evals): paths += self.obtain_eval_paths(idx, eval_task=eval_task, deterministic=True) # goal = self.env._goal # for path in paths: # path['goal'] = goal # goal # save the paths for visualization, only useful for point mass if self.dump_eval_paths: split = 'test' if eval_task else 'train' logger.save_extra_data( paths, path='eval_trajectories/{}-task{}-epoch{}'.format( split, idx, epoch)) return paths
def collect_data_for_embedding_online_with_logging(self, idx, epoch): self.task_idx = idx dprint('Task:', idx) self.env.reset_task(idx) n_exploration_episodes = 10 n_inference_episodes = 10 all_init_paths = [] all_inference_paths = [] self.enc_replay_buffer.clear_buffer(idx) for i in range(n_exploration_episodes): initial_z = self.sample_z_from_prior() init_paths = self.obtain_eval_paths(idx, z=initial_z, eval_task=True) all_init_paths += init_paths self.enc_replay_buffer.add_paths(idx, init_paths) dprint('enc_replay_buffer.task_buffers[idx]._size', self.enc_replay_buffer.task_buffers[idx]._size) for i in range(n_inference_episodes): paths = self.obtain_eval_paths(idx, eval_task=True) all_inference_paths += paths self.enc_replay_buffer.add_paths(idx, init_paths) # save evaluation rollouts for vis # all paths with open( self.output_dir + "/proto-sac-point-mass-fb-16z-init-task{}-{}.pkl".format( idx, epoch), 'wb+') as f: pickle.dump(all_init_paths, f, pickle.HIGHEST_PROTOCOL) with open( self.output_dir + "/proto-sac-point-mass-fb-16z-inference-task{}-{}.pkl".format( idx, epoch), 'wb+') as f: pickle.dump(all_inference_paths, f, pickle.HIGHEST_PROTOCOL) average_inference_returns = [ eval_util.get_average_returns(paths) for paths in all_inference_paths ] self.eval_statistics['AverageInferenceReturns_test_task{}'.format( idx)] = average_inference_returns
def log_statistics(self, paths, split=''): self.eval_statistics.update( eval_util.get_generic_path_information( paths, stat_prefix="{}_task{}".format(split, self.task_idx), )) # TODO(KR) what are these? self.eval_statistics.update( eval_util.get_generic_path_information( self._exploration_paths, stat_prefix="Exploration_task{}".format(self.task_idx), ) ) # something is wrong with these exploration paths i'm pretty sure... average_returns = eval_util.get_average_returns(paths) self.eval_statistics['AverageReturn_{}_task{}'.format( split, self.task_idx)] = average_returns goal = self.env._goal dprint('GoalPosition_{}_task'.format(split)) dprint(goal) self.eval_statistics['GoalPosition_{}_task{}'.format( split, self.task_idx)] = goal
def evaluate(self, epoch): if self.eval_statistics is None: self.eval_statistics = OrderedDict() ### train tasks # eval on a subset of train tasks for speed if self.eval_train_tasks: train_final_returns, train_online_returns = self._do_eval( self.train_tasks, epoch) eval_util.dprint('train online returns') eval_util.dprint(train_online_returns) avg_train_return = np.mean(train_final_returns) self.eval_statistics[ 'AverageReturn_all_train_tasks'] = avg_train_return for i, _ret in enumerate(train_final_returns): self.eval_statistics['train_task' + str(i) + '_return'] = _ret ### test tasks if len(self.eval_tasks) > 0: test_final_returns, test_online_returns = self._do_eval( self.eval_tasks, epoch) eval_util.dprint('test online returns') eval_util.dprint(test_online_returns) avg_test_return = np.mean(test_final_returns) self.eval_statistics[ 'AverageReturn_all_test_tasks'] = avg_test_return for i, _ret in enumerate(test_final_returns): self.eval_statistics['eval_task' + str(i) + '_return'] = _ret self.agent.log_diagnostics(self.eval_statistics) for key, value in self.eval_statistics.items(): logger.record_tabular(key, value) self.eval_statistics = None if self.plotter: self.plotter.draw()
def evaluate(self, epoch): if self.eval_statistics is None: self.eval_statistics = OrderedDict() ### test tasks eval_util.dprint('evaluating on {} test tasks'.format(len(self.eval_tasks))) test_final_returns, test_online_returns = self._do_eval(self.eval_tasks, epoch) eval_util.dprint('test online returns') eval_util.dprint(test_online_returns) # save the final posterior self.agent.log_diagnostics(self.eval_statistics) avg_test_return = np.mean(test_final_returns) self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return for key, value in self.eval_statistics.items(): logger.record_tabular(key, value) self.eval_statistics = None if self.plotter: self.plotter.draw()
def evaluate(self, epoch): if self.eval_statistics is None: self.eval_statistics = OrderedDict() ### sample trajectories from prior for debugging / visualization if self.dump_eval_paths: # 100 arbitrarily chosen for visualizations of point_robot trajectories # just want stochasticity of z, not the policy self.agent.clear_z() prior_paths, _ = self.sampler.obtain_samples( deterministic=self.eval_deterministic, max_samples=self.max_path_length * 20, accum_context=False, resample=1, testing=True) logger.save_extra_data( prior_paths, path='eval_trajectories/prior-epoch{}'.format(epoch)) ### train tasks # eval on a subset of train tasks for speed indices = np.random.choice(self.train_tasks, len(self.eval_tasks)) eval_util.dprint('evaluating on {} train tasks'.format(len(indices))) ### eval train tasks with posterior sampled from the training replay buffer train_returns = [] for idx in indices: self.task_idx = idx self.env.reset_task(idx) paths = [] for _ in range(self.num_steps_per_eval // self.max_path_length): context = self.sample_context(idx) self.agent.infer_posterior(context) p, _ = self.sampler.obtain_samples( deterministic=self.eval_deterministic, max_samples=self.max_path_length, accum_context=False, max_trajs=1, resample=np.inf, testing=True) paths += p if self.sparse_rewards: for p in paths: sparse_rewards = np.stack(e['sparse_reward'] for e in p['env_infos']).reshape( -1, 1) p['rewards'] = sparse_rewards train_returns.append(eval_util.get_average_returns(paths)) train_returns = np.mean(train_returns) ### eval train tasks with on-policy data to match eval of test tasks train_final_returns, train_online_returns = self._do_eval( indices, epoch) eval_util.dprint('train online returns') eval_util.dprint(train_online_returns) ### test tasks eval_util.dprint('evaluating on {} test tasks'.format( len(self.eval_tasks))) test_final_returns, test_online_returns = self._do_eval( self.eval_tasks, epoch) eval_util.dprint('test online returns') eval_util.dprint(test_online_returns) # save the final posterior self.agent.log_diagnostics(self.eval_statistics) if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(paths, prefix=None) avg_train_return = np.mean(train_final_returns) avg_test_return = np.mean(test_final_returns) avg_train_online_return = np.mean(np.stack(train_online_returns), axis=0) avg_test_online_return = np.mean(np.stack(test_online_returns), axis=0) self.eval_statistics[ 'AverageTrainReturn_all_train_tasks'] = train_returns self.eval_statistics[ 'AverageReturn_all_train_tasks'] = avg_train_return self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return logger.save_extra_data(avg_train_online_return, path='online-train-epoch{}'.format(epoch)) logger.save_extra_data(avg_test_online_return, path='online-test-epoch{}'.format(epoch)) for key, value in self.eval_statistics.items(): logger.record_tabular(key, value) self.eval_statistics = None if self.render_eval_paths: self.env.render_paths(paths) if self.plotter: self.plotter.draw()
def evaluate(self, epoch): statistics = OrderedDict() statistics.update(self.eval_statistics) self.eval_statistics = statistics # old_device = ptu.device # ptu.device = torch.device('cpu') # self.policy.cnn_enc.to(ptu.device) # self.policy.task_enc.to(ptu.device) # self.policy.qf1.to(ptu.device) # self.policy_dataset = PolicyDataset(self, eval_task=False) # self.policy_loader = iter(torch.utils.data.DataLoader(self.policy_dataset, batch_size=1, # shuffle=False, pin_memory=True, sampler=None, batch_sampler=None, num_workers=10, # worker_init_fn=None, collate_fn=lambda x: x)) import time # for i in range(10): # t0 = time.time() # paths = self.policy_loader.next() # print((time.time() - t0)) # # import pdb; pdb.set_trace() ### train tasks dprint('evaluating on {} train tasks'.format(len(self.train_tasks))) train_avg_returns = [] train_avg_succ = [] train_avg_len = [] for idx in self.train_tasks: dprint('task {} encoder RB size'.format(idx), self.enc_replay_buffer.task_buffers[idx]._size) paths = self.collect_paths(idx, epoch, eval_task=False) t0 = time.time() # paths = self.policy_loader.next()[0] # import pdb; pdb.set_trace() train_avg_returns.append(eval_util.get_average_returns(paths)) train_avg_succ.append( [sum([j['succ'] for j in i['env_infos']]) for i in paths]) train_avg_len.append([len(i['env_infos']) for i in paths]) print((time.time() - t0)) # import pdb; pdb.set_trace() # ptu.device = old_device # self.policy.cnn_enc.to(ptu.device) # self.policy.task_enc.to(ptu.device) # self.policy.qf1.to(ptu.device) ## test tasks dprint('evaluating on {} test tasks'.format(len(self.eval_tasks))) test_avg_returns = [] test_avg_succ = [] test_avg_len = [] # This is calculating the embedding online, because every iteration # we clear the encoding buffer for the test tasks. for idx in np.random.choice(self.eval_tasks, self.num_evals, replace=False): print('eval task', idx) self.task_idx = idx self.env.reset_task(idx) # collect data fo computing embedding if needed if self.eval_embedding_source in ['online', 'initial_pool']: pass elif self.eval_embedding_source == 'online_exploration_trajectories': self.eval_enc_replay_buffer.task_buffers[idx].clear() # task embedding sampled from prior and held fixed self.collect_data_sampling_from_prior( num_samples=self.num_steps_per_task, resample_z_every_n=self.max_path_length, eval_task=True) elif self.eval_embedding_source == 'online_on_policy_trajectories': self.eval_enc_replay_buffer.task_buffers[idx].clear() # half the data from z sampled from prior, the other half from z sampled from posterior self.collect_data_online(idx=idx, num_samples=self.num_steps_per_task, eval_task=True) else: raise Exception("Invalid option for computing eval embedding") dprint('task {} encoder RB size'.format(idx), self.eval_enc_replay_buffer.task_buffers[idx]._size) test_paths = self.collect_paths(idx, epoch, eval_task=True) test_avg_returns.append(eval_util.get_average_returns(test_paths)) test_avg_succ.append( [sum([j['succ'] for j in i['env_infos']]) for i in test_paths]) test_avg_len.append([len(i['env_infos']) for i in test_paths]) if self.use_information_bottleneck: z_mean = np.mean( np.abs(ptu.get_numpy(self.policy.z_dists[0].mean))) z_sig = np.mean(ptu.get_numpy(self.policy.z_dists[0].variance)) self.eval_statistics['Z mean eval'] = z_mean self.eval_statistics['Z variance eval'] = z_sig # TODO(KR) what does this do if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(test_paths) avg_train_return = np.mean(train_avg_returns) avg_test_return = np.mean(test_avg_returns) avg_train_succ = np.mean(train_avg_succ, axis=0) avg_test_succ = np.mean(test_avg_succ, axis=0) avg_train_len = np.mean(train_avg_len, axis=0) avg_test_len = np.mean(test_avg_len, axis=0) self.eval_statistics[ 'AverageReturn_all_train_tasks'] = avg_train_return self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return for i, s in enumerate(avg_train_succ): self.eval_statistics['Succ_train_tasks_%s' % i] = s for i, s in enumerate(avg_test_succ): self.eval_statistics['Succ_test_tasks_%s' % i] = s for i, s in enumerate(avg_train_len): self.eval_statistics['Len__train_tasks_%s' % i] = s for i, s in enumerate(avg_test_len): self.eval_statistics['Len_test_tasks_%s' % i] = s for key, value in self.eval_statistics.items(): logger.record_tabular(key, value) self.eval_statistics = None if self.render_eval_paths: self.env.render_paths(test_paths) if self.plotter: self.plotter.draw()
def evaluate(self, epoch): statistics = OrderedDict() statistics.update(self.eval_statistics) self.eval_statistics = statistics ### train tasks dprint('evaluating on {} train tasks'.format(len(self.train_tasks))) train_avg_returns = [] for idx in self.train_tasks: dprint('task {} encoder RB size'.format(idx), self.enc_replay_buffer.task_buffers[idx]._size) paths = self.collect_paths(idx, epoch, eval_task=False) train_avg_returns.append(eval_util.get_average_returns(paths)) ### test tasks dprint('evaluating on {} test tasks'.format(len(self.eval_tasks))) test_avg_returns = [] # This is calculating the embedding online, because every iteration # we clear the encoding buffer for the test tasks. for idx in self.eval_tasks: self.task_idx = idx self.env.reset_task(idx) # collect data fo computing embedding if needed if self.eval_embedding_source in ['online', 'initial_pool']: pass elif self.eval_embedding_source == 'online_exploration_trajectories': self.eval_enc_replay_buffer.task_buffers[idx].clear() # task embedding sampled from prior and held fixed self.collect_data_sampling_from_prior( num_samples=self.num_steps_per_task, resample_z_every_n=self.max_path_length, eval_task=True) elif self.eval_embedding_source == 'online_on_policy_trajectories': self.eval_enc_replay_buffer.task_buffers[idx].clear() # half the data from z sampled from prior, the other half from z sampled from posterior self.collect_data_online(idx=idx, num_samples=self.num_steps_per_task, eval_task=True) else: raise Exception("Invalid option for computing eval embedding") dprint('task {} encoder RB size'.format(idx), self.eval_enc_replay_buffer.task_buffers[idx]._size) test_paths = self.collect_paths(idx, epoch, eval_task=True) test_avg_returns.append(eval_util.get_average_returns(test_paths)) if self.use_information_bottleneck: z_mean = np.mean( np.abs(ptu.get_numpy(self.policy.z_dists[0].mean))) z_sig = np.mean(ptu.get_numpy(self.policy.z_dists[0].variance)) self.eval_statistics['Z mean eval'] = z_mean self.eval_statistics['Z variance eval'] = z_sig # TODO(KR) what does this do if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(test_paths) avg_train_return = np.mean(train_avg_returns) avg_test_return = np.mean(test_avg_returns) self.eval_statistics[ 'AverageReturn_all_train_tasks'] = avg_train_return self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return for key, value in self.eval_statistics.items(): logger.record_tabular(key, value) self.eval_statistics = None if self.render_eval_paths: self.env.render_paths(test_paths) if self.plotter: self.plotter.draw()
def evaluate(self, epoch): if self.eval_statistics is None: self.eval_statistics = OrderedDict() ### sample trajectories from prior for debugging / visualization if self.dump_eval_paths: # 100 arbitrarily chosen for visualizations of point_robot trajectories # just want stochasticity of z, not the policy self.agent.clear_z() prior_paths, _ = self.sampler.obtain_samples( deterministic=self.eval_deterministic, max_samples=self.max_path_length * 20, accum_context=False, resample=1) logger.save_extra_data( prior_paths, path='eval_trajectories/prior-epoch{}'.format(epoch)) ### train tasks # eval on a subset of train tasks for speed eval_util.dprint('evaluating on {} train tasks'.format( len(self.train_goals))) ### eval train tasks with on-policy data to match eval of test tasks train_final_returns, train_final_achieved = self._do_eval( self.train_goals, epoch) # Comment this line for walker-param # train_final_achieved_pair = [(train_final_achieved[i], goal) for i, goal in enumerate(self.train_goals)] train_final_achieved_pair = [(train_final_achieved[i], -1) for i, goal in enumerate(self.train_goals) ] eval_util.dprint('train final achieved') eval_util.dprint(train_final_achieved_pair) ### WD tasks eval_util.dprint('evaluating on {} wd tasks'.format(len( self.wd_goals))) wd_final_returns, wd_final_achieved = self._do_eval( self.wd_goals, epoch) # Comment this line for walker-param # wd_final_achieved_pair = [(wd_final_achieved[i], goal) for i, goal in enumerate(self.wd_goals)] wd_final_achieved_pair = [(wd_final_achieved[i], -1) for i, goal in enumerate(self.wd_goals)] eval_util.dprint('WD test final achieved') eval_util.dprint(wd_final_achieved_pair) # ### OOD tasks # eval_util.dprint('evaluating on {} wd tasks'.format(len(self.ood_goals))) # ood_final_returns, ood_final_achieved = self._do_eval(self.ood_goals, epoch) # # Comment this line for walker-param # # ood_final_achieved_pair = [(ood_final_achieved[i], goal) for i, goal in enumerate(self.ood_goals)] # ood_final_achieved_pair = [(ood_final_achieved[i], -1) for i, goal in enumerate(self.ood_goals)] # eval_util.dprint('OOD test final achieved') # eval_util.dprint(ood_final_achieved_pair) # # save the final posterior # self.agent.log_diagnostics(self.eval_statistics) avg_train_return = np.mean(train_final_returns) avg_wd_return = np.mean(wd_final_returns) # avg_ood_return = np.mean(ood_final_returns) self.eval_statistics[ 'AverageReturn_all_train_tasks'] = avg_train_return self.eval_statistics['AverageReturn_all_wd_tasks'] = avg_wd_return # self.eval_statistics['AverageReturn_all_ood_tasks'] = avg_ood_return self.eval_statistics['Return_all_train_tasks'] = train_final_returns self.eval_statistics['Return_all_wd_tasks'] = wd_final_returns # self.eval_statistics['Return_all_ood_tasks'] = ood_final_returns self.eval_statistics[ 'Achieved_all_train_tasks'] = train_final_achieved_pair self.eval_statistics['Achieved_all_wd_tasks'] = wd_final_achieved_pair # self.eval_statistics['Achieved_all_ood_tasks'] = ood_final_achieved_pair for key, value in self.eval_statistics.items(): logger.record_tabular(key, value) self.eval_statistics = None if self.plotter: self.plotter.draw()