def enrich_expert_trajectories(self, origin_folder, goal_number, destination_folder, fake=False): trajs_for_goal = joblib.load(origin_folder + str(goal_number) + ".pkl") goal_distractions_and_shuffle_order = joblib.load( destination_folder + "goals_pool.pkl")['goals_pool'][goal_number] shuffle_order = goal_distractions_and_shuffle_order[-1].reshape((3, )) goal_and_distractions = np.concatenate( (goal_distractions_and_shuffle_order[int(shuffle_order[0])], goal_distractions_and_shuffle_order[int(shuffle_order[1])], goal_distractions_and_shuffle_order[int(shuffle_order[2])])) print("goals for index", goal_number, ":\n", goal_and_distractions) print("shuffle order for index", goal_number, ":\n", shuffle_order) new_trajs_for_goal = [] for traj in trajs_for_goal: obs = traj['observations'] new_obs = [ np.concatenate((obs_step, goal_and_distractions.reshape( (9, )))) for obs_step in obs ] new_traj = copy.deepcopy(traj) new_traj['observations'] = new_obs new_trajs_for_goal.append(new_traj) joblib_dump_safe( new_trajs_for_goal, destination_folder + str(goal_number) + ("" if not fake else "fake") + "dist.pkl")
def _setup_goals(self, goals_pool_to_load, goals_pickle_to): if goals_pool_to_load is not None: # load goals logger.log("Loading goals pool from %s ..." % goals_pool_to_load) self.goals_pool = joblib.load(goals_pool_to_load)['goals_pool'] self.goals_idxs_for_itr_dict = joblib.load( goals_pool_to_load)['idxs_dict'] else: # build goals pool and idxs_dict goals_pool_size = (self.n_itr - self.start_itr) * self.meta_batch_size logger.log("Sampling a pool of tasks/goals for this meta-batch...") env = self.env while 'sample_goals' not in dir(env): env = env.wrapped_env self.goals_pool = env.sample_goals(goals_pool_size) self.goals_idxs_for_itr_dict = {} for itr in range(self.start_itr, self.n_itr): self.goals_idxs_for_itr_dict[itr] = rd.sample( range(goals_pool_size), self.meta_batch_size) # save goals pool if goals_pickle_to is not None: # logger.log("Saving goals to %s..." % goals_pickle_to) # joblib_dump_safe(self.goals_to_use_dict, goals_pickle_to) logger.log("Saving goals pool to %s..." % goals_pickle_to) joblib_dump_safe( dict(goals_pool=self.goals_pool, idxs_dict=self.goals_idxs_for_itr_dict), goals_pickle_to) # inspect goals pool env = self.env while 'sample_goals' not in dir(env): env = env.wrapped_env reset_dimensions = env.sample_goals(1).shape[1:] dimensions = np.shape( self.goals_pool[self.goals_idxs_for_itr_dict[0][0]]) assert reset_dimensions == dimensions, "loaded dimensions are %s, do not match with environment's %s" % ( dimensions, reset_dimensions) # inspect goals_idxs_for_itr_dict assert set(range(self.start_itr, self.n_itr)).issubset(set(self.goals_idxs_for_itr_dict.keys())), \ "Not all meta-iteration numbers have idx_dict in %s" % goals_pool_to_load for itr in range(self.start_itr, self.n_itr): num_goals = len(self.goals_idxs_for_itr_dict[itr]) assert num_goals >= self.meta_batch_size, "iteration %s contained %s goals when at least %s are needed" % ( itr, num_goals, self.meta_batch_size) self.goals_idxs_for_itr_dict[itr] = self.goals_idxs_for_itr_dict[ itr][:self.meta_batch_size] # build goals_to_use_dict self.goals_to_use_dict = {} for itr in range(self.start_itr, self.n_itr): if itr not in self.testing_itrs or self.test_on_training_goals: self.goals_to_use_dict[itr] = np.array([ self.goals_pool[idx] for idx in self.goals_idxs_for_itr_dict[itr] ])
def attach_zeros_expert_trajectories(self, origin_folder, goal_number, destination_folder, extra_dim=0, suffix=""): trajs_for_goal = joblib.load(origin_folder + str(goal_number) + suffix + ".pkl") extra_input = np.array([0.] * extra_dim) new_trajs_for_goal = [] for traj in trajs_for_goal: obs = traj['observations'] new_obs = [ np.concatenate((obs_step, extra_input)) for obs_step in obs ] new_traj = copy.deepcopy(traj) new_traj['observations'] = new_obs new_trajs_for_goal.append(new_traj) print( "doing", destination_folder + str(goal_number) + suffix + "_" + str(extra_dim) + ".pkl") joblib_dump_safe( new_trajs_for_goal, destination_folder + str(goal_number) + suffix + "_" + str(extra_dim) + ".pkl")
# moving the env_infos/img to observations import numpy as np import joblib from rllab.sampler.utils import joblib_dump_safe for goal in ["0"]: #,"1","2"]: a = joblib.load( "/home/rosen/maml_rl/saved_expert_traj/R7DOF/R7-ET-vision-rgb_dummy/raw/%s.pkl" % goal) for path in a: path['observations'] = path['env_infos']['img'] path['env_infos'] = {} joblib_dump_safe( a, "/home/rosen/maml_rl/saved_expert_traj/R7DOF/R7-ET-vision-rgb_dummy/%s.pkl" % goal) # creating a dummy goals pool import numpy as np import joblib from rllab.sampler.utils import joblib_dump_safe gp = joblib.load( "/home/rosen/maml_rl/saved_expert_traj/R7DOF/R7-ET-vision-rgb_dummy/goals_pool.pkl" ) gp_dummy = {} gp_dummy['goals_pool'] = [0] gp_dummy['idxs_dict'] = {}
def train(self): with tf.Session() as sess: if self.load_policy is not None: self.policy = joblib.load(self.load_policy)['policy'] self.init_opt() # initialize uninitialized vars (I know, it's ugly) uninit_vars = [] for var in tf.all_variables(): try: sess.run(var) except tf.errors.FailedPreconditionError: uninit_vars.append(var) sess.run(tf.initialize_variables(uninit_vars)) #sess.run(tf.initialize_all_variables()) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): if itr == self.n_itr - 1: self.policy.std_modifier = 0.0001 self.policy.recompute_dist_for_adjusted_std() if itr in self.goals_for_ET_dict.keys(): # self.policy.std_modifier = 0.0001 # self.policy.recompute_dist_for_adjusted_std() goals = self.goals_for_ET_dict[itr] noise = self.action_noise_test self.batch_size = self.batch_size_expert_traj else: if self.reset_arg is None: goals = [None] else: goals = [self.reset_arg] noise = self.action_noise_train self.batch_size = self.batch_size_train paths_to_save = {} itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") paths = [] for goalnum, goal in enumerate(goals): preupdate = True if itr < self.n_itr - 1 else False # paths_for_goal = self.obtain_samples(itr=itr, reset_args=[{'goal': goal, 'noise': noise}]) # when using oracle environments with changing noise, use this line! paths_for_goal = self.obtain_samples( itr=itr, reset_args=[{ 'goal': goal, 'noise': noise }], preupdate=preupdate) print("debug, goal 1", goal) paths.extend( paths_for_goal ) # we need this to be flat because we process all of them together # TODO: there's a bunch of sample processing happening below that we should abstract away if itr in self.expert_traj_itrs_to_pickle: logger.log("Saving trajectories...") paths_no_goalobs = self.clip_goal_from_obs( paths_for_goal) [ path.pop('agent_infos') for path in paths_no_goalobs ] paths_to_save[goalnum] = paths_no_goalobs if itr in self.expert_traj_itrs_to_pickle: logger.log("Pickling trajectories...") assert len( paths_to_save.keys() ) == 1, "we're going through ET goals one at a time now 10/24/17" joblib_dump_safe( paths_to_save[0], self.save_expert_traj_dir + str(itr) + ".pkl") logger.log("Fast-processing returns...") undiscounted_returns = [ sum(path['rewards']) for path in paths ] print("debug", undiscounted_returns) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) else: logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data) #new_param_values = self.policy.get_variable_values(self.policy.all_params) logger.log("Saving snapshot...") params = self.get_itr_snapshot( itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) if True and (itr % 16 == 0 ) and 7 < self.env.observation_space.shape[ 0] < 12: # ReacherEnvOracleNoise logger.log("Saving visualization of paths") plt.clf() plt.hold(True) goal = paths[0]['observations'][0][-2:] plt.plot(goal[0], goal[1], 'k*', markersize=10) goal = paths[1]['observations'][0][-2:] plt.plot(goal[0], goal[1], 'k*', markersize=10) goal = paths[2]['observations'][0][-2:] plt.plot(goal[0], goal[1], 'k*', markersize=10) points = np.array( [obs[6:8] for obs in paths[0]['observations']]) plt.plot(points[:, 0], points[:, 1], '-r', linewidth=2) points = np.array( [obs[6:8] for obs in paths[1]['observations']]) plt.plot(points[:, 0], points[:, 1], '--r', linewidth=2) points = np.array( [obs[6:8] for obs in paths[2]['observations']]) plt.plot(points[:, 0], points[:, 1], '-.r', linewidth=2) plt.plot(0, 0, 'k.', markersize=5) plt.xlim([-0.25, 0.25]) plt.ylim([-0.25, 0.25]) plt.legend(['path']) plt.savefig( osp.join(logger.get_snapshot_dir(), 'path' + str(0) + '_' + str(itr) + '.png')) print( osp.join(logger.get_snapshot_dir(), 'path' + str(0) + '_' + str(itr) + '.png')) # if self.make_video and itr % 2 == 0 or itr in [0,1,2,3,4,5,6,7,8]: # and itr in self.goals_for_ET_dict.keys() == 0: if self.make_video and ( itr >= 0 and itr <= self.n_itr - 1 ): # and itr in self.goals_for_ET_dict.keys() == 0: logger.log("Saving videos...") self.env.reset(reset_args=goals[0]) video_filename = osp.join( logger.get_snapshot_dir(), 'post_path_%s_0_%s.gif' % (itr, time.strftime("%H%M%S"))) rollout( env=self.env, agent=self.policy, max_path_length=self.max_path_length, animated=True, speedup=2, save_video=True, video_filename=video_filename, reset_arg=goals[0], use_maml=False, ) # self.env.reset(reset_args=goals[0]) # video_filename = osp.join(logger.get_snapshot_dir(), 'post_path_%s_1_%s.gif' % (itr,time.strftime("%H%M%S"))) # rollout(env=self.env, agent=self.policy, max_path_length=self.max_path_length, # animated=True, speedup=2, save_video=True, video_filename=video_filename, # reset_arg=goals[0], # use_maml=False, ) # self.env.reset(reset_args=goals[0]) # video_filename = osp.join(logger.get_snapshot_dir(), 'post_path_%s_2_%s.gif' % (itr,time.strftime("%H%M%S"))) # rollout(env=self.env, agent=self.policy, max_path_length=self.max_path_length, # animated=True, speedup=2, save_video=True, video_filename=video_filename, # reset_arg=goals[0], # use_maml=False, ) # debugging """ if itr % 1 == 0: logger.log("Saving visualization of paths") import matplotlib.pyplot as plt; for ind in range(5): plt.clf(); plt.hold(True) points = paths[ind]['observations'] plt.plot(points[:,0], points[:,1], '-r', linewidth=2) plt.xlim([-1.0, 1.0]) plt.ylim([-1.0, 1.0]) plt.legend(['path']) plt.savefig('/home/cfinn/path'+str(ind)+'.png') """ # end debugging logger.dump_tabular(with_prefix=False) if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") import ipdb ipdb.set_trace() brace = 'a' self.shutdown_worker()
def __init__( self, env, policy, baseline, metalearn_baseline=False, scope=None, n_itr=500, start_itr=0, # Note that the number of trajectories for grad update = batch_size # Defaults are 10 trajectories of length 500 for gradient update # If default is 10 traj-s, why batch_size=100? batch_size=100, max_path_length=500, meta_batch_size=100, num_grad_updates=1, discount=0.99, gae_lambda=1, beta_steps=1, beta_curve=None, plot=False, pause_for_plot=False, make_video=False, center_adv=True, positive_adv=False, store_paths=False, whole_paths=True, fixed_horizon=False, sampler_cls=None, sampler_args=None, force_batch_sampler=False, use_maml=True, use_maml_il=False, test_on_training_goals=False, limit_demos_num=None, test_goals_mult=1, load_policy=None, pre_std_modifier=1.0, post_std_modifier_train=1.0, post_std_modifier_test=1.0, goals_to_load=None, goals_pool_to_load=None, expert_trajs_dir=None, expert_trajs_suffix="", goals_pickle_to=None, goals_pool_size=None, use_pooled_goals=True, extra_input=None, extra_input_dim=0, seed=1, **kwargs): """ :param env: Environment :param policy: Policy :type policy: Policy :param baseline: Baseline :param scope: Scope for identifying the algorithm. Must be specified if running multiple algorithms simultaneously, each using different environments and policies :param n_itr: Number of iterations. :param start_itr: Starting iteration. :param batch_size: Number of samples per iteration. # :param max_path_length: Maximum length of a single rollout. :param meta_batch_size: Number of tasks sampled per meta-update :param num_grad_updates: Number of fast gradient updates :param discount: Discount. :param gae_lambda: Lambda used for generalized advantage estimation. :param plot: Plot evaluation run after each iteration. :param pause_for_plot: Whether to pause before contiuing when plotting. :param center_adv: Whether to rescale the advantages so that they have mean 0 and standard deviation 1. :param positive_adv: Whether to shift the advantages so that they are always positive. When used in conjunction with center_adv the advantages will be standardized before shifting. :param store_paths: Whether to save all paths data to the snapshot. :return: """ self.seed = seed self.env = env self.policy = policy self.load_policy = load_policy self.baseline = baseline self.metalearn_baseline = metalearn_baseline self.scope = scope self.n_itr = n_itr self.start_itr = start_itr # batch_size is the number of trajectories for one fast grad update. # self.batch_size is the number of total transitions to collect. self.batch_size = batch_size * max_path_length * meta_batch_size self.max_path_length = max_path_length self.discount = discount self.gae_lambda = gae_lambda self.beta_steps = beta_steps self.beta_curve = beta_curve if beta_curve is not None else [ self.beta_steps ] self.old_il_loss = None self.plot = plot self.pause_for_plot = pause_for_plot self.make_video = make_video self.center_adv = center_adv self.positive_adv = positive_adv self.store_paths = store_paths self.whole_paths = whole_paths self.fixed_horizon = fixed_horizon self.meta_batch_size = meta_batch_size # number of tasks self.num_grad_updates = num_grad_updates # number of gradient steps during training self.use_maml_il = use_maml_il self.test_on_training_goals = test_on_training_goals self.testing_itrs = TESTING_ITRS if self.metalearn_baseline: self.testing_itrs.insert(0, 0) print("test_on_training_goals", self.test_on_training_goals) self.limit_demos_num = limit_demos_num self.test_goals_mult = test_goals_mult self.pre_std_modifier = pre_std_modifier self.post_std_modifier_train = post_std_modifier_train self.post_std_modifier_test = post_std_modifier_test # self.action_limiter_multiplier = action_limiter_multiplier self.expert_trajs_dir = expert_trajs_dir self.expert_trajs_suffix = expert_trajs_suffix self.use_pooled_goals = use_pooled_goals self.extra_input = extra_input self.extra_input_dim = extra_input_dim # Next, we will set up the goals and potentially trajectories that we plan to use. # If we use trajectorie assert goals_to_load is None, "deprecated" if self.use_pooled_goals: if expert_trajs_dir is not None: assert goals_pool_to_load is None, "expert_trajs already comes with its own goals, please disable goals_pool_to_load" goals_pool = joblib.load(self.expert_trajs_dir + "goals_pool.pkl") self.goals_pool = goals_pool['goals_pool'] self.goals_idxs_for_itr_dict = goals_pool['idxs_dict'] if "demos_path" in goals_pool.keys(): self.demos_path = goals_pool["demos_path"] else: self.demos_path = expert_trajs_dir print("successfully extracted goals pool", self.goals_idxs_for_itr_dict.keys()) elif goals_pool_to_load is not None: logger.log("Loading goals pool from %s ..." % goals_pool_to_load) self.goals_pool = joblib.load(goals_pool_to_load)['goals_pool'] self.goals_idxs_for_itr_dict = joblib.load( goals_pool_to_load)['idxs_dict'] else: # we build our own goals pool and idxs_dict if goals_pool_size is None: self.goals_pool_size = ( self.n_itr - self.start_itr) * self.meta_batch_size else: self.goals_pool_size = goals_pool_size logger.log( "Sampling a pool of tasks/goals for this meta-batch...") env = self.env while 'sample_goals' not in dir(env): env = env.wrapped_env self.goals_pool = env.sample_goals(self.goals_pool_size) self.goals_idxs_for_itr_dict = {} for itr in range(self.start_itr, self.n_itr): self.goals_idxs_for_itr_dict[itr] = rd.sample( range(self.goals_pool_size), self.meta_batch_size) # inspecting the goals pool env = self.env while 'sample_goals' not in dir(env): env = env.wrapped_env reset_dimensions = env.sample_goals(1).shape[1:] dimensions = np.shape( self.goals_pool[self.goals_idxs_for_itr_dict[0][0]]) assert reset_dimensions == dimensions, "loaded dimensions are %s, do not match with environment's %s" % ( dimensions, reset_dimensions) # inspecting goals_idxs_for_itr_dict assert set(range(self.start_itr, self.n_itr)).issubset(set(self.goals_idxs_for_itr_dict.keys())), \ "Not all meta-iteration numbers have idx_dict in %s" % goals_pool_to_load for itr in range(self.start_itr, self.n_itr): num_goals = len(self.goals_idxs_for_itr_dict[itr]) assert num_goals >= self.meta_batch_size, "iteration %s contained %s goals when at least %s are needed" % ( itr, num_goals, self.meta_batch_size) self.goals_idxs_for_itr_dict[ itr] = self.goals_idxs_for_itr_dict[itr][:self. meta_batch_size] # we build goals_to_use_dict regardless of how we obtained goals_pool, goals_idx_for_itr_dict self.goals_to_use_dict = {} for itr in range(self.start_itr, self.n_itr): if itr not in self.testing_itrs: self.goals_to_use_dict[itr] = np.array([ self.goals_pool[idx] for idx in self.goals_idxs_for_itr_dict[itr] ]) else: # backwards compatibility code for old-format ETs self.goals_to_use_dict = joblib.load(self.expert_trajs_dir + "goals.pkl") assert set(range(self.start_itr, self.n_itr)).issubset( set(self.goals_to_use_dict.keys()) ), "Not all meta-iteration numbers have saved goals in %s" % expert_trajs_dir # chopping off unnecessary meta-iterations and goals self.goals_to_use_dict = { itr: self.goals_to_use_dict[itr][:self.meta_batch_size] for itr in range(self.start_itr, self.n_itr) } # saving goals pool if goals_pickle_to is not None: # logger.log("Saving goals to %s..." % goals_pickle_to) # joblib_dump_safe(self.goals_to_use_dict, goals_pickle_to) logger.log("Saving goals pool to %s..." % goals_pickle_to) joblib_dump_safe( dict(goals_pool=self.goals_pool, idxs_dict=self.goals_idxs_for_itr_dict), goals_pickle_to) if sampler_cls is None: if singleton_pool.n_parallel > 1: sampler_cls = BatchSampler print("Using Batch Sampler") else: sampler_cls = VectorizedSampler print("Using Vectorized Sampler") if sampler_args is None: sampler_args = dict() if 'n_envs' not in sampler_args.keys(): sampler_args['n_envs'] = self.meta_batch_size self.sampler = sampler_cls(self, **sampler_args)