def log_diagnostics(self, paths, prefix=''): """ Log extra information per iteration based on the collected paths """ log_stds = np.vstack( [path["agent_infos"]["log_std"] for path in paths]) logger.logkv(prefix + 'AveragePolicyStd', np.mean(np.exp(log_stds)))
def optimize_policy(self, all_samples_data, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ meta_op_input_dict = self._extract_input_dict_meta_op( all_samples_data, self._optimization_keys) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(meta_op_input_dict) logger.log("Computing loss before") loss_before = self.optimizer.loss(meta_op_input_dict) logger.log("Optimizing") self.optimizer.optimize(meta_op_input_dict) logger.log("Computing loss after") loss_after = self.optimizer.loss(meta_op_input_dict) logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(meta_op_input_dict) if log: logger.logkv('MeanKLBefore', mean_kl_before) logger.logkv('MeanKL', mean_kl) logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after) logger.logkv('dLoss', loss_before - loss_after)
def optimize_policy(self, samples_data, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ input_dict = self._extract_input_dict(samples_data, self._optimization_keys, prefix='train') if log: logger.log("Optimizing") loss_before = self.optimizer.optimize(input_val_dict=input_dict) if log: logger.log("Computing statistics") loss_after = self.optimizer.loss(input_val_dict=input_dict) if log: logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after)
def log_diagnostics(self, paths, prefix=''): progs = [np.mean(path["env_infos"]["reward_forward"]) for path in paths] ctrl_cost = [-np.mean(path["env_infos"]["reward_ctrl"]) for path in paths] logger.logkv(prefix + 'AverageForwardReturn', np.mean(progs)) logger.logkv(prefix + 'MaxForwardReturn', np.max(progs)) logger.logkv(prefix + 'MinForwardReturn', np.min(progs)) logger.logkv(prefix + 'StdForwardReturn', np.std(progs)) logger.logkv(prefix + 'AverageCtrlCost', np.mean(ctrl_cost))
def log_diagnostics(self, paths, prefix=''): reach_rew = [path["env_infos"]['reachRew'] for path in paths] pick_rew = [path["env_infos"]['pickRew'][-1] for path in paths] place_rew = [path["env_infos"]['placeRew'] for path in paths] reach_dist = [path["env_infos"]['reachDist'] for path in paths] placing_dist = [path["env_infos"]['placingDist'] for path in paths] logger.logkv(prefix + 'AverageReachReward', np.mean(reach_rew)) logger.logkv(prefix + 'AveragePickReward', np.mean(pick_rew)) logger.logkv(prefix + 'AveragePlaceReward', np.mean(place_rew)) logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist)) logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist))
def train(self): for i in range(1, self.eff+1): with self.sess.as_default() as sess: logger.log("----------- Adaptation rollouts per meta-task = ", i, " -----------") # self.sampler.rollouts_per_meta_task = 10000 self.sampler.update_batch_size(i) # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] sess.run(tf.variables_initializer(uninit_vars)) self.task = self.env.sample_tasks(self.sampler.meta_batch_size, is_eval=True) self.sampler.set_tasks(self.task) #logger.log("\n ---------------- Iteration %d ----------------" % itr) logger.log("Sampling set of tasks/goals for this meta-batch...") """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") paths = self.sampler.obtain_samples(log=True, log_prefix='train-') """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='train-') self.log_diagnostics(sum(paths.values(), []), prefix='train-') #""" ------------------ Policy Update ---------------------""" #logger.log("Optimizing policy...") ## This needs to take all samples_data so that it can construct graph for meta-optimization. #time_optimization_step_start = time.time() #self.algo.optimize_policy(samples_data) """ ------------------- Logging Stuff --------------------------""" logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) #logger.log("Saving snapshot...") #params = self.get_itr_snapshot(itr) #logger.save_itr_params(itr, params) #logger.log("Saved") logger.dumpkvs() # if itr == 0: # sess.graph.finalize() logger.log("Training finished") self.sess.close()
def optimize_policy(self, all_samples_data, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ meta_op_input_dict = self._extract_input_dict_meta_op(all_samples_data, self._optimization_keys) # add kl_coeffs / clip_eps to meta_op_input_dict meta_op_input_dict['inner_kl_coeff'] = self.inner_kl_coeff meta_op_input_dict['clip_eps'] = self.clip_eps if log: logger.log("Optimizing") loss_before = self.optimizer.optimize(input_val_dict=meta_op_input_dict) if log: logger.log("Computing statistics") loss_after, inner_kls, outer_kl = self.optimizer.compute_stats(input_val_dict=meta_op_input_dict) if self.adaptive_inner_kl_penalty: if log: logger.log("Updating inner KL loss coefficients") self.inner_kl_coeff = self.adapt_kl_coeff(self.inner_kl_coeff, inner_kls, self.target_inner_step) if log: logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after) logger.logkv('KLInner', np.mean(inner_kls)) logger.logkv('KLCoeffInner', np.mean(self.inner_kl_coeff))
def optimize_policy(self, all_samples_data, mod_samples_data, num_paths_per_rollout, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ meta_op_input_dict = self._extract_input_dict_meta_op( all_samples_data, self._optimization_keys) extra_feed_dict = { self.policy.mod_input_var: mod_samples_data, self.policy.num_paths_var: num_paths_per_rollout, } # add kl_coeffs / clip_eps to meta_op_input_dict meta_op_input_dict['inner_kl_coeff'] = self.inner_kl_coeff meta_op_input_dict['clip_eps'] = self.clip_eps if log: logger.log("Optimizing") loss_before, grad_norms = self.optimizer.optimize( input_val_dict=meta_op_input_dict, extra_feed_dict=extra_feed_dict) if self.summary_writer is not None: for name, norm in grad_norms.items(): tensorboard_util.log_scalar(self.summary_writer, 'grads/' + name, norm, self.log_step) self.log_step += 1 if log: logger.log("Computing statistics") loss_after, inner_kls, outer_kl = self.optimizer.compute_stats( input_val_dict=meta_op_input_dict, extra_feed_dict=extra_feed_dict) if self.adaptive_inner_kl_penalty: if log: logger.log("Updating inner KL loss coefficients") self.inner_kl_coeff = self.adapt_kl_coeff(self.inner_kl_coeff, inner_kls, self.target_inner_step) if log: logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after) logger.logkv('KLInner', np.mean(inner_kls)) logger.logkv('KLCoeffInner', np.mean(self.inner_kl_coeff))
def log_diagnostics(self, paths, prefix=''): fwrd_vel = [path["env_infos"]['forward_vel'] for path in paths] final_fwrd_vel = [path["env_infos"]['forward_vel'][-1] for path in paths] ctrl_cost = [-path["env_infos"]['reward_ctrl'] for path in paths] logger.logkv(prefix + 'AvgForwardVel', np.mean(fwrd_vel)) logger.logkv(prefix + 'AvgFinalForwardVel', np.mean(final_fwrd_vel)) logger.logkv(prefix + 'AvgCtrlCost', np.std(ctrl_cost))
def log_diagnostics(self, paths, prefix=''): reach_dist = [path["env_infos"]['reachDist'] for path in paths] placing_dist = [path["env_infos"]['placeDist'] for path in paths] cos_dist = [path["env_infos"]['cosDist'] for path in paths] logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist)) logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist)) logger.logkv(prefix + 'AverageCosDistance', np.mean(cos_dist))
def _log_path_stats(self, paths, log=False, log_prefix=''): # compute log stats average_discounted_return = [ sum(path["discounted_rewards"]) for path in paths ] undiscounted_returns = [sum(path["rewards"]) for path in paths] if log == 'reward': logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) elif log == 'all' or log is True: logger.logkv(log_prefix + 'AverageDiscountedReturn', np.mean(average_discounted_return)) logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) logger.logkv(log_prefix + 'NumTrajs', len(paths)) logger.logkv(log_prefix + 'StdReturn', np.std(undiscounted_returns)) logger.logkv(log_prefix + 'MaxReturn', np.max(undiscounted_returns)) logger.logkv(log_prefix + 'MinReturn', np.min(undiscounted_returns))
def train(self): """ Trains policy on env using algo Pseudocode: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [ var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var)) ] sess.run(tf.variables_initializer(uninit_vars)) start_time = time.time() for itr in range(self.start_itr, self.n_itr): self.task = self.env.sample_tasks(self.sampler.meta_batch_size) self.sampler.set_tasks(self.task) itr_start_time = time.time() logger.log( "\n ---------------- Iteration %d ----------------" % itr) logger.log( "Sampling set of tasks/goals for this meta-batch...") """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") time_env_sampling_start = time.time() paths = self.sampler.obtain_samples(log=True, log_prefix='train-') sampling_time = time.time() - time_env_sampling_start """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") time_proc_samples_start = time.time() samples_data = self.sample_processor.process_samples( paths, log='all', log_prefix='train-') proc_samples_time = time.time() - time_proc_samples_start self.log_diagnostics(sum(paths.values(), []), prefix='train-') """ ------------------ Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. time_optimization_step_start = time.time() self.algo.optimize_policy(samples_data) """ ------------------ Test-split Performance for logging ---------------------""" logger.log("Testing on test-tasks split for logging...") sampler_batch_size = self.sampler.batch_size self.sampler.update_batch_size(3) ####################2 undiscounted_returns = [] for i in range(0, self.env.NUM_EVAL, self.sampler.meta_batch_size): # Caution: Here actually i in [0] since self.meta_batch_size=100(when running on linux) self.sampler.update_tasks( test=True, start_from=i) # sample from test split! #self.policy.switch_to_pre_update() # Switch to pre-update policy logger.log("On Test: Obtaining samples...") paths = self.sampler.obtain_samples( log=False, test=True) # log_prefix='test-Step_%d-' % step logger.log("On Test: Processing Samples...") self.log_diagnostics(sum(list(paths.values()), []), prefix='test-') """ ------------------- Logging Returns --------------------""" paths = self.sample_processor.gao_paths(paths) undiscounted_returns.extend( [sum(path["rewards"]) for path in paths]) test_average_return = np.mean(undiscounted_returns) self.sampler.update_batch_size(sampler_batch_size) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) logger.logkv('test-AverageReturn', test_average_return) logger.logkv('Time-Optimization', time.time() - time_optimization_step_start) logger.logkv('Time-SampleProc', np.sum(proc_samples_time)) logger.logkv('Time-Sampling', sampling_time) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() if itr == 0: sess.graph.finalize() logger.log("Training finished") self.sess.close()
def _log_path_stats(self, paths, log=False, log_prefix='', experiment=None): # compute log stats average_discounted_return = np.mean( [path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] # average_vel = np.mean([path["env_infos"]["forward_vel"] for path in paths]) if log == 'reward': logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) elif log == 'all' or log is True: logger.logkv(log_prefix + 'AverageDiscountedReturn', average_discounted_return) logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) logger.logkv(log_prefix + 'NumTrajs', len(paths)) logger.logkv(log_prefix + 'StdReturn', np.std(undiscounted_returns)) logger.logkv(log_prefix + 'MaxReturn', np.max(undiscounted_returns)) logger.logkv(log_prefix + 'MinReturn', np.min(undiscounted_returns)) if experiment: # experiment.log_metric("Average velocity", average_vel) experiment.log_metric("maxReturn", np.max(undiscounted_returns)) experiment.log_metric("MinReturn", np.min(undiscounted_returns)) experiment.log_metric('AverageReturn', np.mean(undiscounted_returns)) experiment.log_metric('StdReturn', np.std(undiscounted_returns)) experiment.log_metric('AverageDiscountedReturn', average_discounted_return) experiment.log_metric('StdReturn', np.std(undiscounted_returns))
def train(self): """ Trains policy on env using algo Pseudocode:: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] sess.run(tf.variables_initializer(uninit_vars)) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log("\n ---------------- Iteration %d ----------------" % itr) logger.log("Sampling set of tasks/goals for this meta-batch...") #self.sampler.update_tasks() self.policy.switch_to_pre_update() # Switch to pre-update policy all_samples_data, all_paths = [], [] list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], [] start_total_inner_time = time.time() for step in range(self.num_inner_grad_steps+1): logger.log('** Step ' + str(step) + ' **') """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") time_env_sampling_start = time.time() paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step) list_sampling_time.append(time.time() - time_env_sampling_start) all_paths.append(paths) """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") time_proc_samples_start = time.time() samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='Step_%d-' % step) all_samples_data.append(samples_data) list_proc_samples_time.append(time.time() - time_proc_samples_start) self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step) """ ------------------- Inner Policy Update --------------------""" time_inner_step_start = time.time() if step < self.num_inner_grad_steps: logger.log("Computing inner policy updates...") self.algo._adapt(samples_data) # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph', # sess.graph) list_inner_step_time.append(time.time() - time_inner_step_start) total_inner_time = time.time() - start_total_inner_time time_maml_opt_start = time.time() """ ------------------ Outer Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. time_outer_step_start = time.time() self.algo.optimize_policy(all_samples_data) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) #writer.add_scalar(self.algo.name, self.sample_processor.AR, self.sampler.total_timesteps_sampled) logger.logkv('Time-OuterStep', time.time() - time_outer_step_start) logger.logkv('Time-TotalInner', total_inner_time) logger.logkv('Time-InnerStep', np.sum(list_inner_step_time)) logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time)) logger.logkv('Time-Sampling', np.sum(list_sampling_time)) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.logkv('Time-MAMLSteps', time.time() - time_maml_opt_start) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() logger.log("Training finished") self.sess.close()
def train(self): policy_0 = self.policy for i in [4, 3, 2, 1]: #range(1, self.eff+1): print("On", i, "self.policy == policy_0: ", self.policy == policy_0) with self.sess.as_default() as sess: logger.log("----------- Adaptation rollouts per meta-task = ", i, " -----------") undiscounted_returns = [] for j in range(0, self.env.NUM_EVAL, self.sampler.meta_batch_size): logger.log("---------Testing on task", j, "~", j + self.sampler.meta_batch_size - 1, "---------") # initialize uninitialized vars (only initialize vars that were not loaded) # uninit_vars = [var for var in tf.global_variables() if # not sess.run(tf.is_variable_initialized(var))] # sess.run(tf.variables_initializer(uninit_vars)) uninit_vars = [var for var in tf.global_variables()] sess.run(tf.variables_initializer(uninit_vars)) logger.log( "Sampling set of tasks/goals for this meta-batch...") self.sampler.update_tasks( test=True, start_from=j) # sample from test split! self.policy.switch_to_pre_update( ) # Switch to pre-update policy for step in range(self.num_inner_grad_steps + 1): if step < self.num_inner_grad_steps: self.sampler.update_batch_size_v2( i) ###################### logger.log("On step-0: Obtaining samples...") else: self.sampler.update_batch_size(2) logger.log("On step-1: Obtaining samples...") paths = self.sampler.obtain_samples( log=False, test=True) # log_prefix='test-Step_%d-' % step logger.log("On Test: Processing Samples...") samples_data = self.sample_processor.process_samples( paths, log=False ) # log='all', log_prefix='test-Step_%d-' % step self.log_diagnostics(sum(list(paths.values()), []), prefix='test-Step_%d-' % step) """ ------------------- Inner Policy Update / logging returns --------------------""" if step < self.num_inner_grad_steps: logger.log( "On Test: Computing inner policy updates...") self.algo._adapt(samples_data) else: paths = self.sample_processor.gao_paths(paths) undiscounted_returns.extend( [sum(path["rewards"]) for path in paths]) test_average_return = np.mean(undiscounted_returns) logger.logkv('x', i) logger.logkv('return', test_average_return) logger.dumpkvs() logger.log("------Testing rollouts per meta-task = ", i, "finished------") '''
def obtain_samples(self, log=False, log_prefix='', test=False): print( "--------------obtaining", self.total_samples // self.meta_batch_size // self.max_path_length, "rollouts_per_task, for", self.meta_batch_size, "tasks..--------------") """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation paths = OrderedDict() for i in range(self.meta_batch_size): paths[i] = [] n_samples = 0 running_paths = [ _get_empty_running_paths_dict() for _ in range(self.vec_env.num_envs) ] print(" runnng_paths length:", len(running_paths)) pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy policy.reset(dones=[True] * self.meta_batch_size) # initial reset of envs obses = self.vec_env.reset() while n_samples < self.total_samples: # execute policy t = time.time() obs_per_task = np.split(np.asarray(obses), self.meta_batch_size) actions, agent_infos = policy.get_actions(obs_per_task) policy_time += time.time() - t # step environments t = time.time() actions = np.concatenate(actions) # stack meta batch next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t # stack agent_infos and if no infos were provided (--> None) create empty dicts agent_infos, env_infos = self._handle_info_dicts( agent_infos, env_infos) new_samples = 0 for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["dones"].append(done) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done: paths[idx // self.envs_per_task].append( dict( observations=np.asarray( running_paths[idx]["observations"]), actions=np.asarray(running_paths[idx]["actions"]), rewards=np.asarray(running_paths[idx]["rewards"]), dones=np.asarray(running_paths[idx]["dones"]), env_infos=utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) new_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = _get_empty_running_paths_dict() pbar.update(new_samples) n_samples += new_samples obses = next_obses pbar.stop() if not test: self.total_timesteps_sampled += self.total_samples print("------------self.total_timesteps_sampled:", self.total_timesteps_sampled, "-----------------") else: print("------------tested on:", self.total_samples // self.max_path_length, " rollouts-----------------") if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def train(self): """ Trains policy on env using algo Pseudocode:: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [ var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var)) ] sess.run(tf.variables_initializer(uninit_vars)) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log( "\n ---------------- Iteration %d ----------------" % itr) logger.log( "Sampling set of tasks/goals for this meta-batch...") self.sampler.update_tasks() # sample tasks! self.policy.switch_to_pre_update( ) # Switch to pre-update policy all_samples_data, all_paths = [], [] list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], [] start_total_inner_time = time.time() for step in range(self.num_inner_grad_steps + 1): logger.log('** Step ' + str(step) + ' **') """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") time_env_sampling_start = time.time() ''' if step == self.num_inner_grad_steps: temp = self.sampler.batch_size self.sampler.update_batch_size(2) paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step) self.sampler.update_batch_size(temp) else: paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step) ''' paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step) list_sampling_time.append(time.time() - time_env_sampling_start) all_paths.append(paths) """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") time_proc_samples_start = time.time() samples_data = self.sample_processor.process_samples( paths, log='all', log_prefix='Step_%d-' % step) all_samples_data.append(samples_data) list_proc_samples_time.append(time.time() - time_proc_samples_start) self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step) """ ------------------- Inner Policy Update --------------------""" time_inner_step_start = time.time() if step < self.num_inner_grad_steps: logger.log("Computing inner policy updates...") self.algo._adapt(samples_data) # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph', # sess.graph) list_inner_step_time.append(time.time() - time_inner_step_start) total_inner_time = time.time() - start_total_inner_time time_maml_opt_start = time.time() """ ------------------ Outer Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. time_outer_step_start = time.time() self.algo.optimize_policy(all_samples_data) """ ------------------ Test-split Performance for logging ---------------------""" logger.log( "Testing on test-tasks split for logging, rollout_per_task = 20..." ) undiscounted_returns = [] for i in range(0, self.env.NUM_EVAL, self.sampler.meta_batch_size): self.sampler.update_tasks( test=True, start_from=i) # sample from test split! self.policy.switch_to_pre_update( ) # Switch to pre-update policy for step in range(self.num_inner_grad_steps + 1): logger.log("On Test: Obtaining samples...") paths = self.sampler.obtain_samples( log=False, test=True) # log_prefix='test-Step_%d-' % step logger.log("On Test: Processing Samples...") samples_data = self.sample_processor.process_samples( paths, log=False ) # log='all', log_prefix='test-Step_%d-' % step self.log_diagnostics(sum(list(paths.values()), []), prefix='test20-Step_%d-' % step) """ ------------------- Inner Policy Update / logging returns --------------------""" if step < self.num_inner_grad_steps: logger.log( "On Test: Computing inner policy updates...") self.algo._adapt(samples_data) else: paths = self.sample_processor.gao_paths(paths) undiscounted_returns.extend( [sum(path["rewards"]) for path in paths]) test_average_return = np.mean(undiscounted_returns) logger.logkv('test20-AverageReturn', test_average_return) logger.log( "Testing on test-tasks split for logging, rollout_per_task = 2..." ) sampler_batch_size = self.sampler.batch_size self.sampler.update_batch_size(2) ############## undiscounted_returns = [] for i in range(0, self.env.NUM_EVAL, self.sampler.meta_batch_size): self.sampler.update_tasks( test=True, start_from=i) # sample from test split! self.policy.switch_to_pre_update( ) # Switch to pre-update policy for step in range(self.num_inner_grad_steps + 1): logger.log("On Test: Obtaining samples...") paths = self.sampler.obtain_samples( log=False, test=True) # log_prefix='test-Step_%d-' % step logger.log("On Test: Processing Samples...") samples_data = self.sample_processor.process_samples( paths, log=False ) # log='all', log_prefix='test-Step_%d-' % step self.log_diagnostics(sum(list(paths.values()), []), prefix='test-Step_%d-' % step) """ ------------------- Inner Policy Update / logging returns --------------------""" if step < self.num_inner_grad_steps: logger.log( "On Test: Computing inner policy updates...") self.algo._adapt(samples_data) else: paths = self.sample_processor.gao_paths(paths) undiscounted_returns.extend( [sum(path["rewards"]) for path in paths]) test_average_return = np.mean(undiscounted_returns) self.sampler.update_batch_size(sampler_batch_size) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) logger.logkv('test-AverageReturn', test_average_return) logger.logkv('Time-OuterStep', time.time() - time_outer_step_start) logger.logkv('Time-TotalInner', total_inner_time) logger.logkv('Time-InnerStep', np.sum(list_inner_step_time)) logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time)) logger.logkv('Time-Sampling', np.sum(list_sampling_time)) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.logkv('Time-MAMLSteps', time.time() - time_maml_opt_start) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() logger.log("Training finished") self.sess.close()
def _log_path_stats(self, paths, log=False, log_prefix=''): # compute log stats average_discounted_return = np.mean( [path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] if log == 'reward': logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) if 'Test' in log_prefix: logger.logkv('AverageReturn_all_test_tasks_last', np.mean(undiscounted_returns)) elif log == 'all' or log is True: logger.logkv(log_prefix + 'AverageDiscountedReturn', average_discounted_return) logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) logger.logkv(log_prefix + 'NumTrajs', len(paths)) logger.logkv(log_prefix + 'StdReturn', np.std(undiscounted_returns)) logger.logkv(log_prefix + 'MaxReturn', np.max(undiscounted_returns)) logger.logkv(log_prefix + 'MinReturn', np.min(undiscounted_returns))
def _log_path_stats(self, paths, log=False, log_prefix='', meta_batch_size=0): # compute log stats average_discounted_return = np.mean( [path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] if log == 'reward': logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) elif log == 'all' or log is True: logger.logkv(log_prefix + 'AverageDiscountedReturn', average_discounted_return) logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) ''' logger.logkv(log_prefix + 'AverageReturn-2', np.mean(undiscounted_returns[-2*meta_batch_size:])) # will take undiscounted[-meta_batch_size:] when rollouts_per_meta_task < 2 logger.log("AverageReturn-2 is estimated by the last 2 trajectories...") ''' logger.logkv(log_prefix + 'NumTrajs', len(paths)) logger.logkv(log_prefix + 'StdReturn', np.std(undiscounted_returns)) logger.logkv(log_prefix + 'MaxReturn', np.max(undiscounted_returns)) logger.logkv(log_prefix + 'MinReturn', np.min(undiscounted_returns))
def train(self): """ Trains policy on env using algo Pseudocode:: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [ var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var)) ] sess.run(tf.variables_initializer(uninit_vars)) n_timesteps = 0 start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log( "\n ---------------- Iteration %d ----------------" % itr) gradients = [] for i in range(self.num_sapling_rounds): logger.log("\n ----- Sampling Round %d ---" % i) dry = i < self.num_sapling_rounds - 1 if not dry: self.sampler.update_tasks() self.policy.switch_to_pre_update( ) # Switch to pre-update policy all_samples_data, all_paths = [], [] for step in range(self.num_inner_grad_steps + 1): logger.log('** Step ' + str(step) + ' **') logger.log("Obtaining samples...") paths = self.sampler.obtain_samples( log=True, log_prefix='Step_%d-' % step) all_paths.append(paths) logger.log("Processing samples...") samples_data = self.sample_processor.process_samples( paths, log='all', log_prefix='Step_%d-' % step) all_samples_data.append(samples_data) if not dry: self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step) if step < self.num_inner_grad_steps: logger.log("Computing inner policy updates...") self.algo._adapt(samples_data) """ compute gradients """ gradients.append( self.algo.compute_gradients(all_samples_data)) if not dry: """ ------------ Compute and log gradient variance ------------""" # compute variance of adaptation gradients for step_id in range(self.num_inner_grad_steps): meta_batch_size = len(gradients[0][0]) grad_std, grad_rstd = [], [] for task_id in range(meta_batch_size): stacked_grads = np.stack([ gradients[round_id][step_id][task_id] for round_id in range(self.num_sapling_rounds) ], axis=1) std = np.std(stacked_grads, axis=1) mean = np.abs(np.mean(stacked_grads, axis=1)) grad_std.append(np.mean(std)) grad_rstd.append(np.mean(std / mean)) logger.logkv('Step_%i-GradientMean', np.mean(mean)) logger.logkv('Step_%i-GradientStd' % step_id, np.mean(grad_std)) logger.logkv('Step_%i-GradientRStd' % step_id, np.mean(grad_rstd)) # compute variance of meta gradients stacked_grads = np.stack([ gradients[round_id][self.num_inner_grad_steps] for round_id in range(self.num_sapling_rounds) ], axis=1) std = np.std(stacked_grads, axis=1) mean = np.abs(np.mean(stacked_grads, axis=1)) meta_grad_std = np.mean(std) meta_grad_rstd = np.mean(std / (mean + 1e-8)) meta_grad_rvar = np.mean(std**2 / (mean + 1e-8)) logger.logkv('Meta-GradientMean', np.mean(mean)) logger.logkv('Meta-GradientStd', meta_grad_std) logger.logkv('Meta-GradientRStd', meta_grad_rstd) logger.logkv('Meta-GradientRVariance', meta_grad_rvar) # compute cosine dists cosine_dists = cdist(np.transpose(stacked_grads), np.transpose( np.mean(stacked_grads, axis=1).reshape( (-1, 1))), metric='cosine') mean_abs_cos_dist = np.mean(np.abs(cosine_dists)) mean_squared_cosine_dists = np.mean(cosine_dists**2) mean_squared_cosine_dists_sqrt = np.sqrt( mean_squared_cosine_dists) logger.logkv('Meta-GradientCosAbs', mean_abs_cos_dist) logger.logkv('Meta-GradientCosVar', mean_squared_cosine_dists) logger.logkv('Meta-GradientCosStd', mean_squared_cosine_dists_sqrt) """ ------------------ Outer Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. self.algo.optimize_policy(all_samples_data) """ ------------------- Logging Stuff --------------------------""" n_timesteps += (self.num_inner_grad_steps + 1) * self.sampler.total_samples logger.logkv('n_timesteps', n_timesteps) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) # , **kwargs) logger.save_itr_params(itr, params) logger.log("Saved") logger.logkv('Itr', itr) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.dumpkvs() logger.log("Training finished") self.sess.close()
def _log_path_stats(self, paths, log=False, log_prefix=''): # compute log stats average_discounted_return = np.mean( [path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] if log == 'reward': logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) elif log == 'all' or log is True: logger.logkv(log_prefix + 'AverageDiscountedReturn', average_discounted_return) logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) logger.logkv(log_prefix + 'NumTrajs', len(paths)) logger.logkv(log_prefix + 'StdReturn', np.std(undiscounted_returns)) logger.logkv(log_prefix + 'MaxReturn', np.max(undiscounted_returns)) logger.logkv(log_prefix + 'MinReturn', np.min(undiscounted_returns)) if 'success' in paths[0]['env_infos']: successes = [ path['env_infos']['success'].any() for path in paths ] logger.logkv(log_prefix + 'SuccessRate', np.mean(successes))