def train(self): for i in range(1, self.eff+1): with self.sess.as_default() as sess: logger.log("----------- Adaptation rollouts per meta-task = ", i, " -----------") # self.sampler.rollouts_per_meta_task = 10000 self.sampler.update_batch_size(i) # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] sess.run(tf.variables_initializer(uninit_vars)) self.task = self.env.sample_tasks(self.sampler.meta_batch_size, is_eval=True) self.sampler.set_tasks(self.task) #logger.log("\n ---------------- Iteration %d ----------------" % itr) logger.log("Sampling set of tasks/goals for this meta-batch...") """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") paths = self.sampler.obtain_samples(log=True, log_prefix='train-') """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='train-') self.log_diagnostics(sum(paths.values(), []), prefix='train-') #""" ------------------ Policy Update ---------------------""" #logger.log("Optimizing policy...") ## This needs to take all samples_data so that it can construct graph for meta-optimization. #time_optimization_step_start = time.time() #self.algo.optimize_policy(samples_data) """ ------------------- Logging Stuff --------------------------""" logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) #logger.log("Saving snapshot...") #params = self.get_itr_snapshot(itr) #logger.save_itr_params(itr, params) #logger.log("Saved") logger.dumpkvs() # if itr == 0: # sess.graph.finalize() logger.log("Training finished") self.sess.close()
def train(self): """ Trains policy on env using algo Pseudocode: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [ var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var)) ] sess.run(tf.variables_initializer(uninit_vars)) start_time = time.time() for itr in range(self.start_itr, self.n_itr): self.task = self.env.sample_tasks(self.sampler.meta_batch_size) self.sampler.set_tasks(self.task) itr_start_time = time.time() logger.log( "\n ---------------- Iteration %d ----------------" % itr) logger.log( "Sampling set of tasks/goals for this meta-batch...") """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") time_env_sampling_start = time.time() paths = self.sampler.obtain_samples(log=True, log_prefix='train-') sampling_time = time.time() - time_env_sampling_start """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") time_proc_samples_start = time.time() samples_data = self.sample_processor.process_samples( paths, log='all', log_prefix='train-') proc_samples_time = time.time() - time_proc_samples_start self.log_diagnostics(sum(paths.values(), []), prefix='train-') """ ------------------ Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. time_optimization_step_start = time.time() self.algo.optimize_policy(samples_data) """ ------------------ Test-split Performance for logging ---------------------""" logger.log("Testing on test-tasks split for logging...") sampler_batch_size = self.sampler.batch_size self.sampler.update_batch_size(3) ####################2 undiscounted_returns = [] for i in range(0, self.env.NUM_EVAL, self.sampler.meta_batch_size): # Caution: Here actually i in [0] since self.meta_batch_size=100(when running on linux) self.sampler.update_tasks( test=True, start_from=i) # sample from test split! #self.policy.switch_to_pre_update() # Switch to pre-update policy logger.log("On Test: Obtaining samples...") paths = self.sampler.obtain_samples( log=False, test=True) # log_prefix='test-Step_%d-' % step logger.log("On Test: Processing Samples...") self.log_diagnostics(sum(list(paths.values()), []), prefix='test-') """ ------------------- Logging Returns --------------------""" paths = self.sample_processor.gao_paths(paths) undiscounted_returns.extend( [sum(path["rewards"]) for path in paths]) test_average_return = np.mean(undiscounted_returns) self.sampler.update_batch_size(sampler_batch_size) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) logger.logkv('test-AverageReturn', test_average_return) logger.logkv('Time-Optimization', time.time() - time_optimization_step_start) logger.logkv('Time-SampleProc', np.sum(proc_samples_time)) logger.logkv('Time-Sampling', sampling_time) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() if itr == 0: sess.graph.finalize() logger.log("Training finished") self.sess.close()
def train(self): """ Trains policy on env using algo Pseudocode:: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] sess.run(tf.variables_initializer(uninit_vars)) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log("\n ---------------- Iteration %d ----------------" % itr) logger.log("Sampling set of tasks/goals for this meta-batch...") #self.sampler.update_tasks() self.policy.switch_to_pre_update() # Switch to pre-update policy all_samples_data, all_paths = [], [] list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], [] start_total_inner_time = time.time() for step in range(self.num_inner_grad_steps+1): logger.log('** Step ' + str(step) + ' **') """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") time_env_sampling_start = time.time() paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step) list_sampling_time.append(time.time() - time_env_sampling_start) all_paths.append(paths) """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") time_proc_samples_start = time.time() samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='Step_%d-' % step) all_samples_data.append(samples_data) list_proc_samples_time.append(time.time() - time_proc_samples_start) self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step) """ ------------------- Inner Policy Update --------------------""" time_inner_step_start = time.time() if step < self.num_inner_grad_steps: logger.log("Computing inner policy updates...") self.algo._adapt(samples_data) # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph', # sess.graph) list_inner_step_time.append(time.time() - time_inner_step_start) total_inner_time = time.time() - start_total_inner_time time_maml_opt_start = time.time() """ ------------------ Outer Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. time_outer_step_start = time.time() self.algo.optimize_policy(all_samples_data) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) #writer.add_scalar(self.algo.name, self.sample_processor.AR, self.sampler.total_timesteps_sampled) logger.logkv('Time-OuterStep', time.time() - time_outer_step_start) logger.logkv('Time-TotalInner', total_inner_time) logger.logkv('Time-InnerStep', np.sum(list_inner_step_time)) logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time)) logger.logkv('Time-Sampling', np.sum(list_sampling_time)) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.logkv('Time-MAMLSteps', time.time() - time_maml_opt_start) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() logger.log("Training finished") self.sess.close()
def train(self): policy_0 = self.policy for i in [4, 3, 2, 1]: #range(1, self.eff+1): print("On", i, "self.policy == policy_0: ", self.policy == policy_0) with self.sess.as_default() as sess: logger.log("----------- Adaptation rollouts per meta-task = ", i, " -----------") undiscounted_returns = [] for j in range(0, self.env.NUM_EVAL, self.sampler.meta_batch_size): logger.log("---------Testing on task", j, "~", j + self.sampler.meta_batch_size - 1, "---------") # initialize uninitialized vars (only initialize vars that were not loaded) # uninit_vars = [var for var in tf.global_variables() if # not sess.run(tf.is_variable_initialized(var))] # sess.run(tf.variables_initializer(uninit_vars)) uninit_vars = [var for var in tf.global_variables()] sess.run(tf.variables_initializer(uninit_vars)) logger.log( "Sampling set of tasks/goals for this meta-batch...") self.sampler.update_tasks( test=True, start_from=j) # sample from test split! self.policy.switch_to_pre_update( ) # Switch to pre-update policy for step in range(self.num_inner_grad_steps + 1): if step < self.num_inner_grad_steps: self.sampler.update_batch_size_v2( i) ###################### logger.log("On step-0: Obtaining samples...") else: self.sampler.update_batch_size(2) logger.log("On step-1: Obtaining samples...") paths = self.sampler.obtain_samples( log=False, test=True) # log_prefix='test-Step_%d-' % step logger.log("On Test: Processing Samples...") samples_data = self.sample_processor.process_samples( paths, log=False ) # log='all', log_prefix='test-Step_%d-' % step self.log_diagnostics(sum(list(paths.values()), []), prefix='test-Step_%d-' % step) """ ------------------- Inner Policy Update / logging returns --------------------""" if step < self.num_inner_grad_steps: logger.log( "On Test: Computing inner policy updates...") self.algo._adapt(samples_data) else: paths = self.sample_processor.gao_paths(paths) undiscounted_returns.extend( [sum(path["rewards"]) for path in paths]) test_average_return = np.mean(undiscounted_returns) logger.logkv('x', i) logger.logkv('return', test_average_return) logger.dumpkvs() logger.log("------Testing rollouts per meta-task = ", i, "finished------") '''
def train(self): """ Trains policy on env using algo Pseudocode:: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [ var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var)) ] sess.run(tf.variables_initializer(uninit_vars)) n_timesteps = 0 start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log( "\n ---------------- Iteration %d ----------------" % itr) gradients = [] for i in range(self.num_sapling_rounds): logger.log("\n ----- Sampling Round %d ---" % i) dry = i < self.num_sapling_rounds - 1 if not dry: self.sampler.update_tasks() self.policy.switch_to_pre_update( ) # Switch to pre-update policy all_samples_data, all_paths = [], [] for step in range(self.num_inner_grad_steps + 1): logger.log('** Step ' + str(step) + ' **') logger.log("Obtaining samples...") paths = self.sampler.obtain_samples( log=True, log_prefix='Step_%d-' % step) all_paths.append(paths) logger.log("Processing samples...") samples_data = self.sample_processor.process_samples( paths, log='all', log_prefix='Step_%d-' % step) all_samples_data.append(samples_data) if not dry: self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step) if step < self.num_inner_grad_steps: logger.log("Computing inner policy updates...") self.algo._adapt(samples_data) """ compute gradients """ gradients.append( self.algo.compute_gradients(all_samples_data)) if not dry: """ ------------ Compute and log gradient variance ------------""" # compute variance of adaptation gradients for step_id in range(self.num_inner_grad_steps): meta_batch_size = len(gradients[0][0]) grad_std, grad_rstd = [], [] for task_id in range(meta_batch_size): stacked_grads = np.stack([ gradients[round_id][step_id][task_id] for round_id in range(self.num_sapling_rounds) ], axis=1) std = np.std(stacked_grads, axis=1) mean = np.abs(np.mean(stacked_grads, axis=1)) grad_std.append(np.mean(std)) grad_rstd.append(np.mean(std / mean)) logger.logkv('Step_%i-GradientMean', np.mean(mean)) logger.logkv('Step_%i-GradientStd' % step_id, np.mean(grad_std)) logger.logkv('Step_%i-GradientRStd' % step_id, np.mean(grad_rstd)) # compute variance of meta gradients stacked_grads = np.stack([ gradients[round_id][self.num_inner_grad_steps] for round_id in range(self.num_sapling_rounds) ], axis=1) std = np.std(stacked_grads, axis=1) mean = np.abs(np.mean(stacked_grads, axis=1)) meta_grad_std = np.mean(std) meta_grad_rstd = np.mean(std / (mean + 1e-8)) meta_grad_rvar = np.mean(std**2 / (mean + 1e-8)) logger.logkv('Meta-GradientMean', np.mean(mean)) logger.logkv('Meta-GradientStd', meta_grad_std) logger.logkv('Meta-GradientRStd', meta_grad_rstd) logger.logkv('Meta-GradientRVariance', meta_grad_rvar) # compute cosine dists cosine_dists = cdist(np.transpose(stacked_grads), np.transpose( np.mean(stacked_grads, axis=1).reshape( (-1, 1))), metric='cosine') mean_abs_cos_dist = np.mean(np.abs(cosine_dists)) mean_squared_cosine_dists = np.mean(cosine_dists**2) mean_squared_cosine_dists_sqrt = np.sqrt( mean_squared_cosine_dists) logger.logkv('Meta-GradientCosAbs', mean_abs_cos_dist) logger.logkv('Meta-GradientCosVar', mean_squared_cosine_dists) logger.logkv('Meta-GradientCosStd', mean_squared_cosine_dists_sqrt) """ ------------------ Outer Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. self.algo.optimize_policy(all_samples_data) """ ------------------- Logging Stuff --------------------------""" n_timesteps += (self.num_inner_grad_steps + 1) * self.sampler.total_samples logger.logkv('n_timesteps', n_timesteps) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) # , **kwargs) logger.save_itr_params(itr, params) logger.log("Saved") logger.logkv('Itr', itr) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.dumpkvs() logger.log("Training finished") self.sess.close()
def train(self): """ Trains policy on env using algo Pseudocode:: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [ var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var)) ] sess.run(tf.variables_initializer(uninit_vars)) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log( "\n ---------------- Iteration %d ----------------" % itr) logger.log( "Sampling set of tasks/goals for this meta-batch...") self.sampler.update_tasks() # sample tasks! self.policy.switch_to_pre_update( ) # Switch to pre-update policy all_samples_data, all_paths = [], [] list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], [] start_total_inner_time = time.time() for step in range(self.num_inner_grad_steps + 1): logger.log('** Step ' + str(step) + ' **') """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") time_env_sampling_start = time.time() ''' if step == self.num_inner_grad_steps: temp = self.sampler.batch_size self.sampler.update_batch_size(2) paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step) self.sampler.update_batch_size(temp) else: paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step) ''' paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step) list_sampling_time.append(time.time() - time_env_sampling_start) all_paths.append(paths) """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") time_proc_samples_start = time.time() samples_data = self.sample_processor.process_samples( paths, log='all', log_prefix='Step_%d-' % step) all_samples_data.append(samples_data) list_proc_samples_time.append(time.time() - time_proc_samples_start) self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step) """ ------------------- Inner Policy Update --------------------""" time_inner_step_start = time.time() if step < self.num_inner_grad_steps: logger.log("Computing inner policy updates...") self.algo._adapt(samples_data) # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph', # sess.graph) list_inner_step_time.append(time.time() - time_inner_step_start) total_inner_time = time.time() - start_total_inner_time time_maml_opt_start = time.time() """ ------------------ Outer Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. time_outer_step_start = time.time() self.algo.optimize_policy(all_samples_data) """ ------------------ Test-split Performance for logging ---------------------""" logger.log( "Testing on test-tasks split for logging, rollout_per_task = 20..." ) undiscounted_returns = [] for i in range(0, self.env.NUM_EVAL, self.sampler.meta_batch_size): self.sampler.update_tasks( test=True, start_from=i) # sample from test split! self.policy.switch_to_pre_update( ) # Switch to pre-update policy for step in range(self.num_inner_grad_steps + 1): logger.log("On Test: Obtaining samples...") paths = self.sampler.obtain_samples( log=False, test=True) # log_prefix='test-Step_%d-' % step logger.log("On Test: Processing Samples...") samples_data = self.sample_processor.process_samples( paths, log=False ) # log='all', log_prefix='test-Step_%d-' % step self.log_diagnostics(sum(list(paths.values()), []), prefix='test20-Step_%d-' % step) """ ------------------- Inner Policy Update / logging returns --------------------""" if step < self.num_inner_grad_steps: logger.log( "On Test: Computing inner policy updates...") self.algo._adapt(samples_data) else: paths = self.sample_processor.gao_paths(paths) undiscounted_returns.extend( [sum(path["rewards"]) for path in paths]) test_average_return = np.mean(undiscounted_returns) logger.logkv('test20-AverageReturn', test_average_return) logger.log( "Testing on test-tasks split for logging, rollout_per_task = 2..." ) sampler_batch_size = self.sampler.batch_size self.sampler.update_batch_size(2) ############## undiscounted_returns = [] for i in range(0, self.env.NUM_EVAL, self.sampler.meta_batch_size): self.sampler.update_tasks( test=True, start_from=i) # sample from test split! self.policy.switch_to_pre_update( ) # Switch to pre-update policy for step in range(self.num_inner_grad_steps + 1): logger.log("On Test: Obtaining samples...") paths = self.sampler.obtain_samples( log=False, test=True) # log_prefix='test-Step_%d-' % step logger.log("On Test: Processing Samples...") samples_data = self.sample_processor.process_samples( paths, log=False ) # log='all', log_prefix='test-Step_%d-' % step self.log_diagnostics(sum(list(paths.values()), []), prefix='test-Step_%d-' % step) """ ------------------- Inner Policy Update / logging returns --------------------""" if step < self.num_inner_grad_steps: logger.log( "On Test: Computing inner policy updates...") self.algo._adapt(samples_data) else: paths = self.sample_processor.gao_paths(paths) undiscounted_returns.extend( [sum(path["rewards"]) for path in paths]) test_average_return = np.mean(undiscounted_returns) self.sampler.update_batch_size(sampler_batch_size) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) logger.logkv('test-AverageReturn', test_average_return) logger.logkv('Time-OuterStep', time.time() - time_outer_step_start) logger.logkv('Time-TotalInner', total_inner_time) logger.logkv('Time-InnerStep', np.sum(list_inner_step_time)) logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time)) logger.logkv('Time-Sampling', np.sum(list_sampling_time)) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.logkv('Time-MAMLSteps', time.time() - time_maml_opt_start) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() logger.log("Training finished") self.sess.close()
for i in range(params['num_inner_grad_steps']): paths = sampler.obtain_samples(log=False) samples_data = sample_processor.process_samples(paths, log=True, log_prefix='%i_' % i) env.log_diagnostics(sum(list(paths.values()), []), prefix='%i_' % i) algo._adapt(samples_data) paths = sampler.obtain_samples(log=False) samples_data = sample_processor.process_samples( paths, log=True, log_prefix='%i_' % params['num_inner_grad_steps']) env.log_diagnostics(sum(list(paths.values()), []), prefix='%i_' % params['num_inner_grad_steps']) logger.dumpkvs() images = [] # Postupdate: for _ in range(args.num_trajs): task_i = np.random.choice(range(params['meta_batch_size'])) env.set_task(tasks[task_i]) print(tasks[task_i]) obs = env.reset() for _ in range(params['max_path_length']): action, _ = policy.get_action(obs, task_i) obs, reward, done, _ = env.step(action) time.sleep(0.001) if done: break