def step(self, random=False): time_step = time.time() '''------------- Obtaining samples from the environment -----------''' if self.verbose: logger.log("Data is obtaining samples...") env_paths = self.env_sampler.obtain_samples( log=True, random=random, log_prefix='Data-EnvSampler-', ) '''-------------- Processing environment samples -------------------''' if self.verbose: logger.log("Data is processing environment samples...") samples_data = self.dynamics_sample_processor.process_samples( env_paths, log=True, log_prefix='Data-EnvTrajs-', ) self.samples_data_arr.append(samples_data) time_step = time.time() - time_step time_sleep = max(self.simulation_sleep - time_step, 0) time.sleep(time_sleep) logger.logkv('Data-TimeStep', time_step) logger.logkv('Data-TimeSleep', time_sleep) # save snapshot params = self.get_itr_snapshot() logger.save_itr_params(self.itr_counter, params)
def train(self): """ Trains policy on env using algo Pseudocode: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) # uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] # sess.run(tf.variables_initializer(uninit_vars)) sess.run(tf.global_variables_initializer()) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log("\n ---------------- Iteration %d ----------------" % itr) time_env_sampling_start = time.time() if self.initial_random_samples and itr == 0: logger.log("Obtaining random samples from the environment...") env_paths = self.env_sampler.obtain_samples(log=True, random=True, log_prefix='Data-EnvSampler-') else: logger.log("Obtaining samples from the environment using the policy...") env_paths = self.env_sampler.obtain_samples(log=True, log_prefix='Data-EnvSampler-') # Add sleeping time to match parallel experiment # time.sleep(10) logger.record_tabular('Data-TimeEnvSampling', time.time() - time_env_sampling_start) logger.log("Processing environment samples...") # first processing just for logging purposes time_env_samp_proc = time.time() samples_data = self.dynamics_sample_processor.process_samples(env_paths, log=True, log_prefix='Data-EnvTrajs-') self.env.log_diagnostics(env_paths, prefix='Data-EnvTrajs-') logger.record_tabular('Data-TimeEnvSampleProc', time.time() - time_env_samp_proc) ''' --------------- fit dynamics model --------------- ''' time_fit_start = time.time() self.dynamics_model.update_buffer(samples_data['observations'], samples_data['actions'], samples_data['next_observations'], check_init=True) buffer = None if not self.sample_from_buffer else samples_data logger.record_tabular('Model-TimeModelFit', time.time() - time_fit_start) ''' --------------- MAML steps --------------- ''' times_dyn_sampling = [] times_dyn_sample_processing = [] times_optimization = [] times_step = [] remaining_model_idx = list(range(self.dynamics_model.num_models)) valid_loss_rolling_average_prev = None with_new_data = True for id_step in range(self.repeat_steps): for epoch in range(self.num_epochs_per_step): logger.log("Training dynamics model for %i epochs ..." % 1) remaining_model_idx, valid_loss_rolling_average = self.dynamics_model.fit_one_epoch( remaining_model_idx, valid_loss_rolling_average_prev, with_new_data, log_tabular=True, prefix='Model-') with_new_data = False for step in range(self.num_grad_policy_per_step): logger.log("\n ---------------- Grad-Step %d ----------------" % int(itr * self.repeat_steps * self.num_grad_policy_per_step + id_step * self.num_grad_policy_per_step + step)) step_start_time = time.time() """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples from the model...") time_env_sampling_start = time.time() paths = self.model_sampler.obtain_samples(log=True, log_prefix='Policy-', buffer=buffer) sampling_time = time.time() - time_env_sampling_start """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples from the model...") time_proc_samples_start = time.time() samples_data = self.model_sample_processor.process_samples(paths, log='all', log_prefix='Policy-') proc_samples_time = time.time() - time_proc_samples_start if type(paths) is list: self.log_diagnostics(paths, prefix='Policy-') else: self.log_diagnostics(sum(paths.values(), []), prefix='Policy-') """ ------------------ Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. time_optimization_step_start = time.time() self.algo.optimize_policy(samples_data) optimization_time = time.time() - time_optimization_step_start times_dyn_sampling.append(sampling_time) times_dyn_sample_processing.append(proc_samples_time) times_optimization.append(optimization_time) times_step.append(time.time() - step_start_time) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Iteration', itr) logger.logkv('n_timesteps', self.env_sampler.total_timesteps_sampled) logger.logkv('Policy-TimeSampleProc', np.sum(times_dyn_sample_processing)) logger.logkv('Policy-TimeSampling', np.sum(times_dyn_sampling)) logger.logkv('Policy-TimeAlgoOpt', np.sum(times_optimization)) logger.logkv('Policy-TimeStep', np.sum(times_step)) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() if itr == 0: sess.graph.finalize() logger.logkv('Trainer-TimeTotal', time.time() - start_time) logger.log("Training finished") self.sess.close()
def train(self): """ Trains policy on env using algo Pseudocode: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) # uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] # sess.run(tf.variables_initializer(uninit_vars)) sess.run(tf.global_variables_initializer()) if type(self.meta_steps_per_iter) is tuple: meta_steps_per_iter = np.linspace(self.meta_steps_per_iter[0] , self.meta_steps_per_iter[1], self.n_itr).astype(np.int) else: meta_steps_per_iter = [self.meta_steps_per_iter] * self.n_itr start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log("\n ---------------- Iteration %d ----------------" % itr) time_env_sampling_start = time.time() if self.initial_random_samples and itr == 0: logger.log("Obtaining random samples from the environment...") env_paths = self.env_sampler.obtain_samples(log=True, random=True, log_prefix='EnvSampler-') else: logger.log("Obtaining samples from the environment using the policy...") env_paths = self.env_sampler.obtain_samples(log=True, log_prefix='EnvSampler-') logger.record_tabular('Time-EnvSampling', time.time() - time_env_sampling_start) logger.log("Processing environment samples...") # first processing just for logging purposes time_env_samp_proc = time.time() if type(env_paths) is dict or type(env_paths) is collections.OrderedDict: env_paths = list(env_paths.values()) idxs = np.random.choice(range(len(env_paths)), size=self.num_rollouts_per_iter, replace=False) env_paths = sum([env_paths[idx] for idx in idxs], []) elif type(env_paths) is list: idxs = np.random.choice(range(len(env_paths)), size=self.num_rollouts_per_iter, replace=False) env_paths = [env_paths[idx] for idx in idxs] else: raise TypeError samples_data = self.dynamics_sample_processor.process_samples(env_paths, log=True, log_prefix='EnvTrajs-') self.env.log_diagnostics(env_paths, prefix='EnvTrajs-') logger.record_tabular('Time-EnvSampleProc', time.time() - time_env_samp_proc) ''' --------------- fit dynamics model --------------- ''' time_fit_start = time.time() logger.log("Training dynamics model for %i epochs ..." % (self.dynamics_model_max_epochs)) self.dynamics_model.fit(samples_data['observations'], samples_data['actions'], samples_data['next_observations'], epochs=self.dynamics_model_max_epochs, verbose=True, log_tabular=True) buffer = None if not self.sample_from_buffer else samples_data logger.record_tabular('Time-ModelFit', time.time() - time_fit_start) ''' ------------ log real performance --------------- ''' if self.log_real_performance: logger.log("Evaluating the performance of the real policy") self.policy.switch_to_pre_update() env_paths = self.env_sampler.obtain_samples(log=True, log_prefix='PrePolicy-') samples_data = self.model_sample_processor.process_samples(env_paths, log='all', log_prefix='PrePolicy-') self.algo._adapt(samples_data) env_paths = self.env_sampler.obtain_samples(log=True, log_prefix='PostPolicy-') self.model_sample_processor.process_samples(env_paths, log='all', log_prefix='PostPolicy-') ''' --------------- MAML steps --------------- ''' times_dyn_sampling = [] times_dyn_sample_processing = [] times_meta_sampling = [] times_inner_step = [] times_total_inner_step = [] times_outer_step = [] times_maml_steps = [] for meta_itr in range(meta_steps_per_iter[itr]): logger.log("\n ---------------- Meta-Step %d ----------------" % int(sum(meta_steps_per_iter[:itr]) + meta_itr)) self.policy.switch_to_pre_update() # Switch to pre-update policy all_samples_data, all_paths = [], [] list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], [] time_maml_steps_start = time.time() start_total_inner_time = time.time() for step in range(self.num_inner_grad_steps+1): logger.log("\n ** Adaptation-Step %d **" % step) """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") time_env_sampling_start = time.time() paths = self.model_sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step, buffer=buffer) list_sampling_time.append(time.time() - time_env_sampling_start) all_paths.append(paths) """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") time_proc_samples_start = time.time() samples_data = self.model_sample_processor.process_samples(paths, log='all', log_prefix='Step_%d-' % step) all_samples_data.append(samples_data) list_proc_samples_time.append(time.time() - time_proc_samples_start) self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step) """ ------------------- Inner Policy Update --------------------""" time_inner_step_start = time.time() if step < self.num_inner_grad_steps: logger.log("Computing inner policy updates...") self.algo._adapt(samples_data) list_inner_step_time.append(time.time() - time_inner_step_start) total_inner_time = time.time() - start_total_inner_time time_maml_opt_start = time.time() """ ------------------ Outer Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. time_outer_step_start = time.time() self.algo.optimize_policy(all_samples_data) times_inner_step.append(list_inner_step_time) times_total_inner_step.append(total_inner_time) times_outer_step.append(time.time() - time_outer_step_start) times_meta_sampling.append(np.sum(list_sampling_time)) times_dyn_sampling.append(list_sampling_time) times_dyn_sample_processing.append(list_proc_samples_time) times_maml_steps.append(time.time() - time_maml_steps_start) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Itr', itr) if self.log_real_performance: logger.logkv('n_timesteps', self.env_sampler.total_timesteps_sampled/(3 * self.policy.meta_batch_size) * self.num_rollouts_per_iter) else: logger.logkv('n_timesteps', self.env_sampler.total_timesteps_sampled/self.policy.meta_batch_size * self.num_rollouts_per_iter) logger.logkv('AvgTime-OuterStep', np.mean(times_outer_step)) logger.logkv('AvgTime-InnerStep', np.mean(times_inner_step)) logger.logkv('AvgTime-TotalInner', np.mean(times_total_inner_step)) logger.logkv('AvgTime-InnerStep', np.mean(times_inner_step)) logger.logkv('AvgTime-SampleProc', np.mean(times_dyn_sample_processing)) logger.logkv('AvgTime-Sampling', np.mean(times_dyn_sampling)) logger.logkv('AvgTime-MAMLSteps', np.mean(times_maml_steps)) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() if itr == 0: sess.graph.finalize() logger.log("Training finished") self.sess.close()
def train(self): """ Trains policy on env using algo Pseudocode: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) # uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] # sess.run(tf.variables_initializer(uninit_vars)) sess.run(tf.global_variables_initializer()) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log( "\n ---------------- Iteration %d ----------------" % itr) time_env_sampling_start = time.time() if itr == 0: logger.log( "Obtaining random samples from the environment...") self.env_sampler.total_samples *= self.num_rollouts_per_iter env_paths = self.env_sampler.obtain_samples( log=True, random=self.initial_random_samples, log_prefix='Data-EnvSampler-', verbose=True) self.env_sampler.total_samples /= self.num_rollouts_per_iter time_env_samp_proc = time.time() samples_data = self.dynamics_sample_processor.process_samples( env_paths, log=True, log_prefix='Data-EnvTrajs-') self.env.log_diagnostics(env_paths, prefix='Data-EnvTrajs-') logger.record_tabular('Data-TimeEnvSampleProc', time.time() - time_env_samp_proc) buffer = samples_data if self.sample_from_buffer else None ''' --------------- fit dynamics model --------------- ''' logger.log("Training dynamics model for %i epochs ..." % self.dynamics_model_max_epochs) time_fit_start = time.time() self.dynamics_model.fit(samples_data['observations'], samples_data['actions'], samples_data['next_observations'], epochs=self.dynamics_model_max_epochs, verbose=False, log_tabular=True, prefix='Model-') logger.record_tabular('Model-TimeModelFit', time.time() - time_fit_start) env_paths = [] for id_rollout in range(self.num_rollouts_per_iter): times_dyn_sampling = [] times_dyn_sample_processing = [] times_optimization = [] times_step = [] grad_steps_per_rollout = self.grad_steps_per_rollout for step in range(grad_steps_per_rollout): # logger.log("\n ---------------- Grad-Step %d ----------------" % int(grad_steps_per_rollout*itr*self.num_rollouts_per_iter, # + id_rollout * grad_steps_per_rollout + step)) step_start_time = time.time() """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples from the model...") time_env_sampling_start = time.time() paths = self.model_sampler.obtain_samples( log=True, log_prefix='Policy-', buffer=buffer) sampling_time = time.time() - time_env_sampling_start """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples from the model...") time_proc_samples_start = time.time() samples_data = self.model_sample_processor.process_samples( paths, log='all', log_prefix='Policy-') proc_samples_time = time.time( ) - time_proc_samples_start if type(paths) is list: self.log_diagnostics(paths, prefix='Policy-') else: self.log_diagnostics(sum(paths.values(), []), prefix='Policy-') """ ------------------ Policy Update ---------------------""" logger.log("Optimizing policy...") time_optimization_step_start = time.time() self.algo.optimize_policy(samples_data) optimization_time = time.time( ) - time_optimization_step_start times_dyn_sampling.append(sampling_time) times_dyn_sample_processing.append(proc_samples_time) times_optimization.append(optimization_time) times_step.append(time.time() - step_start_time) logger.log( "Obtaining random samples from the environment...") env_paths.extend( self.env_sampler.obtain_samples( log=True, log_prefix='Data-EnvSampler-', verbose=True)) logger.record_tabular('Data-TimeEnvSampling', time.time() - time_env_sampling_start) logger.log("Processing environment samples...") """ ------------------- Logging Stuff --------------------------""" logger.logkv('Iteration', itr) logger.logkv('n_timesteps', self.env_sampler.total_timesteps_sampled) logger.logkv('Policy-TimeSampleProc', np.sum(times_dyn_sample_processing)) logger.logkv('Policy-TimeSampling', np.sum(times_dyn_sampling)) logger.logkv('Policy-TimeAlgoOpt', np.sum(times_optimization)) logger.logkv('Policy-TimeStep', np.sum(times_step)) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() if itr == 0: sess.graph.finalize() logger.logkv('Trainer-TimeTotal', time.time() - start_time) logger.log("Training finished") self.sess.close()
def train(self): """ Trains policy on env using algo Pseudocode: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] sess.run(tf.variables_initializer(uninit_vars)) start_time = time.time() for itr in range(self.start_itr, self.n_itr): self.sampler.update_tasks() itr_start_time = time.time() logger.log("\n ---------------- Iteration %d ----------------" % itr) logger.log("Sampling set of tasks/goals for this meta-batch...") """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") time_env_sampling_start = time.time() paths = self.sampler.obtain_samples(log=True, log_prefix='train-') sampling_time = time.time() - time_env_sampling_start """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") time_proc_samples_start = time.time() samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='train-') proc_samples_time = time.time() - time_proc_samples_start if type(paths) is list: self.log_diagnostics(paths, prefix='train-') else: self.log_diagnostics(sum(paths.values(), []), prefix='train-') """ ------------------ Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. time_optimization_step_start = time.time() self.algo.optimize_policy(samples_data) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) logger.logkv('Time-Optimization', time.time() - time_optimization_step_start) logger.logkv('Time-SampleProc', np.sum(proc_samples_time)) logger.logkv('Time-Sampling', sampling_time) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() if itr == 0: sess.graph.finalize() logger.log("Training finished") self.sess.close()
def train(self): """ Trains policy on env using algo Pseudocode: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [ var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var)) ] sess.run(tf.variables_initializer(uninit_vars)) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log( "\n ---------------- Iteration %d ----------------" % itr) time_env_sampling_start = time.time() if self.initial_random_samples and itr == 0: logger.log( "Obtaining random samples from the environment...") env_paths = self.sampler.obtain_samples(log=True, random=True, log_prefix='') elif self.initial_sinusoid_samples and itr == 0: logger.log( "Obtaining sinusoidal samples from the environment using the policy..." ) env_paths = self.sampler.obtain_samples(log=True, log_prefix='', sinusoid=True) else: logger.log( "Obtaining samples from the environment using the policy..." ) env_paths = self.sampler.obtain_samples(log=True, log_prefix='') logger.record_tabular('Time-EnvSampling', time.time() - time_env_sampling_start) logger.log("Processing environment samples...") # first processing just for logging purposes time_env_samp_proc = time.time() samples_data = self.dynamics_sample_processor.process_samples( env_paths, log=True, log_prefix='EnvTrajs-') logger.record_tabular('Time-EnvSampleProc', time.time() - time_env_samp_proc) ''' --------------- fit dynamics model --------------- ''' time_fit_start = time.time() logger.log("Training dynamics model for %i epochs ..." % (self.dynamics_model_max_epochs)) self.dynamics_model.fit(samples_data['observations'], samples_data['actions'], samples_data['next_observations'], epochs=self.dynamics_model_max_epochs, verbose=False, log_tabular=True) logger.record_tabular('Time-ModelFit', time.time() - time_fit_start) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) self.log_diagnostics(env_paths, '') logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() if itr == 0: sess.graph.finalize() logger.log("Training finished") self.sess.close()