def _try_to_eval(self, epoch): if self._can_evaluate(): # save if it's time to save if epoch % self.freq_saving == 0: logger.save_extra_data(self.get_extra_data_to_save(epoch)) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) self.evaluate(epoch) logger.record_tabular( "Number of train calls total", self._n_train_steps_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.")
def _try_to_eval(self, epoch): #logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): self.evaluate(epoch) #params = self.get_epoch_snapshot(epoch) #logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() if self._old_table_keys is not None: assert table_keys == self._old_table_keys, ( "Table keys cannot change from iteration to iteration.") self._old_table_keys = table_keys logger.record_tabular("Number of train steps total", self._n_train_steps_total) logger.record_tabular("Number of env steps total", self._n_env_steps_total) logger.record_tabular("Number of rollouts total", self._n_rollouts_total) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.")
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) evaluation_env = deep_clone(env) if self._eval_n_episodes else None # TODO: use Ezpickle to deep_clone??? # evaluation_env = env with tf_utils.get_default_session().as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training( iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() self.sampler.terminate()
def _try_to_eval(self, epoch): logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): if self.environment_farming: # Create new new eval_sampler each evaluation time in order to avoid relesed environment problem env_for_eval_sampler = self.farmer.force_acq_env() print(env_for_eval_sampler) self.eval_sampler = InPlacePathSampler( env=env_for_eval_sampler, policy=self.eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, ) self.evaluate(epoch) # Adding env back to free_env list self.farmer.add_free_env(env_for_eval_sampler) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() if self._old_table_keys is not None: assert table_keys == self._old_table_keys, ( "Table keys cannot change from iteration to iteration.") self._old_table_keys = table_keys logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.")
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) self.sampler.initialize(env, policy, pool) with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): # TODO.codeconsolidation: Add control interval to sampler self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) evaluation_env = deep_clone(env) if self._eval_n_episodes else None with tf_utils.get_default_session().as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training( iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix()
def _get_epoch_timings(): times_itrs = gt.get_times().stamps.itrs times = OrderedDict() epoch_time = 0 for key in sorted(times_itrs): time = times_itrs[key][-1] epoch_time += time times['time/{} (s)'.format(key)] = time times['time/epoch (s)'] = epoch_time times['time/total (s)'] = gt.get_times().total return times
def log(self, write_table_header=False): logger.log("Logging data in directory: %s" % logger.get_snapshot_dir()) logger.record_tabular("Episode", self.num_episodes) logger.record_tabular("Accumulated Training Steps", self.num_train_interactions) logger.record_tabular("Policy Error", self.logging_policies_error) logger.record_tabular("Q-Value Error", self.logging_qvalues_error) logger.record_tabular("V-Value Error", self.logging_vvalues_error) logger.record_tabular("Alpha", np_ify(self.log_alpha.exp()).item()) logger.record_tabular("Entropy", np_ify(self.logging_entropy.mean(dim=(0, )))) act_mean = np_ify(self.logging_mean.mean(dim=(0, ))) act_std = np_ify(self.logging_std.mean(dim=(0, ))) for aa in range(self.action_dim): logger.record_tabular("Mean Action %02d" % aa, act_mean[aa]) logger.record_tabular("Std Action %02d" % aa, act_std[aa]) # Evaluation Stats to plot logger.record_tabular("Test Rewards Mean", np_ify(self.logging_eval_rewards.mean())) logger.record_tabular("Test Rewards Std", np_ify(self.logging_eval_rewards.std())) logger.record_tabular("Test Returns Mean", np_ify(self.logging_eval_returns.mean())) logger.record_tabular("Test Returns Std", np_ify(self.logging_eval_returns.std())) # Add the previous times to the logger times_itrs = gt.get_times().stamps.itrs train_time = times_itrs.get('train', [0])[-1] sample_time = times_itrs.get('sample', [0])[-1] eval_time = times_itrs.get('eval', [0])[-1] epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) # Dump the logger data logger.dump_tabular(with_prefix=False, with_timestamp=False, write_header=write_table_header) # Save pytorch models self.save_training_state() logger.log("----")
def _train(self, env, policy, uniform_policy, pool): self._init_training(env, policy, pool) self.sampler.initialize(env, uniform_policy, pool) # use uniform sampler initially with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): if self._epoch_length * epoch >= self._n_random_steps: self.sampler.set_policy(policy) logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training( itr=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
def _record_stats(self): times_itrs = gt.get_times().stamps.itrs sample_time = times_itrs['sample'][-1] adaptation_time = times_itrs['adaptation'][-1] meta_time = times_itrs['meta'][-1] eval_time = times_itrs['eval'][-1] iter_time = sample_time + adaptation_time + meta_time self._time_total += iter_time self.logger.record_tabular('Model Loss', np.float32(self.theta_loss.data.cpu())) for task, reward in zip(self.tasks, self.eval_rewards): self.logger.record_tabular('Reward: ' + task.env.spec.id, reward) self.logger.record_tabular('Dataset Size', len(self.dataset)) self.logger.record_tabular('Total Model Steps', self._n_model_steps_total) self.logger.record_tabular('Total Task Steps', self._n_task_steps_total) self.logger.record_tabular('Total Rollouts', self._n_rollouts_total) self.logger.record_tabular('Sample Time (s)', sample_time) self.logger.record_tabular('Adaptation Time (s)', adaptation_time) self.logger.record_tabular('Meta Time (s)', meta_time) self.logger.record_tabular('Evaluation Time (s)', eval_time) self.logger.record_tabular('Iteration Time (s)', iter_time) self.logger.record_tabular('Total Time (s)', self._time_total) self.logger.dump_tabular(with_prefix=False, with_timestamp=False)
def _try_to_eval(self, epoch): logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): self.evaluate(epoch) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() #print("TABLE KEYS") #print(table_keys) #if self._old_table_keys is not None: # assert table_keys == self._old_table_keys, ( # "Table keys cannot change from iteration to iteration." # ) self._old_table_keys = table_keys logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) # tensorboard stuff _writer = self._writer for k, v_str in logger._tabular: if k == 'Epoch': continue v = float(v_str) if k.endswith('Loss'): _writer.add_scalar('Loss/{}'.format(k), v, epoch) elif k.endswith('Max'): prefix = k[:-4] _writer.add_scalar('{}/{}'.format(prefix, k), v, epoch) elif k.endswith('Min'): prefix = k[:-4] _writer.add_scalar('{}/{}'.format(prefix, k), v, epoch) elif k.endswith('Std'): prefix = k[:-4] _writer.add_scalar('{}/{}'.format(prefix, k), v, epoch) elif k.endswith('Mean'): prefix = k[:-5] _writer.add_scalar('{}/{}'.format(prefix, k), v, epoch) elif 'Time' in k: _writer.add_scalar('Time/{}'.format(k), v, epoch) elif k.startswith('Num'): _writer.add_scalar('Number/{}'.format(k), v, epoch) elif k.startswith('Exploration'): _writer.add_scalar('Exploration/{}'.format(k), v, epoch) elif k.startswith('Test'): _writer.add_scalar('Test/{}'.format(k), v, epoch) else: _writer.add_scalar(k, v, epoch) _writer.file_writer.flush() logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.")
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ #### pool is e.g. simple_replay_pool training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool if not self._training_started: #### perform some initial steps (gather samples) using initial policy ###### fills pool with _n_initial_exploration_steps samples self._initial_exploration_hook(training_environment, self._policy, pool) #### set up sampler with train env and actual policy (may be different from initial exploration policy) ######## note: sampler is set up with the pool that may be already filled from initial exploration hook self.sampler.initialize(training_environment, policy, pool) self.model_sampler.initialize(self.fake_env, policy, self.model_pool) rollout_dkl_lim = self.model_sampler.compute_dynamics_dkl( obs_batch=self._pool.rand_batch_from_archive( 5000, fields=['observations'])['observations'], depth=self._rollout_schedule[2]) self.model_sampler.set_rollout_dkl(rollout_dkl_lim) self.initial_model_dkl = self.model_sampler.dyn_dkl #### reset gtimer (for coverage of project development) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self.policy_epoch = 0 ### count policy updates self.new_real_samples = 0 self.last_eval_step = 0 self.diag_counter = 0 running_diag = {} self.approx_model_batch = self.batch_size_policy - self.min_real_samples_per_epoch ### some size to start off #### not implemented, could train policy before hook self._training_before_hook() #### iterate over epochs, gt.timed_for to create loop with gt timestamps for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): #### do something at beginning of epoch (in this case reset self._train_steps_this_epoch=0) self._epoch_before_hook() gt.stamp('epoch_before_hook') #### util class Progress, e.g. for plotting a progress bar ####### note: sampler may already contain samples in its pool from initial_exploration_hook or previous epochs self._training_progress = Progress(self._epoch_length * self._n_train_repeat / self._train_every_n_steps) samples_added = 0 #=====================================================================# # Rollout model # #=====================================================================# model_samples = None keep_rolling = True model_metrics = {} #### start model rollout if self._real_ratio < 1.0: #if self._timestep % self._model_train_freq == 0 and self._real_ratio < 1.0: #=====================================================================# # Model Rollouts # #=====================================================================# if self.rollout_mode == 'schedule': self._set_rollout_length() while keep_rolling: ep_b = self._pool.epoch_batch( batch_size=self._rollout_batch_size, epochs=self._pool.epochs_list, fields=['observations', 'pi_infos']) kls = np.clip(self._policy.compute_DKL( ep_b['observations'], ep_b['mu'], ep_b['log_std']), a_min=0, a_max=None) btz_dist = self._pool.boltz_dist(kls, alpha=self.policy_alpha) btz_b = self._pool.distributed_batch_from_archive( self._rollout_batch_size, btz_dist, fields=['observations', 'pi_infos']) start_states, mus, logstds = btz_b['observations'], btz_b[ 'mu'], btz_b['log_std'] btz_kl = np.clip(self._policy.compute_DKL( start_states, mus, logstds), a_min=0, a_max=None) self.model_sampler.reset(start_states) if self.rollout_mode == 'uncertainty': self.model_sampler.set_max_uncertainty( self.max_tddyn_err) for i in count(): # print(f'Model Sampling step Nr. {i+1}') _, _, _, info = self.model_sampler.sample( max_samples=int(self.approx_model_batch - samples_added)) if self.model_sampler._total_samples + samples_added >= .99 * self.approx_model_batch: keep_rolling = False break if info['alive_ratio'] <= 0.1: break ### diagnostics for rollout ### rollout_diagnostics = self.model_sampler.finish_all_paths() if self.rollout_mode == 'iv_gae': keep_rolling = self.model_pool.size + samples_added <= .99 * self.approx_model_batch ###################################################################### ### get model_samples, get() invokes the inverse variance rollouts ### model_samples_new, buffer_diagnostics_new = self.model_pool.get( ) model_samples = [ np.concatenate((o, n), axis=0) for o, n in zip(model_samples, model_samples_new) ] if model_samples else model_samples_new ###################################################################### ### diagnostics new_n_samples = len(model_samples_new[0]) + EPS diag_weight_old = samples_added / (new_n_samples + samples_added) diag_weight_new = new_n_samples / (new_n_samples + samples_added) model_metrics = update_dict(model_metrics, rollout_diagnostics, weight_a=diag_weight_old, weight_b=diag_weight_new) model_metrics = update_dict(model_metrics, buffer_diagnostics_new, weight_a=diag_weight_old, weight_b=diag_weight_new) ### run diagnostics on model data if buffer_diagnostics_new['poolm_batch_size'] > 0: model_data_diag = self._policy.run_diagnostics( model_samples_new) model_data_diag = { k + '_m': v for k, v in model_data_diag.items() } model_metrics = update_dict(model_metrics, model_data_diag, weight_a=diag_weight_old, weight_b=diag_weight_new) samples_added += new_n_samples model_metrics.update({'samples_added': samples_added}) ###################################################################### ## for debugging model_metrics.update({ 'cached_var': np.mean(self.fake_env._model.scaler_out.cached_var) }) model_metrics.update({ 'cached_mu': np.mean(self.fake_env._model.scaler_out.cached_mu) }) print(f'Rollouts finished') gt.stamp('epoch_rollout_model') #=====================================================================# # Sample # #=====================================================================# n_real_samples = self.model_sampler.dyn_dkl / self.initial_model_dkl * self.min_real_samples_per_epoch n_real_samples = max(n_real_samples, 1000) # n_real_samples = self.min_real_samples_per_epoch ### for ablation model_metrics.update({'n_real_samples': n_real_samples}) start_samples = self.sampler._total_samples ### train for epoch_length ### for i in count(): #### _timestep is within an epoch samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples #### not implemented atm self._timestep_before_hook() gt.stamp('timestep_before_hook') ##### śampling from the real world ! ##### _, _, _, _ = self._do_sampling(timestep=self.policy_epoch) gt.stamp('sample') self._timestep_after_hook() gt.stamp('timestep_after_hook') if self.ready_to_train or self._timestep > n_real_samples: self.sampler.finish_all_paths(append_val=True, append_cval=True, reset_path=False) self.new_real_samples += self._timestep break #=====================================================================# # Train model # #=====================================================================# if self.new_real_samples > 2048 and self._real_ratio < 1.0: model_diag = self.train_model(min_epochs=1, max_epochs=10) self.new_real_samples = 0 model_metrics.update(model_diag) #=====================================================================# # Get Buffer Data # #=====================================================================# real_samples, buf_diag = self._pool.get() ### run diagnostics on real data policy_diag = self._policy.run_diagnostics(real_samples) policy_diag = {k + '_r': v for k, v in policy_diag.items()} model_metrics.update(policy_diag) model_metrics.update(buf_diag) #=====================================================================# # Update Policy # #=====================================================================# train_samples = [ np.concatenate((r, m), axis=0) for r, m in zip(real_samples, model_samples) ] if model_samples else real_samples self._policy.update_real_c(real_samples) self._policy.update_policy(train_samples) self._policy.update_critic( train_samples, train_vc=(train_samples[-3] > 0).any()) ### only train vc if there are any costs if self._real_ratio < 1.0: self.approx_model_batch = self.batch_size_policy - n_real_samples #self.model_sampler.dyn_dkl/self.initial_model_dkl * self.min_real_samples_per_epoch self.policy_epoch += 1 self.max_tddyn_err *= self.max_tddyn_err_decay #### log policy diagnostics self._policy.log() gt.stamp('train') #=====================================================================# # Log performance and stats # #=====================================================================# self.sampler.log() # write results to file, ray prints for us, so no need to print from logger logger_diagnostics = self.logger.dump_tabular( output_dir=self._log_dir, print_out=False) #=====================================================================# if self._total_timestep // self.eval_every_n_steps > self.last_eval_step: evaluation_paths = self._evaluation_paths( policy, evaluation_environment) gt.stamp('evaluation_paths') self.last_eval_step = self._total_timestep // self.eval_every_n_steps else: evaluation_paths = [] if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') diag_obs_batch = np.concatenate(([ evaluation_paths[i]['observations'] for i in range(len(evaluation_paths)) ]), axis=0) else: evaluation_metrics = {} diag_obs_batch = [] gt.stamp('epoch_after_hook') new_diagnostics = {} time_diagnostics = gt.get_times().stamps.itrs # add diagnostics from logger new_diagnostics.update(logger_diagnostics) new_diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'model/{key}', model_metrics[key]) for key in sorted(model_metrics.keys())), ))) if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): training_environment.render_rollouts(evaluation_paths) #### updateing and averaging old_ts_diag = running_diag.get('timestep', 0) new_ts_diag = self._total_timestep - self.diag_counter - old_ts_diag w_olddiag = old_ts_diag / (new_ts_diag + old_ts_diag) w_newdiag = new_ts_diag / (new_ts_diag + old_ts_diag) running_diag = update_dict(running_diag, new_diagnostics, weight_a=w_olddiag, weight_b=w_newdiag) running_diag.update({'timestep': new_ts_diag + old_ts_diag}) #### if new_ts_diag + old_ts_diag > self.eval_every_n_steps: running_diag.update({ 'epoch': self._epoch, 'timesteps_total': self._total_timestep, 'train-steps': self._num_train_steps, }) self.diag_counter = self._total_timestep diag = running_diag.copy() running_diag = {} yield diag if self._total_timestep >= self.n_env_interacts: self.sampler.terminate() self._training_after_hook() self._training_progress.close() print("###### DONE ######") yield {'done': True, **running_diag} break
def _train(self, env, policy, pool, qf=None, vf=None, saver=None, _ec=None, dynamic_ec=False): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) self.sampler.initialize(env, policy, pool) if dynamic_ec: dicrese_rate = _ec / self._n_epochs logger2 = mylogger2.get_logger() os.makedirs(os.path.join(logger2.log_dir, 'model'), exist_ok=logger2.exist_ok) optuna_break = False with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs), save_itrs=True): if optuna_break: continue # logger.push_prefix('Epoch #%d | ' % epoch) epoch_states = [] kurtosis = [] signed_variance = [] for t in range(self._epoch_length): # TODO.codeconsolidation: Add control interval to sampler done, _n_episodes, obs, next_obs, info = self.sampler.sample( ) epoch_states.append(obs) state_importances = self.policy.calc_knack([obs]) kurtosis.append(state_importances["kurtosis"][0]) signed_variance.append( state_importances["signed_variance"] [0]) # be careful of batch_ready < epoch_length if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') # evaluation if epoch % self._eval_n_frequency == 0: eval_average_return = self._evaluate(epoch) logger.record_tabular('eval_average_return', eval_average_return) if hasattr(self.policy, "optuna_trial"): if self.policy.optuna_trial is not None: self.policy.optuna_trial.report( eval_average_return, epoch) # report intermediate_value if self.policy.optuna_trial.should_prune(): optuna_break = True continue # raise optuna.structs.TrialPruned() else: logger.record_tabular('eval_average_return', np.nan) gt.stamp('eval') # logging about time and step times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('total_step', self.sampler._total_samples) logger.record_tabular('total_episode', self.sampler._n_episodes) # logging about array if hasattr(self.policy, "current_knack_thresh"): current_knack_thresh = self.policy.current_knack_thresh _ = self.policy.calc_and_update_knack(epoch_states) if logger2.save_array_flag: kwargs1 = { 'epoch': epoch, 'states': np.array(epoch_states), 'knack_kurtosis': np.array(kurtosis), 'signed_variance': np.array(signed_variance) } if hasattr(self.policy, "current_knack_thresh"): kwargs1.update( {'current_knack_thresh': current_knack_thresh}) kwargs1.update(self.policy.get_q_params()) logger2.add_array_data(kwargs1) if epoch % 10 == 0: # TODO save only parameters saver.save(self._sess, os.path.join(logger2.log_dir, 'model')) gt.stamp("tf save") gt.stamp("calc knacks") if dynamic_ec: self._sess.run(tf.assign(_ec, _ec - dicrese_rate)) logger.dump_tabular() logger2.write() # print(gt.report()) # finalize processing if optuna_break: return None if logger2.save_array_flag: saver.save(self._sess, os.path.join(logger2.log_dir, 'model')) self.sampler.terminate() return eval_average_return
def _train(self, env, policy, pool, initial_exploration_policy=None): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ if not self._training_started: self._init_training() self._initial_exploration_hook(env, initial_exploration_policy, pool) self.sampler.initialize(env, policy, pool) evaluation_env = env.copy() if self._eval_n_episodes else None gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') start_samples = self.sampler._total_samples for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if samples_now >= start_samples + self._epoch_length: break self._timestep_before_hook() gt.stamp('timestep_before_hook') self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_env) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, env) gt.stamp('training_metrics') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_env) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_mode is not None and hasattr( evaluation_env, 'render_rollouts'): # TODO(hartikainen): Make this consistent such that there's no # need for the hasattr check. env.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook()
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool if not self._training_started: self._init_training() self._initial_exploration_hook(training_environment, self._initial_exploration_policy, pool) self.sampler.initialize(training_environment, policy, pool) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): print('starting epoch', self._epoch) self._epoch_before_hook() gt.stamp('epoch_before_hook') start_samples = self.sampler._total_samples print('start samples', start_samples) for i in count(): samples_now = self.sampler._total_samples # print('samples now', samples_now) self._timestep = samples_now - start_samples if (-samples_now + (start_samples + self._epoch_length)) % 100 == 0: print('samples needed', -samples_now + (start_samples + self._epoch_length)) if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') print('after hook', self._epoch) training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, training_environment) gt.stamp('training_metrics') should_save_path = (self._path_save_frequency > 0 and self._epoch % self._path_save_frequency == 0) if should_save_path: import pickle for i, path in enumerate(training_paths): #path.pop('images') path_file_name = f'training_path_{self._epoch}_{i}.pkl' path_file_path = os.path.join(os.getcwd(), 'paths', path_file_name) if not os.path.exists(os.path.dirname(path_file_path)): os.makedirs(os.path.dirname(path_file_path)) with open(path_file_path, 'wb') as f: pickle.dump(path, f) if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): # TODO(hartikainen): Make this consistent such that there's no # need for the hasattr check. training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() yield {'done': True, **diagnostics}
def _train(self, env, policy): self._init_training(env, policy) with self._sess.as_default(): observation = env.reset() policy.reset() itr = 0 path_length = 0 path_return = 0 gt.rename_root('online algo') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): # Sample next action and state. action, _ = policy.get_action(observation) gt.stamp('train: get actions') action.squeeze() if self._render: env.render() next_ob, raw_reward, terminal, info = env.step(action) reward = raw_reward * self._scale_reward path_length += 1 path_return += reward gt.stamp('train: simulation') # Add experience to replay pool. self._pool.add_sample(observation, action, reward, terminal, False) should_reset = (terminal or path_length >= self._max_path_length) if should_reset: # noinspection PyTypeChecker self._pool.add_sample(next_ob, np.zeros_like(action), np.zeros_like(reward), np.zeros_like(terminal), True) observation = env.reset() policy.reset() path_length = 0 path_return = 0 else: observation = next_ob gt.stamp('train: fill replay pool') # Train. if self._pool.size >= self._min_pool_size: self._do_training(itr) itr += 1 gt.stamp('train: updates') # Evaluate. self._evaluate(epoch) gt.stamp("test") # Log. params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs train_time = np.sum([ times_itrs[key][-1] for key in times_itrs.keys() if 'train: ' in key ]) eval_time = times_itrs["test"][-1] total_time = gt.get_times().total logger.record_tabular("time: train", train_time) logger.record_tabular("time: eval", eval_time) logger.record_tabular("time: total", total_time) logger.record_tabular("scale_reward", self._scale_reward) logger.record_tabular("epochs", epoch) logger.record_tabular("steps: all", itr) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp("logging") print( gt.report( include_itrs=False, format_options={'itr_name_width': 30}, )) env.terminate()
feat_data, posterior_tb, save_dir=save_dir, **cs_kwargs) # cs = coreset(acq, feat_data, posterior, save_dir=save_dir, **cs_kwargs) # replace imputed labels by true labels data.y[data.index['train'][-batch_size:]] = np.vstack( true_labels) else: batch = cs.build(batch_size) data.move_from_unlabeled_to_train(batch) gt.stamp('batch_selection', unique=False) print() t = gt.get_times().stamps.cum test_performances['num_samples'].append(num_samples) test_performances['wt'].append(t['model_training'] + t['batch_selection']) test_performances['wt_batch'].append(t['batch_selection']) test_performances['LL'].append(-test_nll.mean()) test_performances['RMSE'].append(test_performance.mean()) optim_params = { 'num_epochs': args.training_epochs, 'batch_size': get_batch_size(args.dataset, data), 'weight_decay': args.weight_decay, 'initial_lr': args.initial_lr } nl = NeuralLinearTB(data, out_features=out_features, **kwargs) # nl = NeuralLinear(data, out_features=out_features, **kwargs)
def _train(self, env, policy, pool): """When training our policy expects an augmented observation.""" self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() log_p_z_episode = [] # Store log_p_z for this episode path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 if self._learn_p_z: log_p_z_list = [ deque(maxlen=self._max_path_length) for _ in range(self._num_skills) ] gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) path_length_list = [] z = self._sample_z() aug_obs = utils.concat_obs_z(observation, z, self._num_skills, concat_type=self.concat_type) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(aug_obs) if self._learn_p_z: (obs, _) = utils.split_aug_obs(aug_obs, self._num_skills) feed_dict = { self._discriminator._obs_pl: obs[None], self._discriminator._action_pl: action[None] } logits = tf_utils.get_default_session().run( self._discriminator._output_t, feed_dict)[0] log_p_z = np.log(utils._softmax(logits)[z]) if self._learn_p_z: log_p_z_list[z].append(log_p_z) next_ob, reward, terminal, info = env.step(action) aug_next_ob = utils.concat_obs_z( next_ob, z, self._num_skills, concat_type=self.concat_type) path_length += 1 path_return += reward self._pool.add_sample( aug_obs, action, reward, terminal, aug_next_ob, ) if terminal or path_length >= self._max_path_length: path_length_list.append(path_length) observation = env.reset() policy.reset() log_p_z_episode = [] path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: aug_obs = aug_next_ob gt.stamp('sample') if self._pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self._pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') if self._learn_p_z: print('learning p(z)') for z in range(self._num_skills): if log_p_z_list[z]: print( '\t skill = %d, min=%.2f, max=%.2f, mean=%.2f, len=%d' % (z, np.min( log_p_z_list[z]), np.max(log_p_z_list[z]), np.mean( log_p_z_list[z]), len(log_p_z_list[z]))) log_p_z = [ np.mean(log_p_z) if log_p_z else np.log(1.0 / self._num_skills) for log_p_z in log_p_z_list ] print('log_p_z: %s' % log_p_z) self._p_z = utils._softmax(log_p_z) self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self._pool.size) logger.record_tabular('path-length', np.mean(path_length_list)) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def _log_data(self, epoch, write_header=False): # Update logger parameters with algorithm-specific variables params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) # Check that logger parameters (table keys) did not change. table_keys = logger.get_table_key_set() if self._old_table_keys is not None: if not table_keys == self._old_table_keys: error_text = "Table keys cannot change from iteration " \ "to iteration.\n" error_text += 'table_keys: ' error_text += str(table_keys) error_text += '\n' error_text += 'old_table_keys: ' error_text += str(self._old_table_keys) error_text += 'not in new: ' error_text += str( np.setdiff1d(list(table_keys), list(self._old_table_keys))) error_text += 'not in old:' error_text += str( np.setdiff1d(list(self._old_table_keys), list(table_keys))) raise AttributeError(error_text) self._old_table_keys = table_keys # Add the number of steps to the logger logger.record_tabular( "Number of train steps total", self._n_total_train_steps, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) # Get useful times times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] # eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 eval_time = times_itrs['eval'][-1] if 'eval' in times_itrs else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total # Add the previous times to the logger logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) # Add the number of epoch to the logger logger.record_tabular("Epoch", epoch) # Dump the logger data logger.dump_tabular(with_prefix=False, with_timestamp=False, write_header=write_header)
def train(self): """ CG: the function that conducts ensemble training. :return: """ # Set up parameters for the training process. self._n_epochs = self._base_ac_params['n_epochs'] self._epoch_length = self._base_ac_params['epoch_length'] self._n_train_repeat = self._base_ac_params['n_train_repeat'] self._n_initial_exploration_steps = self._base_ac_params[ 'n_initial_exploration_steps'] self._eval_render = self._base_ac_params['eval_render'] self._eval_n_episodes = self._base_ac_params['eval_n_episodes'] self._eval_deterministic = self._base_ac_params['eval_deterministic'] # Set up the evaluation environment. if self._eval_n_episodes > 0: with tf.variable_scope("low_level_policy", reuse=True): self._eval_env = deep_clone(self._env) # Set up the tensor flow session. self._sess = tf_utils.get_default_session() # Import required libraries for training. import random import math import operator import numpy as np # Initialize the sampler. alg_ins = random.choice(self._alg_instances) self._sampler.initialize(self._env, alg_ins[0].policy, self._pool) # Perform the training/evaluation process. num_episode = 0. with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): isEpisodeEnd = self._sampler.sample() # If an episode is ended, we need to update performance statistics for each AC instance and # pick randomly another AC instance for next episode of exploration. if isEpisodeEnd: num_episode = num_episode + 1. alg_ins[1] = 0.9 * alg_ins[ 1] + 0.1 * self._sampler._last_path_return alg_ins[2] = alg_ins[2] + 1. if self._use_ucb: # Select an algorithm instance based on UCB. selected = False for ains in self._alg_instances: if ains[2] < 1.: alg_ins = ains selected = True break else: ains[3] = ains[1] + math.sqrt( 2.0 * math.log(num_episode) / ains[2]) if not selected: alg_ins = max(self._alg_instances, key=operator.itemgetter(3)) else: # Select an algorithm instance uniformly at random. alg_ins = random.choice(self._alg_instances) self._sampler.set_policy(alg_ins[0].policy) if not self._sampler.batch_ready(): continue gt.stamp('sample') # Perform training over all AC instances. for i in range(self._n_train_repeat): batch = self._sampler.random_batch() for ains in self._alg_instances: ains[0]._do_training(iteration=t + epoch * self._epoch_length, batch=batch) gt.stamp('train') # Perform evaluation after one full epoch of training is completed. if self._eval_n_episodes < 1: continue if self._evaluation_strategy == 'ensemble': # Use a whole ensemble of AC instances for evaluation. paths = rollouts(self._eval_env, self, self._sampler._max_path_length, self._eval_n_episodes) elif self._evaluation_strategy == 'best-policy': # Choose the AC instance with the highest observed performance so far for evaluation. eval_alg_ins = max(self._alg_instances, key=operator.itemgetter(1)) with eval_alg_ins[0].policy.deterministic( self._eval_deterministic): paths = rollouts(self._eval_env, eval_alg_ins[0].policy, self._sampler._max_path_length, self._eval_n_episodes) else: paths = None if paths is not None: total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) self._eval_env.log_diagnostics(paths) if self._eval_render: self._eval_env.render(paths) # Produce log info after each episode of training and evaluation. times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self._sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') # Terminate the sampler after the training process is completed. self._sampler.terminate()
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool model_metrics = {} if not self._training_started: self._init_training() self.sampler.initialize(training_environment, policy, pool) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): if self._epoch % 200 == 0: #### model training print('[ MOPO ] log_dir: {} | ratio: {}'.format( self._log_dir, self._real_ratio)) print( '[ MOPO ] Training model at epoch {} | freq {} | timestep {} (total: {})' .format(self._epoch, self._model_train_freq, self._timestep, self._total_timestep)) max_epochs = 1 if self._model.model_loaded else None model_train_metrics = self._train_model( batch_size=256, max_epochs=max_epochs, holdout_ratio=0.2, max_t=self._max_model_t) model_metrics.update(model_train_metrics) self._log_model() gt.stamp('epoch_train_model') #### self._epoch_before_hook() gt.stamp('epoch_before_hook') self._training_progress = Progress(self._epoch_length * self._n_train_repeat) start_samples = self.sampler._total_samples for timestep in count(): self._timestep = timestep if (timestep >= self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') ## model rollouts if timestep % self._model_train_freq == 0 and self._real_ratio < 1.0: self._training_progress.pause() self._set_rollout_length() self._reallocate_model_pool() model_rollout_metrics = self._rollout_model( rollout_batch_size=self._rollout_batch_size, deterministic=self._deterministic) model_metrics.update(model_rollout_metrics) gt.stamp('epoch_rollout_model') self._training_progress.resume() ## train actor and critic if self.ready_to_train: self._do_training_repeats(timestep=timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), *((f'model/{key}', model_metrics[key]) for key in sorted(model_metrics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): training_environment.render_rollouts(evaluation_paths) ## ensure we did not collect any more data assert self._pool.size == self._init_pool_size yield diagnostics epi_ret = self._rollout_model_for_eval( self._training_environment.reset) np.savetxt("EEepi_ret__fin.csv", epi_ret, delimiter=',') self.sampler.terminate() self._training_after_hook() self._training_progress.close() yield {'done': True, **diagnostics}
import time import gtimer as gt time.sleep(0.1) gt.stamp('first') for i in gt.timed_for([1, 2, 3]): time.sleep(0.1) gt.stamp('loop_1') if i > 1: time.sleep(0.1) gt.stamp('loop_2') times_itrs = gt.get_times().stamps.itrs print(times_itrs) time.sleep(0.1) gt.stamp('second') time.sleep(0.1) loop = gt.timed_loop('named_loop', save_itrs=True) x = 0 while x < 3: loop.next() time.sleep(0.1) x += 1 gt.stamp('loop') gt.attach loop.exit() time.sleep(0.1) times_itrs = gt.get_times().stamps.itrs print(times_itrs)
def _try_to_eval(self, epoch): if epoch % self.logging_period != 0: return if epoch in self.save_extra_manual_epoch_set: logger.save_extra_data( self.get_extra_data_to_save(epoch), file_name='extra_snapshot_itr{}'.format(epoch), mode='cloudpickle', ) if self._save_extra_every_epoch: logger.save_extra_data(self.get_extra_data_to_save(epoch)) gt.stamp('save-extra') if self._can_evaluate(): self.evaluate(epoch) gt.stamp('eval') params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) gt.stamp('save-snapshot') table_keys = logger.get_table_key_set() if self._old_table_keys is not None: assert table_keys == self._old_table_keys, ( "Table keys cannot change from iteration to iteration.") self._old_table_keys = table_keys logger.record_dict( self.trainer.get_diagnostics(), prefix='trainer/', ) logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] save_extra_time = times_itrs['save-extra'][-1] save_snapshot_time = times_itrs['save-snapshot'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + save_extra_time + eval_time total_time = gt.get_times().total logger.record_tabular('in_unsupervised_model', float(self.in_unsupervised_phase)) logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Save Extra Time (s)', save_extra_time) logger.record_tabular('Save Snapshot Time (s)', save_snapshot_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.")
def _train(self, env, policy, pool): """When training our policy expects an augmented observation.""" self._init_training(env, policy, pool) with self._sess.as_default(): # reset with goal goal = env.sample_goal() observation = env.reset(goal=goal) policy.reset() # sample z ~ p(z|g) z = self._embedding.get_z(goal=goal) path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 trajectory = [] z_indx = 0 gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) path_length_list = [] for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length # flatten observation with given latent z aug_obs = np.concatenate((observation['observation'], z)) action, _ = policy.get_action(aug_obs) next_ob, reward, terminal, info = env.step(action) # assert all(next_ob['desired_goal'] == goal) assert reward == env.compute_reward( next_ob['achieved_goal'], next_ob['desired_goal'], info) path_length += 1 path_return += reward trajectory.append( (observation, action, reward, next_ob, terminal)) if terminal or path_length >= self._max_path_length: path_length_list.append(path_length) # add hindsight samples self._pool.add_hindsight_episode( episode=trajectory, embedding=self._embedding, latent=z, goal=goal, ) z_indx += 1 if z_indx >= self._n_latents: goal = env.sample_goal() z_indx = 0 z = self._embedding.get_z(goal=goal) observation = env.reset(goal=goal) policy.reset() path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 trajectory = [] else: observation = next_ob gt.stamp('sample') if self._pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self._pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('steps', iteration) # also record total steps logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self._pool.size) logger.record_tabular('path-length', np.mean(path_length_list)) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ import gtimer as gt from itertools import count training_environment = self._training_environment evaluation_environment = self._evaluation_environment training_metrics = [0 for _ in range(self._num_goals)] if not self._training_started: self._init_training() for i in range(self._num_goals): self._initial_exploration_hook( training_environment, self._initial_exploration_policy, i) self._initialize_samplers() self._sample_count = 0 gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) print("starting_training") self._training_before_hook() import time for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') start_samples = sum([self._samplers[i]._total_samples for i in range(self._num_goals)]) sample_times = [] for i in count(): samples_now = sum([self._samplers[i]._total_samples for i in range(self._num_goals)]) self._timestep = samples_now - start_samples if samples_now >= start_samples + self._epoch_length and self.ready_to_train: break t0 = time.time() self._timestep_before_hook() gt.stamp('timestep_before_hook') self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') sample_times.append(time.time() - t0) t0 = time.time() if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') # print("Train time: ", time.time() - t0) self._timestep_after_hook() gt.stamp('timestep_after_hook') # TODO diagnostics per goal print("Average Sample Time: ", np.mean(np.array(sample_times))) print("Step count", self._sample_count) training_paths_per_policy = self._training_paths() # self.sampler.get_last_n_paths( # math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths_per_policy = self._evaluation_paths() gt.stamp('evaluation_paths') training_metrics_per_policy = self._evaluate_rollouts( training_paths_per_policy, training_environment) gt.stamp('training_metrics') if evaluation_paths_per_policy: evaluation_metrics_per_policy = self._evaluate_rollouts( evaluation_paths_per_policy, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics_per_policy = [{} for _ in range(self._num_goals)] self._epoch_after_hook(training_paths_per_policy) gt.stamp('epoch_after_hook') t0 = time.time() sampler_diagnostics_per_policy = [ self._samplers[i].get_diagnostics() for i in range(self._num_goals)] diagnostics = self.get_diagnostics( iteration=self._total_timestep, batches=self._evaluation_batches(), training_paths_per_policy=training_paths_per_policy, evaluation_paths_per_policy=evaluation_paths_per_policy) time_diagnostics = gt.get_times().stamps.itrs print("Basic diagnostics: ", time.time() - t0) print("Sample count: ", self._sample_count) diagnostics.update(OrderedDict(( *( (f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys()) ), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) print("Other basic diagnostics: ", time.time() - t0) for i, (evaluation_metrics, training_metrics, sampler_diagnostics) in ( enumerate(zip(evaluation_metrics_per_policy, training_metrics_per_policy, sampler_diagnostics_per_policy))): diagnostics.update(OrderedDict(( *( (f'evaluation_{i}/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys()) ), *( (f'training_{i}/{key}', training_metrics[key]) for key in sorted(training_metrics.keys()) ), *( (f'sampler_{i}/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys()) ), ))) # if self._eval_render_kwargs and hasattr( # evaluation_environment, 'render_rollouts'): # # TODO(hartikainen): Make this consistent such that there's no # # need for the hasattr check. # training_environment.render_rollouts(evaluation_paths) yield diagnostics print("Diagnostic time: ", time.time() - t0) for i in range(self._num_goals): self._samplers[i].terminate() self._training_after_hook() del evaluation_paths_per_policy yield {'done': True, **diagnostics}
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') update_diagnostics = [] start_samples = self.sampler._total_samples for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') if self.ready_to_train: update_diagnostics.append( self._do_training_repeats( timestep=self._total_timestep)) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') update_diagnostics = tree.map_structure(lambda *d: np.mean(d), *update_diagnostics) training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, training_environment, self._total_timestep, evaluation_type='train') gt.stamp('training_metrics') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment, self._total_timestep, evaluation_type='evaluation') gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = { key: times[-1] for key, times in gt.get_times().stamps.itrs.items() } # TODO(hartikainen/tf2): Fix the naming of training/update # diagnostics/metric diagnostics.update(( ('evaluation', evaluation_metrics), ('training', training_metrics), ('update', update_diagnostics), ('times', time_diagnostics), ('sampler', sampler_diagnostics), ('epoch', self._epoch), ('timestep', self._timestep), ('total_timestep', self._total_timestep), ('num_train_steps', self._num_train_steps), )) if self._eval_render_kwargs and hasattr(evaluation_environment, 'render_rollouts'): # TODO(hartikainen): Make this consistent such that there's no # need for the hasattr check. training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() yield {'done': True, **diagnostics}
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool if not self._training_started: self._init_training() self._initial_exploration_hook(training_environment, self._initial_exploration_policy, pool) self.sampler.initialize(training_environment, policy, pool) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() import time for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') start_samples = self.sampler._total_samples sample_times = [] for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') t0 = time.time() self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') sample_times.append(time.time() - t0) if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') print("Average Sample Time: ", np.mean(np.array(sample_times))) training_paths = self._training_paths() # self.sampler.get_last_n_paths( # math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, training_environment) gt.stamp('training_metrics') #should_save_path = ( # self._path_save_frequency > 0 # and self._epoch % self._path_save_frequency == 0) #if should_save_path: # import pickle # for i, path in enumerate(training_paths): # #path.pop('images') # path_file_name = f'training_path_{self._epoch}_{i}.pkl' # path_file_path = os.path.join( # os.getcwd(), 'paths', path_file_name) # if not os.path.exists(os.path.dirname(path_file_path)): # os.makedirs(os.path.dirname(path_file_path)) # with open(path_file_path, 'wb' ) as f: # pickle.dump(path, f) if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) obs = self._pool.last_n_batch( self._pool.size)['observations']['state_observation'] plt.cla() plt.clf() plt.xlim(-20, 20) plt.ylim(-20, 20) plt.plot(obs[:, 0], obs[:, 1]) plt.savefig('traj_plot_%d.png' % (self._epoch)) if self._rnd_int_rew_coeff: errors = [] for i in np.arange(-20, 20, 0.5): error = [] for j in np.arange(-20, 20, 0.5): curr_pos = np.array([i, j]) err = self._session.run( self._rnd_errors, { self._placeholders['observations']['state_observation']: [curr_pos] })[0] error.append(err) errors.append(error) plt.cla() plt.clf() plt.imshow(np.asarray(errors)[:, :, 0]) plt.savefig('errors_%d.png' % (self._epoch)) if self._eval_render_kwargs and hasattr(evaluation_environment, 'render_rollouts'): # TODO(hartikainen): Make this consistent such that there's no # need for the hasattr check. training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() del evaluation_paths yield {'done': True, **diagnostics}
def _try_to_eval(self, epoch, eval_paths=None): if MPI and MPI.COMM_WORLD.Get_rank() == 0: if epoch % self.save_extra_data_interval == 0: logger.save_extra_data(self.get_extra_data_to_save(epoch)) if epoch % self.num_epochs_per_param_save == 0: print("Attemping itr param save...") params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) print(F"Itr{epoch} param saved!") if self._can_evaluate(): self.evaluate(epoch, eval_paths=eval_paths) logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs # train_time = times_itrs['train'][-1] training_loops = ['get_batch', 'update_normalizer', 'forward', 'compute_losses', 'qf1_loop', "policy_loss_forward", 'policy_loop', 'vf_loop'] train_time = sum(times_itrs[loop][-1] for loop in times_itrs.keys()) sample_time = times_itrs['sample'][-1] if epoch > 0: eval_time = times_itrs['eval'][-1] else: times_itrs['eval'] = [0] # Need to do this so we can do line 343, the list comprehension eval_time = 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total # logger.record_tabular('Get Batch (s)', times_itrs['get_batch'][-1]) # logger.record_tabular('Update Normalizer (s)', times_itrs['update_normalizer'][-1]) # logger.record_tabular('Forward (s)', times_itrs['forward'][-1]) # logger.record_tabular('Compute Losses (s)', times_itrs['compute_losses'][-1]) # logger.record_tabular('QF1 Loop (s)', times_itrs['qf1_loop'][-1]) # logger.record_tabular('QF2 Loop (s)', times_itrs['qf2_loop'][-1]) # logger.record_tabular("Policy Forward (s)", times_itrs['policy_loss_forward'][-1]) # logger.record_tabular('Policy Loop (s)', times_itrs['policy_loop'][-1]) # logger.record_tabular('VF Loop (s)', times_itrs['vf_loop'][-1]) [logger.record_tabular(key.title(), times_itrs[key][-1]) for key in times_itrs.keys()] logger.record_tabular('Train Time (s) ---', train_time) logger.record_tabular('(Previous) Eval Time (s) ---', eval_time) logger.record_tabular('Sample Time (s) ---', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) table_keys = logger.get_table_key_set() if self._old_table_keys is not None and table_keys != self._old_table_keys: # assert table_keys == self._old_table_keys, ( # "Table keys cannot change from iteration to iteration." # ) print("Table keys have changed. Rewriting header and filling with 0s") logger.update_header() raise NotImplementedError self._old_table_keys = table_keys logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.")
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool model_metrics = {} if not self._training_started: self._init_training() self._initial_exploration_hook(training_environment, self._initial_exploration_policy, pool) self.sampler.initialize(training_environment, policy, pool) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') self._training_progress = Progress(self._epoch_length * self._n_train_repeat) start_samples = self.sampler._total_samples for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') if self._timestep % self._model_train_freq == 0 and self._real_ratio < 1.0: self._training_progress.pause() print('[ MBPO ] log_dir: {} | ratio: {}'.format( self._log_dir, self._real_ratio)) print( '[ MBPO ] Training model at epoch {} | freq {} | timestep {} (total: {}) | epoch train steps: {} (total: {})' .format(self._epoch, self._model_train_freq, self._timestep, self._total_timestep, self._train_steps_this_epoch, self._num_train_steps)) model_train_metrics = self._train_model( batch_size=256, max_epochs=None, holdout_ratio=0.2, max_t=self._max_model_t) model_metrics.update(model_train_metrics) gt.stamp('epoch_train_model') self._set_rollout_length() if self._rollout_batch_size > 30000: factor = self._rollout_batch_size // 30000 + 1 mini_batch = self._rollout_batch_size // factor for i in range(factor): model_rollout_metrics = self._rollout_model( rollout_batch_size=mini_batch, deterministic=self._deterministic) else: model_rollout_metrics = self._rollout_model( rollout_batch_size=self._rollout_batch_size, deterministic=self._deterministic) model_metrics.update(model_rollout_metrics) gt.stamp('epoch_rollout_model') # self._visualize_model(self._evaluation_environment, self._total_timestep) self._training_progress.resume() self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, training_environment) gt.stamp('training_metrics') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), *((f'model/{key}', model_metrics[key]) for key in sorted(model_metrics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() self._training_progress.close() yield {'done': True, **diagnostics}
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(observation) next_ob, reward, terminal, info = env.step(action) path_length += 1 path_return += reward self.pool.add_sample( observation, action, reward, terminal, next_ob, ) if terminal or path_length >= self._max_path_length: observation = env.reset() policy.reset() path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: observation = next_ob gt.stamp('sample') if self.pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self.pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) if(epoch%20==0): logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self.pool.size) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate() # logger. np.save(logger._snapshot_dir+'/reward_data.npy', self.reward)
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment # policy = self._policy pool = self._pool model_metrics = {} # if not self._training_started: self._init_training() # TODO: change policy to placeholder or a function def get_action(state, hidden, deterministic=False): return self.get_action_meta(state, hidden, deterministic) def make_init_hidden(batch_size=1): return self.make_init_hidden(batch_size) self.sampler.initialize(training_environment, (get_action, make_init_hidden), pool) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) # self._training_before_hook() #### model training print('[ MOPO ] log_dir: {} | ratio: {}'.format( self._log_dir, self._real_ratio)) print( '[ MOPO ] Training model at epoch {} | freq {} | timestep {} (total: {})' .format(self._epoch, self._model_train_freq, self._timestep, self._total_timestep)) # train dynamics model offline max_epochs = 1 if self._model.model_loaded else None model_train_metrics = self._train_model(batch_size=256, max_epochs=max_epochs, holdout_ratio=0.2, max_t=self._max_model_t) model_metrics.update(model_train_metrics) self._log_model() gt.stamp('epoch_train_model') #### tester.time_step_holder.set_time(0) for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') self._training_progress = Progress(self._epoch_length * self._n_train_repeat) start_samples = self.sampler._total_samples training_logs = {} for timestep in count(): self._timestep = timestep if (timestep >= self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') ## model rollouts if timestep % self._model_train_freq == 0 and self._real_ratio < 1.0: self._training_progress.pause() self._set_rollout_length() self._reallocate_model_pool() model_rollout_metrics = self._rollout_model( rollout_batch_size=self._rollout_batch_size, deterministic=self._deterministic) model_metrics.update(model_rollout_metrics) gt.stamp('epoch_rollout_model') self._training_progress.resume() ## train actor and critic if self.ready_to_train: # print('[ DEBUG ]: ready to train at timestep: {}'.format(timestep)) training_logs = self._do_training_repeats( timestep=timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) # evaluate the polices evaluation_paths = self._evaluation_paths( (lambda _state, _hidden: get_action(_state, _hidden, True), make_init_hidden), evaluation_environment) gt.stamp('evaluation_paths') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict( (*(('evaluation/{}'.format(key), evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *(('times/{}'.format(key), time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *(('sampler/{}'.format(key), sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), *(('model/{}'.format(key), model_metrics[key]) for key in sorted(model_metrics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), *(('training/{}'.format(key), training_logs[key]) for key in sorted(training_logs.keys()))))) diagnostics['perf/AverageReturn'] = diagnostics[ 'evaluation/return-average'] diagnostics['perf/AverageLength'] = diagnostics[ 'evaluation/episode-length-avg'] if not self.min_ret == self.max_ret: diagnostics['perf/NormalizedReturn'] = (diagnostics['perf/AverageReturn'] - self.min_ret) \ / (self.max_ret - self.min_ret) # diagnostics['keys/logp_pi'] = diagnostics['training/sac_pi/logp_pi'] if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): training_environment.render_rollouts(evaluation_paths) ## ensure we did not collect any more data assert self._pool.size == self._init_pool_size for k, v in diagnostics.items(): # print('[ DEBUG ] epoch: {} diagnostics k: {}, v: {}'.format(self._epoch, k, v)) self._writer.add_scalar(k, v, self._epoch) yield diagnostics self.sampler.terminate() self._training_after_hook() self._training_progress.close() yield {'done': True, **diagnostics}
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) if self.iter_callback is not None: self.iter_callback(locals(), globals()) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(observation) next_ob, reward, terminal, info = env.step(action) path_length += 1 path_return += reward self.pool.add_sample( observation, action, reward, terminal, next_ob, ) if terminal or path_length >= self._max_path_length: observation = env.reset() policy.reset() path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: observation = next_ob gt.stamp('sample') if self.pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self.pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self.pool.size) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()