def _update(self, env_ro, gen_env_ro): # gen_env_ro is just used for computing gradient std. assert gen_env_ro is not None '''Set _ro to self._or ''' with timed('Update Oracle'): # self.set_ro(env_ro) self._or.update(env_ro, update_nor=True, to_log=True, itr=self._itr) with timed('Compute Grad'): grads = self._or.compute_grad(ret_comps=True) grad = grads[0] if self.gradients is None: self.gradients = grad else: self.gradients = np.concatenate([self.gradients, grad], axis=0) names = ['g', 'mc_g', 'ac_os', 'tau_os', 'dr_grad_os'] for g, name in zip(grads, names): logz.log_tabular('norm_{}'.format(name), la.norm(g)) self.accum_ac += grads[2] self.accum_tau += grads[3] self.accum_func += grads[4] logz.log_tabular('norm_accum_ac_os', la.norm(self.accum_ac / self.gradients.shape[0])) logz.log_tabular('norm_accum_tau_os', la.norm(self.accum_tau / self.gradients.shape[0])) logz.log_tabular('norm_accum_func_os', la.norm(self.accum_func / self.gradients.shape[0])) self._itr += 1 logz.log_tabular('std', np.mean(self._policy.std))
def run_alg(self, n_itrs, save_policy=True, save_policy_fun=None, save_freq=3, save_value_fun=None, save_sim_fun=None, pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs): start_time = time.time() if pretrain: # algorithm-specific if rollout_kwargs is None: gr = self._gen_ro_raw elif (rollout_kwargs['max_n_rollouts'] is None and rollout_kwargs['min_n_samples'] is None): gr = self._gen_ro_raw else: gr = functools.partial(generate_rollout, env=self._env, **rollout_kwargs) self._alg.pretrain(gr, **other_pretrain_kwargs) # Main loop for itr in range(n_itrs): logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) with timed('Generate env rollouts'): ro = self.gen_ro(to_log=True) # algorithm-specific if save_policy and isinstance(save_freq, int) and itr % save_freq == 0: mean_val = logz.get_val_from_LOG('MeanSumOfRewards') prefix = 'iter_{}_eval_'.format(itr) + '%.0f' % mean_val save_policy_fun(prefix + '_pi') save_value_fun(prefix + '_vfn') save_sim_fun(prefix + 'sim') self._alg.update(ro, gen_env_ro=self._gen_ro) logz.dump_tabular() # dump log # Save the final policy. if save_policy: save_policy_fun('final') cprint('Final policy has been saved.')
def pretrain(self, gen_ro): with timed('Pretraining'): # Implement necessary pretraining procedures here. if isinstance(self._or, Or.tfPolicyGradient): self._ro = gen_ro(self.pi, logp=self.logp) self._or.update_ae(self._ro) # take a prediction step first if self._take_first_pred and self._w_pred: self._prediction()
def _correction(self): # single first-order update with timed('Update oracle'): self._or.update(self._ro, update_nor=True) if callable(getattr(self._or, 'update_ae')): self._or.update_ae(self._ro, to_log=True) with timed('Compute policy gradient'): g = self._or.compute_grad() self._g = g if self._w_corr: if self._update_rule in ['dyna', 'model_free']: self._pcl.clear_g_hat() # make sure hat_g is None with timed('Take piccolo correction step'): kwargs = {} if isinstance(self._pcl, rlPiccoloFisher): kwargs['ro'] = self._ro self._pcl.update(g, 'correct', **kwargs)
def callback(): with timed('Update model oracle (callback)'): self._mor.update(update_nor=True, update_ae=update_ae_and_nor, update_pol_nor=update_ae_and_nor) method = getattr(self._pcl, 'method', None) if isinstance(method, rlPiccoloFisher): method.assign( self._policy) # sync the normalizer method.ro = self._mor.ro if isinstance(self._pcl, rlPiccoloFisher): self._pcl._reg_swp.update(self._mor.ro.obs)
def pretrain(self, gen_ro, n_vf_updates=1, n_dyn_updates=1, n_rw_updates=1, update_pol_nor=True, **kwargs): with timed('Pretraining'): ro = gen_ro(self.pi, logp=self.logp) if update_pol_nor: self._policy.prepare_for_update(ro.obs) # update nor of policy ro = gen_ro(self.pi, logp=self.logp) for _ in range(n_vf_updates): self._or.update_ae(ro) for _ in range(n_dyn_updates): self._or.update_dyn(ro) for _ in range(n_rw_updates): self._or.update_rw(ro)
def _update(self, env_ro, gen_env_ro): # gen_env_ro is just used for computing gradient std. assert gen_env_ro is not None # XXX If using simulation to train vf, vf should be updated after policy nor is updated. if self.gen_sim_ro is not None: with timed('Generate sim data'): sim_ro = self.gen_sim_ro() with timed('Update ae'): self._or.update_ae(sim_ro, to_log=True) # update value function if self.log_sigmas_freq is not None and self._itr % self.log_sigmas_freq == 0: with timed('Compute Sigmas'): self._or.log_sigmas(**self.log_sigmas_kwargs) with timed('Update Oracle'): self._or.update(env_ro, update_nor=True, to_log=True, itr=self._itr) with timed('Compute Grad'): grads = self._or.compute_grad(ret_comps=True) grad = grads[0] names = ['g', 'mc_g', 'ac_os', 'tau_os'] for g, name in zip(grads, names): logz.log_tabular('norm_{}'.format(name), la.norm(g)) with timed('Take Gradient Step'): self._learner.update(grad, self._or.ro) # take the grad with the env_ro if self.gen_sim_ro is None: with timed('Update ae'): self._or.update_ae(env_ro, to_log=True) # update value function # Always update dynamics using true data. with timed('Update dyn'): self._or.update_dyn(env_ro, to_log=True) # update dynamics with timed('Update rw'): self._or.update_rw(env_ro, to_log=True) self._itr += 1 logz.log_tabular('online_learner_stepsize', self._learner.stepsize) logz.log_tabular('std', np.mean(self._policy.std))
def run_alg(self, n_itrs, pretrain=True, save_policy=False, save_freq=100, final_eval=False): start_time = time.time() if pretrain: # algorithm-specific self._alg.pretrain(functools.partial(self.gen_ro, to_log=False)) # Main loop for itr in range(n_itrs): logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) with timed('Generate env rollouts'): ro = self.gen_ro(self._alg.pi_ro, logp=self._alg.logp, to_log=True) self._alg.update(ro) # algorithm-specific logz.dump_tabular() # dump log
def run_alg(self, n_itrs, save_policy=None, save_policy_fun=None, save_freq=None, pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs): start_time = time.time() if pretrain: # algorithm-specific if rollout_kwargs is None: gr = self._gen_ro_raw elif (rollout_kwargs['max_n_rollouts'] is None and rollout_kwargs['min_n_samples'] is None): gr = self._gen_ro_raw else: gr = functools.partial(generate_rollout, env=self._env, **rollout_kwargs) self._alg.pretrain(gr, **other_pretrain_kwargs) # Main loop for itr in range(n_itrs): logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) with timed('Generate env rollouts'): ro = self.gen_ro(to_log=True) # algorithm-specific self._alg.update(ro, gen_env_ro=self._gen_ro) logz.dump_tabular() # dump log if save_policy and isinstance(save_freq, int) and itr % save_freq == 0: save_policy_fun('{}'.format(itr)) # Save the final policy. if save_policy: save_policy_fun('final') cprint('Final policy has been saved.')
def time_batch_env(envid, n_envs): seed = 0 n_ro = 5000 e = envs.create_env(envid, seed) def pi(obs): ac = e.action_space.sample() ac = [ac for _ in range(len(obs))] return ac # env = envs.create_batch_env(envid, seed, 1, use_ext_proc=False) # roller = Roller(env, min_n_samples=None, max_n_rollouts=n_ro, max_rollout_len=None) # with timed('1 env generate {} ros'.format(n_ro)): # roller.gen_ro(pi=pi, logp=None) # env = envs.create_batch_env(envid, seed, n_envs, use_ext_proc=True) # roller = Roller(env, min_n_samples=None, max_n_rollouts=n_ro, max_rollout_len=None) # with timed('{} envs parallel generating {} ros'.format(n_envs, n_ro)): # roller.gen_ro(pi=pi, logp=None) e = envs.create_batch_env(envid, seed, 1, use_ext_proc=False) with timed(''): generate_rollout(lambda ob: e.action_space.sample(), None, e, min_n_samples=None, max_n_rollouts=n_ro, max_rollout_len=None)
def _prediction(self): # (multi-step) update using model-information with timed('Update model oracle'): # flags shift_adv = self._shift_adv and isinstance(self._pcl, PiccoloOpt) # if to update pol_nor and ae in model update update_ae_and_nor = self._pre_w_adap or self._update_in_pred # mimic the oracle update kwargs = {'update_nor': True, 'to_log': True} if isinstance(self._mor, Or.SimulationOracle): kwargs['update_ae'] = update_ae_and_nor kwargs['update_pol_nor'] = update_ae_and_nor elif (isinstance(self._mor, Or.LazyOracle) or isinstance(self._mor, Or.AggregatedOracle) or isinstance(self._mor, Or.AdversarialOracle)): kwargs['shift_adv'] = shift_adv elif isinstance(self._mor, Or.DummyOracle): kwargs['g'] = self._g else: raise NotImplementedError( 'Model oracle update is not implemented.') self._mor.update(ro=self._ro, **kwargs) with timed('Compute model gradient'): g_hat = self._mor.compute_grad() with timed('Take piccolo prediction step'): kwargs = {} if isinstance(self._pcl, rlPiccoloFisher): kwargs['ro'] = self._mor.ro if isinstance(self._pcl, PiccoloOpt): # need to define the optimization problem kwargs['grad_hat'] = self._mor.compute_grad kwargs['loss_hat'] = self._mor.compute_loss kwargs['warm_start'] = self._warm_start kwargs['stop_std_grad'] = self._stop_std_grad if isinstance(self._mor, Or.SimulationOracle): def callback(): with timed('Update model oracle (callback)'): self._mor.update(update_nor=True, update_ae=update_ae_and_nor, update_pol_nor=update_ae_and_nor) method = getattr(self._pcl, 'method', None) if isinstance(method, rlPiccoloFisher): method.assign( self._policy) # sync the normalizer method.ro = self._mor.ro if isinstance(self._pcl, rlPiccoloFisher): self._pcl._reg_swp.update(self._mor.ro.obs) kwargs['callback'] = callback # adapt for 'dyna' and 'model-based' self._pcl.update(g_hat, 'predict', adapt=self._pre_w_adap, **kwargs)