def update(self, ros, agents): # Aggregate data ro = self.merge(ros) # Update input normalizer for whitening if self._itr < self._n_warm_up_itrs: self.policy.update(xs=ro['obs_short']) with timed('Update oracle'): _, ev0, ev1 = self.oracle.update(ro, self.policy) with timed('Compute policy gradient'): g = self.oracle.grad(self.policy.variable) with timed('Policy update'): if isinstance(self.learner, ol.FisherOnlineOptimizer): if self._optimizer=='trpo_wl': # use also the loss function self.learner.update(g, ro=ro, policy=self.policy, loss_fun=self.oracle.fun) else: self.learner.update(g, ro=ro, policy=self.policy) else: self.learner.update(g) self.policy.variable = self.learner.x # log logz.log_tabular('stepsize', self.learner.stepsize) if hasattr(self.policy,'lstd'): logz.log_tabular('std', np.mean(np.exp(self.policy.lstd))) logz.log_tabular('g_norm', np.linalg.norm(g)) logz.log_tabular('ExplainVarianceBefore(AE)', ev0) logz.log_tabular('ExplainVarianceAfter(AE)', ev1) self._itr +=1
def update(self, ro): # Update input normalizer for whitening if self._itr < self._n_warm_up_itrs: self.policy.update(xs=ro['obs_short']) # Mirror descent with timed('Update oracle'): if self._use_cv: # Split ro into two phases rollouts = ro.to_list()[:int(len(ro)/2)*2] # even length ro_mix = rollouts[0:][::2] # ro with random switch assert len(ro_mix)==len(self._t_switch) or len(ro_mix)==len(self._t_switch)-1 # if a rollout too short, it is treated as zero ro_exp = [] for r, t, s in zip(ro_mix, self._t_switch, self._scale): if len(r)>=t: r = r[t-1:] r.scale = s ro_exp.append(r) ro_exp = Dataset(ro_exp) ro_pol = Dataset(rollouts[1:][::2]) _, ev0, ev1 = self.oracle.update(ro_exp=ro_exp, ro_pol=ro_pol, policy=self.policy) # for adaptive sampling self._avg_n_steps.update(np.mean([len(r) for r in ro_pol])) else: [setattr(r,'scale',s) for r,s in zip(ro, self._scale)] _, ev0, ev1 = self.oracle.update(ro_exp=ro, policy=self.policy) with timed('Compute policy gradient'): g = self.oracle.grad(self.policy) with timed('Policy update'): self.learner.update(g) self.policy.variable = self.learner.x # log logz.log_tabular('stepsize', self.learner.stepsize) logz.log_tabular('std', np.mean(np.exp(2.*self.policy.lstd))) logz.log_tabular('g_norm', np.linalg.norm(g)) logz.log_tabular('ExplainVarianceBefore(AE)', ev0) logz.log_tabular('ExplainVarianceAfter(AE)', ev1) if self._use_cv: logz.log_tabular('NumberOfExpertRollouts', len(ro_exp)) logz.log_tabular('NumberOfLearnerRollouts', len(ro_pol)) else: logz.log_tabular('NumberOfExpertRollouts', len(ro)) # reset self._reset_pi_ro() self._itr+=1
def pretrain(self, gen_ro): with timed('Pretraining'): for _ in range(self._n_pretrain_itrs): ros, _ = gen_ro(self.agent('behavior')) ro = self.merge(ros) self.oracle.update(ro, self.distribution) # dist. self.policy.update(xs=ro['obs_short'])
def _eval_policy(self): with timed('Evaluate policy performance'): self.gen_ro(self.alg.agent('target'), mdp=self.mdp_test, ro_kwargs=self.ro_kwargs_test, initialize=True, to_log=True, eval_mode=True)
def pretrain(self, gen_ro): with timed('Pretraining'): for _ in range(self._n_pretrain_itrs): for k, expert in enumerate(self.experts): ros, _ = gen_ro(PolicyAgent(expert)) ro = self.merge(ros) self.aes[k].update(ro) self.policy.update(ro['obs_short'])
def pretrain(self, gen_ro): pi_exp = lambda ob, t, done: self.expert(ob) with timed('Pretraining'): for _ in range(self._n_pretrain_itrs): ro = gen_ro(pi_exp, logp=self.expert.logp) self.oracle.update(ro_pol=ro, policy=self.policy, update_nor=False) self.policy.update(ro['obs_short']) self._reset_pi_ro()
def pretrain(self, gen_ro): with timed('Pretraining'): # Implement necessary pretraining procedures here. if isinstance(self._or, Or.tfPolicyGradient): self._ro = gen_ro(self.pi, logp=self.logp) self._or.update_ae(self._ro) # take a prediction step first if self._take_first_pred and self._w_pred: self._prediction()
def _correction(self): # single first-order update with timed('Update oracle'): self._or.update(self._ro, update_nor=True) if callable(getattr(self._or, 'update_ae')): self._or.update_ae(self._ro, to_log=True) with timed('Compute policy gradient'): g = self._or.compute_grad() self._g = g if self._w_corr: if self._update_rule in ['dyna', 'model_free']: self._pcl.clear_g_hat() # make sure hat_g is None with timed('Take piccolo correction step'): kwargs = {} if isinstance(self._pcl, rlPiccoloFisher): kwargs['ro'] = self._ro self._pcl.update(g, 'correct', **kwargs)
def callback(): with timed('Update model oracle (callback)'): self._mor.update(update_nor=True, update_ae=update_ae_and_nor, update_pol_nor=update_ae_and_nor) method = getattr(self._pcl, 'method', None) if isinstance(method, rlPiccoloFisher): method.assign( self._policy) # sync the normalizer method.ro = self._mor.ro if isinstance(self._pcl, rlPiccoloFisher): self._pcl._reg_swp.update(self._mor.ro.obs)
def update(self, ros, agents): # Aggregate data ro = self.merge(ros) # Update input normalizer for whitening if self._itr < self._n_warm_up_itrs: self.policy.update(xs=ro['obs_short']) # Below we update `distribution` where the variables are hosted. with timed('Update oracle'): _, err0, err1 = self.oracle.update(ro, self.distribution) # dist. with timed('Compute policy gradient'): g = self.oracle.grad(self.distribution.variable) # dist. with timed('Policy update'): if isinstance(self.learner, ol.FisherOnlineOptimizer): if self._optimizer == 'trpo_wl': # use also the loss function self.learner.update(g, ro=ro, policy=self.distribution, loss_fun=self.oracle.fun) # dist. else: self.learner.update(g, ro=ro, policy=self.distribution) # dist. else: self.learner.update(g) self.distribution.variable = self.learner.x # dist. # log logz.log_tabular('stepsize', self.learner.stepsize) if hasattr(self.distribution, 'lstd'): logz.log_tabular('std', np.mean(np.exp(self.distribution.lstd))) logz.log_tabular('g_norm', np.linalg.norm(g)) logz.log_tabular('NrmseBefore(AE)', err0) logz.log_tabular('NrmseAfter(AE)', err1) self._itr += 1
def run(self, n_itrs, pretrain=True, seed=None, save_freq=None, eval_freq=None, final_eval=False, final_save=True): eval_policy = eval_freq is not None save_policy = save_freq is not None if seed is not None: set_randomseed(seed) self.mdp.env.seed(seed) start_time = time.time() if pretrain: self.alg.pretrain(functools.partial(self.gen_ro, to_log=False)) # Main loop for itr in range(n_itrs): logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) if eval_policy: if itr % eval_freq == 0: self._eval_policy() with timed('Generate env rollouts'): ros, agents = self.gen_ro(self.alg.agent('behavior'), to_log=not eval_policy) self.alg.update(ros, agents) if save_policy: if itr % save_freq == 0: self._save_policy(self.alg.policy, itr) # dump log logz.dump_tabular() # Save the final policy. if final_eval: logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr + 1) self._eval_policy() logz.dump_tabular() if final_save: self._save_policy(self.alg.policy, n_itrs) self._save_policy(self.best_policy, 'best')
def update(self, ros, agents): # agents are behavior policies # Aggregate data data = [ a.split(ro, self.policy_as_expert) for ro, a in zip(ros, agents) ] ro_exps = [d[0] for d in data] ro_exps = list(map( list, zip(*ro_exps))) # transpose, s.t. len(ro_exps)==len(self.experts) ro_exps = [self.merge(ros) for ros in ro_exps] ro_pol = [d[1] for d in data] ro_pol = self.merge(ro_pol) # Update input normalizer for whitening if self._itr < self._n_warm_up_itrs: ro = self.merge(ros) self.policy.update(xs=ro['obs_short']) with timed('Update oracle'): # Update the value function of the experts EV0, EV1 = [], [] for k, ro_exp in enumerate(ro_exps): if len(ro_exp) > 0: _, ev0, ev1 = self.aes[k].update(ro_exp) EV0.append(ev0) EV1.append(ev1) # Update oracle self.oracle.update(ro_pol, update_vfn=False, policy=self.policy) # Update the value function the learner (after oracle update so it unbiased) if self.policy_as_expert: _, ev0, ev1 = self.aes[-1].update(ro_pol) # For adaptive sampling self._avg_n_steps.update(np.mean([len(r) for r in ro_pol])) with timed('Compute gradient'): g = self.oracle.grad(self.policy.variable) with timed('Policy update'): if isinstance(self.learner, ol.FisherOnlineOptimizer): if self._optimizer == 'trpo_wl': # use also the loss function self.learner.update(g, ro=ro, policy=self.policy, loss_fun=self.oracle.fun) else: self.learner.update(g, ro=ro, policy=self.policy) else: self.learner.update(g) self.policy.variable = self.learner.x # Log logz.log_tabular('stepsize', self.learner.stepsize) logz.log_tabular('std', np.mean(np.exp(2. * self.policy.lstd))) logz.log_tabular('g_norm', np.linalg.norm(g)) if self.policy_as_expert: logz.log_tabular('ExplainVarianceBefore(AE)', ev0) logz.log_tabular('ExplainVarianceAfter(AE)', ev1) logz.log_tabular('MeanExplainVarianceBefore(AE)', np.mean(EV0)) logz.log_tabular('MeanExplainVarianceAfter(AE)', np.mean(EV1)) logz.log_tabular('NumberOfExpertRollouts', np.sum([len(ro) for ro in ro_exps])) logz.log_tabular('NumberOfLearnerRollouts', len(ro_pol)) # Reset self._itr += 1
def _prediction(self): # (multi-step) update using model-information with timed('Update model oracle'): # flags shift_adv = self._shift_adv and isinstance(self._pcl, PiccoloOpt) # if to update pol_nor and ae in model update update_ae_and_nor = self._pre_w_adap or self._update_in_pred # mimic the oracle update kwargs = {'update_nor': True, 'to_log': True} if isinstance(self._mor, Or.SimulationOracle): kwargs['update_ae'] = update_ae_and_nor kwargs['update_pol_nor'] = update_ae_and_nor elif (isinstance(self._mor, Or.LazyOracle) or isinstance(self._mor, Or.AggregatedOracle) or isinstance(self._mor, Or.AdversarialOracle)): kwargs['shift_adv'] = shift_adv elif isinstance(self._mor, Or.DummyOracle): kwargs['g'] = self._g else: raise NotImplementedError( 'Model oracle update is not implemented.') self._mor.update(ro=self._ro, **kwargs) with timed('Compute model gradient'): g_hat = self._mor.compute_grad() with timed('Take piccolo prediction step'): kwargs = {} if isinstance(self._pcl, rlPiccoloFisher): kwargs['ro'] = self._mor.ro if isinstance(self._pcl, PiccoloOpt): # need to define the optimization problem kwargs['grad_hat'] = self._mor.compute_grad kwargs['loss_hat'] = self._mor.compute_loss kwargs['warm_start'] = self._warm_start kwargs['stop_std_grad'] = self._stop_std_grad if isinstance(self._mor, Or.SimulationOracle): def callback(): with timed('Update model oracle (callback)'): self._mor.update(update_nor=True, update_ae=update_ae_and_nor, update_pol_nor=update_ae_and_nor) method = getattr(self._pcl, 'method', None) if isinstance(method, rlPiccoloFisher): method.assign( self._policy) # sync the normalizer method.ro = self._mor.ro if isinstance(self._pcl, rlPiccoloFisher): self._pcl._reg_swp.update(self._mor.ro.obs) kwargs['callback'] = callback # adapt for 'dyna' and 'model-based' self._pcl.update(g_hat, 'predict', adapt=self._pre_w_adap, **kwargs)