def _update_func_approx(self, x, y, w, to_log=False, log_prefix=''): """ Update the function approximator based on the current data (x, y, w) or through self._agg_data which is up-to-date with (x, y, w). """ # initial loss loss_before = self._compute_loss(x, y, w) # just on the current sample? explained_variance_before = math_utils.compute_explained_variance( self.predict(x), y) # optimization self.prepare_for_update(x) x_agg, y_agg, w_agg = self._agg_data['x'], self._agg_data[ 'y'], self._agg_data['w'] lr = self._update_with_lr_search( x_agg, y_agg, w_agg) # using aggregated data to update # new loss loss_after = self._compute_loss(x, y, w) explained_variance_after = math_utils.compute_explained_variance( self.predict(x), y) if to_log: logz.log_tabular( 'LossBefore({}){}'.format(self.name, log_prefix), loss_before) logz.log_tabular( 'LossAfter({}){}'.format(self.name, log_prefix), loss_after) logz.log_tabular( 'ExplainedVarianceBefore({}){}'.format( self.name, log_prefix), explained_variance_before) logz.log_tabular( 'ExplainedVarianceAfter({}){}'.format( self.name, log_prefix), explained_variance_after) logz.log_tabular( 'UsedLearningRate({}){}'.format(self.name, log_prefix), lr)
def update(self, ros, agents): # Aggregate data ro = self.merge(ros) # Update input normalizer for whitening if self._itr < self._n_warm_up_itrs: self.policy.update(xs=ro['obs_short']) with timed('Update oracle'): _, ev0, ev1 = self.oracle.update(ro, self.policy) with timed('Compute policy gradient'): g = self.oracle.grad(self.policy.variable) with timed('Policy update'): if isinstance(self.learner, ol.FisherOnlineOptimizer): if self._optimizer=='trpo_wl': # use also the loss function self.learner.update(g, ro=ro, policy=self.policy, loss_fun=self.oracle.fun) else: self.learner.update(g, ro=ro, policy=self.policy) else: self.learner.update(g) self.policy.variable = self.learner.x # log logz.log_tabular('stepsize', self.learner.stepsize) if hasattr(self.policy,'lstd'): logz.log_tabular('std', np.mean(np.exp(self.policy.lstd))) logz.log_tabular('g_norm', np.linalg.norm(g)) logz.log_tabular('ExplainVarianceBefore(AE)', ev0) logz.log_tabular('ExplainVarianceAfter(AE)', ev1) self._itr +=1
def update(self, ro, update_nor=False, shift_adv=False, to_log=False, log_prefix=''): """ Args: ro: RO object representing the new information update_nor: whether to update the control variate of tfLikelihoodRatioOracle shift_adv: whether to force the adv values to be positive. if float, it specifies the amount to shift. """ self._ro = ro # save the ref to rollouts # Compute adv. advs, vfns = self._ae.advs(ro) # adv has its own ref_policy adv = np.concatenate(advs) if shift_adv: # make adv non-negative assert self._use_log_loss if shift_adv is True: adv = adv - np.min(adv) else: adv = adv - np.mean(adv) + shift_adv self._nor.reset() # defined in tfLikelihoodRatioOracle update_nor = False if not self._normalize_weighting: if self._avg_type == 'sum': # rescale the problem if needed adv *= len(adv) / len(ro) # Update the loss function. if self._use_log_loss is True: # - E_{ob} E_{ac ~ q | ob} [ w * log p(ac|ob) * adv(ob, ac) ] if self._onestep_weighting: # consider importance weight w_or_logq = np.concatenate( self._ae.weights(ro, policy=self.policy)) # helper function else: w_or_logq = np.ones_like(adv) else: # False or None # - E_{ob} E_{ac ~ q | ob} [ p(ac|ob)/q(ac|ob) * adv(ob, ac) ] assert self._onestep_weighting w_or_logq = ro.lps if to_log: vfn = np.concatenate(vfns) logz.log_tabular('max_adv', np.amax(np.abs(adv))) logz.log_tabular('max_vfn', np.amax(np.abs(vfn))) # Update the tfLikelihoodRatioOracle. super().update(-adv, w_or_logq, [ro.obs, ro.acs], update_nor) # loss is negative reward
def run(self, n_itrs, pretrain=True, seed=None, save_freq=None, eval_freq=None, final_eval=False, final_save=True): eval_policy = eval_freq is not None save_policy = save_freq is not None if seed is not None: set_randomseed(seed) self.mdp.env.seed(seed) start_time = time.time() if pretrain: self.alg.pretrain(functools.partial(self.gen_ro, to_log=False)) # Main loop for itr in range(n_itrs): logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) if eval_policy: if itr % eval_freq == 0: self._eval_policy() with timed('Generate env rollouts'): ros, agents = self.gen_ro(self.alg.agent('behavior'), to_log=not eval_policy) self.alg.update(ros, agents) if save_policy: if itr % save_freq == 0: self._save_policy(self.alg.policy, itr) # dump log logz.dump_tabular() # Save the final policy. if final_eval: logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr + 1) self._eval_policy() logz.dump_tabular() if final_save: self._save_policy(self.alg.policy, n_itrs) self._save_policy(self.best_policy, 'best')
def update(self, ros, agents): # Aggregate data ro = self.merge(ros) # Update input normalizer for whitening if self._itr < self._n_warm_up_itrs: self.policy.update(xs=ro['obs_short']) # Below we update `distribution` where the variables are hosted. with timed('Update oracle'): _, err0, err1 = self.oracle.update(ro, self.distribution) # dist. with timed('Compute policy gradient'): g = self.oracle.grad(self.distribution.variable) # dist. with timed('Policy update'): if isinstance(self.learner, ol.FisherOnlineOptimizer): if self._optimizer == 'trpo_wl': # use also the loss function self.learner.update(g, ro=ro, policy=self.distribution, loss_fun=self.oracle.fun) # dist. else: self.learner.update(g, ro=ro, policy=self.distribution) # dist. else: self.learner.update(g) self.distribution.variable = self.learner.x # dist. # log logz.log_tabular('stepsize', self.learner.stepsize) if hasattr(self.distribution, 'lstd'): logz.log_tabular('std', np.mean(np.exp(self.distribution.lstd))) logz.log_tabular('g_norm', np.linalg.norm(g)) logz.log_tabular('NrmseBefore(AE)', err0) logz.log_tabular('NrmseAfter(AE)', err1) self._itr += 1
def update(self, ros, agents): # agents are behavior policies # Aggregate data data = [ a.split(ro, self.policy_as_expert) for ro, a in zip(ros, agents) ] ro_exps = [d[0] for d in data] ro_exps = list(map( list, zip(*ro_exps))) # transpose, s.t. len(ro_exps)==len(self.experts) ro_exps = [self.merge(ros) for ros in ro_exps] ro_pol = [d[1] for d in data] ro_pol = self.merge(ro_pol) # Update input normalizer for whitening if self._itr < self._n_warm_up_itrs: ro = self.merge(ros) self.policy.update(xs=ro['obs_short']) with timed('Update oracle'): # Update the value function of the experts EV0, EV1 = [], [] for k, ro_exp in enumerate(ro_exps): if len(ro_exp) > 0: _, ev0, ev1 = self.aes[k].update(ro_exp) EV0.append(ev0) EV1.append(ev1) # Update oracle self.oracle.update(ro_pol, update_vfn=False, policy=self.policy) # Update the value function the learner (after oracle update so it unbiased) if self.policy_as_expert: _, ev0, ev1 = self.aes[-1].update(ro_pol) # For adaptive sampling self._avg_n_steps.update(np.mean([len(r) for r in ro_pol])) with timed('Compute gradient'): g = self.oracle.grad(self.policy.variable) with timed('Policy update'): if isinstance(self.learner, ol.FisherOnlineOptimizer): if self._optimizer == 'trpo_wl': # use also the loss function self.learner.update(g, ro=ro, policy=self.policy, loss_fun=self.oracle.fun) else: self.learner.update(g, ro=ro, policy=self.policy) else: self.learner.update(g) self.policy.variable = self.learner.x # Log logz.log_tabular('stepsize', self.learner.stepsize) logz.log_tabular('std', np.mean(np.exp(2. * self.policy.lstd))) logz.log_tabular('g_norm', np.linalg.norm(g)) if self.policy_as_expert: logz.log_tabular('ExplainVarianceBefore(AE)', ev0) logz.log_tabular('ExplainVarianceAfter(AE)', ev1) logz.log_tabular('MeanExplainVarianceBefore(AE)', np.mean(EV0)) logz.log_tabular('MeanExplainVarianceAfter(AE)', np.mean(EV1)) logz.log_tabular('NumberOfExpertRollouts', np.sum([len(ro) for ro in ro_exps])) logz.log_tabular('NumberOfLearnerRollouts', len(ro_pol)) # Reset self._itr += 1
def gen_ro(self, agent, mdp=None, ro_kwargs=None, initialize=False, prefix='', to_log=False, eval_mode=False): """ Run the agent in the mdp and return rollout statistics as a Dataset and the agent that collects it. mpds, ro_kwargs can be either a single instance or a list. """ ro_kwargs = ro_kwargs or self.ro_kwargs mdp = mdp or self.mdp # Make mdp, ro_kwargs as lists if not isinstance(mdp, list): mdp = [mdp] if not isinstance(ro_kwargs, list): ro_kwargs = [ro_kwargs] if len(mdp) > 1 and len(ro_kwargs) == 1: ro_kwargs *= len(mdp) assert len(mdp) == len(ro_kwargs) # Run the agent and log statistics ros_all, agents_all = [], [] avg_performance = 0. for i, (m, kw) in enumerate(zip(mdp, ro_kwargs)): if initialize: # so deterministic behaviors can be realized. m.initialize() ros, agents = m.run(agent, **kw) ros_all.extend(ros) agents_all.extend(agents) # Log ro = functools.reduce(lambda x, y: x + y, ros) if not eval_mode: self._n_rollouts += len(ro) self._n_samples += ro.n_samples if to_log: if len(mdp) > 1: prefix = 'MDP' + str(i) + '_' # current ro gamma = m.gamma sum_of_rewards = [ ((gamma**np.arange(len(r.rws))) * r.rws).sum() for r in ro ] performance = np.mean(sum_of_rewards) if gamma < 1.: avg_of_rewards = [(1 - gamma) * sr for sr, r in zip(sum_of_rewards, ro)] else: avg_of_rewards = [ sr / len(r) for sr, r in zip(sum_of_rewards, ro) ] performance_avg = np.mean(avg_of_rewards) rollout_lens = [len(rollout) for rollout in ro] n_samples = sum(rollout_lens) logz.log_tabular(prefix + "NumSamples", n_samples) logz.log_tabular(prefix + "NumberOfRollouts", len(ro)) logz.log_tabular(prefix + "MeanAvgOfRewards", performance_avg) logz.log_tabular(prefix + "MeanSumOfRewards", performance) logz.log_tabular(prefix + "StdSumOfRewards", np.std(sum_of_rewards)) logz.log_tabular(prefix + "MaxSumOfRewards", np.max(sum_of_rewards)) logz.log_tabular(prefix + "MinSumOfRewards", np.min(sum_of_rewards)) logz.log_tabular(prefix + "MeanRolloutLens", np.mean(rollout_lens)) logz.log_tabular(prefix + "StdRolloutLens", np.std(rollout_lens)) avg_performance += performance / len(mdp) if to_log: # total if avg_performance >= self.best_performance: self.best_policy = copy.deepcopy(self.alg.policy) self.best_performance = avg_performance logz.log_tabular(prefix + 'TotalNumberOfSamples', self._n_samples) logz.log_tabular(prefix + 'TotalNumberOfRollouts', self._n_rollouts) logz.log_tabular(prefix + 'BestSumOfRewards', self.best_performance) return ros_all, agents_all
def update(self, ro): self._ro = ro if not self._ignore_samples: # update input normalizer for whitening self._policy.prepare_for_update(self._ro.obs) # Correction Step (Model-free) self._correction() # end of round self._itr += 1 # log logz.log_tabular('pcl_stepsize', self._pcl.stepsize) logz.log_tabular('std', np.mean(self._policy.std)) if not self._ignore_samples: logz.log_tabular('true_grads_size', np.linalg.norm(self._g)) logz.log_tabular('pred_grads_size', np.linalg.norm(self._pcl.g_hat)) pred_error_size = np.linalg.norm(self._g - self._pcl.g_hat) ratio = pred_error_size / np.linalg.norm(self._g) logz.log_tabular('pred_error_size', pred_error_size) logz.log_tabular('pred_error_true_ratio', ratio) # Prediction Step (Model-based) if self._w_pred: self._prediction() # log logz.log_tabular('std_after', np.mean(self._policy.std))
def update(self, ro): # Update input normalizer for whitening if self._itr < self._n_warm_up_itrs: self.policy.update(xs=ro['obs_short']) # Mirror descent with timed('Update oracle'): if self._use_cv: # Split ro into two phases rollouts = ro.to_list()[:int(len(ro)/2)*2] # even length ro_mix = rollouts[0:][::2] # ro with random switch assert len(ro_mix)==len(self._t_switch) or len(ro_mix)==len(self._t_switch)-1 # if a rollout too short, it is treated as zero ro_exp = [] for r, t, s in zip(ro_mix, self._t_switch, self._scale): if len(r)>=t: r = r[t-1:] r.scale = s ro_exp.append(r) ro_exp = Dataset(ro_exp) ro_pol = Dataset(rollouts[1:][::2]) _, ev0, ev1 = self.oracle.update(ro_exp=ro_exp, ro_pol=ro_pol, policy=self.policy) # for adaptive sampling self._avg_n_steps.update(np.mean([len(r) for r in ro_pol])) else: [setattr(r,'scale',s) for r,s in zip(ro, self._scale)] _, ev0, ev1 = self.oracle.update(ro_exp=ro, policy=self.policy) with timed('Compute policy gradient'): g = self.oracle.grad(self.policy) with timed('Policy update'): self.learner.update(g) self.policy.variable = self.learner.x # log logz.log_tabular('stepsize', self.learner.stepsize) logz.log_tabular('std', np.mean(np.exp(2.*self.policy.lstd))) logz.log_tabular('g_norm', np.linalg.norm(g)) logz.log_tabular('ExplainVarianceBefore(AE)', ev0) logz.log_tabular('ExplainVarianceAfter(AE)', ev1) if self._use_cv: logz.log_tabular('NumberOfExpertRollouts', len(ro_exp)) logz.log_tabular('NumberOfLearnerRollouts', len(ro_pol)) else: logz.log_tabular('NumberOfExpertRollouts', len(ro)) # reset self._reset_pi_ro() self._itr+=1