def __init__(self, x_shape, y_shape, name='supervised_learner', max_n_samples=0, # number of samples to keep max_n_batches=0, # number of batches to keep **kwargs): super().__init__(x_shape, y_shape, name=name, **kwargs) self._dataset = Dataset(max_n_samples=max_n_samples, max_n_batches=max_n_batches)
def update(self, ro): # Update input normalizer for whitening if self._itr < self._n_warm_up_itrs: self.policy.update(xs=ro['obs_short']) # Mirror descent with timed('Update oracle'): if self._use_cv: # Split ro into two phases rollouts = ro.to_list()[:int(len(ro)/2)*2] # even length ro_mix = rollouts[0:][::2] # ro with random switch assert len(ro_mix)==len(self._t_switch) or len(ro_mix)==len(self._t_switch)-1 # if a rollout too short, it is treated as zero ro_exp = [] for r, t, s in zip(ro_mix, self._t_switch, self._scale): if len(r)>=t: r = r[t-1:] r.scale = s ro_exp.append(r) ro_exp = Dataset(ro_exp) ro_pol = Dataset(rollouts[1:][::2]) _, ev0, ev1 = self.oracle.update(ro_exp=ro_exp, ro_pol=ro_pol, policy=self.policy) # for adaptive sampling self._avg_n_steps.update(np.mean([len(r) for r in ro_pol])) else: [setattr(r,'scale',s) for r,s in zip(ro, self._scale)] _, ev0, ev1 = self.oracle.update(ro_exp=ro, policy=self.policy) with timed('Compute policy gradient'): g = self.oracle.grad(self.policy) with timed('Policy update'): self.learner.update(g) self.policy.variable = self.learner.x # log logz.log_tabular('stepsize', self.learner.stepsize) logz.log_tabular('std', np.mean(np.exp(2.*self.policy.lstd))) logz.log_tabular('g_norm', np.linalg.norm(g)) logz.log_tabular('ExplainVarianceBefore(AE)', ev0) logz.log_tabular('ExplainVarianceAfter(AE)', ev1) if self._use_cv: logz.log_tabular('NumberOfExpertRollouts', len(ro_exp)) logz.log_tabular('NumberOfLearnerRollouts', len(ro_pol)) else: logz.log_tabular('NumberOfExpertRollouts', len(ro)) # reset self._reset_pi_ro() self._itr+=1
def __init__(self, ref_policy, name='advantage_func_app', max_n_rollouts=float('Inf'), # number of samples (i.e. rollouts) to keep max_n_batches=0, # number of batches (i.e. iterations) to keep **kwargs): # replay buffer (the user should append ro) self.buffer = Dataset(max_n_batches=max_n_batches, max_n_samples=max_n_rollouts) assert isinstance(ref_policy, Policy) self.ref_policy = ref_policy # reference policy self._ob_shape = ref_policy.x_shape self._ac_shape = ref_policy.y_shape super().__init__([self._ob_shape, self._ac_shape], (1,), name=name, **kwargs)
class SupervisedLearner(FunctionApproximator): """ FunctionApproximator trained on aggregated data. """ def __init__(self, x_shape, y_shape, name='supervised_learner', max_n_samples=0, # number of samples to keep max_n_batches=0, # number of batches to keep **kwargs): super().__init__(x_shape, y_shape, name=name, **kwargs) self._dataset = Dataset(max_n_samples=max_n_samples, max_n_batches=max_n_batches) def as_funcapp(self): """ Return a new copy but without the dataset and update rules. """ new = copy.copy(self) new._dataset = None new.update = None new.update_funcapp = None return new def update(self, xs, ys, ws=1.0, **kwargs): """ Update the function approximator through supervised learning xs, ys, and ws are inputs, outputs, and weights. """ assert len(xs.shape)>1 and len(ys.shape)>1 super().update(xs, ys, ws, **kwargs) # update dataset ws = np.ones(xs.shape[0])*ws if type(ws) is not np.ndarray else ws assert xs.shape[0] == ys.shape[0] == ws.shape[0] self._dataset.append(Data(xs=xs, ys=ys, ws=ws)) # update function approximator ev0 = compute_explained_variance(self(xs), ys) results = self.update_funcapp(**kwargs) # return logs, if any ev1 = compute_explained_variance(self(xs), ys) return results, ev0, ev1 @abstractmethod def update_funcapp(self, **kwargs): """ Update the function approximator based on the aggregated dataset.
def update(self, ro_exp=None, ro_pol=None, policy=None, update_nor=True, **kwargs): """ Need to provide either `ro_exp` or `ro_pol`, and `policy`. `ro_exp` is used to compute an unbiased but noisy estimate of E_{pi}[\nabla \pi(s,a) \hat{A}_{\pi^*}(s,a)] when \hat{A}_{\pi^*} given by `self._or` is unbiased. `ro_pol` provides a biased gradient which can be used as a control variate (when `ro_exp` is provided) or just to define a biased oracle. """ assert (ro_exp is not None) or (ro_pol is not None) assert policy is not None # Sync policies' parameters. self._policy.assign(policy) # NOTE sync BOTH variables and parameters # Update the oracles n_rollouts = len(ro_exp) if ro_pol is None else len(ro_pol) self._ro_or = None if ro_exp is not None: # compute adv if len(ro_exp)>0: advs, _ = self._ae.advs(ro_exp, use_is=self._use_is) advs = [a[0:1]*r.scale for a, r in zip(advs, ro_exp)] adv = np.concatenate(advs) if ro_pol is not None: # compute the control variate advs_cv, _ = self._ae.advs(ro_exp, use_is=self._use_is, lambd=0.) advs_cv = [a[0:1]*r.scale for a,r in zip(advs_cv, ro_exp)] adv -= np.concatenate(advs_cv) logq = np.concatenate([r.lps[0:1] for r in ro_exp]) # update noisy oracle self._scale_or = len(adv)/n_rollouts self._or.update(-adv, logq, update_nor=update_nor) # loss is negative reward self._ro_or = Dataset([r[0:1] for r in ro_exp]) # for defining logp self._ro_cv = None if ro_pol is not None: # update biased oracle advs, _ = self._ae.advs(ro_pol, use_is=self._use_is, lambd=0.) adv = np.concatenate(advs) self._scale_cv = len(adv)/n_rollouts logq = ro_pol['lps'] self._cv.update(-adv, logq, update_nor=update_nor) # loss is negative reward self._ro_cv = ro_pol # for defining logp # Update the value function at the end, so it's unbiased. if ro_exp is not None: return self._ae.update(ro_exp, **kwargs) else: # when biased gradient is used return self._ae.update(ro_pol, **kwargs)
def split(self, ro, policy_as_expert): # Split ro into two phases rollouts = ro.to_list() ro_mix = [rollouts[i] for i in self._ind_ro_mix] ro_pol = [rollouts[i] for i in self._ind_ro_pol] assert (len(ro_mix) + len(ro_pol)) == len(rollouts) ro_exps = [[] for _ in range(len(self.experts))] for r, t, s, k in zipsame(ro_mix, self._t_switch, self._scale, self._k_star): assert len(r) >= t # because t >= 1 if not policy_as_expert or k < len(self.experts) - 1: # we assume the last expert is the learner r = r[t:] r.weight = 1.0 ro_exps[k].append(r) if policy_as_expert: ro_pol += ro_exps[-1] del ro_exps[-1] ro_exps = [Dataset(ro_exp) for ro_exp in ro_exps] ro_pol = Dataset(ro_pol) return ro_exps, ro_pol
def generate_rollout(pi, logp, env, callback=None, v_end=None, t_state=None, rw_shaping=None, min_n_samples=None, max_n_rollouts=None, min_n_rollouts=0, max_rollout_len=None, with_animation=False): """ Collect rollouts until we have enough samples or rollouts. Each rollout is generated by repeatedly calling the behavior `pi`. At the end of the rollout, the statistics (e.g. observations, actions) are packaged as a Rollout object and then `logp` is called **once** to save the log probability of the behavior policy `pi`. All rollouts are COMPLETE in that they never end prematurely, even when `min_n_samples` is reached. They end either when `done` is true, or `max_rollout_len` is reached, or `pi` returns None. Args: `pi`: the behavior policy, which takes (observation, time, done) and returns the action or None. If None is returned, the rollout terminates. done, here, is treated as special symbol of state. If `pi` returns None, the rollout will be terminated. `logp`: either None or a function that maps (obs, acs) to log probabilities (called at end of each rollout) `env`: a gym-like environment `v_end`: the terminal value when the episoide ends (a callable function of observation and done) `t_state`: a function that maps time to desired features `rw_shaping`: a function that maps a reward to the new reward `max_rollout_len`: the maximal length of a rollout (i.e. the problem's horizon) `min_n_samples`: the minimal number of samples to collect `max_n_rollouts`: the maximal number of rollouts, `min_n_rollouts`: the minimal number of rollouts, `with_animation`: display animiation of the first rollout """ # Configs assert (min_n_samples is not None) or (max_n_rollouts is not None ) # so we can stop min_n_samples = min_n_samples or float('Inf') max_n_rollouts = max_n_rollouts or float('Inf') min_n_rollouts = min(min_n_rollouts, max_n_rollouts) max_rollout_len = max_rollout_len or float('Inf') max_episode_steps = getattr(env, '_max_episode_steps', float('Inf')) max_rollout_len = min(max_episode_steps, max_rollout_len) if v_end is None: def v_end(ob, dn): return 0. if rw_shaping is None: def rw_shaping(rw, ob, ac): return rw def post_process(x, t): # Augment observation with time information, if needed. return x if t_state is None else np.concatenate( [x.flatten(), (t_state(t), )]) def step(ac, tm): ob, rw, dn, info = env.step(ac) # current reward, next ob and dn return post_process(ob, tm), rw, dn, info def reset(tm): ob = env.reset() return post_process(ob, tm) # Start trajectory-wise rollouts. n_samples = 0 rollouts = [] while True: animate_this_rollout = len(rollouts) == 0 and with_animation obs, acs, rws, = [], [], [] tm = 0 # time step dn = False ob = reset(tm) # each trajectory while True: if animate_this_rollout: env.render() time.sleep(0.05) ac = pi(ob, tm, dn) # apply action and get to the next state if ac is None: dn = False # the learner decides to stop collecting data break # ob, ac, rw are at tm obs.append(ob) acs.append(ac) ob, rw, dn, _ = step(ac, tm) rw = rw_shaping(rw, ob, ac) rws.append(rw) tm += 1 if dn or tm >= max_rollout_len: break # due to steps limit or entering an absorbing state # save the terminal observation/reward obs.append(ob) rws.append(v_end(ob, dn)) # terminal reward # end of one rollout (`logp` is called once) rollout = Rollout(obs=obs, acs=acs, rws=rws, done=dn, logp=logp) if callback is not None: callback(rollout) rollouts.append(rollout) n_samples += len(rollout) if (n_samples >= min_n_samples) or (len(rollouts) >= max_n_rollouts): if len(rollouts) >= min_n_rollouts: break ro = Dataset(rollouts) return ro