Example #1
0
    def update(self, ros, agents):
        # Aggregate data
        ro = self.merge(ros)

        # Update input normalizer for whitening
        if self._itr < self._n_warm_up_itrs:
            self.policy.update(xs=ro['obs_short'])

        with timed('Update oracle'):
            _, ev0, ev1 = self.oracle.update(ro, self.policy)

        with timed('Compute policy gradient'):
            g = self.oracle.grad(self.policy.variable)

        with timed('Policy update'):
            if isinstance(self.learner, ol.FisherOnlineOptimizer):
                if self._optimizer=='trpo_wl':  # use also the loss function
                    self.learner.update(g, ro=ro, policy=self.policy, loss_fun=self.oracle.fun)
                else:
                    self.learner.update(g, ro=ro, policy=self.policy)
            else:
                self.learner.update(g)
            self.policy.variable = self.learner.x

        # log
        logz.log_tabular('stepsize', self.learner.stepsize)
        if hasattr(self.policy,'lstd'):
            logz.log_tabular('std', np.mean(np.exp(self.policy.lstd)))
        logz.log_tabular('g_norm', np.linalg.norm(g))
        logz.log_tabular('ExplainVarianceBefore(AE)', ev0)
        logz.log_tabular('ExplainVarianceAfter(AE)', ev1)

        self._itr +=1
Example #2
0
    def update(self, ro):
        # Update input normalizer for whitening
        if self._itr < self._n_warm_up_itrs:
            self.policy.update(xs=ro['obs_short'])

        # Mirror descent
        with timed('Update oracle'):

            if self._use_cv:
                # Split ro into two phases
                rollouts = ro.to_list()[:int(len(ro)/2)*2]  # even length
                ro_mix = rollouts[0:][::2]  # ro with random switch
                assert len(ro_mix)==len(self._t_switch) or len(ro_mix)==len(self._t_switch)-1
                # if a rollout too short, it is treated as zero
                ro_exp = []
                for r, t, s in zip(ro_mix, self._t_switch, self._scale):
                    if len(r)>=t:
                        r = r[t-1:]
                        r.scale = s
                        ro_exp.append(r)
                ro_exp = Dataset(ro_exp)
                ro_pol = Dataset(rollouts[1:][::2])
                _, ev0, ev1 = self.oracle.update(ro_exp=ro_exp,
                                                 ro_pol=ro_pol,
                                                 policy=self.policy)
                # for adaptive sampling
                self._avg_n_steps.update(np.mean([len(r) for r in ro_pol]))
            else:
                [setattr(r,'scale',s) for r,s in zip(ro, self._scale)]
                _, ev0, ev1 = self.oracle.update(ro_exp=ro,
                                                 policy=self.policy)

        with timed('Compute policy gradient'):
            g = self.oracle.grad(self.policy)

        with timed('Policy update'):
            self.learner.update(g)
            self.policy.variable = self.learner.x

        # log
        logz.log_tabular('stepsize', self.learner.stepsize)
        logz.log_tabular('std', np.mean(np.exp(2.*self.policy.lstd)))
        logz.log_tabular('g_norm', np.linalg.norm(g))
        logz.log_tabular('ExplainVarianceBefore(AE)', ev0)
        logz.log_tabular('ExplainVarianceAfter(AE)', ev1)
        if self._use_cv:
            logz.log_tabular('NumberOfExpertRollouts', len(ro_exp))
            logz.log_tabular('NumberOfLearnerRollouts', len(ro_pol))
        else:
            logz.log_tabular('NumberOfExpertRollouts', len(ro))


        # reset
        self._reset_pi_ro()
        self._itr+=1
Example #3
0
 def pretrain(self, gen_ro):
     with timed('Pretraining'):
         for _ in range(self._n_pretrain_itrs):
             ros, _ = gen_ro(self.agent('behavior'))
             ro = self.merge(ros)
             self.oracle.update(ro, self.distribution)  # dist.
             self.policy.update(xs=ro['obs_short'])
Example #4
0
 def _eval_policy(self):
     with timed('Evaluate policy performance'):
         self.gen_ro(self.alg.agent('target'),
                     mdp=self.mdp_test,
                     ro_kwargs=self.ro_kwargs_test,
                     initialize=True,
                     to_log=True,
                     eval_mode=True)
Example #5
0
File: opg.py Project: gtrll/librl
 def pretrain(self, gen_ro):
     with timed('Pretraining'):
         for _ in range(self._n_pretrain_itrs):
             for k, expert in enumerate(self.experts):
                 ros, _ = gen_ro(PolicyAgent(expert))
                 ro = self.merge(ros)
                 self.aes[k].update(ro)
                 self.policy.update(ro['obs_short'])
Example #6
0
    def pretrain(self, gen_ro):
        pi_exp = lambda ob, t, done: self.expert(ob)
        with timed('Pretraining'):
            for _ in range(self._n_pretrain_itrs):
                ro = gen_ro(pi_exp, logp=self.expert.logp)
                self.oracle.update(ro_pol=ro, policy=self.policy, update_nor=False)
                self.policy.update(ro['obs_short'])

        self._reset_pi_ro()
Example #7
0
    def pretrain(self, gen_ro):
        with timed('Pretraining'):
            # Implement necessary pretraining procedures here.
            if isinstance(self._or, Or.tfPolicyGradient):
                self._ro = gen_ro(self.pi, logp=self.logp)
                self._or.update_ae(self._ro)

            # take a prediction step first
            if self._take_first_pred and self._w_pred:
                self._prediction()
Example #8
0
    def _correction(self):
        # single first-order update
        with timed('Update oracle'):
            self._or.update(self._ro, update_nor=True)
            if callable(getattr(self._or, 'update_ae')):
                self._or.update_ae(self._ro, to_log=True)

        with timed('Compute policy gradient'):
            g = self._or.compute_grad()
            self._g = g

        if self._w_corr:
            if self._update_rule in ['dyna', 'model_free']:
                self._pcl.clear_g_hat()  # make sure hat_g is None
            with timed('Take piccolo correction step'):
                kwargs = {}
                if isinstance(self._pcl, rlPiccoloFisher):
                    kwargs['ro'] = self._ro
                self._pcl.update(g, 'correct', **kwargs)
Example #9
0
                    def callback():
                        with timed('Update model oracle (callback)'):
                            self._mor.update(update_nor=True,
                                             update_ae=update_ae_and_nor,
                                             update_pol_nor=update_ae_and_nor)

                            method = getattr(self._pcl, 'method', None)
                            if isinstance(method, rlPiccoloFisher):
                                method.assign(
                                    self._policy)  # sync the normalizer
                                method.ro = self._mor.ro
                            if isinstance(self._pcl, rlPiccoloFisher):
                                self._pcl._reg_swp.update(self._mor.ro.obs)
Example #10
0
    def update(self, ros, agents):
        # Aggregate data
        ro = self.merge(ros)

        # Update input normalizer for whitening
        if self._itr < self._n_warm_up_itrs:
            self.policy.update(xs=ro['obs_short'])

        # Below we update `distribution` where the variables are hosted.
        with timed('Update oracle'):
            _, err0, err1 = self.oracle.update(ro, self.distribution)  # dist.

        with timed('Compute policy gradient'):
            g = self.oracle.grad(self.distribution.variable)  # dist.

        with timed('Policy update'):
            if isinstance(self.learner, ol.FisherOnlineOptimizer):
                if self._optimizer == 'trpo_wl':  # use also the loss function
                    self.learner.update(g,
                                        ro=ro,
                                        policy=self.distribution,
                                        loss_fun=self.oracle.fun)  # dist.
                else:
                    self.learner.update(g, ro=ro,
                                        policy=self.distribution)  # dist.
            else:
                self.learner.update(g)
            self.distribution.variable = self.learner.x  # dist.

        # log
        logz.log_tabular('stepsize', self.learner.stepsize)
        if hasattr(self.distribution, 'lstd'):
            logz.log_tabular('std', np.mean(np.exp(self.distribution.lstd)))
        logz.log_tabular('g_norm', np.linalg.norm(g))
        logz.log_tabular('NrmseBefore(AE)', err0)
        logz.log_tabular('NrmseAfter(AE)', err1)

        self._itr += 1
Example #11
0
    def run(self,
            n_itrs,
            pretrain=True,
            seed=None,
            save_freq=None,
            eval_freq=None,
            final_eval=False,
            final_save=True):

        eval_policy = eval_freq is not None
        save_policy = save_freq is not None

        if seed is not None:
            set_randomseed(seed)
            self.mdp.env.seed(seed)

        start_time = time.time()
        if pretrain:
            self.alg.pretrain(functools.partial(self.gen_ro, to_log=False))

        # Main loop
        for itr in range(n_itrs):
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)

            if eval_policy:
                if itr % eval_freq == 0:
                    self._eval_policy()

            with timed('Generate env rollouts'):
                ros, agents = self.gen_ro(self.alg.agent('behavior'),
                                          to_log=not eval_policy)
            self.alg.update(ros, agents)

            if save_policy:
                if itr % save_freq == 0:
                    self._save_policy(self.alg.policy, itr)
            # dump log
            logz.dump_tabular()

        # Save the final policy.
        if final_eval:
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr + 1)
            self._eval_policy()
            logz.dump_tabular()

        if final_save:
            self._save_policy(self.alg.policy, n_itrs)
            self._save_policy(self.best_policy, 'best')
Example #12
0
    def update(self, ros, agents):  # agents are behavior policies
        # Aggregate data
        data = [
            a.split(ro, self.policy_as_expert) for ro, a in zip(ros, agents)
        ]
        ro_exps = [d[0] for d in data]
        ro_exps = list(map(
            list,
            zip(*ro_exps)))  # transpose, s.t. len(ro_exps)==len(self.experts)
        ro_exps = [self.merge(ros) for ros in ro_exps]
        ro_pol = [d[1] for d in data]
        ro_pol = self.merge(ro_pol)

        # Update input normalizer for whitening
        if self._itr < self._n_warm_up_itrs:
            ro = self.merge(ros)
            self.policy.update(xs=ro['obs_short'])

        with timed('Update oracle'):
            # Update the value function of the experts
            EV0, EV1 = [], []
            for k, ro_exp in enumerate(ro_exps):
                if len(ro_exp) > 0:
                    _, ev0, ev1 = self.aes[k].update(ro_exp)
                    EV0.append(ev0)
                    EV1.append(ev1)
            # Update oracle
            self.oracle.update(ro_pol, update_vfn=False, policy=self.policy)

            # Update the value function the learner (after oracle update so it unbiased)
            if self.policy_as_expert:
                _, ev0, ev1 = self.aes[-1].update(ro_pol)

            # For adaptive sampling
            self._avg_n_steps.update(np.mean([len(r) for r in ro_pol]))

        with timed('Compute gradient'):
            g = self.oracle.grad(self.policy.variable)

        with timed('Policy update'):
            if isinstance(self.learner, ol.FisherOnlineOptimizer):
                if self._optimizer == 'trpo_wl':  # use also the loss function
                    self.learner.update(g,
                                        ro=ro,
                                        policy=self.policy,
                                        loss_fun=self.oracle.fun)
                else:
                    self.learner.update(g, ro=ro, policy=self.policy)
            else:
                self.learner.update(g)
            self.policy.variable = self.learner.x

        # Log
        logz.log_tabular('stepsize', self.learner.stepsize)
        logz.log_tabular('std', np.mean(np.exp(2. * self.policy.lstd)))
        logz.log_tabular('g_norm', np.linalg.norm(g))
        if self.policy_as_expert:
            logz.log_tabular('ExplainVarianceBefore(AE)', ev0)
            logz.log_tabular('ExplainVarianceAfter(AE)', ev1)
        logz.log_tabular('MeanExplainVarianceBefore(AE)', np.mean(EV0))
        logz.log_tabular('MeanExplainVarianceAfter(AE)', np.mean(EV1))
        logz.log_tabular('NumberOfExpertRollouts',
                         np.sum([len(ro) for ro in ro_exps]))
        logz.log_tabular('NumberOfLearnerRollouts', len(ro_pol))

        # Reset
        self._itr += 1
Example #13
0
    def _prediction(self):
        # (multi-step) update using model-information

        with timed('Update model oracle'):
            # flags
            shift_adv = self._shift_adv and isinstance(self._pcl, PiccoloOpt)
            # if to update pol_nor and ae in model update
            update_ae_and_nor = self._pre_w_adap or self._update_in_pred

            # mimic the oracle update
            kwargs = {'update_nor': True, 'to_log': True}
            if isinstance(self._mor, Or.SimulationOracle):
                kwargs['update_ae'] = update_ae_and_nor
                kwargs['update_pol_nor'] = update_ae_and_nor
            elif (isinstance(self._mor, Or.LazyOracle)
                  or isinstance(self._mor, Or.AggregatedOracle)
                  or isinstance(self._mor, Or.AdversarialOracle)):
                kwargs['shift_adv'] = shift_adv
            elif isinstance(self._mor, Or.DummyOracle):
                kwargs['g'] = self._g
            else:
                raise NotImplementedError(
                    'Model oracle update is not implemented.')
            self._mor.update(ro=self._ro, **kwargs)

        with timed('Compute model gradient'):
            g_hat = self._mor.compute_grad()

        with timed('Take piccolo prediction step'):
            kwargs = {}
            if isinstance(self._pcl, rlPiccoloFisher):
                kwargs['ro'] = self._mor.ro

            if isinstance(self._pcl, PiccoloOpt):
                # need to define the optimization problem
                kwargs['grad_hat'] = self._mor.compute_grad
                kwargs['loss_hat'] = self._mor.compute_loss
                kwargs['warm_start'] = self._warm_start
                kwargs['stop_std_grad'] = self._stop_std_grad
                if isinstance(self._mor, Or.SimulationOracle):

                    def callback():
                        with timed('Update model oracle (callback)'):
                            self._mor.update(update_nor=True,
                                             update_ae=update_ae_and_nor,
                                             update_pol_nor=update_ae_and_nor)

                            method = getattr(self._pcl, 'method', None)
                            if isinstance(method, rlPiccoloFisher):
                                method.assign(
                                    self._policy)  # sync the normalizer
                                method.ro = self._mor.ro
                            if isinstance(self._pcl, rlPiccoloFisher):
                                self._pcl._reg_swp.update(self._mor.ro.obs)

                    kwargs['callback'] = callback

            # adapt for 'dyna' and 'model-based'
            self._pcl.update(g_hat,
                             'predict',
                             adapt=self._pre_w_adap,
                             **kwargs)