Exemple #1
0
    def __init__(self, stochpol, usercfg):
        EzPickle.__init__(self, stochpol, usercfg)
        cfg = update_default_config(self.options, usercfg)

        self.stochpol = stochpol
        self.cfg = cfg

        probtype = stochpol.probtype
        params = stochpol.trainable_variables
        EzFlat.__init__(self, params)

        ob_no = stochpol.input
        act_na = probtype.sampled_variable()
        adv_n = T.vector("adv_n")

        # Probability distribution:
        prob_np = stochpol.get_output()
        oldprob_np = probtype.prob_variable()

        logp_n = probtype.loglikelihood(act_na, prob_np)
        oldlogp_n = probtype.loglikelihood(act_na, oldprob_np)
        N = ob_no.shape[0]

        # Policy gradient:
        surr = (-1.0 / N) * T.exp(logp_n - oldlogp_n).dot(adv_n)
        pg = flatgrad(surr, params)

        prob_np_fixed = theano.gradient.disconnected_grad(prob_np)
        kl_firstfixed = probtype.kl(prob_np_fixed, prob_np).sum() / N
        grads = T.grad(kl_firstfixed, params)
        flat_tangent = T.fvector(name="flat_tan")
        shapes = [var.get_value(borrow=True).shape for var in params]
        start = 0
        tangents = []
        for shape in shapes:
            size = np.prod(shape)
            tangents.append(T.reshape(flat_tangent[start:start + size], shape))
            start += size
        gvp = T.add(
            *[T.sum(g * tangent) for (g, tangent) in zipsame(grads, tangents)])  #pylint: disable=E1111
        # Fisher-vector product
        fvp = flatgrad(gvp, params)

        ent = probtype.entropy(prob_np).mean()
        kl = probtype.kl(oldprob_np, prob_np).mean()

        losses = [surr, kl, ent]
        self.loss_names = ["surr", "kl", "ent"]

        args = [ob_no, act_na, adv_n, oldprob_np]

        self.compute_policy_gradient = theano.function(args, pg, **FNOPTS)
        self.compute_losses = theano.function(args, losses, **FNOPTS)
        self.compute_fisher_vector_product = theano.function(
            [flat_tangent] + args, fvp, **FNOPTS)
Exemple #2
0
    def __init__(self, stochpol, usercfg):
        EzPickle.__init__(self, stochpol, usercfg)
        cfg = update_default_config(self.options, usercfg)
        print("PPOUpdater", cfg)

        self.stochpol = stochpol
        self.cfg = cfg
        self.kl_coeff = 1.0
        kl_cutoff = cfg["kl_target"]*2.0

        probtype = stochpol.probtype
        params = stochpol.trainable_variables
        EzFlat.__init__(self, params)

        ob_no = stochpol.input
        act_na = probtype.sampled_variable()
        adv_n = T.vector("adv_n")
        kl_coeff = T.scalar("kl_coeff")

        # Probability distribution:
        prob_np = stochpol.get_output()
        oldprob_np = probtype.prob_variable()

        p_n = probtype.likelihood(act_na, prob_np)
        oldp_n = probtype.likelihood(act_na, oldprob_np)
        N = ob_no.shape[0]

        ent = probtype.entropy(prob_np).mean()
        if cfg["reverse_kl"]:
            kl = probtype.kl(prob_np, oldprob_np).mean()
        else:
            kl = probtype.kl(oldprob_np, prob_np).mean()


        # Policy gradient:
        surr = (-1.0 / N) * (p_n / oldp_n).dot(adv_n)
        pensurr = surr + kl_coeff*kl + 1000*(kl>kl_cutoff)*T.square(kl-kl_cutoff)
        g = flatgrad(pensurr, params)

        losses = [surr, kl, ent]
        self.loss_names = ["surr", "kl", "ent"]

        args = [ob_no, act_na, adv_n, oldprob_np]

        self.compute_lossgrad = theano.function([kl_coeff] + args, [pensurr, g], **FNOPTS)
        self.compute_losses = theano.function(args, losses, **FNOPTS)
Exemple #3
0
    def __init__(self, stochpol, usercfg):
        EzPickle.__init__(self, stochpol, usercfg)
        cfg = update_default_config(self.options, usercfg)
        print("PPOUpdater", cfg)

        self.stochpol = stochpol
        self.cfg = cfg
        self.kl_coeff = 1.0
        kl_cutoff = cfg["kl_target"]*2.0

        probtype = stochpol.probtype
        params = stochpol.trainable_variables
        old_params = [theano.shared(v.get_value()) for v in stochpol.trainable_variables]

        ob_no = stochpol.input
        act_na = probtype.sampled_variable()
        adv_n = T.vector("adv_n")
        kl_coeff = T.scalar("kl_coeff")

        # Probability distribution:
        self.loss_names = ["surr", "kl", "ent"]

        prob_np = stochpol.get_output()
        oldprob_np = theano.clone(stochpol.get_output(), replace=dict(zipsame(params, old_params)))
        p_n = probtype.likelihood(act_na, prob_np)
        oldp_n = probtype.likelihood(act_na, oldprob_np)
        N = ob_no.shape[0]
        ent = probtype.entropy(prob_np).mean()
        kl = probtype.kl(oldprob_np, prob_np).mean()
        # Policy gradient:
        surr = (-1.0 / N) * (p_n / oldp_n).dot(adv_n)
        train_losses = [surr, kl, ent] 

        # training
        args = [ob_no, act_na, adv_n]
        surr,kl = train_losses[:2]
        pensurr = surr + kl_coeff*kl + cfg["kl_cutoff_coeff"]*(kl>kl_cutoff)*T.square(kl-kl_cutoff)
        self.train = theano.function([kl_coeff]+args, train_losses, 
            updates=stochpol.get_updates()
            + list(adam_updates(pensurr, params, learning_rate=cfg.stepsize).items()), **FNOPTS)

        self.test = theano.function(args, train_losses, **FNOPTS) # XXX
        self.update_old_net = theano.function([], [], updates = list(zip(old_params, params)))
Exemple #4
0
 def __init__(self, ob_space, ac_space, usercfg):
     cfg = update_default_config(self.options, usercfg)
     policy, self.baseline = make_mlps(ob_space, ac_space, cfg)
     obfilter, rewfilter = make_filters(cfg, ob_space)
     self.updater = PpoSgdUpdater(policy, cfg)
     AgentWithPolicy.__init__(self, policy, obfilter, rewfilter)
Exemple #5
0
 def __init__(self, ob_space, ac_space, usercfg):
     cfg = update_default_config(self.options, usercfg)
     policy = make_deterministic_mlp(ob_space, ac_space, cfg)
     obfilter, rewfilter = make_filters(cfg, ob_space)
     AgentWithPolicy.__init__(self, policy, obfilter, rewfilter)
     self.set_stochastic(False)