def __init__(self, stochpol, usercfg): EzPickle.__init__(self, stochpol, usercfg) cfg = update_default_config(self.options, usercfg) self.stochpol = stochpol self.cfg = cfg probtype = stochpol.probtype params = stochpol.trainable_variables EzFlat.__init__(self, params) ob_no = stochpol.input act_na = probtype.sampled_variable() adv_n = T.vector("adv_n") # Probability distribution: prob_np = stochpol.get_output() oldprob_np = probtype.prob_variable() logp_n = probtype.loglikelihood(act_na, prob_np) oldlogp_n = probtype.loglikelihood(act_na, oldprob_np) N = ob_no.shape[0] # Policy gradient: surr = (-1.0 / N) * T.exp(logp_n - oldlogp_n).dot(adv_n) pg = flatgrad(surr, params) prob_np_fixed = theano.gradient.disconnected_grad(prob_np) kl_firstfixed = probtype.kl(prob_np_fixed, prob_np).sum() / N grads = T.grad(kl_firstfixed, params) flat_tangent = T.fvector(name="flat_tan") shapes = [var.get_value(borrow=True).shape for var in params] start = 0 tangents = [] for shape in shapes: size = np.prod(shape) tangents.append(T.reshape(flat_tangent[start:start + size], shape)) start += size gvp = T.add( *[T.sum(g * tangent) for (g, tangent) in zipsame(grads, tangents)]) #pylint: disable=E1111 # Fisher-vector product fvp = flatgrad(gvp, params) ent = probtype.entropy(prob_np).mean() kl = probtype.kl(oldprob_np, prob_np).mean() losses = [surr, kl, ent] self.loss_names = ["surr", "kl", "ent"] args = [ob_no, act_na, adv_n, oldprob_np] self.compute_policy_gradient = theano.function(args, pg, **FNOPTS) self.compute_losses = theano.function(args, losses, **FNOPTS) self.compute_fisher_vector_product = theano.function( [flat_tangent] + args, fvp, **FNOPTS)
def __init__(self, stochpol, usercfg): EzPickle.__init__(self, stochpol, usercfg) cfg = update_default_config(self.options, usercfg) print("PPOUpdater", cfg) self.stochpol = stochpol self.cfg = cfg self.kl_coeff = 1.0 kl_cutoff = cfg["kl_target"]*2.0 probtype = stochpol.probtype params = stochpol.trainable_variables EzFlat.__init__(self, params) ob_no = stochpol.input act_na = probtype.sampled_variable() adv_n = T.vector("adv_n") kl_coeff = T.scalar("kl_coeff") # Probability distribution: prob_np = stochpol.get_output() oldprob_np = probtype.prob_variable() p_n = probtype.likelihood(act_na, prob_np) oldp_n = probtype.likelihood(act_na, oldprob_np) N = ob_no.shape[0] ent = probtype.entropy(prob_np).mean() if cfg["reverse_kl"]: kl = probtype.kl(prob_np, oldprob_np).mean() else: kl = probtype.kl(oldprob_np, prob_np).mean() # Policy gradient: surr = (-1.0 / N) * (p_n / oldp_n).dot(adv_n) pensurr = surr + kl_coeff*kl + 1000*(kl>kl_cutoff)*T.square(kl-kl_cutoff) g = flatgrad(pensurr, params) losses = [surr, kl, ent] self.loss_names = ["surr", "kl", "ent"] args = [ob_no, act_na, adv_n, oldprob_np] self.compute_lossgrad = theano.function([kl_coeff] + args, [pensurr, g], **FNOPTS) self.compute_losses = theano.function(args, losses, **FNOPTS)
def __init__(self, stochpol, usercfg): EzPickle.__init__(self, stochpol, usercfg) cfg = update_default_config(self.options, usercfg) print("PPOUpdater", cfg) self.stochpol = stochpol self.cfg = cfg self.kl_coeff = 1.0 kl_cutoff = cfg["kl_target"]*2.0 probtype = stochpol.probtype params = stochpol.trainable_variables old_params = [theano.shared(v.get_value()) for v in stochpol.trainable_variables] ob_no = stochpol.input act_na = probtype.sampled_variable() adv_n = T.vector("adv_n") kl_coeff = T.scalar("kl_coeff") # Probability distribution: self.loss_names = ["surr", "kl", "ent"] prob_np = stochpol.get_output() oldprob_np = theano.clone(stochpol.get_output(), replace=dict(zipsame(params, old_params))) p_n = probtype.likelihood(act_na, prob_np) oldp_n = probtype.likelihood(act_na, oldprob_np) N = ob_no.shape[0] ent = probtype.entropy(prob_np).mean() kl = probtype.kl(oldprob_np, prob_np).mean() # Policy gradient: surr = (-1.0 / N) * (p_n / oldp_n).dot(adv_n) train_losses = [surr, kl, ent] # training args = [ob_no, act_na, adv_n] surr,kl = train_losses[:2] pensurr = surr + kl_coeff*kl + cfg["kl_cutoff_coeff"]*(kl>kl_cutoff)*T.square(kl-kl_cutoff) self.train = theano.function([kl_coeff]+args, train_losses, updates=stochpol.get_updates() + list(adam_updates(pensurr, params, learning_rate=cfg.stepsize).items()), **FNOPTS) self.test = theano.function(args, train_losses, **FNOPTS) # XXX self.update_old_net = theano.function([], [], updates = list(zip(old_params, params)))
def __init__(self, ob_space, ac_space, usercfg): cfg = update_default_config(self.options, usercfg) policy, self.baseline = make_mlps(ob_space, ac_space, cfg) obfilter, rewfilter = make_filters(cfg, ob_space) self.updater = PpoSgdUpdater(policy, cfg) AgentWithPolicy.__init__(self, policy, obfilter, rewfilter)
def __init__(self, ob_space, ac_space, usercfg): cfg = update_default_config(self.options, usercfg) policy = make_deterministic_mlp(ob_space, ac_space, cfg) obfilter, rewfilter = make_filters(cfg, ob_space) AgentWithPolicy.__init__(self, policy, obfilter, rewfilter) self.set_stochastic(False)