def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 #X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+2]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = U.function([X], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ async=1, kfac_update=2, cold_iter=50, \ weight_decay_dict=wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101 U.initialize() # Initialize uninitialized TF variables
def policy_loss_ppo(self, pi, oldpi, ac, atarg, ret): kl_oldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() mean_kl = U.mean(kl_oldnew) mean_ent = U.mean(ent) pol_entpen = -self._entcoeff * mean_ent action_prob = pi.pd.logp(ac) - oldpi.pd.logp(ac) action_loss = tf.exp(action_prob) * atarg ratio = tf.exp(action_prob) surr1 = ratio * atarg surr2 = U.clip(ratio, 1.0 - self._clip_param, 1.0 + self._clip_param) * atarg pol_surr = -U.mean(tf.minimum(surr1, surr2)) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = { 'total_loss': total_loss, 'action_loss': action_loss, 'pol_surr': pol_surr, 'pol_entpen': pol_entpen, 'kl': mean_kl, 'entropy': mean_ent, 'vf_loss': vf_loss } return losses
def __init__(self, ob_dim, ac_dim): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions wd_dict = {} h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output self.wd_dict = wd_dict self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs logstd_1a = tf.expand_dims(logstd_1a, 0) std_1a = tf.exp(logstd_1a) std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim)) #kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy self.compute_kl = U.function([ob_no, oldac_dist], kl) self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss U.initialize() # Initialize uninitialized TF variables
def __init__(self, ob_dim, ac_dim, ac_space, bins): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2], name="ob") # batch of observations oldac_na = tf.placeholder( tf.int32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_logits = tf.placeholder( tf.float32, shape=[None, ac_dim * bins], name="oldac_logit" ) # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate self.pdtype = make_pdtype(ac_space) wd_dict = {} # forward pass h1 = tf.nn.tanh( dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh( dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) logits_na = dense(h2, self.pdtype.param_shape()[0], "logits", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control self.wd_dict = wd_dict self.pd = self.pdtype.pdfromflat( logits_na) # multi-categorical distributions # sample action for control sampled_ac_na = self.pd.sample() # log prob for sampled actions logprobsampled_n = -self.pd.neglogp(sampled_ac_na) logprob_n = -self.pd.neglogp(oldac_na) # kl div old_pd = self.pdtype.pdfromflat(oldac_logits) kl = U.mean(old_pd.kl(self.pd)) # surr loss surr = -U.mean(adv_n * logprob_n) surr_sampled = -U.mean(logprob_n) # expressions self._act = U.function([ob_no], [sampled_ac_na, logits_na, logprobsampled_n]) self.compute_kl = U.function([ob_no, oldac_logits], kl) self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) U.initialize()
def __init__( self, ob_space, ac_space, model_func, clip_param, entcoeff, # clipping parameter epsilon, entropy coeff adam_epsilon=1e-5, ): with tf.variable_scope('pi'): self.pi = pi = model_func(ob_space, ac_space) with tf.variable_scope('pi_old'): self.pi_old = pi_old = model_func(ob_space, ac_space) self.adv = tf.placeholder( dtype=tf.float32, shape=[None], name='adv') # Target advantage function (if applicable) self.ret = tf.placeholder(dtype=tf.float32, shape=[None], name='ret') # Empirical return self.lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * self.lrmult # Annealed cliping parameter epislon self.ac = ac = pi.pdtype.sample_placeholder([None]) kloldnew = pi_old.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - pi_old.pd.logp(ac)) # pnew / pold surr1 = ratio * self.adv # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * self.adv # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - self.ret)) self.total_loss = pol_surr + pol_entpen + vf_loss # gradients self.grads = tf.gradients(self.total_loss, pi.train_vars) self.flat_grads = U.flatgrad(self.total_loss, pi.train_vars) # optimizer self.optimizer = MpiAdam(pi.train_vars, epsilon=adam_epsilon) # assign new pi to old pi self.op_assign_old_eq_new = tf.group(*[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(pi_old.global_vars, pi.global_vars) ]) U.initialize() self.optimizer.sync()
def _init(self, ob_dim, act_dim, num_units=3, num_layers=4, batch=None): assert batch is not None self.batch = batch ob_act = tf.placeholder(tf.float32, shape=[1, ob_dim * 2], name="ob_act") ob_train = tf.placeholder(tf.float32, shape=[batch, ob_dim * 2], name="ob_train") oldac_na = tf.placeholder(tf.float32, shape=[batch, act_dim], name="ac") action_act = tf.placeholder(tf.float32, shape=[1, act_dim], name="ac_act") oldac_dist = tf.placeholder( tf.float32, shape=[batch], name="oldac_dist") # logprob for old actions adv_n = tf.placeholder(tf.float32, shape=[batch], name="adv") wd_dict = {} # module for execution and training policy_train = NormalizingFlowStateModel(ob_train, oldac_na, name='policy', reuse=False, num_units=num_units, num_layers=num_layers) policy_act = NormalizingFlowStateModel(ob_act, action_act, name='policy', reuse=True, num_units=num_units, num_layers=num_layers) # weight decay self.wd_dict = {} # TODO # action for execution self.pi_act = policy_act.y_sample self.log_prob_act = policy_act.log_prob # kl divergence ac_dist = policy_train.log_prob # logprob kl = U.mean(oldac_dist - ac_dist) # sample based kl # surr loss surr = -U.mean(adv_n * ac_dist) surr_sampled = -U.mean(ac_dist) # functions self._act = U.function([ob_act], self.pi_act) self._act_logprob = U.function([ob_act, action_act], self.log_prob_act) self.compute_kl = U.function([ob_train, oldac_na, oldac_dist], kl) self.update_info = ((ob_train, oldac_na, adv_n), surr, surr_sampled) U.initialize()
def load_policy(env, policy_func, *, clip_param, entcoeff, # clipping parameter epsilon, entropy coeff adam_epsilon=1e-5, model_path, checkpoint): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() U.load_state(os.path.join(model_path, 'model-{}'.format(checkpoint))) return pi
def policy_loss_ppo(self, pi, oldpi, ac, atarg, ret, term=None, entcoeff=None): kl_oldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() mean_kl = U.mean(kl_oldnew) mean_ent = U.mean(ent) entcoeff = self._entcoeff if entcoeff is None else entcoeff logger.info('Policy {} entropy {}'.format(pi.name, entcoeff)) pol_entpen = -entcoeff * mean_ent action_prob = pi.pd.logp(ac) - oldpi.pd.logp(ac) action_prob = tf.check_numerics(action_prob, 'check action_prob') action_loss = tf.check_numerics(atarg, 'check atarg') action_loss = tf.exp(action_prob) * atarg action_loss = tf.check_numerics(action_loss, 'check action_loss') term_loss = None if term is not None: # ignore prob of actions if term is True action_prob = (1 - tf.to_float(term)) * action_prob if pi.term_activation == 'sigmoid': term_prob = tf.log(pi.term_pred + 1e-5) - tf.clip_by_value(tf.log(oldpi.term_pred + 1e-5), -20, 20) else: term_prob = pi.term_pd.logp(term) - tf.clip_by_value(oldpi.term_pd.logp(term), -20, 20) action_prob += term_prob term_loss = tf.exp(term_prob) * atarg ratio = tf.exp(action_prob) surr1 = ratio * atarg surr2 = U.clip(ratio, 1.0 - self._clip_param, 1.0 + self._clip_param) * atarg pol_surr = -U.mean(tf.minimum(surr1, surr2)) vf_loss = U.mean(tf.square(pi.vpred - ret)) pol_surr = tf.check_numerics(pol_surr, 'check pol_surr') vf_loss = tf.check_numerics(vf_loss, 'check vf_loss') total_loss = pol_surr + pol_entpen + vf_loss total_loss = tf.check_numerics(total_loss, 'check total_loss') losses = {'total_loss': total_loss, 'action_loss': action_loss, 'pol_surr': pol_surr, 'pol_entpen': pol_entpen, 'kl': mean_kl, 'entropy': mean_ent, 'vf_loss': vf_loss} if term_loss is not None: losses.update({'term_loss': term_loss}) return losses
def __init__(self, sess, ob_dim, ac_dim, vf_lr=0.001, cv_lr=0.001, reuse=False): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them self.relaxed = False self.X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations self.ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations self.oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions with tf.variable_scope("model", reuse=reuse): h1 = tf.nn.tanh(dense(self.ob_no, 64, "pi_h1", weight_init=U.normc_initializer(1.0), bias_init=0.0)) h2 = tf.nn.tanh(dense(h1, 64, "pi_h2", weight_init=U.normc_initializer(1.0), bias_init=0.0)) mean_na = dense(h2, ac_dim, "pi", weight_init=U.normc_initializer(0.1), bias_init=0.0) # Mean control output self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs logstd_1a = tf.expand_dims(logstd_1a, 0) self.std_1a = tf.exp(logstd_1a) self.std_na = tf.tile(self.std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(self.std_na, [-1, ac_dim])], 1) sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action self.logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - self.oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim)) vh1 = tf.nn.elu(dense(self.X, 64, "vf_h1", weight_init=U.normc_initializer(1.0), bias_init=0)) vh2 = tf.nn.elu(dense(vh1, 64, "vf_h2", weight_init=U.normc_initializer(1.0), bias_init=0)) vpred_n = dense(vh2, 1, "vf", weight_init=None, bias_init=0) v0 = vpred_n[:, 0] self.vf_optim = tf.train.AdamOptimizer(vf_lr) def act(ob): ac, dist, logp = sess.run([sampled_ac_na, ac_dist, logprobsampled_n], {self.ob_no: ob[None]}) # Generate a new action and its logprob return ac[0], dist[0], logp[0] def value(obs, x): return sess.run(v0, {self.X: x, self.ob_no:obs}) def preproc(path): l = pathlength(path) al = np.arange(l).reshape(-1,1)/10.0 act = path["action_dist"].astype('float32') X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1) return X def predict(obs, path): return value(obs, preproc(path)) def compute_kl(ob, dist): return sess.run(kl, {self.ob_no: ob, oldac_dist: dist}) self.mean = mean_na self.vf = v0 self.act = act self.value = value self.preproc = preproc self.predict = predict self.compute_kl = compute_kl self.a0 = sampled_ac_na
def policy_loss_trpo(self, pi, oldpi, ob, ac, atarg, ret): raise NotImplementedError() kl_oldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() mean_kl = U.mean(kl_oldnew) mean_ent = U.mean(ent) pol_entpen = -self._entcoeff * mean_ent vf_loss = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) pol_surr = U.mean(ratio * atarg) pol_loss = pol_surr + pol_entpen losses = {'pol_loss': pol_loss, 'pol_surr': pol_surr, 'pol_entpen': pol_entpen, 'kl': mean_kl, 'entropy': mean_ent, 'vf_loss': vf_loss} return losses
def policy_loss_ppo_term(self, pi, oldpi, atarg, ret, term): if pi.term_type == 'sigmoid': term_prob = tf.log(pi.term_pred + 1e-5) - tf.clip_by_value(tf.log(oldpi.term_pred + 1e-5), -20, 20) else: term_prob = pi.term_pd.logp(term) - tf.clip_by_value(oldpi.term_pd.logp(term), -20, 20) term_loss = tf.exp(term_prob) * atarg ratio = tf.exp(term_prob) surr1 = ratio * atarg surr2 = U.clip(ratio, 1.0 - self._clip_param, 1.0 + self._clip_param) * atarg pol_surr = -U.mean(tf.minimum(surr1, surr2)) vf_loss = U.mean(tf.square(pi.vpred - ret)) pol_surr = tf.check_numerics(pol_surr, 'check pol_surr') vf_loss = tf.check_numerics(vf_loss, 'check vf_loss') total_loss = pol_surr + vf_loss total_loss = tf.check_numerics(total_loss, 'check total_loss') losses = {'total_loss': total_loss, 'pol_surr': pol_surr, 'vf_loss': vf_loss, 'term_loss': term_loss} return losses
def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = U.function([X], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ async=1, kfac_update=2, cold_iter=50, \ weight_decay_dict=wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101 U.initialize() # Initialize uninitialized TF variables
def learn(env, policy_func, *, timesteps_per_batch, # what to train on log_every=None, log_dir=None, episodes_so_far=0, timesteps_so_far=0, iters_so_far=0, max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, **kwargs ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) oldpi = policy_func("oldpi", ob_space, ac_space) # Target advantage function (if applicable) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- # GRASPING saver = tf.train.Saver(var_list=U.ALREADY_INITIALIZED, max_to_keep=1) checkpoint = tf.train.latest_checkpoint(log_dir) if checkpoint: print("Restoring checkpoint: {}".format(checkpoint)) saver.restore(U.get_session(), checkpoint) if hasattr(env, "set_actor"): def actor(obs): return pi.act(False, obs)[0] env.set_actor(actor) if not checkpoint and hasattr(env, "warm_init_eps"): pretrain(pi, env) saver.save(U.get_session(), osp.join(log_dir, "model")) # /GRASPING seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) tstart = time.time() assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 while True: if callback: callback(locals(), globals()) should_break = False if max_timesteps and timesteps_so_far >= max_timesteps: should_break = True elif max_episodes and episodes_so_far >= max_episodes: should_break = True elif max_iters and iters_so_far >= max_iters: should_break = True if log_every and log_dir: if (iters_so_far + 1) % log_every == 0 or should_break: # To reduce space, don't specify global step. saver.save(U.get_session(), osp.join(log_dir, "model")) job_info = {'episodes_so_far': episodes_so_far, 'iters_so_far': iters_so_far, 'timesteps_so_far': timesteps_so_far} with open(osp.join(log_dir, "job_info_new.yaml"), 'w') as file: yaml.dump(job_info, file, default_flow_style=False) # Make sure write is instantaneous. file.flush() os.fsync(file) os.rename(osp.join(log_dir, "job_info_new.yaml"), osp.join(log_dir, "job_info.yaml")) if should_break: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / (atarg.std() + 1e-10) # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) meanlosses = None if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) if meanlosses is not None: for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) logger.record_tabular("EpLenMean", np.mean(lens)) logger.record_tabular("EpRewMean", np.mean(rews)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular()
def learn(env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) num_options=1, app='', saves=False, wsaves=False, epoch=-1, seed=1, dc=0 ): optim_batchsize_ideal = optim_batchsize np.random.seed(seed) tf.set_random_seed(seed) env.seed(seed) ### Book-keeping gamename = env.spec.id[:-3].lower() gamename += 'seed' + str(seed) gamename += app # This variable: "version name, defines the name of the training" version_name = '25er_alternation_SEPARATE_optimization-ppo-ESCH-1-0-0-nI' dirname = '{}_{}_{}opts_saves/'.format(version_name,gamename,num_options) print (dirname) # retrieve everything using relative paths. Create a train_results folder where the repo has been cloned dirname_rel = os.path.dirname(__file__) splitted = dirname_rel.split("/") dirname_rel = ("/".join(dirname_rel.split("/")[:len(splitted)-3])+"/") dirname = dirname_rel + "train_results/" + dirname # if saving -> create the necessary directories if wsaves: first=True if not os.path.exists(dirname): os.makedirs(dirname) first = False # copy also the original files into the folder where the training results are stored files = ['pposgd_simple.py','mlp_policy.py','run_mujoco.py'] first = True for i in range(len(files)): src = os.path.join(dirname_rel,'baselines/baselines/ppo1/') + files[i] print (src) #dest = os.path.join('/home/nfunk/results_NEW/ppo1/') + dirname dest = dirname + "src_code/" if (first): os.makedirs(dest) first = False print (dest) shutil.copy2(src,dest) # brute force copy normal env file at end of copying process: src = os.path.join(dirname_rel,'nfunk/envs_nf/pendulum_nf.py') shutil.copy2(src,dest) shutil.copy2(src,dest) os.makedirs(dest+"assets/") src = os.path.join(dirname_rel,'nfunk/envs_nf/assets/clockwise.png') shutil.copy2(src,dest+"assets/") ### # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space max_action = env.action_space.high # add the dimension in the observation space! ob_space.shape =((ob_space.shape[0] + ac_space.shape[0]),) print (ob_space.shape) print (ac_space.shape) pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return pol_ov_op_ent = tf.placeholder(dtype=tf.float32, shape=None) # Entropy coefficient for policy over options lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon for PPO # setup observation, option and terminal advantace ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None]) # create variable for action ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent # propability of choosing action under new policy vs old policy (PPO) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage of choosing the action atarg_clip = atarg # surrogate 1: surr1 = ratio * atarg_clip #atarg # surrogate from conservative policy iteration # surrogate 2: surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg_clip # PPO's pessimistic surrogate (L^CLIP) pol_surr = - U.mean(tf.minimum(surr1, surr2)) # Loss on the Q-function vf_loss = U.mean(tf.square(pi.vpred - ret)) # calculate the total loss total_loss = vf_loss intra_op = pol_surr losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] # calculate logarithm of propability of policy over options log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-5, 1.0)) # calculate logarithm of propability of policy over options old parameter old_log_pi = tf.log(tf.clip_by_value(oldpi.op_pi, 1e-5, 1.0)) # calculate entropy of policy over options entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1) # calculate the ppo update for the policy over options: ratio_pol_ov_op = tf.exp(tf.transpose(log_pi)[option[0]] - tf.transpose(old_log_pi)[option[0]]) # pnew / pold term_adv_clip = term_adv surr1_pol_ov_op = ratio_pol_ov_op * term_adv_clip # surrogate from conservative policy iteration surr2_pol_ov_op = U.clip(ratio_pol_ov_op, 1.0 - clip_param, 1.0 + clip_param) * term_adv_clip # pol_surr_pol_ov_op = - U.mean(tf.minimum(surr1_pol_ov_op, surr2_pol_ov_op)) # PPO's pessimistic surrogate (L^CLIP) op_loss = pol_surr_pol_ov_op - pol_ov_op_ent*tf.reduce_sum(entropy) # add loss of policy over options to total loss #total_loss += op_loss total_loss1 = total_loss + intra_op total_loss2 = total_loss + op_loss var_list = pi.get_trainable_variables() term_list = var_list[6:8] # define function that we will later do gradient descent on lossandgrad1 = U.function([ob, ac, atarg, ret, lrmult,option, term_adv,pol_ov_op_ent], losses + [U.flatgrad(total_loss1, var_list)]) lossandgrad2 = U.function([ob, ac, atarg, ret, lrmult,option, term_adv,pol_ov_op_ent], losses + [U.flatgrad(total_loss2, var_list)]) # define adam optimizer adam = MpiAdam(var_list, epsilon=adam_epsilon) # define function that will assign the current parameters to the old policy assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() # NOW: everything for training was defined, from here on we start with the execution: # initialize "savers" which will store the results saver = tf.train.Saver(max_to_keep=10000) saver_best = tf.train.Saver(max_to_keep=1) ### Define the names of the .csv files that are going to be stored results=[] if saves: results = open(dirname + version_name + '_' + gamename +'_'+str(num_options)+'opts_'+'_results.csv','w') results_best_model = open(dirname + version_name + '_' + gamename +'_'+str(num_options)+'opts_'+'_bestmodel.csv','w') out = 'epoch,avg_reward' for opt in range(num_options): out += ',option {} dur'.format(opt) for opt in range(num_options): out += ',option {} std'.format(opt) for opt in range(num_options): out += ',option {} term'.format(opt) for opt in range(num_options): out += ',option {} adv'.format(opt) out+='\n' results.write(out) results.flush() # speciality: if running the training with epoch argument -> a model is loaded if epoch >= 0: dirname = '{}_{}opts_saves/'.format(gamename,num_options) print("Loading weights from iteration: " + str(epoch)) filename = dirname + '{}_epoch_{}.ckpt'.format(gamename,epoch) saver.restore(U.get_session(),filename) ### # start training episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 alternating_frequency = 25 # defines after how many epochs we switch optimizing between control and communication des_pol_op_ent = 0.1 # define policy over options entropy scheduling max_val = -100000 # define max_val, this will be updated to always store the best model tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, num_options=num_options,saves=saves,results=results,rewbuffer=rewbuffer,dc=dc) datas = [0 for _ in range(num_options)] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) # Sample (s,a)-Transitions seg = seg_gen.__next__() # Calculate A(s,a,o) using GAE add_vtarg_and_adv(seg, gamma, lam) # calculate information for logging opt_d = [] for i in range(num_options): dur = np.mean(seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0. opt_d.append(dur) std = [] for i in range(num_options): logstd = np.mean(seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0. std.append(np.exp(logstd)) print("mean opt dur:", opt_d) print("mean op pol:", np.mean(np.array(seg['optpol_p']),axis=0)) print("mean term p:", np.mean(np.array(seg['term_p']),axis=0)) print("mean value val:", np.mean(np.array(seg['value_val']),axis=0)) ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy if hasattr(pi, "ob_rms_only"): pi.ob_rms_only.update(ob[:,:-ac_space.shape[0]]) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values # if iterations modulo 1000 -> adapt entropy scheduling coefficient #if ((iters_so_far+1)%1000 and (iters_so_far+1)>=2000) == 0: if ((iters_so_far+1)%1000) == 0: des_pol_op_ent = des_pol_op_ent/10 # every 50 epochs save the best model if iters_so_far % 50 == 0 and wsaves: print("weights are saved...") filename = dirname + '{}_epoch_{}.ckpt'.format(gamename,iters_so_far) save_path = saver.save(U.get_session(),filename) # adaptively save best model -> if current reward is highest, save the model if (np.mean(rewbuffer)>max_val) and wsaves: max_val = np.mean(rewbuffer) results_best_model.write('epoch: '+str(iters_so_far) + 'rew: ' + str(np.mean(rewbuffer)) + '\n') results_best_model.flush() filename = dirname + 'best.ckpt'.format(gamename,iters_so_far) save_path = saver_best.save(U.get_session(),filename) # minimum batch size: min_batch=160 t_advs = [[] for _ in range(num_options)] # select all the samples concering one of the options # Note: so far the update is that we first use all samples from option 0 to update, then we use all samples from option 1 to update for opt in range(num_options): indices = np.where(opts==opt)[0] print("batch size:",indices.size) opt_d[opt] = indices.size if not indices.size: t_advs[opt].append(0.) continue ### This part is only necessasry when we use options. We proceed to these verifications in order not to discard any collected trajectories. if datas[opt] != 0: if (indices.size < min_batch and datas[opt].n > min_batch): datas[opt] = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif indices.size + datas[opt].n < min_batch: # pdb.set_trace() oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'],ob[indices])) cat_ac = np.concatenate((oldmap['ac'],ac[indices])) cat_atarg = np.concatenate((oldmap['atarg'],atarg[indices])) cat_vtarg = np.concatenate((oldmap['vtarg'],tdlamret[indices])) datas[opt] = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif (indices.size + datas[opt].n > min_batch and datas[opt].n < min_batch) or (indices.size > min_batch and datas[opt].n < min_batch): oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'],ob[indices])) cat_ac = np.concatenate((oldmap['ac'],ac[indices])) cat_atarg = np.concatenate((oldmap['atarg'],atarg[indices])) cat_vtarg = np.concatenate((oldmap['vtarg'],tdlamret[indices])) datas[opt] = d = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) if (indices.size > min_batch and datas[opt].n > min_batch): datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) elif datas[opt] == 0: datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) ### # define the batchsize of the optimizer: optim_batchsize = optim_batchsize or ob.shape[0] print("optim epochs:", optim_epochs) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): # Calculate advantage for using specific option here tadv,nodc_adv = pi.get_opt_adv(batch["ob"],[opt]) tadv = tadv if num_options > 1 else np.zeros_like(tadv) t_advs[opt].append(nodc_adv) # calculate the gradient #VAR 1: #if ((iters_so_far+1)>=2000): # *newlosses, grads = lossandgrad2(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv,des_pol_op_ent) #else: # *newlosses, grads = lossandgrad1(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv,des_pol_op_ent) if (int((iters_so_far)/alternating_frequency)%2==1): *newlosses, grads = lossandgrad2(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv,des_pol_op_ent) else: #print ("optim comm always") *newlosses, grads = lossandgrad1(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv,des_pol_op_ent) # perform gradient update adam.update(grads, optim_stepsize * cur_lrmult) losses.append(newlosses) # do logging: lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular() ### Book keeping if saves: out = "{},{}" for _ in range(num_options): out+=",{},{},{},{}" out+="\n" info = [iters_so_far, np.mean(rewbuffer)] for i in range(num_options): info.append(opt_d[i]) for i in range(num_options): info.append(std[i]) for i in range(num_options): info.append(np.mean(np.array(seg['term_p']),axis=0)[i]) for i in range(num_options): info.append(np.mean(t_advs[i])) results.write(out.format(*info)) results.flush()
def cmaml_loss(neglogpacs, advantage): # add in correction term. mean_adv = U.mean(advantage) exploration_term = U.mean(neglogpacs) * mean_adv return exploration_term
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) num_options=1, app='', saves=False, wsaves=False, epoch=-1, seed=1, dc=0): optim_batchsize_ideal = optim_batchsize np.random.seed(seed) tf.set_random_seed(seed) env.seed(seed) ### Book-keeping gamename = env.spec.id[:-3].lower() gamename += 'seed' + str(seed) gamename += app version_name = 'FINAL_NORM-ACT-LOWER-LR-len-400-wNoise-update1-ppo-ESCH-1-2-5-nI' dirname = '{}_{}_{}opts_saves/'.format(version_name, gamename, num_options) print(dirname) #input ("wait here after dirname") if wsaves: first = True if not os.path.exists(dirname): os.makedirs(dirname) first = False # while os.path.exists(dirname) and first: # dirname += '0' files = ['pposgd_simple.py', 'mlp_policy.py', 'run_mujoco.py'] first = True for i in range(len(files)): src = os.path.join( '/home/nfunk/Code_MA/ppoc_off_tryout/baselines/baselines/ppo1/' ) + files[i] print(src) #dest = os.path.join('/home/nfunk/results_NEW/ppo1/') + dirname dest = dirname + "src_code/" if (first): os.makedirs(dest) first = False print(dest) shutil.copy2(src, dest) # brute force copy normal env file at end of copying process: src = os.path.join( '/home/nfunk/Code_MA/ppoc_off_tryout/nfunk/envs_nf/pendulum_nf.py') shutil.copy2(src, dest) ### # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space max_action = env.action_space.high # add the dimension in the observation space! ob_space.shape = ((ob_space.shape[0] + ac_space.shape[0]), ) print(ob_space.shape) print(ac_space.shape) #input ("wait here where the spaces are printed!!!") pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return pol_ov_op_ent = tf.placeholder(dtype=tf.float32, shape=None) # Empirical return # option = tf.placeholder(dtype=tf.int32, shape=[None]) lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # pdb.set_trace() ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None]) ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold atarg_clip = atarg #tf.clip_by_value(atarg,-10,10) surr1 = ratio * atarg_clip #atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg_clip #atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) #vf_loss = U.mean(tf.square(tf.clip_by_value(pi.vpred - ret, -10.0, 10.0))) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] term_loss = pi.tpred * term_adv force_pi_loss = U.mean( tf.square( tf.clip_by_value(pi.op_pi, 1e-5, 1.0) - tf.constant([[0.05, 0.95]]))) log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-5, 1.0)) #log_pi = tf.Print(log_pi, [log_pi, tf.shape(tf.transpose(log_pi))]) old_log_pi = tf.log(tf.clip_by_value(oldpi.op_pi, 1e-5, 1.0)) entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1) ratio_pol_ov_op = tf.exp( tf.transpose(log_pi)[option[0]] - tf.transpose(old_log_pi)[option[0]]) # pnew / pold term_adv_clip = term_adv #tf.clip_by_value(term_adv,-10,10) surr1_pol_ov_op = ratio_pol_ov_op * term_adv_clip # surrogate from conservative policy iteration surr2_pol_ov_op = U.clip(ratio_pol_ov_op, 1.0 - clip_param, 1.0 + clip_param) * term_adv_clip # pol_surr_pol_ov_op = -U.mean( tf.minimum(surr1_pol_ov_op, surr2_pol_ov_op)) # PPO's pessimistic surrogate (L^CLIP) op_loss = pol_surr_pol_ov_op - pol_ov_op_ent * tf.reduce_sum(entropy) #op_loss = pol_surr_pol_ov_op #total_loss += force_pi_loss total_loss += op_loss var_list = pi.get_trainable_variables() term_list = var_list[6:8] lossandgrad = U.function( [ob, ac, atarg, ret, lrmult, option, term_adv, pol_ov_op_ent], losses + [U.flatgrad(total_loss, var_list)]) termloss = U.function([ob, option, term_adv], [U.flatgrad(term_loss, var_list) ]) # Since we will use a different step size. adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() saver = tf.train.Saver(max_to_keep=10000) saver_best = tf.train.Saver(max_to_keep=1) ### More book-kepping results = [] if saves: results = open( version_name + '_' + gamename + '_' + str(num_options) + 'opts_' + '_results.csv', 'w') results_best_model = open( dirname + version_name + '_' + gamename + '_' + str(num_options) + 'opts_' + '_bestmodel.csv', 'w') out = 'epoch,avg_reward' for opt in range(num_options): out += ',option {} dur'.format(opt) for opt in range(num_options): out += ',option {} std'.format(opt) for opt in range(num_options): out += ',option {} term'.format(opt) for opt in range(num_options): out += ',option {} adv'.format(opt) out += '\n' results.write(out) # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n') results.flush() if epoch >= 0: dirname = '{}_{}opts_saves/'.format(gamename, num_options) print("Loading weights from iteration: " + str(epoch)) filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch) saver.restore(U.get_session(), filename) ### episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 des_pol_op_ent = 0.1 max_val = -100000 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, num_options=num_options, saves=saves, results=results, rewbuffer=rewbuffer, dc=dc) datas = [0 for _ in range(num_options)] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) opt_d = [] for i in range(num_options): dur = np.mean( seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0. opt_d.append(dur) std = [] for i in range(num_options): logstd = np.mean( seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0. std.append(np.exp(logstd)) print("mean opt dur:", opt_d) print("mean op pol:", np.mean(np.array(seg['optpol_p']), axis=0)) print("mean term p:", np.mean(np.array(seg['term_p']), axis=0)) print("mean value val:", np.mean(np.array(seg['value_val']), axis=0)) ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy if hasattr(pi, "ob_rms_only"): pi.ob_rms_only.update(ob[:, :-ac_space.shape[0]] ) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values if (iters_so_far + 1) % 1000 == 0: des_pol_op_ent = des_pol_op_ent / 10 if iters_so_far % 50 == 0 and wsaves: print("weights are saved...") filename = dirname + '{}_epoch_{}.ckpt'.format( gamename, iters_so_far) save_path = saver.save(U.get_session(), filename) # adaptively save best run: if (np.mean(rewbuffer) > max_val) and wsaves: max_val = np.mean(rewbuffer) results_best_model.write('epoch: ' + str(iters_so_far) + 'rew: ' + str(np.mean(rewbuffer)) + '\n') results_best_model.flush() filename = dirname + 'best.ckpt'.format(gamename, iters_so_far) save_path = saver_best.save(U.get_session(), filename) min_batch = 160 # Arbitrary t_advs = [[] for _ in range(num_options)] for opt in range(num_options): indices = np.where(opts == opt)[0] print("batch size:", indices.size) opt_d[opt] = indices.size if not indices.size: t_advs[opt].append(0.) continue ### This part is only necessasry when we use options. We proceed to these verifications in order not to discard any collected trajectories. if datas[opt] != 0: if (indices.size < min_batch and datas[opt].n > min_batch): datas[opt] = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif indices.size + datas[opt].n < min_batch: # pdb.set_trace() oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif (indices.size + datas[opt].n > min_batch and datas[opt].n < min_batch) or (indices.size > min_batch and datas[opt].n < min_batch): oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = d = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) if (indices.size > min_batch and datas[opt].n > min_batch): datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) elif datas[opt] == 0: datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) ### optim_batchsize = optim_batchsize or ob.shape[0] optim_epochs = np.clip( np.int(10 * (indices.size / (timesteps_per_batch / num_options))), 10, 10) if num_options > 1 else optim_epochs print("optim epochs:", optim_epochs) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): #tadv,nodc_adv = pi.get_term_adv(batch["ob"],[opt]) tadv, nodc_adv = pi.get_opt_adv(batch["ob"], [opt]) tadv = tadv if num_options > 1 else np.zeros_like(tadv) t_advs[opt].append(nodc_adv) #if (opt==1): # *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv) #else: # *newlosses, grads = lossandgrad0(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv) *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv, des_pol_op_ent) #*newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv) #termg = termloss(batch["ob"], [opt], tadv) #adam.update(termg[0], 5e-7 * cur_lrmult) adam.update(grads, optim_stepsize * cur_lrmult) losses.append(newlosses) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() ### Book keeping if saves: out = "{},{}" for _ in range(num_options): out += ",{},{},{},{}" out += "\n" info = [iters_so_far, np.mean(rewbuffer)] for i in range(num_options): info.append(opt_d[i]) for i in range(num_options): info.append(std[i]) for i in range(num_options): info.append(np.mean(np.array(seg['term_p']), axis=0)[i]) for i in range(num_options): info.append(np.mean(t_advs[i])) results.write(out.format(*info)) results.flush()
def learn( env, policy_func, discriminator, expert_dataset, pretrained, pretrained_weight, *, g_step, d_step, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, save_per_iter=100, ckpt_dir=None, log_dir=None, load_model_path=None, task_name=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] d_adam = MpiAdam(discriminator.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n( [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out writer = U.FileWriter(log_dir) U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, discriminator, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(discriminator.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) # if provieded model path if load_model_path is not None: U.load_state(load_model_path) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, discriminator.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # update running mean/std for discriminator if hasattr(discriminator, "obs_rms"): discriminator.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = discriminator.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() g_loss_stats.add_all_summary(writer, g_losses, iters_so_far) d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0), iters_so_far) ep_stats.add_all_summary(writer, [ np.mean(true_rewbuffer), np.mean(rewbuffer), np.mean(lenbuffer) ], iters_so_far)
def __init__(self, a_name, env, policy_func, par): # Setup losses and stuff # ---------------------------------------- self.env = env self.timesteps_per_actorbatch = par.timesteps_per_actorbatch self.optim_epochs = par.optim_epochs self.optim_stepsize = par.optim_stepsize self.optim_batchsize = par.optim_batchsize # optimization hypers self.gamma = par.gamma self.lam = par.lam # advantage estimation self.max_timesteps = par.max_timesteps self.max_episodes = par.max_episodes self.max_iters = par.max_iters self.max_seconds = par.max_seconds # time constraint self.callback = par.callback, # you can do anything in the callback, since it takes locals(), globals() self.adam_epsilon = par.adam_epsilon self.schedule = par.schedule # annealing for stepsize parameters (epsilon and adam) self.ob_space = env.observation_space self.ac_space = env.action_space self.pi = policy_func( a_name, self.ob_space, self.ac_space) # Construct network for new policy self.oldpi = policy_func("old" + a_name, self.ob_space, self.ac_space) # Network for old policy self.atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) self.ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return self.lrmult = tf.placeholder( name='lrmult' + a_name, dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule self.clip_param = par.clip_param * self.lrmult # Annealed cliping parameter epislon obname = str('ob' + str(learning_agent.index2)) learning_agent.index2 += 1 self.ob = U.get_placeholder_cached(name=obname) self.ac = self.pi.pdtype.sample_placeholder([None]) self.kloldnew = self.oldpi.pd.kl(self.pi.pd) self.ent = self.pi.pd.entropy() self.meankl = U.mean(self.kloldnew) self.meanent = U.mean(self.ent) self.pol_entpen = (-par.entcoeff) * self.meanent self.ratio = tf.exp( self.pi.pd.logp(self.ac) - self.oldpi.pd.logp(self.ac)) # pnew / pold surr1 = self.ratio * self.atarg # surrogate from conservative policy iteration surr2 = U.clip(self.ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * self.atarg # self.pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) self.vf_loss = U.mean(tf.square(self.pi.vpred - self.ret)) self.total_loss = self.pol_surr + self.pol_entpen + self.vf_loss self.losses = [ self.pol_surr, self.pol_entpen, self.vf_loss, self.meankl, self.meanent ] self.loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] self.var_list = self.pi.get_trainable_variables() self.lossandgrad = U.function( [self.ob, self.ac, self.atarg, self.ret, self.lrmult], self.losses + [U.flatgrad(self.total_loss, self.var_list)]) self.adam = MpiAdam(self.var_list, epsilon=self.adam_epsilon) self.assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( self.oldpi.get_variables(), self.pi.get_variables()) ]) self.compute_losses = U.function( [self.ob, self.ac, self.atarg, self.ret, self.lrmult], self.losses) print(U.get_session()) U.initialize() self.adam.sync()
def learn( # =========== modified part begins =========== # env_id, seed, robot, # robot class with GMM params joint_optimization_iters, # total number of joint optimization iterations design_iters, # number of samples when updating physical design in each joint optimization iteration policy_iters, # number of samples when updating robot policy in each joint optimization iteration # ============ modified part ends ============ # policy_func, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # ================================== modification 1 ================================== # """ input: replace "env" (env class) with "env_id" (string) add "seed" (int) reason: to enable env.make() during training modification detail: add following lines into learn() env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) env.close() # added at the end of learn() """ import roboschool, gym from baselines import bench env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) # ================================== modification 1 ================================== # # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space # policy_func is the initialization of NN # NN structure: # state -> (num_hid_layers) fully-connected layers with (hid_size) units -> (action, predicted value) # num_hid_layers, hid_size: set in the file calls "learn" pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # placeholder for "ob" # created in mlppolicy.py ob = U.get_placeholder_cached(name="ob") # placeholder for "ac" # in common/distribution.py ac = pi.pdtype.sample_placeholder([None]) # KL divergence and Entropy, defined in common/distribution.py kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) # pol_entpen: Entropy Bounus encourages exploration # entcoeff: entropy coefficient, defined in PPO page 5, Equ. (9) pol_entpen = (-entcoeff) * meanent # probability ration, defined in PPO page 3 ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold # Surrogate Goal # defined in PPO page 3, Equ (7) surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) # Value Function Loss: square error loss for ||v_pred - v_target|| vf_loss = U.mean(tf.square(pi.vpred - ret)) # Total_loss = L^CLIP - Value Function Loss + Entropy Bounus # defined in PPO page 5, Equ. (9) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) # adam optimizer? adam = MpiAdam(var_list, epsilon=adam_epsilon) # oldpi = pi assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) # Why we need this line? compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # ================================== modification 2 ================================== # for joint_optimization_iter in range(joint_optimization_iters): U.save_state('/home/yetong/Desktop/Project/models/model{}.ckpt'.format( joint_optimization_iter)) logger.log("joint optimization progree: {}/{}".format( joint_optimization_iter, joint_optimization_iters)) # ================================== update physical design ================================== # if joint_optimization_iter > 20: Rewards_plus = np.zeros(design_iters) Rewards_minum = np.zeros(design_iters) params = robot.sample(design_iters, to_update=True) for i, param in enumerate(params): robot.modify_file(param) env = gym.make(env_id) # myenv = env.env # pdb.set_trace() env = bench.Monitor(env, logger.get_dir()) R = episode_generator(pi, env, gamma, stochastic=True) logger.log("\t update physical design: %d/%d, rew: %f" % (i, 2 * design_iters, R)) if i % 2 == 0: Rewards_plus[int(i / 2)] = R else: Rewards_minum[int(i / 2)] = R logger.log("prev_mu: ", robot.params_mu) logger.log("prev_sig: ", robot.params_sig) robot.update(Rewards_plus, Rewards_minum) logger.log("mu: ", robot.params_mu) logger.log("sig: ", robot.params_sig) # ================================== update policy ================================== # # params = robot.sample(design_iters) params = [robot.params_mu] for param in params: # reinitialize env robot.modify_file(param) env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) # ================================== modification 2 ================================== # # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([ max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0 ]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break # annealing for stepsize parameters (epsilon and adam) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy # oldpi = pi # set old parameter values to new parameter values assign_old_eq_new() logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular( "ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather( lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() # ================================== modification 1 ================================== # env.close()
def learn(env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular()
def learn(env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint noisy_nets=False, callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) desired_kl=0.02, logdir=".", agentName="PPO-Agent", resume = 0, num_parallel=1, num_cpu=1 ): # Setup losses and stuff # ---------------------------------------- rank = MPI.COMM_WORLD.Get_rank() ob_space = env.observation_space ac_space = env.action_space ob_size = ob_space.shape[0] ac_size = ac_space.shape[0] #print("rank = " + str(rank) + " ob_space = "+str(ob_space.shape) + " ac_space = "+str(ac_space.shape)) #exit(0) pi = policy_func("pi", ob_space, ac_space, noisy_nets) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space, noisy_nets) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vfloss1 = tf.square(pi.vpred - ret) vpredclipped = oldpi.vpred + tf.clip_by_value(pi.vpred - oldpi.vpred, -clip_param, clip_param) vfloss2 = tf.square(vpredclipped - ret) vf_loss = .5 * U.mean(tf.maximum(vfloss1, vfloss2)) # we do the same clipping-based trust region for the value function #vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- if noisy_nets: stochastic = False else: stochastic = True seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=stochastic, num_parallel=num_parallel, num_cpu=num_cpu, rank=rank, ob_size=ob_size, ac_size=ac_size,com=MPI.COMM_WORLD) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards saver = tf.train.Saver() if resume > 0: saver.restore(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName, resume))) iters_so_far = resume assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" logF = open(os.path.join(logdir, 'log.txt'), 'a') logStats = open(os.path.join(logdir, 'log_stats.txt'), 'a') dump_training = 0 learn_from_training = 0 if dump_training: if os.path.exists(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl'): with open(logdir + "\\" +'ob_list_' + str(rank) + '.pkl', 'rb') as f: ob_list = pickle.load(f) else: ob_list = [] # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std saverRMS = tf.train.Saver({"_sum": pi.ob_rms._sum, "_sumsq": pi.ob_rms._sumsq, "_count": pi.ob_rms._count}) saverRMS.save(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "rms.tf")) ob_np_a = np.asarray(ob_list) ob_np = np.reshape(ob_np_a, (-1,ob_size)) [vpred, pdparam] = pi._vpred_pdparam(ob_np) print("vpred = " + str(vpred)) print("pd_param = " + str(pdparam)) with open('training.pkl', 'wb') as f: pickle.dump(ob_np, f) pickle.dump(vpred, f) pickle.dump(pdparam, f) exit(0) if learn_from_training: # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std with open('training.pkl', 'rb') as f: ob_np = pickle.load(f) vpred = pickle.load(f) pdparam = pickle.load(f) num = ob_np.shape[0] for i in range(num): xp = ob_np[i][1] ob_np[i][1] = 0.0 ob_np[i][18] -= xp ob_np[i][22] -= xp ob_np[i][24] -= xp ob_np[i][26] -= xp ob_np[i][28] -= xp ob_np[i][30] -= xp ob_np[i][32] -= xp ob_np[i][34] -= xp print("ob_np = " + str(ob_np)) print("vpred = " + str(vpred)) print("pdparam = " + str(pdparam)) batch_size = 128 y_vpred = tf.placeholder(tf.float32, [batch_size, ]) y_pdparam = tf.placeholder(tf.float32, [batch_size, pdparam.shape[1]]) vpred_loss = U.mean(tf.square(pi.vpred - y_vpred)) vpdparam_loss = U.mean(tf.square(pi.pdparam - y_pdparam)) total_train_loss = vpred_loss + vpdparam_loss #total_train_loss = vpdparam_loss #total_train_loss = vpred_loss #coef = 0.01 #dense_all = U.dense_all #for a in dense_all: # total_train_loss += coef * tf.nn.l2_loss(a) #total_train_loss = vpdparam_loss optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(total_train_loss) d = Dataset(dict(ob=ob_np, vpred=vpred, pdparam=pdparam), shuffle=not pi.recurrent) sess = tf.get_default_session() sess.run(tf.global_variables_initializer()) saverRMS = tf.train.Saver({"_sum": pi.ob_rms._sum, "_sumsq": pi.ob_rms._sumsq, "_count": pi.ob_rms._count}) saverRMS.restore(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "rms.tf")) if resume > 0: saver.restore(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName, resume))) for q in range(100): sumLoss = 0 for batch in d.iterate_once(batch_size): tl, _ = sess.run([total_train_loss, optimizer], feed_dict={pi.ob: batch["ob"], y_vpred: batch["vpred"], y_pdparam:batch["pdparam"]}) sumLoss += tl print("Iteration " + str(q)+ " Loss = " + str(sumLoss)) assign_old_eq_new() # set old parameter values to new parameter values # Save as frame 1 try: saver.save(tf.get_default_session(), os.path.join(logdir, agentName), global_step=1) except: pass #exit(0) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'adaptive' or 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0.0) elif schedule == 'linear_clipped': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0.2) elif schedule == 'cyclic': # cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) raise NotImplementedError else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam, timesteps_per_batch, num_parallel, num_cpu) #print(" ob= " + str(seg["ob"])+ " rew= " + str(seg["rew"])+ " vpred= " + str(seg["vpred"])+ " new= " + str(seg["new"])+ " ac= " + str(seg["ac"])+ " prevac= " + str(seg["prevac"])+ " nextvpred= " + str(seg["nextvpred"])+ " ep_rets= " + str(seg["ep_rets"])+ " ep_lens= " + str(seg["ep_lens"])) #exit(0) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] if dump_training: ob_list.append(ob.tolist()) vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) if desired_kl != None and schedule == 'adaptive': if newlosses[-2] > desired_kl * 2.0: optim_stepsize = max(1e-8, optim_stepsize / 1.5) print('kl divergence was too large = ', newlosses[-2]) print('New optim_stepsize = ', optim_stepsize) elif newlosses[-2] < desired_kl / 2.0: optim_stepsize = min(1e0, optim_stepsize * 1.5) print('kl divergence was too small = ', newlosses[-2]) print('New optim_stepsize = ', optim_stepsize) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) #print(str(losses)) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) rewmean = np.mean(rewbuffer) logger.record_tabular("EpRewMean", rewmean) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if dump_training: with open(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl', 'wb') as f: pickle.dump(ob_list, f) if MPI.COMM_WORLD.Get_rank()==0: logF.write(str(rewmean) + "\n") logStats.write(logger.get_str() + "\n") logF.flush() logStats.flush() logger.dump_tabular() try: os.remove(logdir + "/checkpoint") except OSError: pass try: saver.save(tf.get_default_session(), os.path.join(logdir, agentName), global_step=iters_so_far) except: pass
def learn( env, policy_func, disc, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) logdir=".", agentName="PPO-Agent", resume=0, num_parallel=0, num_cpu=1, num_extra=0, gan_batch_size=128, gan_num_epochs=5, gan_display_step=40, resume_disc=0, resume_non_disc=0, mocap_path="", gan_replay_buffer_size=1000000, gan_prob_to_put_in_replay=0.01, gan_reward_to_retrain_discriminator=5, use_distance=0, use_blend=0): # Deal with GAN if not use_distance: replay_buf = MyReplayBuffer(gan_replay_buffer_size) data = np.loadtxt( mocap_path + ".dat" ) #"D:/p4sw/devrel/libdev/flex/dev/rbd/data/bvh/motion_simple.dat"); label = np.concatenate((np.ones( (data.shape[0], 1)), np.zeros((data.shape[0], 1))), axis=1) print("Real data label = " + str(label)) mocap_set = Dataset(dict(data=data, label=label), shuffle=True) # Setup losses and stuff # ---------------------------------------- rank = MPI.COMM_WORLD.Get_rank() ob_space = env.observation_space ac_space = env.action_space ob_size = ob_space.shape[0] ac_size = ac_space.shape[0] #print("rank = " + str(rank) + " ob_space = "+str(ob_space.shape) + " ac_space = "+str(ac_space.shape)) #exit(0) pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vfloss1 = tf.square(pi.vpred - ret) vpredclipped = oldpi.vpred + tf.clip_by_value(pi.vpred - oldpi.vpred, -clip_param, clip_param) vfloss2 = tf.square(vpredclipped - ret) vf_loss = .5 * U.mean( tf.maximum(vfloss1, vfloss2) ) # we do the same clipping-based trust region for the value function #vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- sess = tf.get_default_session() avars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) non_disc_vars = [ a for a in avars if not a.name.split("/")[0].startswith("discriminator") ] disc_vars = [ a for a in avars if a.name.split("/")[0].startswith("discriminator") ] #print(str(non_disc_names)) #print(str(disc_names)) #exit(0) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards disc_saver = tf.train.Saver(disc_vars, max_to_keep=None) non_disc_saver = tf.train.Saver(non_disc_vars, max_to_keep=None) saver = tf.train.Saver(max_to_keep=None) if resume > 0: saver.restore( tf.get_default_session(), os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName, resume))) if not use_distance: if os.path.exists(logdir + "\\" + 'replay_buf_' + str(int(resume / 100) * 100) + '.pkl'): print("Load replay buf") with open( logdir + "\\" + 'replay_buf_' + str(int(resume / 100) * 100) + '.pkl', 'rb') as f: replay_buf = pickle.load(f) else: print("Can't load replay buf " + logdir + "\\" + 'replay_buf_' + str(int(resume / 100) * 100) + '.pkl') iters_so_far = resume if resume_non_disc > 0: non_disc_saver.restore( tf.get_default_session(), os.path.join( os.path.abspath(logdir), "{}-{}".format(agentName + "_non_disc", resume_non_disc))) iters_so_far = resume_non_disc if use_distance: print("Use distance") nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(data) else: nn = None seg_gen = traj_segment_generator(pi, env, disc, timesteps_per_batch, stochastic=True, num_parallel=num_parallel, num_cpu=num_cpu, rank=rank, ob_size=ob_size, ac_size=ac_size, com=MPI.COMM_WORLD, num_extra=num_extra, iters_so_far=iters_so_far, use_distance=use_distance, nn=nn) if resume_disc > 0: disc_saver.restore( tf.get_default_session(), os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName + "_disc", resume_disc))) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" logF = open(logdir + "\\" + 'log.txt', 'a') logR = open(logdir + "\\" + 'log_rew.txt', 'a') logStats = open(logdir + "\\" + 'log_stats.txt', 'a') if os.path.exists(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl'): with open(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl', 'rb') as f: ob_list = pickle.load(f) else: ob_list = [] dump_training = 0 learn_from_training = 0 if dump_training: # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std saverRMS = tf.train.Saver({ "_sum": pi.ob_rms._sum, "_sumsq": pi.ob_rms._sumsq, "_count": pi.ob_rms._count }) saverRMS.save(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "rms.tf")) ob_np_a = np.asarray(ob_list) ob_np = np.reshape(ob_np_a, (-1, ob_size)) [vpred, pdparam] = pi._vpred_pdparam(ob_np) print("vpred = " + str(vpred)) print("pd_param = " + str(pdparam)) with open('training.pkl', 'wb') as f: pickle.dump(ob_np, f) pickle.dump(vpred, f) pickle.dump(pdparam, f) exit(0) if learn_from_training: # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std with open('training.pkl', 'rb') as f: ob_np = pickle.load(f) vpred = pickle.load(f) pdparam = pickle.load(f) num = ob_np.shape[0] for i in range(num): xp = ob_np[i][1] ob_np[i][1] = 0.0 ob_np[i][18] -= xp ob_np[i][22] -= xp ob_np[i][24] -= xp ob_np[i][26] -= xp ob_np[i][28] -= xp ob_np[i][30] -= xp ob_np[i][32] -= xp ob_np[i][34] -= xp print("ob_np = " + str(ob_np)) print("vpred = " + str(vpred)) print("pdparam = " + str(pdparam)) batch_size = 128 y_vpred = tf.placeholder(tf.float32, [ batch_size, ]) y_pdparam = tf.placeholder(tf.float32, [batch_size, pdparam.shape[1]]) vpred_loss = U.mean(tf.square(pi.vpred - y_vpred)) vpdparam_loss = U.mean(tf.square(pi.pdparam - y_pdparam)) total_train_loss = vpred_loss + vpdparam_loss #total_train_loss = vpdparam_loss #total_train_loss = vpred_loss #coef = 0.01 #dense_all = U.dense_all #for a in dense_all: # total_train_loss += coef * tf.nn.l2_loss(a) #total_train_loss = vpdparam_loss optimizer = tf.train.AdamOptimizer( learning_rate=0.001).minimize(total_train_loss) d = Dataset(dict(ob=ob_np, vpred=vpred, pdparam=pdparam), shuffle=not pi.recurrent) sess = tf.get_default_session() sess.run(tf.global_variables_initializer()) saverRMS = tf.train.Saver({ "_sum": pi.ob_rms._sum, "_sumsq": pi.ob_rms._sumsq, "_count": pi.ob_rms._count }) saverRMS.restore(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "rms.tf")) if resume > 0: saver.restore( tf.get_default_session(), os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName, resume))) for q in range(100): sumLoss = 0 for batch in d.iterate_once(batch_size): tl, _ = sess.run( [total_train_loss, optimizer], feed_dict={ pi.ob: batch["ob"], y_vpred: batch["vpred"], y_pdparam: batch["pdparam"] }) sumLoss += tl print("Iteration " + str(q) + " Loss = " + str(sumLoss)) assign_old_eq_new() # set old parameter values to new parameter values # Save as frame 1 try: saver.save(tf.get_default_session(), os.path.join(logdir, agentName), global_step=1) except: pass #exit(0) if resume > 0: firstTime = False else: firstTime = True # Check accuracy #amocap = sess.run([disc.accuracy], # feed_dict={disc.input: data, # disc.label: label}) #print("Mocap accuracy = " + str(amocap)) #print("Mocap label is " + str(label)) #adata = np.array(replay_buf._storage) #print("adata shape = " + str(adata.shape)) #alabel = np.concatenate((np.zeros((adata.shape[0], 1)), np.ones((adata.shape[0], 1))), axis=1) #areplay = sess.run([disc.accuracy], # feed_dict={disc.input: adata, # disc.label: alabel}) #print("Replay accuracy = " + str(areplay)) #print("Replay label is " + str(alabel)) #exit(0) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam, timesteps_per_batch, num_parallel, num_cpu) #print(" ob= " + str(seg["ob"])+ " rew= " + str(seg["rew"])+ " vpred= " + str(seg["vpred"])+ " new= " + str(seg["new"])+ " ac= " + str(seg["ac"])+ " prevac= " + str(seg["prevac"])+ " nextvpred= " + str(seg["nextvpred"])+ " ep_rets= " + str(seg["ep_rets"])+ " ep_lens= " + str(seg["ep_lens"])) #exit(0) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret, extra = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"], seg["extra"] #ob_list.append(ob.tolist()) vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) #print(str(losses)) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) rewmean = np.mean(rewbuffer) logger.record_tabular("EpRewMean", rewmean) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) # Train discriminator if not use_distance: print("Put in replay buf " + str((int)(gan_prob_to_put_in_replay * extra.shape[0] + 1))) replay_buf.add(extra[np.random.choice( extra.shape[0], (int)(gan_prob_to_put_in_replay * extra.shape[0] + 1), replace=True)]) #if iters_so_far == 1: if not use_blend: if firstTime: firstTime = False # Train with everything we got lb = np.concatenate((np.zeros( (extra.shape[0], 1)), np.ones((extra.shape[0], 1))), axis=1) extra_set = Dataset(dict(data=extra, label=lb), shuffle=True) for e in range(10): i = 0 for mbatch in mocap_set.iterate_once(gan_batch_size): batch = extra_set.next_batch(gan_batch_size) _, l = sess.run( [disc.optimizer_first, disc.loss], feed_dict={ disc.input: np.concatenate( (mbatch['data'], batch['data'])), disc.label: np.concatenate( (mbatch['label'], batch['label'])) }) i = i + 1 # Display logs per step if i % gan_display_step == 0 or i == 1: print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) if seg['mean_ext_rew'] > gan_reward_to_retrain_discriminator: for e in range(gan_num_epochs): i = 0 for mbatch in mocap_set.iterate_once(gan_batch_size): data = replay_buf.sample(mbatch['data'].shape[0]) lb = np.concatenate((np.zeros( (data.shape[0], 1)), np.ones( (data.shape[0], 1))), axis=1) _, l = sess.run( [disc.optimizer, disc.loss], feed_dict={ disc.input: np.concatenate((mbatch['data'], data)), disc.label: np.concatenate((mbatch['label'], lb)) }) i = i + 1 # Display logs per step if i % gan_display_step == 0 or i == 1: print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) else: if firstTime: firstTime = False # Train with everything we got extra_set = Dataset(dict(data=extra), shuffle=True) for e in range(10): i = 0 for mbatch in mocap_set.iterate_once(gan_batch_size): batch = extra_set.next_batch(gan_batch_size) bf = np.random.uniform(0, 1, (gan_batch_size, 1)) onembf = 1 - bf my_label = np.concatenate((bf, onembf), axis=1) my_data = np.multiply(mbatch['data'], bf) + np.multiply( batch['data'], onembf) _, l = sess.run([disc.optimizer_first, disc.loss], feed_dict={ disc.input: my_data, disc.label: my_label }) i = i + 1 # Display logs per step if i % gan_display_step == 0 or i == 1: print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) if seg['mean_ext_rew'] > gan_reward_to_retrain_discriminator: for e in range(gan_num_epochs): i = 0 for mbatch in mocap_set.iterate_once(gan_batch_size): data = replay_buf.sample(mbatch['data'].shape[0]) bf = np.random.uniform(0, 1, (gan_batch_size, 1)) onembf = 1 - bf my_label = np.concatenate((bf, onembf), axis=1) my_data = np.multiply(mbatch['data'], bf) + np.multiply( data, onembf) _, l = sess.run([disc.optimizer_first, disc.loss], feed_dict={ disc.input: my_data, disc.label: my_label }) i = i + 1 # Display logs per step if i % gan_display_step == 0 or i == 1: print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) # if True: # lb = np.concatenate((np.zeros((extra.shape[0],1)),np.ones((extra.shape[0],1))),axis=1) # extra_set = Dataset(dict(data=extra,label=lb), shuffle=True) # num_r = 1 # if iters_so_far == 1: # num_r = gan_num_epochs # for e in range(num_r): # i = 0 # for batch in extra_set.iterate_once(gan_batch_size): # mbatch = mocap_set.next_batch(gan_batch_size) # _, l = sess.run([disc.optimizer, disc.loss], feed_dict={disc.input: np.concatenate((mbatch['data'],batch['data'])), disc.label: np.concatenate((mbatch['label'],batch['label']))}) # i = i + 1 # # Display logs per step # if i % gan_display_step == 0 or i == 1: # print('discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) # print('discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) if not use_distance: if iters_so_far % 100 == 0: with open( logdir + "\\" + 'replay_buf_' + str(iters_so_far) + '.pkl', 'wb') as f: pickle.dump(replay_buf, f) with open(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl', 'wb') as f: pickle.dump(ob_list, f) if MPI.COMM_WORLD.Get_rank() == 0: logF.write(str(rewmean) + "\n") logR.write(str(seg['mean_ext_rew']) + "\n") logStats.write(logger.get_str() + "\n") logF.flush() logStats.flush() logR.flush() logger.dump_tabular() try: os.remove(logdir + "/checkpoint") except OSError: pass try: saver.save(tf.get_default_session(), os.path.join(logdir, agentName), global_step=iters_so_far) except: pass try: non_disc_saver.save(tf.get_default_session(), os.path.join(logdir, agentName + "_non_disc"), global_step=iters_so_far) except: pass try: disc_saver.save(tf.get_default_session(), os.path.join(logdir, agentName + "_disc"), global_step=iters_so_far) except: pass
def learn(env, policy_func, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- #seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" while True: data_path = '/Users/wjh720/Desktop/Tmp/para_%i/' % (timesteps_per_actorbatch / 100) U.load_state(data_path + 'para') test(pi, env, timesteps_per_actorbatch, stochastic=True)
def render_evaluate(env, policy_func, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.compute_kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out # set up saver sess = tf.get_default_session() saver = tf.train.Saver() U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) print("loading pretrained model") saver.restore(sess, callback.model_dir) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1 import gym env = gym.make('Ant-v1') if True: obsall = [] for _ in range(50): obs = [] done = False ob = env.reset() #env.render() stochastic = 1 obs.append(env.unwrapped.get_body_com('torso')[:2].copy()) while not done: ac, vpred = pi.act(stochastic, ob) ob, rew, done, _ = env.step(ac) #env.render() obs.append(env.unwrapped.get_body_com('torso')[:2].copy()) obsall.append(obs) if rank==0: logger.dump_tabular() if callback is not None: callback(locals(), globals()) """
def learn( env, policy_func, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) load_model=None, action_bias=0.4, action_repeat=0, action_repeat_rand=False, warmup_frames=0, target_kl=0.01, vf_loss_mult=1, vfloss_optim_stepsize=0.003, vfloss_optim_batchsize=8, vfloss_optim_epochs=10): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule # Not sure why they anneal clip and learning rate with the same parameter #clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen losses = [pol_surr, pol_entpen, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) lossandgrad_vfloss = U.function([ob, ac, atarg, ret], [vf_loss] + [U.flatgrad(vf_loss, var_list)]) adam_vfloss = MpiAdam(var_list, epsilon=adam_epsilon) compute_vfloss = U.function([ob, ac, atarg, ret], [vf_loss]) U.initialize() adam.sync() adam_vfloss.sync() if load_model: logger.log('Loading model: %s' % load_model) pi.load(load_model) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, action_bias=action_bias, action_repeat=action_repeat, action_repeat_rand=action_repeat_rand, warmup_frames=warmup_frames) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" ep_rew_file = None if MPI.COMM_WORLD.Get_rank() == 0: import wandb ep_rew_file = open( os.path.join(wandb.run.dir, 'episode_rewards.jsonl'), 'w') checkpoint_dir = 'checkpoints-%s' % wandb.run.id os.mkdir(checkpoint_dir) cur_lrmult = 1.0 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) elif schedule == 'target_kl': pass else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.next() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): result = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) newlosses = result[:-1] g = result[-1] adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) # vfloss optimize logger.log("Optimizing value function...") logger.log(fmt_row(13, ['vf'])) for _ in range(vfloss_optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(vfloss_optim_batchsize): result = lossandgrad_vfloss(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"]) newlosses = result[:-1] g = result[-1] adam_vfloss.update(g, vfloss_optim_stepsize) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) newlosses += compute_vfloss(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"]) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names + ['vf']): logger.record_tabular("loss_" + name, lossval) # check kl if schedule == 'target_kl': if meanlosses[2] > target_kl * 1.1: cur_lrmult /= 1.5 elif meanlosses[2] < target_kl / 1.1: cur_lrmult *= 1.5 logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) if rewbuffer: logger.record_tabular('CurLrMult', cur_lrmult) logger.record_tabular('StepSize', optim_stepsize * cur_lrmult) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMax", np.max(rewbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpRewMin", np.min(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) time_elapsed = time.time() - tstart logger.record_tabular("TimeElapsed", time_elapsed) if MPI.COMM_WORLD.Get_rank() == 0: import wandb ep_rew_file.write('%s\n' % json.dumps({ 'TimeElapsed': time_elapsed, 'Rewards': rews })) ep_rew_file.flush() data = logger.Logger.CURRENT.name2val wandb.run.history.add(data) summary_data = {} for k, v in data.iteritems(): if 'Rew' in k: summary_data[k] = v wandb.run.summary.update(summary_data) pi.save( os.path.join(checkpoint_dir, 'model-%s.ckpt' % (iters_so_far - 1))) logger.dump_tabular() else: logger.log('No episodes complete yet')
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 rollouts_time = 0 optimization_time = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) a = time.time() seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values b = time.time() logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) grad_time = 0.0 allreduce_time = 0.0 # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): aa = time.time() *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) bb = time.time() adam.update(g, optim_stepsize * cur_lrmult) cc = time.time() grad_time += bb - aa allreduce_time += cc - bb losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("GradTime", grad_time) logger.record_tabular("AllReduceTime", allreduce_time) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) c = time.time() rollouts_time += (b - a) optimization_time += (c - b) logger.record_tabular("RolloutsTime", rollouts_time) logger.record_tabular("OptimizationTime", optimization_time) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def __init__(self, env, policy, emb_network, emb_size, clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint adam_epsilon=1e-5, schedule='constant', joint_training=False ): # Setup variables self.optim_epochs = optim_epochs self.optim_stepsize = optim_stepsize self.optim_batchsize = optim_batchsize self.gamma = gamma self.lam = lam self.max_timesteps = max_timesteps self.adam_epsilon = adam_epsilon self.schedule = schedule # Setup losses and stuff # ---------------------------------------- with tf.name_scope('ppo'): ob_space = env.observation_space ac_space = env.action_space self.pi = policy # Construct network for new policy oldpi = Policy("old_policy", env.action_space, joint_training, emb_size, emb_network) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) if joint_training: ob = U.get_placeholder_cached(name="ob_f") else: ob = U.get_placeholder_cached(name="ob") ac = self.pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(self.pi.pd) ent = self.pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(self.pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(self.pi.vpred - ret)) self.total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] self.loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = self.pi.get_trainable_variables() self.lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(self.total_loss, var_list)]) self.adam = MpiAdam(var_list, epsilon=adam_epsilon) self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), self.pi.get_variables())]) self.compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() self.adam.sync() # Prepare for rollouts # ---------------------------------------- self.episodes_so_far = 0 self.timesteps_so_far = 0 self.iters_so_far = 0 self.tstart = time.time() self.lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths self.rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) sym_loss_weight=0.0, return_threshold=None, # termiante learning if reaches return_threshold op_after_init=None, init_policy_params=None, policy_scope=None, max_threshold=None, positive_rew_enforce=False, reward_drop_bound=None, min_iters=0, ref_policy_params=None, rollout_length_thershold=None): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space if policy_scope is None: pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy else: pi = policy_func(policy_scope, ob_space, ac_space) # Construct network for new policy oldpi = policy_func("old" + policy_scope, ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent sym_loss = sym_loss_weight * U.mean( tf.square(pi.mean - pi.mirrored_mean)) # mirror symmetric loss ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) + sym_loss # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent, sym_loss] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent", "sym_loss"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() if init_policy_params is not None: cur_scope = pi.get_variables()[0].name[0:pi.get_variables()[0].name. find('/')] orig_scope = list(init_policy_params.keys() )[0][0:list(init_policy_params.keys())[0].find('/')] for i in range(len(pi.get_variables())): assign_op = pi.get_variables()[i].assign( init_policy_params[pi.get_variables()[i].name.replace( cur_scope, orig_scope, 1)]) U.get_session().run(assign_op) assign_op = oldpi.get_variables()[i].assign( init_policy_params[pi.get_variables()[i].name.replace( cur_scope, orig_scope, 1)]) U.get_session().run(assign_op) if ref_policy_params is not None: ref_pi = policy_func("ref_pi", ob_space, ac_space) cur_scope = ref_pi.get_variables()[0].name[0:ref_pi.get_variables()[0]. name.find('/')] orig_scope = list(ref_policy_params.keys() )[0][0:list(ref_policy_params.keys())[0].find('/')] for i in range(len(ref_pi.get_variables())): assign_op = ref_pi.get_variables()[i].assign( ref_policy_params[ref_pi.get_variables()[i].name.replace( cur_scope, orig_scope, 1)]) U.get_session().run(assign_op) env.env.env.ref_policy = ref_pi adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" max_thres_satisfied = max_threshold is None adjust_ratio = 0.0 prev_avg_rew = -1000000 revert_parameters = {} variables = pi.get_variables() for i in range(len(variables)): cur_val = variables[i].eval() revert_parameters[variables[i].name] = cur_val revert_data = [0, 0, 0] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() if reward_drop_bound is not None: lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) revert_iteration = False if np.mean( rewbuffer ) < prev_avg_rew - reward_drop_bound: # detect significant drop in performance, revert to previous iteration print("Revert Iteration!!!!!") revert_iteration = True else: prev_avg_rew = np.mean(rewbuffer) logger.record_tabular("Revert Rew", prev_avg_rew) if revert_iteration: # revert iteration for i in range(len(pi.get_variables())): assign_op = pi.get_variables()[i].assign( revert_parameters[pi.get_variables()[i].name]) U.get_session().run(assign_op) episodes_so_far = revert_data[0] timesteps_so_far = revert_data[1] iters_so_far = revert_data[2] continue else: variables = pi.get_variables() for i in range(len(variables)): cur_val = variables[i].eval() revert_parameters[variables[i].name] = np.copy(cur_val) revert_data[0] = episodes_so_far revert_data[1] = timesteps_so_far revert_data[2] = iters_so_far if positive_rew_enforce: rewlocal = (seg["pos_rews"], seg["neg_pens"], seg["rew"] ) # local values listofrews = MPI.COMM_WORLD.allgather(rewlocal) # list of tuples pos_rews, neg_pens, rews = map(flatten_lists, zip(*listofrews)) if np.mean(rews) < 0.0: #min_id = np.argmin(rews) #adjust_ratio = pos_rews[min_id]/np.abs(neg_pens[min_id]) adjust_ratio = np.max([ adjust_ratio, np.mean(pos_rews) / np.abs(np.mean(neg_pens)) ]) for i in range(len(seg["rew"])): if np.abs(seg["rew"][i] - seg["pos_rews"][i] - seg["neg_pens"][i]) > 1e-5: print(seg["rew"][i], seg["pos_rews"][i], seg["neg_pens"][i]) print('Reward wrong!') abc seg["rew"][i] = seg["pos_rews"][ i] + seg["neg_pens"][i] * adjust_ratio add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) if reward_drop_bound is None: lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("Iter", iters_so_far) if positive_rew_enforce: if adjust_ratio is not None: logger.record_tabular("RewardAdjustRatio", adjust_ratio) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() if max_threshold is not None: print('Current max return: ', np.max(rewbuffer)) if np.max(rewbuffer) > max_threshold: max_thres_satisfied = True else: max_thres_satisfied = False return_threshold_satisfied = True if return_threshold is not None: if not (np.mean(rewbuffer) > return_threshold and iters_so_far > min_iters): return_threshold_satisfied = False rollout_length_thershold_satisfied = True if rollout_length_thershold is not None: rewlocal = (seg["avg_vels"], seg["rew"]) # local values listofrews = MPI.COMM_WORLD.allgather(rewlocal) # list of tuples avg_vels, rews = map(flatten_lists, zip(*listofrews)) if not (np.mean(lenbuffer) > rollout_length_thershold and np.mean(avg_vels) > 0.5 * env.env.env.final_tv): rollout_length_thershold_satisfied = False if rollout_length_thershold is not None or return_threshold is not None: if rollout_length_thershold_satisfied and return_threshold_satisfied: break return pi, np.mean(rewbuffer)
def learn(env, policy_func, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular()
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update log_every=None, log_dir=None, episodes_so_far=0, timesteps_so_far=0, iters_so_far=0, clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) **kwargs): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy # Target advantage function (if applicable) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return # learning rate multiplier, updated with schedule lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- # GRASPING saver = tf.train.Saver(var_list=U.ALREADY_INITIALIZED, max_to_keep=1) checkpoint = tf.train.latest_checkpoint(log_dir) if checkpoint: print("Restoring checkpoint: {}".format(checkpoint)) saver.restore(U.get_session(), checkpoint) if hasattr(env, "set_actor"): def actor(obs): return pi.act(False, obs)[0] env.set_actor(actor) if not checkpoint and hasattr(env, "warm_init_eps"): pretrain(pi, env) saver.save(U.get_session(), osp.join(log_dir, "model")) # /GRASPING seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) tstart = time.time() assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) should_break = False if max_timesteps and timesteps_so_far >= max_timesteps: should_break = True elif max_episodes and episodes_so_far >= max_episodes: should_break = True elif max_iters and iters_so_far >= max_iters: should_break = True elif max_seconds and time.time() - tstart >= max_seconds: should_break = True if log_every and log_dir: if (iters_so_far + 1) % log_every == 0 or should_break: # To reduce space, don't specify global step. saver.save(U.get_session(), osp.join(log_dir, "model")) job_info = { 'episodes_so_far': episodes_so_far, 'iters_so_far': iters_so_far, 'timesteps_so_far': timesteps_so_far } with open(osp.join(log_dir, "job_info_new.yaml"), 'w') as file: yaml.dump(job_info, file, default_flow_style=False) # Make sure write is instantaneous. file.flush() os.fsync(file) os.rename(osp.join(log_dir, "job_info_new.yaml"), osp.join(log_dir, "job_info.yaml")) if should_break: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / ( atarg.std() + 1e-10) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values # logger.log("Optimizing...") # logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) # logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) logger.record_tabular("EpLenMean", np.mean(lens)) logger.record_tabular("EpRewMean", np.mean(rews)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) num_options=1, app='', saves=False, wsaves=False, epoch=0, seed=1, dc=0, plots=False, w_intfc=True, switch=False, intlr=1e-4, piolr=1e-4, fewshot=False, ): optim_batchsize_ideal = optim_batchsize np.random.seed(seed) tf.set_random_seed(seed) ### Book-keeping if hasattr(env, 'NAME'): gamename = env.NAME.lower() else: gamename = env.spec.id[:-3].lower() gamename += 'seed' + str(seed) gamename += app dirname = '{}_{}opts_saves/'.format(gamename, num_options) if wsaves: first = True if not os.path.exists(dirname): os.makedirs(dirname) first = False # while os.path.exists(dirname) and first: # dirname += '0' files = ['pposgd_simple.py', 'cnn_policy.py', 'run_miniw.py'] for i in range(len(files)): src = os.path.expanduser( '~/baselines_intfc/baselines/ppoc_int/') + files[i] dest = os.path.expanduser( '~/baselines_intfc/baselines/ppoc_int/') + dirname shutil.copy2(src, dest) ### # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return # option = tf.placeholder(dtype=tf.int32, shape=[None]) lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None]) op_adv = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) betas = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] term_loss = pi.tpred * term_adv # pi_w = tf.stop_gradient(pi.op_pi) pi_w = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) option_hot = tf.one_hot(option, depth=num_options) pi_I = pi.intfc * pi_w / tf.expand_dims( tf.reduce_sum(pi.intfc * pi_w, axis=1), 1) pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6) int_loss = -tf.reduce_sum( betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv) intfc = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) pi_I = intfc * pi.op_pi / tf.expand_dims( tf.reduce_sum(intfc * pi.op_pi, axis=1), 1) pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6) op_loss = -tf.reduce_sum( betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv) log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0)) op_entropy = -tf.reduce_mean(pi.op_pi * log_pi, reduction_indices=1) op_loss -= 0.01 * tf.reduce_sum(op_entropy) var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option], losses + [U.flatgrad(total_loss, var_list)]) lossandgrad_vf = U.function([ob, ac, atarg, ret, lrmult, option], losses + [U.flatgrad(vf_loss, var_list)]) termgrad = U.function([ob, option, term_adv], [U.flatgrad(term_loss, var_list) ]) # Since we will use a different step size. opgrad = U.function([ob, option, betas, op_adv, intfc], [U.flatgrad(op_loss, var_list) ]) # Since we will use a different step size. intgrad = U.function([ob, option, betas, op_adv, pi_w], [U.flatgrad(int_loss, var_list) ]) # Since we will use a different step size. adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() saver = tf.train.Saver(max_to_keep=10000) ### More book-kepping results = [] if saves: directory_res = "res_switch150/learnpio/lr{}/".format( optim_stepsize) if not fewshot else "res_fewshot/lr{}/".format( optim_stepsize) if not os.path.exists(directory_res): os.makedirs(directory_res) if w_intfc: results = open( directory_res + gamename + '_intfc{}_intlr{}_piolr{}'.format(int(w_intfc), intlr, piolr) + '_' + str(num_options) + 'opts.csv', 'w') else: results = open( directory_res + gamename + '_intfc{}_piolr{}'.format(int(w_intfc), piolr) + '_' + str(num_options) + 'opts.csv', 'w') out = 'epoch,avg_reward' # for opt in range(num_options): out += ',option {} dur'.format(opt) # # for opt in range(num_options): out += ',option {} std'.format(opt) # for opt in range(num_options): out += ',option {} term'.format(opt) # for opt in range(num_options): out += ',option {} adv'.format(opt) out += '\n' results.write(out) # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n') results.flush() if epoch: dirname = '{}_{}opts_saves/'.format(gamename, num_options) print("Loading weights from iteration: " + str(epoch)) filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch) saver.restore(U.get_session(), filename) ### episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=10) # rolling buffer for episode lengths rewbuffer = deque(maxlen=10) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, num_options=num_options, saves=saves, results=results, rewbuffer=rewbuffer, dc=dc, epoch=epoch, seed=seed, plots=plots, w_intfc=w_intfc, switch=switch) datas = [0 for _ in range(num_options)] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam, num_options) opt_d = [] for i in range(num_options): dur = np.mean( seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0. opt_d.append(dur) print("mean opt dur:", opt_d) print("mean op probs:", np.mean(np.array(seg['op_probs']), axis=0)) print("mean term p:", np.mean(np.array(seg['term_p']), axis=0)) print("mean vpreds:", np.mean(np.array(seg['vpred']), axis=0)) ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values if iters_so_far % 5 == 0 and wsaves: print("weights are saved...") filename = dirname + '{}_epoch_{}.ckpt'.format( gamename, iters_so_far) save_path = saver.save(U.get_session(), filename) min_batch = 160 # Arbitrary and this is the main issue for multi option (or options in general) t_advs = [[] for _ in range(num_options)] for opt in range(num_options): indices = np.where(opts == opt)[0] print("batch size:", indices.size) opt_d[opt] = indices.size if not indices.size: t_advs[opt].append(0.) continue ########## This part is only necessary when we use options. We proceed to these verifications in order not to discard any collected trajectories. if datas[opt] != 0: if (indices.size < min_batch and datas[opt].n > min_batch): datas[opt] = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue # The preivous dataset has already been trained on (datas[opt].n > min_batch), so we replace it, # and continue without training, as indices.size is too small (indices.size < min_batch). # A too small dataset causes divergence. ################################################## elif indices.size + datas[opt].n < min_batch: oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue # The preivous dataset hasn't been trained on (datas[opt].n < min_batch), so we concatenante with new samples. # The combination of both (indices.size + datas[opt].n < min_batch) is still insufficient, so we skip training. # A too small dataset causes divergence. ################################################### elif (indices.size + datas[opt].n > min_batch and datas[opt].n < min_batch): oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = d = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) # The preivous dataset hasn't been trained on (datas[opt].n < min_batch), so we concatenante with new samples. # The combination of both (indices.size + datas[opt].n < min_batch) is sufficient for training. ################################################## if (indices.size > min_batch and datas[opt].n > min_batch): datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) # The preivous dataset has already been trained on (datas[opt].n > min_batch), so we replace it. # The new samples are numerous enough (indices.size > min_batch), so we use them for training. ################################################## elif datas[opt] == 0: datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) # Only useful for the very first iteration of the training process. ######### optim_batchsize = optim_batchsize or ob.shape[0] optim_epochs = np.clip( np.int(10 * (indices.size / (timesteps_per_batch / num_options))), 10, 10) if num_options > 1 else optim_epochs print("optim epochs:", optim_epochs) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): # tadv,nodc_adv = pi.get_term_adv(batch["ob"],[opt]) # tadv = tadv if num_options > 1 else np.zeros_like(tadv) # t_advs[opt].append(nodc_adv) if iters_so_far < 150 or not fewshot: *newlosses, grads = lossandgrad( batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt]) adam.update(grads, optim_stepsize * cur_lrmult) losses.append(newlosses) else: *newlosses, grads = lossandgrad_vf( batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt]) adam.update(grads, optim_stepsize * cur_lrmult) losses.append(newlosses) if iters_so_far < 150 or not fewshot: termg = termgrad(seg["ob"], seg['opts'], seg["op_adv"])[0] adam.update(termg, 5e-7) if w_intfc: intgrads = intgrad(seg['ob'], seg['opts'], seg["last_betas"], seg["op_adv"], seg["op_probs"])[0] adam.update(intgrads, intlr) opgrad = intgrad(seg['ob'], seg['opts'], seg["last_betas"], seg["op_adv"], seg["intfc"])[0] adam.update(opgrad, piolr) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() ### Book keeping if saves: out = "{},{}" # for _ in range(num_options): out+=",{},{},{}" out += "\n" # pdb.set_trace() info = [iters_so_far, np.mean(rewbuffer)] results.write(out.format(*info)) results.flush()
def __init__(self, a_name, env, policy_func, par): self.env = env self.timesteps_per_batch = par.timesteps_per_batch self.max_kl = par.max_kl self.cg_iters = par.cg_iters self.gamma = par.gamma self.lam = par.lam # advantage estimation self.entcoeff = par.entcoeff self.cg_damping = par.cg_damping self.vf_stepsize = par.vf_stepsize self.vf_iters = par.vf_iters self.max_timesteps = par.max_timesteps self.max_episodes = par.max_episodes self.max_iters = par.max_iters self.callback = par.callback, # you can do anything in the callback, since it takes locals(), globals() self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- self.ob_space = self.env.observation_space self.ac_space = self.env.action_space self.pi = policy_func(a_name, self.ob_space, self.ac_space) self.oldpi = policy_func("oldpi" + a_name, self.ob_space, self.ac_space) self.atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) self.ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return self.ob = U.get_placeholder_cached(name="ob" + str(TRPO_agent_new.index2)) self.ac = self.pi.pdtype.sample_placeholder([None]) self.kloldnew = self.oldpi.pd.kl(self.pi.pd) self.ent = self.pi.pd.entropy() meankl = U.mean(self.kloldnew) meanent = U.mean(self.ent) entbonus = self.entcoeff * meanent self.vferr = U.mean(tf.square(self.pi.vpred - self.ret)) ratio = tf.exp(self.pi.pd.logp(self.ac) - self.oldpi.pd.logp(self.ac)) # advantage * pnew / pold surrgain = U.mean(ratio * self.atarg) optimgain = surrgain + entbonus self.losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = [ "optimgain", "meankl", "entloss", "surrgain", "entropy" ] self.dist = meankl all_var_list = self.pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] self.vfadam = MpiAdam(vf_var_list) self.get_flat = U.GetFlat(var_list) self.set_from_flat = U.SetFromFlat(var_list) self.klgrads = tf.gradients(self.dist, var_list) self.flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan" + str(TRPO_agent_new.index2)) shapes = [var.get_shape().as_list() for var in var_list] start = 0 self.tangents = [] for shape in shapes: sz = U.intprod(shape) self.tangents.append( tf.reshape(self.flat_tangent[start:start + sz], shape)) start += sz self.gvp = tf.add_n([ U.sum(g * tangent) for (g, tangent) in zipsame(self.klgrads, self.tangents) ]) #pylint: disable=E1111 self.fvp = U.flatgrad(self.gvp, var_list) self.assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( self.oldpi.get_variables(), self.pi.get_variables()) ]) self.compute_losses = U.function([self.ob, self.ac, self.atarg], self.losses) self.compute_lossandgrad = U.function( [self.ob, self.ac, self.atarg], self.losses + [U.flatgrad(optimgain, var_list)]) self.compute_fvp = U.function( [self.flat_tangent, self.ob, self.ac, self.atarg], self.fvp) self.compute_vflossandgrad = U.function([self.ob, self.ret], U.flatgrad( self.vferr, vf_var_list)) TRPO_agent_new.index2 += 1 U.initialize() self.th_init = self.get_flat() MPI.COMM_WORLD.Bcast(self.th_init, root=0) self.set_from_flat(self.th_init) self.vfadam.sync() print("Init param sum", self.th_init.sum(), flush=True)
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) load_model_path, test_only, stochastic, symmetric_training=False, obs_names=None, single_episode=False, horizon_hack=False, running_avg_len=100, init_three=False, actions=None, symmetric_training_trick=False, seeds_fn=None, bootstrap_seeds=False, ): global seeds # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Network for new policy old_pi = policy_func("old_pi", ob_space, ac_space) # Network for old policy adv_targ = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return mask = tf.placeholder(dtype=tf.bool, shape=[None]) # Mask for the trick lr_mult = tf.placeholder( name='lr_mult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lr_mult # Annealed clipping parameter epsilon ob = U.get_placeholder_cached(name="ob") st = U.get_placeholder_cached(name="st") ac = pi.pdtype.sample_placeholder([None]) kl = old_pi.pd.kl(pi.pd) ent = pi.pd.entropy() mean_kl = U.mean(tf.boolean_mask(kl, mask)) # Mean over the batch mean_ent = U.mean(tf.boolean_mask(ent, mask)) entropy_penalty = -entcoeff * mean_ent ratio = tf.exp(pi.pd.logp(ac) - old_pi.pd.logp(ac)) # pi_new / pi_old surr_1 = ratio * adv_targ # surrogate from conservative policy iteration surr_2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * adv_targ # surr_loss = -U.mean(tf.boolean_mask( tf.minimum(surr_1, surr_2), mask)) # PPO's pessimistic surrogate (L^CLIP), mean over the batch vf_loss = U.mean(tf.boolean_mask(tf.square(pi.vpred - ret), mask)) total_loss = surr_loss + entropy_penalty + vf_loss losses = [surr_loss, entropy_penalty, vf_loss, mean_kl, mean_ent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() comp_loss_and_grad = U.function([ob, st, ac, adv_targ, ret, lr_mult, mask], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(old_v, new_v) for (old_v, new_v) in zipsame(old_pi.get_variables(), pi.get_variables()) ]) comp_loss = U.function([ob, st, ac, adv_targ, ret, lr_mult, mask], losses) if init_three: assign_init_three_1 = U.function( [], [], updates=[ tf.assign(new_v, old_v) for (old_v, new_v) in zipsame( pi.get_orig_variables(), pi.get_part_variables(1)) ]) assign_init_three_2 = U.function( [], [], updates=[ tf.assign(new_v, old_v) for (old_v, new_v) in zipsame( pi.get_orig_variables(), pi.get_part_variables(2)) ]) U.initialize() if load_model_path is not None: U.load_state(load_model_path) if init_three: assign_init_three_1() assign_init_three_2() adam.sync() if seeds_fn is not None: with open(seeds_fn) as f: seeds = [int(seed) for seed in f.readlines()] # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=stochastic, single_episode=test_only or single_episode, actions=actions, bootstrap_seeds=bootstrap_seeds) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() len_buffer = deque( maxlen=running_avg_len) # rolling buffer for episode lengths rew_buffer = deque( maxlen=running_avg_len) # rolling buffer for episode rewards origrew_buffer = deque( maxlen=running_avg_len) # rolling buffer for original episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam, horizon_hack=horizon_hack) # ob, ac, adv_targ, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, st, ac, adv_targ, tdlamret = seg["ob"], seg["step"], seg[ "ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate if symmetric_training_trick: first_75 = st < 75 mask = ~np.concatenate((np.zeros_like(first_75), first_75)) else: mask = np.concatenate( (np.ones_like(st, dtype=np.bool), np.ones_like(st, dtype=np.bool))) if symmetric_training: sym_obss = [] sym_acc = [] for i in range(timesteps_per_batch): obs = OrderedDict(zip(obs_names, ob[i])) sym_obs = obs.copy() swap_legs(sym_obs) sym_ac = ac[i].copy() sym_ac = np.concatenate((sym_ac[9:], sym_ac[:9])) sym_obss.append(np.asarray(list(sym_obs.values()))) sym_acc.append(sym_ac) sym_obss = np.asarray(sym_obss) sym_acc = np.asarray(sym_acc) ob = np.concatenate((ob, sym_obss)) ac = np.concatenate((ac, sym_acc)) adv_targ = np.concatenate((adv_targ, adv_targ)) tdlamret = np.concatenate((tdlamret, tdlamret)) vpredbefore = np.concatenate((vpredbefore, vpredbefore)) st = np.concatenate((st, st)) # Compute stats before updating if bootstrap_seeds: lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_orig_rets"], seg["easy_seeds"], seg["hard_seeds"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, orig_rews, easy_seeds, hard_seeds = map( flatten_lists, zip(*listoflrpairs)) easy_seeds = [x for x in easy_seeds if x != 0] hard_seeds = [x for x in hard_seeds if x != 0] print('seeds set sizes:', len(seeds), len(easy_seeds), len(hard_seeds)) seeds = list((set(seeds) - set(easy_seeds)) | set(hard_seeds)) else: lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_orig_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, orig_rews = map(flatten_lists, zip(*listoflrpairs)) len_buffer.extend(lens) rew_buffer.extend(rews) origrew_buffer.extend(orig_rews) logger.record_tabular("Iter", iters_so_far) logger.record_tabular("EpLenMean", np.mean(len_buffer)) logger.record_tabular("EpRewMean", np.mean(rew_buffer)) logger.record_tabular("EpOrigRewMean", np.mean(origrew_buffer)) logger.record_tabular("EpOrigRewStd", np.std(origrew_buffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) n_completed = 0 sum_completed = 0 for ep_len, orig_rew in zip(lens, orig_rews): if ep_len == 1000: n_completed += 1 sum_completed += orig_rew avg_completed = sum_completed / n_completed if n_completed > 0 else 0 logger.record_tabular("AvgCompleted", avg_completed) perc_completed = 100 * n_completed / len(lens) if len(lens) > 0 else 0 logger.record_tabular("PercCompleted", perc_completed) if callback: callback(locals(), globals()) adv_targ = (adv_targ - adv_targ.mean()) / adv_targ.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, st=st, ac=ac, atarg=adv_targ, vtarg=tdlamret, mask=mask), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") if not test_only: logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data. I log results only for the first worker (rank=0) for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *batch_losses, grads = comp_loss_and_grad( batch["ob"], batch["st"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, batch["mask"]) if not test_only: adam.update(grads, optim_stepsize * cur_lrmult) losses.append(batch_losses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): batch_losses = comp_loss(batch["ob"], batch["st"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, batch["mask"]) losses.append(batch_losses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() iters_so_far += 1