def build_policies(self, make_policy, target_pi): # Build the policies for i in range(self.capacity): name = 'behavioral_' + str(i + 1) + '_policy' self.policies.append( make_policy(name, self.ob_space, self.ac_space)) all_var_list = self.policies[i].get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split('/')[1].startswith('pol') ] self.get_flats.append(U.GetFlat(var_list)) # Build the swapping actions for i in range(self.capacity): if i == 0: previous_pi = target_pi else: previous_pi = self.policies[i - 1] op = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(self.policies[i].get_variables(), previous_pi.get_variables()) ]) self.assigning_ops.append(op)
def log(self): logger.log("Evaluating losses...") losses = [] for b in self.d.iterate_once(self.optim_batchsize): newlosses = self.compute_losses(b["ob"], b["ac"], b["atarg"], b["vtarg"], self.cur_lrmult) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, self.loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(self.vpredbefore, self.tdlamret)) lrlocal = (self.seg["ep_lens"], self.seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(self.flatten_lists, zip(*listoflrpairs)) self.lenbuffer.extend(lens) self.rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(self.lenbuffer)) logger.record_tabular("EpRewMean", np.mean(self.rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) self.episodes_so_far += len(lens) self.timesteps_so_far += sum(lens) self.iters_so_far += 1 logger.record_tabular("EpisodesSoFar", self.episodes_so_far) logger.record_tabular("TimestepsSoFar", self.timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - self.tstart) if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular()
def update_dropout_masks(mask_vars, keep_prob): replacements = [] i = min(len(static_placeholders) - 1, 0) while len(static_placeholders) < len(mask_vars): static_placeholders.append( tf.placeholder(dtype=tf.float32, shape=mask_vars[i].get_shape().as_list(), name="mask%d" % i)) i += 1 print(i) for mask_var in mask_vars: np_random_mask = np.floor(keep_prob + np.random.uniform( size=tuple(mask_var.get_shape().as_list()))) replacements.append(np_random_mask) global replacement_func if replacement_func is None: print("entring replacement") replacement_func = U.function( static_placeholders, [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(mask_vars, static_placeholders) ]) return replacement_func(*replacements)
def compute_fvp(self, flat_tangent, ob, ac, atarg): shapes = [var.get_shape().as_list() for var in self.pi_var_list] with tf.GradientTape() as outter_tape: with tf.GradientTape() as inner_tape: old_policy_latent = self.oldpi.policy_network(ob) old_pd, _ = self.oldpi.pdtype.pdfromlatent(old_policy_latent) policy_latent = self.pi.policy_network(ob) pd, _ = self.pi.pdtype.pdfromlatent(policy_latent) kloldnew = old_pd.kl(pd) meankl = tf.reduce_mean(kloldnew) klgrads = inner_tape.gradient(meankl, self.pi_var_list) start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) hessians_products = outter_tape.gradient(gvp, self.pi_var_list) fvp = U.flatgrad(hessians_products, self.pi_var_list) return fvp
def __init__( self, ob_space, ac_space, model_func, clip_param, entcoeff, # clipping parameter epsilon, entropy coeff adam_epsilon=1e-5, ): with tf.variable_scope('pi'): self.pi = pi = model_func(ob_space, ac_space) with tf.variable_scope('pi_old'): self.pi_old = pi_old = model_func(ob_space, ac_space) self.adv = tf.placeholder( dtype=tf.float32, shape=[None], name='adv') # Target advantage function (if applicable) self.ret = tf.placeholder(dtype=tf.float32, shape=[None], name='ret') # Empirical return self.lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * self.lrmult # Annealed cliping parameter epislon self.ac = ac = pi.pdtype.sample_placeholder([None]) kloldnew = pi_old.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - pi_old.pd.logp(ac)) # pnew / pold surr1 = ratio * self.adv # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * self.adv # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - self.ret)) self.total_loss = pol_surr + pol_entpen + vf_loss # gradients self.grads = tf.gradients(self.total_loss, pi.train_vars) self.flat_grads = U.flatgrad(self.total_loss, pi.train_vars) # optimizer self.optimizer = MpiAdam(pi.train_vars, epsilon=adam_epsilon) # assign new pi to old pi self.op_assign_old_eq_new = tf.group(*[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(pi_old.global_vars, pi.global_vars) ]) U.initialize() self.optimizer.sync()
def evaluate_cnet_losses(self, do_log=True): ''' Compute losses alone, without gradients ''' losses = [] if self.do_dummy_cnet_update: batch_generator = self.n_batches_this_epoch * [None] else: batch_generator = self.constraint_demonstration_buffer.iterate_epoch( self.extra_args.cnet_batch_size ) # Iterate through all examples exactly once for i_batch, batch_demonstrations in enumerate(batch_generator): if self.do_dummy_cnet_update: test_losses = self.cnet_losses_and_scores_dummy else: batch_cnet_observations = np.stack( [_e.state for _e in batch_demonstrations]) batch_cnet_actions = np.stack( [_e.action for _e in batch_demonstrations]) batch_cnet_action_indicators = np.array( [_e.action_indicator for _e in batch_demonstrations]) test_losses_all = self.cnet_compute_losses_and_scores( batch_cnet_observations, batch_cnet_actions, batch_cnet_action_indicators) *test_losses_base, n_positive_satisfied, n_positive_violated, n_negative_satisfied, n_negative_violated = test_losses_all test_classification_accuracy = self.calc_classification_accuracy( n_positive_satisfied, n_positive_violated, n_negative_satisfied, n_negative_violated) test_losses = test_losses_base + test_classification_accuracy losses.append(test_losses) if i_batch > self.n_batches_this_epoch: break # allow early stop if max number of batches per epoch is set meanlosses, _, _ = mpi_moments(losses, axis=0) mean_classification_accuracy_test = meanlosses[-3:] if len(self.extra_args.adaptive_constraint_activation) > 0: if 'average' in self.extra_args.adaptive_constraint_activation: activation_probability = mean_classification_accuracy_test[2] elif 'positive' in self.extra_args.adaptive_constraint_activation: activation_probability = mean_classification_accuracy_test[0] elif 'negative' in self.extra_args.adaptive_constraint_activation: activation_probability = mean_classification_accuracy_test[1] elif 'min' in self.extra_args.adaptive_constraint_activation: activation_probability = min( mean_classification_accuracy_test[:2]) else: raise ValueError( 'Invalid argument adaptive_constraint_activation={0}'. format(self.extra_args.adaptive_constraint_activation)) else: activation_probability = 1. if do_log: self.logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, self.cnet_loss_and_accuracy_names): self.logger.record_tabular('loss_' + name, lossval) return meanlosses, activation_probability
def __init__(self, sess, env, hid_size, num_hid_layers, clip_param, entcoeff, adam_epsilon, gaussian_fixed_var=True): self.sess = sess self.ob_space = env.observation_space self.ac_space = env.action_space self.ob_dim = env.observation_space.shape[0] self.ac_dim = env.action_space.shape[0] self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.pdtype = pdtype = make_pdtype(self.ac_space) self.ob = tf.placeholder(name="ob", dtype=tf.float32, shape=[None] + list(self.ob_space.shape)) self.ac = self.pdtype.sample_placeholder([None]) with tf.variable_scope('pi'): self.ob_rms, self.vpred, self.pd, self.sample_ac = self.build_network( sess, 'pi', self.ob) self.pi_scope = tf.get_variable_scope().name with tf.variable_scope('old_pi'): _, _, self.old_pd, _ = self.build_network(sess, 'old_pi', self.ob) self.old_pi_scope = tf.get_variable_scope().name # Setup losses and stuff self.atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) self.ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return self.lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule self.total_loss, self.loss_names, self.losses, self.var_list = self.setup_ppo_loss( clip_param, entcoeff, adam_epsilon) self.gradients = tf.gradients(self.total_loss, self.var_list) self.assign_old_eq_new_op = [ tf.assign(oldv, newv) for (oldv, newv ) in zipsame(self.get_old_variables(), self.get_variables()) ] self.learning_rate = tf.placeholder(dtype=tf.float32) self.update_op = tf.train.AdamOptimizer(self.learning_rate).minimize( self.total_loss, var_list=self.var_list)
def callback(locals, globals): if (len(locals['rewbuffer'])): meanlosses = locals['meanlosses'] loss_names = locals['loss_names'] data = { 'iters_so_far': locals['iters_so_far'], 'episode_reward_mean': np.mean(locals['rewbuffer']), } for (lossval, name) in zipsame(locals['meanlosses'], locals['loss_names']): data['loss_' + name] = float(lossval) env.post_data(data)
def build_graph(self): atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = self.clip_param * lrmult # Annealed clipping parameter epsilon ob = self.pi.ob ac = self.pi.pdtype.sample_placeholder([None]) kloldnew = self.oldpi.pd.kl(self.pi.pd) ent = self.pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-self.entcoeff) * meanent ratio = tf.exp(self.pi.pd.logp(ac) - self.oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(self.pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss self.losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] self.loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = self.pi.get_trainable_variables() self.lossandgrad = U.function([ac, atarg, ret, lrmult] + ob, self.losses + [U.flatgrad(total_loss, var_list)]) self.compute_losses = U.function([ac, atarg, ret, lrmult] + ob, self.losses) self.adam = MpiAdam(var_list, epsilon=self.adam_epsilon) self.assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( self.oldpi.get_variables(), self.pi.get_variables()) ])
def __init__(self, name, target): super(TargetMlpPolicy, self).__init__(name=name, action_shape=target.action_shape, observation_shape=target.observation_shape, hid_size=target.hid_size, num_hid_layers=target.num_hid_layers) self.update = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv ) in zipsame(self.get_variables(), target.get_variables()) ])
def load_policy(env, policy_func, *, clip_param, entcoeff, # clipping parameter epsilon, entropy coeff adam_epsilon=1e-5, model_path, checkpoint): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() U.load_state(os.path.join(model_path, 'model-{}'.format(checkpoint))) return pi
def train(self, seg, optim_batchsize, optim_epochs): cur_lrmult = 1.0 add_vtarg_and_adv(seg, self.gamma, self.lam) ob, unnorm_ac, atarg, tdlamret = seg["ob"], seg["unnorm_ac"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=unnorm_ac, atarg=atarg, vtarg=tdlamret), shuffle=not self.pi.recurrent) if hasattr(self.pi, "ob_rms"): self.pi.update_obs_rms(ob) # update running mean/std for policy self.assign_old_eq_new( ) # set old parameter values to new parameter values logger.log2("Optimizing...") logger.log2(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): lg = self.lossandgrad(batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, *self.fix_ob2feed(batch["ob"])) new_losses, g = lg[:-1], lg[-1] self.adam.update(g, self.optim_stepsize * cur_lrmult) losses.append(new_losses) logger.log2(fmt_row(13, np.mean(losses, axis=0))) logger.log2("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, *self.fix_ob2feed(batch["ob"])) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log2(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, self.loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) return meanlosses
def update_policy(pi, seg, gamma, lam, logger, optim_epochs, optim_batchsize, optim_stepsize, cur_lrmult, loss_names, lossandgrad, adam, assign_old_eq_new, compute_losses, mpi_moments_fn): add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): # *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments_fn(losses) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) return vpredbefore, tdlamret, optim_batchsize
def _helper_runningmeanstd(): comm = MPI.COMM_WORLD np.random.seed(0) for (triple,axis) in [ ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0), ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0), ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1), ]: x = np.concatenate(triple, axis=axis) ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]] ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis) for (a1,a2) in zipsame(ms1, ms2): print(a1, a2) assert np.allclose(a1, a2) print("ok!")
def test_runningmeanstd(): comm = MPI.COMM_WORLD np.random.seed(0) for (triple,axis) in [ ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0), ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0), ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1), ]: x = np.concatenate(triple, axis=axis) ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]] ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis) for (a1,a2) in zipsame(ms1, ms2): print(a1, a2) assert np.allclose(a1, a2) print("ok!")
def _build_ppo(self): config = self._config pi = self.policy oldpi = self.old_policy # input placeholders obs = pi.obs ac = pi.pdtype.sample_placeholder([None], name='action') atarg = tf.placeholder(dtype=tf.float32, shape=[None], name='advantage') ret = tf.placeholder(dtype=tf.float32, shape=[None], name='return') lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) self._clip_param = config.clip_param * lrmult # policy var_list = pi.get_trainable_variables() self._adam = MpiAdam(var_list) fetch_dict = self.policy_loss_ppo(pi, oldpi, ac, atarg, ret) if self._is_chef: self.summary_name += ['ppo/' + key for key in fetch_dict.keys()] self.summary_name += ['ppo/grad_norm', 'ppo/grad_norm_clipped'] fetch_dict['g'] = U.flatgrad(fetch_dict['total_loss'], var_list) self._loss = U.function([lrmult] + obs + [ac, atarg, ret], fetch_dict) self._update_oldpi = U.function( [], [], updates=[ tf.assign(oldv, newv) for ( oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) # initialize and sync U.initialize() self._adam.sync()
def __init__(self, env, world, policies, nsteps, load_path, rho, max_kl, ent_coef, vf_coef, max_grad_norm, sync): self.sess = sess = U.get_session() self.env = env self.world = world self.sync = sync self.max_kl = max_kl if hasattr(env, 'num_envs'): self.n_batches = n_batches = nsteps * env.num_envs else: self.n_batches = n_batches = nsteps if MPI is not None: self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() else: self.nworkers = 1 self.rank = 0 cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) # GLOBAL PLACEHOLDERS self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) self.pi_n, self.oldpi_n, self.vfadam_n, self.exchange_n, self.to_exchange_n = [], [], [], [], [] self.compute_jtvp_n, self.compute_fvp_n, self.compute_losses_n, self.compute_vfloss_n = [], [], [], [] self.set_from_flat_n, self.get_flat_n = [], [] for i in range(world.n): name_scope = world.agents[i].name.replace(' ', '') with tf.variable_scope(name_scope): # OBSERVATION PLACEHOLDER ob_dtype = env.observation_space[i].dtype ob_shape = env.observation_space[i].shape OB = tf.placeholder(dtype=ob_dtype, shape=(None, ) + ob_shape) # Policy with tf.variable_scope("pi"): pi = policies[i](n_batches, observ_placeholder=OB) with tf.variable_scope("oldpi"): oldpi = policies[i](n_batches, observ_placeholder=OB) # CREATE OTHER PLACEHOLDERS AC = pi.pdtype.sample_placeholder([None]) ADV = tf.placeholder(dtype=tf.float32, shape=[None]) R = tf.placeholder(dtype=tf.float32, shape=[None]) OLDVPRED = tf.placeholder(dtype=tf.float32, shape=[None]) NB = tf.placeholder(dtype=tf.int32, shape=None) A = tf.placeholder(dtype=tf.float32, shape=None) ratio = tf.exp( pi.pd.logp(AC) - oldpi.pd.logp(AC) ) # Be careful about the dimensionality!!!!!!!!!!!!!!!! surrgain = tf.reduce_mean(ADV * ratio) kloldnew = oldpi.pd.kl(pi.pd) meankl = tf.reduce_mean(kloldnew) sync_err = A * tf.reshape(ratio, (self.n_batches, )) - tf.reshape( tf.gather(pi.net.z, NB), (self.n_batches, )) sync_loss = tf.reduce_sum(tf.reshape(tf.gather(pi.net.z, NB), (self.n_batches,)) * sync_err) + \ 0.5 * rho * tf.reduce_sum(tf.square(sync_err)) lagrange_loss = -surrgain + sync_loss losses = [lagrange_loss, surrgain, meankl] dist = meankl var_list = pi.net.w klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + sz], shape)) start += sz jjvp = [tf.zeros(shape, dtype=tf.float32) for shape in shapes] jtvp = [tf.zeros(shape, dtype=tf.float32) for shape in shapes] right_b = -ADV + A * tf.gather( pi.net.p, NB) - rho * A * tf.gather(pi.net.z, NB) for i in range(self.n_batches): ratio_i_grad = tf.gradients(ratio[i], var_list) jvp_i = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(ratio_i_grad, tangents) ]) jjvp = [ tf.add_n([jj, gg * jvp_i]) for (jj, gg) in zipsame(jjvp, ratio_i_grad) ] jtvp = [ tf.add_n([jt, gt * right_b[i]]) for (jt, gt) in zipsame(jtvp, ratio_i_grad) ] print(i) jjvp = tf.concat( axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in jjvp]) jtvp = tf.concat( axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in jtvp]) gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = tf.add_n([U.flatgrad(gvp, var_list), rho * jjvp]) # Define the value loss vpredclipped = OLDVPRED + tf.clip_by_value( pi.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # vpredclipped = tf.clip_by_value(pi.vf, OLDVPRED*(1-CLIPRANGE), OLDVPRED*(1+CLIPRANGE)) vferr = tf.square(pi.vf - R) vferr2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vferr, vferr2)) vfadam = MpiAdam(pi.net.v) compute_jtvp = U.function([OB, AC, ADV, A, NB], jtvp) compute_fvp = U.function([flat_tangent, OB, AC, ADV], fvp) compute_losses = U.function([OB, AC, ADV, A, NB], losses) compute_vfloss = U.function([OB, R, OLDVPRED, CLIPRANGE], vf_loss) exchange = pi.net.exchange(sess, OB, AC, CLIPRANGE, NB, rho) to_exchange = U.function( [OB, AC, ADV, NB, CLIPRANGE], [ratio, tf.gather(pi.net.p, NB)]) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) self.pi_n.append(pi) self.oldpi_n.append(oldpi) self.get_flat_n.append(get_flat) self.set_from_flat_n.append(set_from_flat) self.vfadam_n.append(vfadam) self.exchange_n.append(exchange) self.to_exchange_n.append(to_exchange) self.compute_jtvp_n.append(compute_jtvp) self.compute_fvp_n.append(compute_fvp) self.compute_losses_n.append(compute_losses) self.compute_vfloss_n.append(compute_vfloss) # Update old plicy network updates = [] for i in range(len(world.agents)): name_scope = world.agents[i].name.replace(' ', '') old_vars = get_trainable_variables("{}/oldpi".format(name_scope)) now_vars = get_trainable_variables("{}/pi".format(name_scope)) updates += [ tf.assign(oldv, nowv) for (oldv, nowv) in zipsame(old_vars, now_vars) ] updates += [ tf.assign(self.pi_n[i].net.z, tf.ones_like(self.pi_n[i].net.z)) ] self.assign_old_eq_new = U.function([], [], updates=updates) @contextmanager def timed(msg): print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) self.timed = timed def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= self.nworkers else: out = np.copy(x) return out self.allmean = allmean # Initialization U.initialize() if load_path is not None: self.load(load_path) # for i in range(len(self.pi_n)): th_init = self.get_flat_n[i]() self.set_from_flat_n[i](th_init) print("Init param sum", th_init.sum(), flush=True) for vfadam in self.vfadam_n: vfadam.sync()
def enjoy( env, policy_func, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_name=None, save_per_acts=3, reload_name=None): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() if reload_name: saver = tf.train.Saver() saver.restore(tf.get_default_session(), reload_name) print("Loaded model successfully.") # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses)
def learn( env, policy_func, reward_giver, expert_dataset, rank, pretrained, pretrained_weight, *, # 0 g_step, d_step, entcoeff, save_per_iter, # 1024 ckpt_dir, log_dir, timesteps_per_batch, task_name, robot_name, gamma, lam, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, callback=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd") ] vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] assert len(var_list) == len(vf_var_list) + 1 d_adam = MpiAdam(reward_giver.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) if robot_name == 'scara': summary_writer = tf.summary.FileWriter( '/home/yue/gym-gazebo/Tensorboard/scara', graph=tf.get_default_graph()) elif robot_name == 'mara': # summary_writer=tf.summary.FileWriter('/home/yue/gym-gazebo/Tensorboard/mara/down-home_position',graph=tf.get_default_graph()) summary_writer = tf.summary.FileWriter( '/home/yue/gym-gazebo/Tensorboard/mara/collisions_model/', graph=tf.get_default_graph()) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy if nworkers != 1: g = allmean(compute_vflossandgrad(mbob, mbret)) else: g = compute_vflossandgrad(mbob, mbret) vfadam.update(g, vf_stepsize) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # update running mean/std for reward_giver if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) if nworkers != 1: d_adam.update(allmean(g), d_stepsize) else: d_adam.update(g, d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) g_loss_summary = tf.Summary(value=[ tf.Summary.Value(tag="g_loss", simple_value=np.mean(d_losses[0][0])) ]) summary_writer.add_summary(g_loss_summary, timesteps_so_far) d_loss_summary = tf.Summary(value=[ tf.Summary.Value(tag="d_loss", simple_value=np.mean(d_losses[0][1])) ]) summary_writer.add_summary(d_loss_summary, timesteps_so_far) entropy_summary = tf.Summary(value=[ tf.Summary.Value(tag="entropy", simple_value=np.mean(d_losses[0][2])) ]) summary_writer.add_summary(entropy_summary, timesteps_so_far) entropy_loss_summary = tf.Summary(value=[ tf.Summary.Value(tag="entropy_loss", simple_value=np.mean(d_losses[0][3])) ]) summary_writer.add_summary(entropy_loss_summary, timesteps_so_far) g_acc_summary = tf.Summary(value=[ tf.Summary.Value(tag="g_acc", simple_value=np.mean(d_losses[0][4])) ]) summary_writer.add_summary(g_acc_summary, timesteps_so_far) expert_acc_summary = tf.Summary(value=[ tf.Summary.Value(tag="expert_acc", simple_value=np.mean(d_losses[0][5])) ]) summary_writer.add_summary(expert_acc_summary, timesteps_so_far) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) summary = tf.Summary(value=[ tf.Summary.Value(tag="MeanDiscriminator", simple_value=np.mean(rewbuffer)) ]) summary_writer.add_summary(summary, timesteps_so_far) truesummary = tf.Summary(value=[ tf.Summary.Value(tag="MeanGenerator", simple_value=np.mean(true_rewbuffer)) ]) summary_writer.add_summary(truesummary, timesteps_so_far) true_rets_summary = tf.Summary(value=[ tf.Summary.Value(tag="Generator", simple_value=np.mean(true_rets)) ]) summary_writer.add_summary(true_rets_summary, timesteps_so_far) len_summary = tf.Summary(value=[ tf.Summary.Value(tag="Length", simple_value=np.mean(lenbuffer)) ]) summary_writer.add_summary(len_summary, timesteps_so_far) optimgain_summary = tf.Summary(value=[ tf.Summary.Value(tag="Optimgain", simple_value=np.mean(meanlosses[0])) ]) summary_writer.add_summary(optimgain_summary, timesteps_so_far) meankl_summary = tf.Summary(value=[ tf.Summary.Value(tag="Meankl", simple_value=np.mean(meanlosses[1])) ]) summary_writer.add_summary(meankl_summary, timesteps_so_far) entloss_summary = tf.Summary(value=[ tf.Summary.Value(tag="Entloss", simple_value=np.mean(meanlosses[2])) ]) summary_writer.add_summary(entloss_summary, timesteps_so_far) surrgain_summary = tf.Summary(value=[ tf.Summary.Value(tag="Surrgain", simple_value=np.mean(meanlosses[3])) ]) summary_writer.add_summary(surrgain_summary, timesteps_so_far) entropy_summary = tf.Summary(value=[ tf.Summary.Value(tag="Entropy", simple_value=np.mean(meanlosses[4])) ]) summary_writer.add_summary(entropy_summary, timesteps_so_far) epThisIter_summary = tf.Summary(value=[ tf.Summary.Value(tag="EpThisIter", simple_value=np.mean(len(lens))) ]) summary_writer.add_summary(epThisIter_summary, timesteps_so_far) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("MeanDiscriminator", np.mean(rewbuffer)) # Save model if robot_name == 'scara': if iters_so_far % save_per_iter == 0: if np.mean(rewbuffer) <= 200 or np.mean( true_rewbuffer) >= -100: task_name = str(iters_so_far) fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) if iters_so_far == 2000: break elif robot_name == 'mara': if iters_so_far % save_per_iter == 0: # if np.mean(rewbuffer) <= 300 or np.mean(true_rewbuffer) >= -400: task_name = str(iters_so_far) fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) if iters_so_far == 5000: break logger.record_tabular("MeanGenerator", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular()
def learn(base_env, policy_fn, *, max_fitness, # has to be negative, as cmaes consider minization popsize, gensize, bounds, sigma, eval_iters, timesteps_per_actorbatch, max_timesteps = 0, max_episodes = 0, max_iters = 0, max_seconds = 0, seed = 0 ): set_global_seeds(seed) # Setup losses and stuff # ---------------------------------------- ob_space = base_env.observation_space ac_space = base_env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy backup_pi = policy_fn("backup_pi", ob_space, ac_space) # Construct a network for every individual to adapt during the es evolution U.initialize() pi_set_from_flat_params = U.SetFromFlat(pi.get_trainable_variables()) pi_get_flat_params = U.GetFlat(pi.get_trainable_variables()) global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer,best_fitness, eval_seq episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen = 100) # rolling buffer for episode lengths rewbuffer = deque(maxlen = 100) # rolling buffer for episode rewards assign_backup_eq_new = U.function([], [], updates = [tf.assign(backup_v, newv) for (backup_v, newv) in zipsame( backup_pi.get_variables(), pi.get_variables())]) assign_new_eq_backup = U.function([], [], updates = [tf.assign(newv, backup_v) for (newv, backup_v) in zipsame( pi.get_variables(), backup_pi.get_variables())]) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" # Build generator for all solutions actors = [] best_fitness = -np.inf eval_seq = traj_segment_generator_eval(pi, base_env, timesteps_per_actorbatch, stochastic = True) for i in range(popsize): newActor = traj_segment_generator(pi, base_env, timesteps_per_actorbatch, stochastic = True, eval_iters = eval_iters) actors.append(newActor) flatten_weights = pi_get_flat_params() opt = cma.CMAOptions() opt['tolfun'] = max_fitness opt['popsize'] = popsize opt['maxiter'] = gensize opt['verb_disp'] = 0 opt['verb_log'] = 0 # opt['seed'] = seed opt['AdaptSigma'] = True # opt['bounds'] = bounds es = cma.CMAEvolutionStrategy(flatten_weights, sigma, opt) costs = None best_solution = None while True: if max_timesteps and timesteps_so_far >= max_timesteps: logger.log("Max time steps") break elif max_episodes and episodes_so_far >= max_episodes: logger.log("Max episodes") break elif max_iters and iters_so_far >= max_iters: logger.log("Max iterations") break elif max_seconds and time.time() - tstart >= max_seconds: logger.log("Max time") break elif es.countiter >= opt['maxiter']: logger.log("Max generations") break assign_backup_eq_new() # backup current policy logger.log("********** Generation %i ************" % iters_so_far) eval_seg = eval_seq.__next__() rewbuffer.extend(eval_seg["ep_rets"]) lenbuffer.extend(eval_seg["ep_lens"]) if iters_so_far == 0: result_record() solutions = es.ask() if costs is not None: solutions[np.argmax(costs)] = np.copy(best_solution) ob_segs = None segs = [] costs = [] lens = [] for id, solution in enumerate(solutions): # pi.set_Flat_variables(solution) pi_set_from_flat_params(solution) seg = actors[id].__next__() costs.append(-np.mean(seg["ep_rets"])) lens.append(np.sum(seg["ep_lens"])) segs.append(seg) if ob_segs is None: ob_segs = {'ob': np.copy(seg['ob'])} else: ob_segs['ob'] = np.append(ob_segs['ob'], seg['ob'], axis=0) assign_new_eq_backup() fit_idx = np.array(costs).flatten().argsort()[:len(costs)] solutions = np.array(solutions)[fit_idx] costs = np.array(costs)[fit_idx] segs = np.array(segs)[fit_idx] # Weights decay # costs, real_costs = fitness_shift(costs) # costs, real_costs = compute_centered_ranks(costs) l2_decay = compute_weight_decay(0.01, solutions) costs += l2_decay costs, real_costs = fitness_normalization(costs) # best_solution = np.copy(solutions[0]) # best_fitness = -real_costs[0] # rewbuffer.extend(segs[0]["ep_rets"]) # lenbuffer.extend(segs[0]["ep_lens"]) es.tell_real_seg(solutions = solutions, function_values = costs, real_f = real_costs, segs = segs) best_solution = np.copy(es.result[0]) best_fitness = -es.result[1] rewbuffer.extend(es.result[3]["ep_rets"]) lenbuffer.extend(es.result[3]["ep_lens"]) logger.log("Generation:", es.countiter) logger.log("Best Solution Fitness:", best_fitness) pi_set_from_flat_params(best_solution) ob = ob_segs["ob"] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for observation normalization iters_so_far += 1 episodes_so_far += sum(lens)
def learn( make_env, make_policy, *, n_episodes, horizon, delta, gamma, max_iters, sampler=None, use_natural_gradient=False, #can be 'exact', 'approximate' fisher_reg=1e-2, iw_method='is', iw_norm='none', bound='J', line_search_type='parabola', save_weights=False, improvement_tol=0., center_return=False, render_after=None, max_offline_iters=100, callback=None, clipping=False, entropy='none', positive_return=False, reward_clustering='none'): np.set_printoptions(precision=3) max_samples = horizon * n_episodes if line_search_type == 'binary': line_search = line_search_binary elif line_search_type == 'parabola': line_search = line_search_parabola else: raise ValueError() # Building the environment env = make_env() ob_space = env.observation_space ac_space = env.action_space # Building the policy pi = make_policy('pi', ob_space, ac_space) oldpi = make_policy('oldpi', ob_space, ac_space) all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split('/')[1].startswith('pol') ] shapes = [U.intprod(var.get_shape().as_list()) for var in var_list] n_parameters = sum(shapes) # Placeholders ob_ = ob = U.get_placeholder_cached(name='ob') ac_ = pi.pdtype.sample_placeholder([max_samples], name='ac') mask_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='mask') rew_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='rew') disc_rew_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='disc_rew') gradient_ = tf.placeholder(dtype=tf.float32, shape=(n_parameters, 1), name='gradient') iter_number_ = tf.placeholder(dtype=tf.int32, name='iter_number') losses_with_name = [] # Policy densities target_log_pdf = pi.pd.logp(ac_) behavioral_log_pdf = oldpi.pd.logp(ac_) log_ratio = target_log_pdf - behavioral_log_pdf # Split operations disc_rew_split = tf.stack(tf.split(disc_rew_ * mask_, n_episodes)) rew_split = tf.stack(tf.split(rew_ * mask_, n_episodes)) log_ratio_split = tf.stack(tf.split(log_ratio * mask_, n_episodes)) target_log_pdf_split = tf.stack( tf.split(target_log_pdf * mask_, n_episodes)) behavioral_log_pdf_split = tf.stack( tf.split(behavioral_log_pdf * mask_, n_episodes)) mask_split = tf.stack(tf.split(mask_, n_episodes)) # Renyi divergence emp_d2_split = tf.stack( tf.split(pi.pd.renyi(oldpi.pd, 2) * mask_, n_episodes)) emp_d2_cum_split = tf.reduce_sum(emp_d2_split, axis=1) empirical_d2 = tf.reduce_mean(tf.exp(emp_d2_cum_split)) # Return ep_return = tf.reduce_sum(mask_split * disc_rew_split, axis=1) if clipping: rew_split = tf.clip_by_value(rew_split, -1, 1) if center_return: ep_return = ep_return - tf.reduce_mean(ep_return) rew_split = rew_split - (tf.reduce_sum(rew_split) / (tf.reduce_sum(mask_split) + 1e-24)) discounter = [pow(gamma, i) for i in range(0, horizon)] # Decreasing gamma discounter_tf = tf.constant(discounter) disc_rew_split = rew_split * discounter_tf return_mean = tf.reduce_mean(ep_return) return_std = U.reduce_std(ep_return) return_max = tf.reduce_max(ep_return) return_min = tf.reduce_min(ep_return) return_abs_max = tf.reduce_max(tf.abs(ep_return)) return_step_max = tf.reduce_max(tf.abs(rew_split)) # Max step reward return_step_mean = tf.abs(tf.reduce_mean(rew_split)) positive_step_return_max = tf.maximum(0.0, tf.reduce_max(rew_split)) negative_step_return_max = tf.maximum(0.0, tf.reduce_max(-rew_split)) return_step_maxmin = tf.abs(positive_step_return_max - negative_step_return_max) losses_with_name.extend([(return_mean, 'InitialReturnMean'), (return_max, 'InitialReturnMax'), (return_min, 'InitialReturnMin'), (return_std, 'InitialReturnStd'), (empirical_d2, 'EmpiricalD2'), (return_step_max, 'ReturnStepMax'), (return_step_maxmin, 'ReturnStepMaxmin')]) if iw_method == 'pdis': # log_ratio_split cumulative sum log_ratio_cumsum = tf.cumsum(log_ratio_split, axis=1) # Exponentiate ratio_cumsum = tf.exp(log_ratio_cumsum) # Multiply by the step-wise reward (not episode) ratio_reward = ratio_cumsum * disc_rew_split # Average on episodes ratio_reward_per_episode = tf.reduce_sum(ratio_reward, axis=1) w_return_mean = tf.reduce_sum(ratio_reward_per_episode, axis=0) / n_episodes # Get d2(w0:t) with mask d2_w_0t = tf.exp(tf.cumsum(emp_d2_split, axis=1)) * mask_split # LEAVE THIS OUTSIDE # Sum d2(w0:t) over timesteps episode_d2_0t = tf.reduce_sum(d2_w_0t, axis=1) # Sample variance J_sample_variance = (1 / (n_episodes - 1)) * tf.reduce_sum( tf.square(ratio_reward_per_episode - w_return_mean)) losses_with_name.append((J_sample_variance, 'J_sample_variance')) losses_with_name.extend([(tf.reduce_max(ratio_cumsum), 'MaxIW'), (tf.reduce_min(ratio_cumsum), 'MinIW'), (tf.reduce_mean(ratio_cumsum), 'MeanIW'), (U.reduce_std(ratio_cumsum), 'StdIW')]) losses_with_name.extend([(tf.reduce_max(d2_w_0t), 'MaxD2w0t'), (tf.reduce_min(d2_w_0t), 'MinD2w0t'), (tf.reduce_mean(d2_w_0t), 'MeanD2w0t'), (U.reduce_std(d2_w_0t), 'StdD2w0t')]) elif iw_method == 'is': iw = tf.exp(tf.reduce_sum(log_ratio_split, axis=1)) if iw_norm == 'none': iwn = iw / n_episodes w_return_mean = tf.reduce_sum(iwn * ep_return) J_sample_variance = (1 / (n_episodes - 1)) * tf.reduce_sum( tf.square(iw * ep_return - w_return_mean)) losses_with_name.append((J_sample_variance, 'J_sample_variance')) elif iw_norm == 'sn': iwn = iw / tf.reduce_sum(iw) w_return_mean = tf.reduce_sum(iwn * ep_return) elif iw_norm == 'regression': iwn = iw / n_episodes mean_iw = tf.reduce_mean(iw) beta = tf.reduce_sum( (iw - mean_iw) * ep_return * iw) / (tf.reduce_sum( (iw - mean_iw)**2) + 1e-24) w_return_mean = tf.reduce_mean(iw * ep_return - beta * (iw - 1)) else: raise NotImplementedError() ess_classic = tf.linalg.norm(iw, 1)**2 / tf.linalg.norm(iw, 2)**2 sqrt_ess_classic = tf.linalg.norm(iw, 1) / tf.linalg.norm(iw, 2) ess_renyi = n_episodes / empirical_d2 losses_with_name.extend([(tf.reduce_max(iwn), 'MaxIWNorm'), (tf.reduce_min(iwn), 'MinIWNorm'), (tf.reduce_mean(iwn), 'MeanIWNorm'), (U.reduce_std(iwn), 'StdIWNorm'), (tf.reduce_max(iw), 'MaxIW'), (tf.reduce_min(iw), 'MinIW'), (tf.reduce_mean(iw), 'MeanIW'), (U.reduce_std(iw), 'StdIW'), (ess_classic, 'ESSClassic'), (ess_renyi, 'ESSRenyi')]) elif iw_method == 'rbis': # Check if we need to cluster rewards rew_clustering_options = reward_clustering.split(':') if reward_clustering == 'none': pass # Do nothing elif rew_clustering_options[0] == 'global': assert len( rew_clustering_options ) == 2, "Reward clustering: Provide the correct number of parameters" N = int(rew_clustering_options[1]) tf.add_to_collection( 'prints', tf.Print(ep_return, [ep_return], 'ep_return', summarize=20)) global_rew_min = tf.Variable(float('+inf'), trainable=False) global_rew_max = tf.Variable(float('-inf'), trainable=False) rew_min = tf.reduce_min(ep_return) rew_max = tf.reduce_max(ep_return) global_rew_min = tf.assign(global_rew_min, tf.minimum(global_rew_min, rew_min)) global_rew_max = tf.assign(global_rew_max, tf.maximum(global_rew_max, rew_max)) interval_size = (global_rew_max - global_rew_min) / N ep_return = tf.floordiv(ep_return, interval_size) * interval_size elif rew_clustering_options[0] == 'batch': assert len( rew_clustering_options ) == 2, "Reward clustering: Provide the correct number of parameters" N = int(rew_clustering_options[1]) rew_min = tf.reduce_min(ep_return) rew_max = tf.reduce_max(ep_return) interval_size = (rew_max - rew_min) / N ep_return = tf.floordiv(ep_return, interval_size) * interval_size elif rew_clustering_options[0] == 'manual': assert len( rew_clustering_options ) == 4, "Reward clustering: Provide the correct number of parameters" N, rew_min, rew_max = map(int, rew_clustering_options[1:]) interval_size = (rew_max - rew_min) / N # Clip to avoid overflow and cluster ep_return = tf.clip_by_value(ep_return, rew_min, rew_max) ep_return = tf.floordiv(ep_return, interval_size) * interval_size else: raise Exception('Unrecognized reward clustering scheme.') # Get pdfs for episodes target_log_pdf_episode = tf.reduce_sum(target_log_pdf_split, axis=1) behavioral_log_pdf_episode = tf.reduce_sum(behavioral_log_pdf_split, axis=1) # Normalize log_proba (avoid as overflows as possible) normalization_factor = tf.reduce_mean( tf.stack([target_log_pdf_episode, behavioral_log_pdf_episode])) target_norm_log_pdf_episode = target_log_pdf_episode - normalization_factor behavioral_norm_log_pdf_episode = behavioral_log_pdf_episode - normalization_factor # Exponentiate target_pdf_episode = tf.clip_by_value( tf.cast(tf.exp(target_norm_log_pdf_episode), tf.float64), 1e-300, 1e+300) behavioral_pdf_episode = tf.clip_by_value( tf.cast(tf.exp(behavioral_norm_log_pdf_episode), tf.float64), 1e-300, 1e+300) tf.add_to_collection( 'asserts', tf.assert_positive(target_pdf_episode, name='target_pdf_positive')) tf.add_to_collection( 'asserts', tf.assert_positive(behavioral_pdf_episode, name='behavioral_pdf_positive')) # Compute the merging matrix (reward-clustering) and the number of clusters reward_unique, reward_indexes = tf.unique(ep_return) episode_clustering_matrix = tf.cast( tf.one_hot(reward_indexes, n_episodes), tf.float64) max_index = tf.reduce_max(reward_indexes) + 1 trajectories_per_cluster = tf.reduce_sum(episode_clustering_matrix, axis=0)[:max_index] tf.add_to_collection( 'asserts', tf.assert_positive(tf.reduce_sum(episode_clustering_matrix, axis=0)[:max_index], name='clustering_matrix')) # Get the clustered pdfs clustered_target_pdf = tf.matmul( tf.reshape(target_pdf_episode, (1, -1)), episode_clustering_matrix)[0][:max_index] clustered_behavioral_pdf = tf.matmul( tf.reshape(behavioral_pdf_episode, (1, -1)), episode_clustering_matrix)[0][:max_index] tf.add_to_collection( 'asserts', tf.assert_positive(clustered_target_pdf, name='clust_target_pdf_positive')) tf.add_to_collection( 'asserts', tf.assert_positive(clustered_behavioral_pdf, name='clust_behavioral_pdf_positive')) # Compute the J ratio_clustered = clustered_target_pdf / clustered_behavioral_pdf #ratio_reward = tf.cast(ratio_clustered, tf.float32) * reward_unique # ---- No cluster cardinality ratio_reward = tf.cast(ratio_clustered, tf.float32) * reward_unique * tf.cast( trajectories_per_cluster, tf.float32) # ---- Cluster cardinality #w_return_mean = tf.reduce_sum(ratio_reward) / tf.cast(max_index, tf.float32) # ---- No cluster cardinality w_return_mean = tf.reduce_sum(ratio_reward) / tf.cast( n_episodes, tf.float32) # ---- Cluster cardinality # Divergences ess_classic = tf.linalg.norm(ratio_reward, 1)**2 / tf.linalg.norm( ratio_reward, 2)**2 sqrt_ess_classic = tf.linalg.norm(ratio_reward, 1) / tf.linalg.norm( ratio_reward, 2) ess_renyi = n_episodes / empirical_d2 # Summaries losses_with_name.extend([(tf.reduce_max(ratio_clustered), 'MaxIW'), (tf.reduce_min(ratio_clustered), 'MinIW'), (tf.reduce_mean(ratio_clustered), 'MeanIW'), (U.reduce_std(ratio_clustered), 'StdIW'), (1 - (max_index / n_episodes), 'RewardCompression'), (ess_classic, 'ESSClassic'), (ess_renyi, 'ESSRenyi')]) else: raise NotImplementedError() if bound == 'J': bound_ = w_return_mean elif bound == 'std-d2': bound_ = w_return_mean - tf.sqrt( (1 - delta) / (delta * ess_renyi)) * return_std elif bound == 'max-d2': var_estimate = tf.sqrt( (1 - delta) / (delta * ess_renyi)) * return_abs_max bound_ = w_return_mean - tf.sqrt( (1 - delta) / (delta * ess_renyi)) * return_abs_max elif bound == 'max-ess': bound_ = w_return_mean - tf.sqrt( (1 - delta) / delta) / sqrt_ess_classic * return_abs_max elif bound == 'std-ess': bound_ = w_return_mean - tf.sqrt( (1 - delta) / delta) / sqrt_ess_classic * return_std elif bound == 'pdis-max-d2': # Discount factor if gamma >= 1: discounter = [ float(1 + 2 * (horizon - t - 1)) for t in range(0, horizon) ] else: def f(t): return pow(gamma, 2 * t) + ( 2 * pow(gamma, t) * (pow(gamma, t + 1) - pow(gamma, horizon))) / (1 - gamma) discounter = [f(t) for t in range(0, horizon)] discounter_tf = tf.constant(discounter) mean_episode_d2 = tf.reduce_sum( d2_w_0t, axis=0) / (tf.reduce_sum(mask_split, axis=0) + 1e-24) discounted_d2 = mean_episode_d2 * discounter_tf # Discounted d2 discounted_total_d2 = tf.reduce_sum(discounted_d2, axis=0) # Sum over time bound_ = w_return_mean - tf.sqrt( (1 - delta) * discounted_total_d2 / (delta * n_episodes)) * return_step_max elif bound == 'pdis-mean-d2': # Discount factor if gamma >= 1: discounter = [ float(1 + 2 * (horizon - t - 1)) for t in range(0, horizon) ] else: def f(t): return pow(gamma, 2 * t) + ( 2 * pow(gamma, t) * (pow(gamma, t + 1) - pow(gamma, horizon))) / (1 - gamma) discounter = [f(t) for t in range(0, horizon)] discounter_tf = tf.constant(discounter) mean_episode_d2 = tf.reduce_sum( d2_w_0t, axis=0) / (tf.reduce_sum(mask_split, axis=0) + 1e-24) discounted_d2 = mean_episode_d2 * discounter_tf # Discounted d2 discounted_total_d2 = tf.reduce_sum(discounted_d2, axis=0) # Sum over time bound_ = w_return_mean - tf.sqrt( (1 - delta) * discounted_total_d2 / (delta * n_episodes)) * return_step_mean else: raise NotImplementedError() # Policy entropy for exploration ent = pi.pd.entropy() meanent = tf.reduce_mean(ent) losses_with_name.append((meanent, 'MeanEntropy')) # Add policy entropy bonus if entropy != 'none': scheme, v1, v2 = entropy.split(':') if scheme == 'step': entcoeff = tf.cond(iter_number_ < int(v2), lambda: float(v1), lambda: float(0.0)) losses_with_name.append((entcoeff, 'EntropyCoefficient')) entbonus = entcoeff * meanent bound_ = bound_ + entbonus elif scheme == 'lin': ip = tf.cast(iter_number_ / max_iters, tf.float32) entcoeff_decay = tf.maximum( 0.0, float(v2) + (float(v1) - float(v2)) * (1.0 - ip)) losses_with_name.append((entcoeff_decay, 'EntropyCoefficient')) entbonus = entcoeff_decay * meanent bound_ = bound_ + entbonus elif scheme == 'exp': ent_f = tf.exp( -tf.abs(tf.reduce_mean(iw) - 1) * float(v2)) * float(v1) losses_with_name.append((ent_f, 'EntropyCoefficient')) bound_ = bound_ + ent_f * meanent else: raise Exception('Unrecognized entropy scheme.') losses_with_name.append((w_return_mean, 'ReturnMeanIW')) losses_with_name.append((bound_, 'Bound')) losses, loss_names = map(list, zip(*losses_with_name)) if use_natural_gradient: p = tf.placeholder(dtype=tf.float32, shape=[None]) target_logpdf_episode = tf.reduce_sum(target_log_pdf_split * mask_split, axis=1) grad_logprob = U.flatgrad( tf.stop_gradient(iwn) * target_logpdf_episode, var_list) dot_product = tf.reduce_sum(grad_logprob * p) hess_logprob = U.flatgrad(dot_product, var_list) compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_], [-hess_logprob]) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) assert_ops = tf.group(*tf.get_collection('asserts')) print_ops = tf.group(*tf.get_collection('prints')) compute_lossandgrad = U.function( [ob_, ac_, rew_, disc_rew_, mask_, iter_number_], losses + [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_grad = U.function( [ob_, ac_, rew_, disc_rew_, mask_, iter_number_], [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_bound = U.function( [ob_, ac_, rew_, disc_rew_, mask_, iter_number_], [bound_, assert_ops, print_ops]) compute_losses = U.function( [ob_, ac_, rew_, disc_rew_, mask_, iter_number_], losses) #compute_temp = U.function([ob_, ac_, rew_, disc_rew_, mask_], [ratio_cumsum, discounted_ratio]) set_parameter = U.SetFromFlat(var_list) get_parameter = U.GetFlat(var_list) if sampler is None: seg_gen = traj_segment_generator(pi, env, n_episodes, horizon, stochastic=True) sampler = type("SequentialSampler", (object, ), { "collect": lambda self, _: seg_gen.__next__() })() U.initialize() # Starting optimizing episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=n_episodes) rewbuffer = deque(maxlen=n_episodes) while True: iters_so_far += 1 if render_after is not None and iters_so_far % render_after == 0: if hasattr(env, 'render'): render(env, pi, horizon) if callback: callback(locals(), globals()) if iters_so_far >= max_iters: print('Finised...') break logger.log('********** Iteration %i ************' % iters_so_far) theta = get_parameter() with timed('sampling'): seg = sampler.collect(theta) add_disc_rew(seg, gamma) lens, rets = seg['ep_lens'], seg['ep_rets'] lenbuffer.extend(lens) rewbuffer.extend(rets) episodes_so_far += len(lens) timesteps_so_far += sum(lens) args = ob, ac, rew, disc_rew, mask, iter_number = seg['ob'], seg[ 'ac'], seg['rew'], seg['disc_rew'], seg['mask'], iters_so_far assign_old_eq_new() def evaluate_loss(): loss = compute_bound(*args) return loss[0] def evaluate_gradient(): gradient = compute_grad(*args) return gradient[0] if use_natural_gradient: def evaluate_fisher_vector_prod(x): return compute_linear_operator(x, *args)[0] + fisher_reg * x def evaluate_natural_gradient(g): return cg(evaluate_fisher_vector_prod, g, cg_iters=10, verbose=0) else: evaluate_natural_gradient = None with timed('summaries before'): logger.record_tabular("Iteration", iters_so_far) logger.record_tabular("InitialBound", evaluate_loss()) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if save_weights: logger.record_tabular('Weights', str(get_parameter())) import pickle file = open('checkpoint.pkl', 'wb') pickle.dump(theta, file) with timed("offline optimization"): theta, improvement = optimize_offline( theta, set_parameter, line_search, evaluate_loss, evaluate_gradient, evaluate_natural_gradient, max_offline_ite=max_offline_iters) set_parameter(theta) with timed('summaries after'): meanlosses = np.array(compute_losses(*args)) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.dump_tabular() env.close()
def __init__(self, a_name, env, policy_func, par): self.env = env self.timesteps_per_batch = par.timesteps_per_batch self.max_kl = par.max_kl self.cg_iters = par.cg_iters self.gamma = par.gamma self.lam = par.lam # advantage estimation self.entcoeff = par.entcoeff self.cg_damping = par.cg_damping self.vf_stepsize = par.vf_stepsize self.vf_iters = par.vf_iters self.max_timesteps = par.max_timesteps self.max_episodes = par.max_episodes self.max_iters = par.max_iters self.callback = par.callback, # you can do anything in the callback, since it takes locals(), globals() self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- self.ob_space = self.env.observation_space self.ac_space = self.env.action_space self.pi = policy_func(a_name, self.ob_space, self.ac_space) self.oldpi = policy_func("oldpi" + a_name, self.ob_space, self.ac_space) self.atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) self.ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return self.ob = U.get_placeholder_cached(name="ob" + str(TRPO_agent_new.index2)) self.ac = self.pi.pdtype.sample_placeholder([None]) self.kloldnew = self.oldpi.pd.kl(self.pi.pd) self.ent = self.pi.pd.entropy() meankl = U.mean(self.kloldnew) meanent = U.mean(self.ent) entbonus = self.entcoeff * meanent self.vferr = U.mean(tf.square(self.pi.vpred - self.ret)) ratio = tf.exp(self.pi.pd.logp(self.ac) - self.oldpi.pd.logp(self.ac)) # advantage * pnew / pold surrgain = U.mean(ratio * self.atarg) optimgain = surrgain + entbonus self.losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = [ "optimgain", "meankl", "entloss", "surrgain", "entropy" ] self.dist = meankl all_var_list = self.pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] self.vfadam = MpiAdam(vf_var_list) self.get_flat = U.GetFlat(var_list) self.set_from_flat = U.SetFromFlat(var_list) self.klgrads = tf.gradients(self.dist, var_list) self.flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan" + str(TRPO_agent_new.index2)) shapes = [var.get_shape().as_list() for var in var_list] start = 0 self.tangents = [] for shape in shapes: sz = U.intprod(shape) self.tangents.append( tf.reshape(self.flat_tangent[start:start + sz], shape)) start += sz self.gvp = tf.add_n([ U.sum(g * tangent) for (g, tangent) in zipsame(self.klgrads, self.tangents) ]) #pylint: disable=E1111 self.fvp = U.flatgrad(self.gvp, var_list) self.assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( self.oldpi.get_variables(), self.pi.get_variables()) ]) self.compute_losses = U.function([self.ob, self.ac, self.atarg], self.losses) self.compute_lossandgrad = U.function( [self.ob, self.ac, self.atarg], self.losses + [U.flatgrad(optimgain, var_list)]) self.compute_fvp = U.function( [self.flat_tangent, self.ob, self.ac, self.atarg], self.fvp) self.compute_vflossandgrad = U.function([self.ob, self.ret], U.flatgrad( self.vferr, vf_var_list)) TRPO_agent_new.index2 += 1 U.initialize() self.th_init = self.get_flat() MPI.COMM_WORLD.Bcast(self.th_init, root=0) self.set_from_flat(self.th_init) self.vfadam.sync() print("Init param sum", self.th_init.sum(), flush=True)
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation # CMAES max_fitness, # has to be negative, as cmaes consider minization popsize, gensize, bounds, sigma, eval_iters, max_v_train_iter, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) seed, env_id): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy backup_pi = policy_fn( "backup_pi", ob_space, ac_space ) # Construct a network for every individual to adapt during the es evolution pi_params = tf.placeholder(dtype=tf.float32, shape=[None]) old_pi_params = tf.placeholder(dtype=tf.float32, shape=[None]) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule layer_clip = tf.placeholder( name='layer_clip', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule bound_coeff = tf.placeholder( name='bound_coeff', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult * layer_clip # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - (oldpi.pd.logp(ac) + 1e-8)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) vf_losses = [vf_loss] vf_loss_names = ["vf_loss"] pol_loss = pol_surr + pol_entpen total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() vf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") ] pol_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("pol") ] layer_var_list = [] for i in range(pi.num_hid_layers): layer_var_list.append([ v for v in pol_var_list if v.name.split("/")[2].startswith('fc%i' % (i + 1)) ]) logstd_var_list = [ v for v in pol_var_list if v.name.split("/")[2].startswith("logstd") ] if len(logstd_var_list) != 0: layer_var_list.append([ v for v in pol_var_list if v.name.split("/")[2].startswith("final") ] + logstd_var_list) vf_lossandgrad = U.function([ob, ac, ret, lrmult], vf_losses + [U.flatgrad(vf_loss, vf_var_list)]) lossandgrad = U.function([ob, ac, atarg, ret, lrmult, layer_clip], losses + [U.flatgrad(total_loss, var_list)]) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) assign_backup_eq_new = U.function( [], [], updates=[ tf.assign(backup_v, newv) for ( backup_v, newv) in zipsame(backup_pi.get_variables(), pi.get_variables()) ]) assign_new_eq_backup = U.function( [], [], updates=[ tf.assign(newv, backup_v) for (newv, backup_v ) in zipsame(pi.get_variables(), backup_pi.get_variables()) ]) # Compute all losses compute_pol_losses = U.function([ob, ac, atarg, ret, lrmult, layer_clip], [pol_loss, pol_surr, pol_entpen, meankl]) compute_v_pred = U.function([ob], [pi.vpred]) a_prob = tf.exp(pi.pd.logp(ac)) compute_a_prob = U.function([ob, ac], [a_prob]) U.initialize() layer_set_operate_list = [] layer_get_operate_list = [] for var in layer_var_list: set_pi_layer_flat_params = U.SetFromFlat(var) layer_set_operate_list.append(set_pi_layer_flat_params) get_pi_layer_flat_params = U.GetFlat(var) layer_get_operate_list.append(get_pi_layer_flat_params) # get_pi_layer_flat_params = U.GetFlat(pol_var_list) # set_pi_layer_flat_params = U.SetFromFlat(pol_var_list) vf_adam.sync() adam.sync() global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness episodes_so_far = 0 timesteps_so_far = 0 ppo_timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards best_fitness = -np.inf eval_seq = traj_segment_generator_eval(pi, env, timesteps_per_actorbatch, stochastic=False) # eval_gen = traj_segment_generator_eval(pi, test_env, timesteps_per_actorbatch, stochastic = True) # For evaluation seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, eval_seq=eval_seq) # For train V Func assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" indices = [] # maintain all selected indices for each iteration opt = cma.CMAOptions() opt['tolfun'] = max_fitness opt['popsize'] = popsize opt['maxiter'] = gensize opt['verb_disp'] = 0 opt['verb_log'] = 0 # opt['seed'] = seed opt['AdaptSigma'] = True # opt['bounds'] = bounds # opt['tolstagnation'] = 20 ess = [] seg = None segs = None sum_vpred = [] while True: if max_timesteps and timesteps_so_far >= max_timesteps: print("Max time steps") break elif max_episodes and episodes_so_far >= max_episodes: print("Max episodes") break elif max_iters and iters_so_far >= max_iters: print("Max iterations") break elif max_seconds and time.time() - tstart >= max_seconds: print("Max time") break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / (max_timesteps), 0) else: raise NotImplementedError # epsilon = max(0.5 - float(timesteps_so_far) / (max_timesteps), 0) * cur_lrmult epsilon = max(0.5 * cur_lrmult, 0) # epsilon = 0.2 sigma_adapted = max(sigma * cur_lrmult, 1e-8) # sigma_adapted = max(max(sigma - float(timesteps_so_far) / (5000 * max_timesteps), 0) * cur_lrmult, 1e-8) # cmean_adapted = max(1.0 - float(timesteps_so_far) / (max_timesteps), 1e-8) # cmean_adapted = max(0.8 - float(time˚steps_so_far) / (2*max_timesteps), 1e-8) # if timesteps_so_far % max_timesteps == 10: max_v_train_iter = int( max( max_v_train_iter * (1 - timesteps_so_far / (0.5 * max_timesteps)), 1)) logger.log("********** Iteration %i ************" % iters_so_far) if iters_so_far == 0: eval_seg = eval_seq.__next__() rewbuffer.extend(eval_seg["ep_rets"]) lenbuffer.extend(eval_seg["ep_lens"]) result_record() # Repository Train train_segs = {} seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) if hasattr(pi, "ob_rms"): pi.ob_rms.update( seg["ob"]) # update running mean/std for normalization # rewbuffer.extend(seg["ep_rets"]) # lenbuffer.extend(seg["ep_lens"]) # # if iters_so_far == 0: # result_record() assign_old_eq_new() # set old parameter values to new parameter values if segs is None: segs = seg segs["v_target"] = np.zeros(len(seg["ob"]), 'float32') elif len(segs["ob"]) >= 50000: segs["ob"] = np.take(segs["ob"], np.arange(timesteps_per_actorbatch, len(segs["ob"])), axis=0) segs["next_ob"] = np.take(segs["next_ob"], np.arange(timesteps_per_actorbatch, len(segs["next_ob"])), axis=0) segs["ac"] = np.take(segs["ac"], np.arange(timesteps_per_actorbatch, len(segs["ac"])), axis=0) segs["rew"] = np.take(segs["rew"], np.arange(timesteps_per_actorbatch, len(segs["rew"])), axis=0) segs["vpred"] = np.take(segs["vpred"], np.arange(timesteps_per_actorbatch, len(segs["vpred"])), axis=0) segs["act_props"] = np.take(segs["act_props"], np.arange(timesteps_per_actorbatch, len(segs["act_props"])), axis=0) segs["new"] = np.take(segs["new"], np.arange(timesteps_per_actorbatch, len(segs["new"])), axis=0) segs["adv"] = np.take(segs["adv"], np.arange(timesteps_per_actorbatch, len(segs["adv"])), axis=0) segs["tdlamret"] = np.take(segs["tdlamret"], np.arange(timesteps_per_actorbatch, len(segs["tdlamret"])), axis=0) segs["ep_rets"] = np.take(segs["ep_rets"], np.arange(timesteps_per_actorbatch, len(segs["ep_rets"])), axis=0) segs["ep_lens"] = np.take(segs["ep_lens"], np.arange(timesteps_per_actorbatch, len(segs["ep_lens"])), axis=0) segs["v_target"] = np.take(segs["v_target"], np.arange(timesteps_per_actorbatch, len(segs["v_target"])), axis=0) segs["ob"] = np.append(segs['ob'], seg['ob'], axis=0) segs["next_ob"] = np.append(segs['next_ob'], seg['next_ob'], axis=0) segs["ac"] = np.append(segs['ac'], seg['ac'], axis=0) segs["rew"] = np.append(segs['rew'], seg['rew'], axis=0) segs["vpred"] = np.append(segs['vpred'], seg['vpred'], axis=0) segs["act_props"] = np.append(segs['act_props'], seg['act_props'], axis=0) segs["new"] = np.append(segs['new'], seg['new'], axis=0) segs["adv"] = np.append(segs['adv'], seg['adv'], axis=0) segs["tdlamret"] = np.append(segs['tdlamret'], seg['tdlamret'], axis=0) segs["ep_rets"] = np.append(segs['ep_rets'], seg['ep_rets'], axis=0) segs["ep_lens"] = np.append(segs['ep_lens'], seg['ep_lens'], axis=0) segs["v_target"] = np.append(segs['v_target'], np.zeros(len(seg["ob"]), 'float32'), axis=0) else: segs["ob"] = np.append(segs['ob'], seg['ob'], axis=0) segs["next_ob"] = np.append(segs['next_ob'], seg['next_ob'], axis=0) segs["ac"] = np.append(segs['ac'], seg['ac'], axis=0) segs["rew"] = np.append(segs['rew'], seg['rew'], axis=0) segs["vpred"] = np.append(segs['vpred'], seg['vpred'], axis=0) segs["act_props"] = np.append(segs['act_props'], seg['act_props'], axis=0) segs["new"] = np.append(segs['new'], seg['new'], axis=0) segs["adv"] = np.append(segs['adv'], seg['adv'], axis=0) segs["tdlamret"] = np.append(segs['tdlamret'], seg['tdlamret'], axis=0) segs["ep_rets"] = np.append(segs['ep_rets'], seg['ep_rets'], axis=0) segs["ep_lens"] = np.append(segs['ep_lens'], seg['ep_lens'], axis=0) segs["v_target"] = np.append(segs['v_target'], np.zeros(len(seg["ob"]), 'float32'), axis=0) if iters_so_far == 0: ob, ac, tdlamret = seg["ob"], seg["ac"], seg["tdlamret"] d = Dataset(dict(ob=ob, ac=ac, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] # Train V function # logger.log("Catchup Training V Func and Evaluating V Func Losses") for _ in range(max_v_train_iter): for batch in d.iterate_once(optim_batchsize): *vf_loss, g = vf_lossandgrad(batch["ob"], batch["ac"], batch["vtarg"], cur_lrmult) vf_adam.update(g, optim_stepsize * cur_lrmult) # logger.log(fmt_row(13, np.mean(vf_losses, axis = 0))) else: # Update v target new = segs["new"] rew = segs["rew"] act_prob = np.asarray(compute_a_prob(segs["ob"], segs["ac"])).T importance_ratio = np.squeeze(act_prob) / ( segs["act_props"] + np.ones(segs["act_props"].shape) * 1e-8) segs["v_target"] = importance_ratio * (1 / np.sum(importance_ratio)) * \ np.squeeze( rew + np.invert(new).astype(np.float32) * gamma * compute_v_pred(segs["next_ob"])) # train_segs["v_target"] = rew + np.invert(new).astype(np.float32) * gamma * compute_v_pred(train_segs["next_ob"]) if len(segs["ob"]) >= 20000: train_times = int(max_v_train_iter / 2) if int(max_v_train_iter / 2) > 0 else 1 else: train_times = 2 for i in range(train_times): selected_train_index = np.random.choice( range(len(segs["ob"])), timesteps_per_actorbatch, replace=False) train_segs["ob"] = np.take(segs["ob"], selected_train_index, axis=0) train_segs["next_ob"] = np.take(segs["next_ob"], selected_train_index, axis=0) train_segs["ac"] = np.take(segs["ac"], selected_train_index, axis=0) train_segs["rew"] = np.take(segs["rew"], selected_train_index, axis=0) train_segs["vpred"] = np.take(segs["vpred"], selected_train_index, axis=0) train_segs["new"] = np.take(segs["new"], selected_train_index, axis=0) train_segs["adv"] = np.take(segs["adv"], selected_train_index, axis=0) train_segs["tdlamret"] = np.take(segs["tdlamret"], selected_train_index, axis=0) train_segs["v_target"] = np.take(segs["v_target"], selected_train_index, axis=0) # ob, ac, v_target = train_segs["ob"], train_segs[ "ac"], train_segs["v_target"] d = Dataset(dict(ob=ob, ac=ac, vtarg=v_target), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] # Train V function # logger.log("Training V Func and Evaluating V Func Losses") # Train V function # logger.log("Catchup Training V Func and Evaluating V Func Losses") # logger.log("Train V - "+str(_)) for _ in range(max_v_train_iter): for batch in d.iterate_once(optim_batchsize): *vf_loss, g = vf_lossandgrad(batch["ob"], batch["ac"], batch["vtarg"], cur_lrmult) vf_adam.update(g, optim_stepsize * cur_lrmult) # logger.log(fmt_row(13, np.mean(vf_losses, axis = 0))) # seg['vpred'] = np.asarray(compute_v_pred(seg["ob"])).reshape(seg['vpred'].shape) # seg['nextvpred'] = seg['vpred'][-1] * (1 - seg["new"][-1]) # add_vtarg_and_adv(seg, gamma, lam) ob, ac, atarg, v_target = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=v_target), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] # Local search for _ in range(optim_epochs): for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, 1 / 4) adam.update(g, optim_stepsize * cur_lrmult) # seg['vpred'] = np.asarray(compute_v_pred(seg["ob"])).reshape(seg['vpred'].shape) # seg['nextvpred'] = seg['vpred'][-1] * (1 - seg["new"][-1]) # add_vtarg_and_adv(seg, gamma, lam) ob_po, ac_po, atarg_po, tdlamret_po = seg["ob"], seg["ac"], seg[ "adv"], seg["tdlamret"] atarg_po = (atarg_po - atarg_po.mean()) / atarg_po.std( ) # standardized advantage function estimate # opt['CMA_cmean'] = cmean_adapted # assign_old_eq_new() # set old parameter values to new parameter values for i in range(len(layer_var_list)): # CMAES Train Policy assign_backup_eq_new() # backup current policy flatten_weights = layer_get_operate_list[i]() if len(indices) < len(layer_var_list): selected_index, init_weights = uniform_select( flatten_weights, 0.5) # 0.5 means 50% proportion of params are selected indices.append(selected_index) else: rand = np.random.uniform() # print("Random-Number:", rand) # print("Epsilon:", epsilon) if rand < epsilon: selected_index, init_weights = uniform_select( flatten_weights, 0.5) indices.append(selected_index) # logger.log("Random: select new weights") else: selected_index = indices[i] init_weights = np.take(flatten_weights, selected_index) es = cma.CMAEvolutionStrategy(init_weights, sigma_adapted, opt) while True: if es.countiter >= gensize: # logger.log("Max generations for current layer") break # logger.log("Iteration:" + str(iters_so_far) + " - sub-train Generation for Policy:" + str(es.countiter)) # logger.log("Sigma=" + str(es.sigma)) # solutions = es.ask(sigma_fac = max(cur_lrmult, 1e-8)) solutions = es.ask() # solutions = [np.clip(solution, -5.0, 5.0).tolist() for solution in solutions] costs = [] lens = [] assign_backup_eq_new() # backup current policy for id, solution in enumerate(solutions): np.put(flatten_weights, selected_index, solution) layer_set_operate_list[i](flatten_weights) cost = compute_pol_losses(ob_po, ac_po, atarg_po, tdlamret_po, cur_lrmult, 1 / 4 * (i + 1)) costs.append(cost[0]) assign_new_eq_backup() # Weights decay l2_decay = compute_weight_decay(0.01, solutions) costs += l2_decay costs, real_costs = fitness_rank(costs) # logger.log("real_costs:"+str(real_costs)) # best_solution = np.copy(es.result[0]) # best_fitness = -es.result[1] es.tell_real_seg(solutions=solutions, function_values=costs, real_f=real_costs, segs=None) # best_solution = np.copy(solutions[np.argmin(costs)]) # best_fitness = -real_costs[np.argmin(costs)] best_solution = es.result[0] best_fitness = es.result[1] np.put(flatten_weights, selected_index, best_solution) layer_set_operate_list[i](flatten_weights) # logger.log("Update the layer") # best_solution = es.result[0] # best_fitness = es.result[1] # logger.log("Best Solution Fitness:" + str(best_fitness)) # set_pi_flat_params(best_solution) import gc gc.collect() iters_so_far += 1 episodes_so_far += sum(lens)
def learn( # =========== modified part begins =========== # env_id, seed, robot, # robot class with GMM params joint_optimization_iters, # total number of joint optimization iterations design_iters, # number of samples when updating physical design in each joint optimization iteration policy_iters, # number of samples when updating robot policy in each joint optimization iteration # ============ modified part ends ============ # policy_func, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # ================================== modification 1 ================================== # """ input: replace "env" (env class) with "env_id" (string) add "seed" (int) reason: to enable env.make() during training modification detail: add following lines into learn() env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) env.close() # added at the end of learn() """ import roboschool, gym from baselines import bench env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) # ================================== modification 1 ================================== # # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space # policy_func is the initialization of NN # NN structure: # state -> (num_hid_layers) fully-connected layers with (hid_size) units -> (action, predicted value) # num_hid_layers, hid_size: set in the file calls "learn" pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # placeholder for "ob" # created in mlppolicy.py ob = U.get_placeholder_cached(name="ob") # placeholder for "ac" # in common/distribution.py ac = pi.pdtype.sample_placeholder([None]) # KL divergence and Entropy, defined in common/distribution.py kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) # pol_entpen: Entropy Bounus encourages exploration # entcoeff: entropy coefficient, defined in PPO page 5, Equ. (9) pol_entpen = (-entcoeff) * meanent # probability ration, defined in PPO page 3 ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold # Surrogate Goal # defined in PPO page 3, Equ (7) surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) # Value Function Loss: square error loss for ||v_pred - v_target|| vf_loss = U.mean(tf.square(pi.vpred - ret)) # Total_loss = L^CLIP - Value Function Loss + Entropy Bounus # defined in PPO page 5, Equ. (9) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) # adam optimizer? adam = MpiAdam(var_list, epsilon=adam_epsilon) # oldpi = pi assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) # Why we need this line? compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # ================================== modification 2 ================================== # for joint_optimization_iter in range(joint_optimization_iters): U.save_state('/home/yetong/Desktop/Project/models/model{}.ckpt'.format( joint_optimization_iter)) logger.log("joint optimization progree: {}/{}".format( joint_optimization_iter, joint_optimization_iters)) # ================================== update physical design ================================== # if joint_optimization_iter > 20: Rewards_plus = np.zeros(design_iters) Rewards_minum = np.zeros(design_iters) params = robot.sample(design_iters, to_update=True) for i, param in enumerate(params): robot.modify_file(param) env = gym.make(env_id) # myenv = env.env # pdb.set_trace() env = bench.Monitor(env, logger.get_dir()) R = episode_generator(pi, env, gamma, stochastic=True) logger.log("\t update physical design: %d/%d, rew: %f" % (i, 2 * design_iters, R)) if i % 2 == 0: Rewards_plus[int(i / 2)] = R else: Rewards_minum[int(i / 2)] = R logger.log("prev_mu: ", robot.params_mu) logger.log("prev_sig: ", robot.params_sig) robot.update(Rewards_plus, Rewards_minum) logger.log("mu: ", robot.params_mu) logger.log("sig: ", robot.params_sig) # ================================== update policy ================================== # # params = robot.sample(design_iters) params = [robot.params_mu] for param in params: # reinitialize env robot.modify_file(param) env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) # ================================== modification 2 ================================== # # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([ max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0 ]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break # annealing for stepsize parameters (epsilon and adam) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy # oldpi = pi # set old parameter values to new parameter values assign_old_eq_new() logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular( "ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather( lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() # ================================== modification 1 ================================== # env.close()
def learn( env, policy_fn, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entc=0.5, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, i_trial): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) oldpi = policy_fn("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) entp = tf.placeholder(dtype=tf.float32, shape=[]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entp * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "loss_ent"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, entp], losses) compute_lossandgrad = U.function([ob, ac, atarg, entp], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() tf.global_variables_initializer() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, gamma=gamma) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards drwdsbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # entcoeff = max(entc - float(iters_so_far) / float(max_iters), 0.01) entcoeff = 0.0 # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args, entcoeff) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): print("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args, entcoeff))) improve = surr - surrbefore print("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): print("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: print("violated KL constraint. shrinking step.") elif improve < 0: print("surrogate didn't improve. shrinking step.") else: print("Stepsize OK!") break stepsize *= .5 else: print("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.logkv(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.logkv("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_drwds"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, drwds = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) drwdsbuffer.extend(drwds) logger.logkv("EpLenMean", np.mean(lenbuffer)) logger.logkv("EpRewMean", np.mean(rewbuffer)) logger.logkv("EpThisIter", len(lens)) logger.logkv("EpDRewMean", np.mean(drwdsbuffer)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.logkv("EpisodesSoFar", episodes_so_far) logger.logkv("TimestepsSoFar", timesteps_so_far) logger.logkv("TimeElapsed", time.time() - tstart) logger.logkv('trial', i_trial) logger.logkv("Iteration", iters_so_far) logger.logkv("Name", 'TRPO') if rank == 0: logger.dump_tabular()
def learn( *, network, env, eval_env, make_eval_env, env_id, total_timesteps, timesteps_per_batch, sil_update, sil_loss, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, lr=3e-4, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=5, sil_value=0.01, sil_alpha=0.6, sil_beta=0.1, max_episodes=0, max_iters=0, # time constraint callback=None, save_interval=0, load_path=None, # MBL # For train mbl mbl_train_freq=5, # For eval num_eval_episodes=5, eval_freq=5, vis_eval=False, eval_targs=('mbmf', ), #eval_targs=('mf',), quant=2, # For mbl.step #num_samples=(1500,), num_samples=(1, ), horizon=(2, ), #horizon=(2,1), #num_elites=(10,), num_elites=(1, ), mbl_lamb=(1.0, ), mbl_gamma=0.99, #mbl_sh=1, # Number of step for stochastic sampling mbl_sh=10000, #vf_lookahead=-1, #use_max_vf=False, reset_per_step=(0, ), # For get_model num_fc=2, num_fwd_hidden=500, use_layer_norm=False, # For MBL num_warm_start=int(1e4), init_epochs=10, update_epochs=5, batch_size=512, update_with_validation=False, use_mean_elites=1, use_ent_adjust=0, adj_std_scale=0.5, # For data loading validation_set_path=None, # For data collect collect_val_data=False, # For traj collect traj_collect='mf', # For profile measure_time=True, eval_val_err=False, measure_rew=True, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, vf_coef=0.5, max_grad_norm=0.5, log_interval=1, nminibatches=4, noptepochs=4, cliprange=0.2, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if not isinstance(num_samples, tuple): num_samples = (num_samples, ) if not isinstance(horizon, tuple): horizon = (horizon, ) if not isinstance(num_elites, tuple): num_elites = (num_elites, ) if not isinstance(mbl_lamb, tuple): mbl_lamb = (mbl_lamb, ) if not isinstance(reset_per_step, tuple): reset_per_step = (reset_per_step, ) if validation_set_path is None: if collect_val_data: validation_set_path = os.path.join(logger.get_dir(), 'val.pkl') else: validation_set_path = os.path.join('dataset', '{}-val.pkl'.format(env_id)) if eval_val_err: eval_val_err_path = os.path.join('dataset', '{}-combine-val.pkl'.format(env_id)) logger.log(locals()) logger.log('MBL_SH', mbl_sh) logger.log('Traj_collect', traj_collect) if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) policy = build_policy(env, network, value_network='copy', **network_kwargs) nenvs = env.num_envs np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * timesteps_per_batch nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) make_model = lambda: Model( policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=timesteps_per_batch, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, sil_update=sil_update, sil_value=sil_value, sil_alpha=sil_alpha, sil_beta=sil_beta, sil_loss=sil_loss, # fn_reward=env.process_reward, fn_reward=None, # fn_obs=env.process_obs, fn_obs=None, ppo=False, prev_pi='pi', silm=pi) model = make_model() with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) make_old_model = lambda: Model( policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=timesteps_per_batch, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, sil_update=sil_update, sil_value=sil_value, sil_alpha=sil_alpha, sil_beta=sil_beta, sil_loss=sil_loss, # fn_reward=env.process_reward, fn_reward=None, # fn_obs=env.process_obs, fn_obs=None, ppo=False, prev_pi='oldpi', silm=oldpi) old_model = make_old_model() # MBL # --------------------------------------- #viz = Visdom(env=env_id) win = None eval_targs = list(eval_targs) logger.log(eval_targs) make_model_f = get_make_mlp_model(num_fc=num_fc, num_fwd_hidden=num_fwd_hidden, layer_norm=use_layer_norm) mbl = MBL(env=eval_env, env_id=env_id, make_model=make_model_f, num_warm_start=num_warm_start, init_epochs=init_epochs, update_epochs=update_epochs, batch_size=batch_size, **network_kwargs) val_dataset = {'ob': None, 'ac': None, 'ob_next': None} if update_with_validation: logger.log('Update with validation') val_dataset = load_val_data(validation_set_path) if eval_val_err: logger.log('Log val error') eval_val_dataset = load_val_data(eval_val_err_path) if collect_val_data: logger.log('Collect validation data') val_dataset_collect = [] def _mf_pi(ob, t=None): stochastic = True ac, vpred, _, _ = pi.step(ob, stochastic=stochastic) return ac, vpred def _mf_det_pi(ob, t=None): #ac, vpred, _, _ = pi.step(ob, stochastic=False) ac, vpred = pi._evaluate([pi.pd.mode(), pi.vf], ob) return ac, vpred def _mf_ent_pi(ob, t=None): mean, std, vpred = pi._evaluate([pi.pd.mode(), pi.pd.std, pi.vf], ob) ac = np.random.normal(mean, std * adj_std_scale, size=mean.shape) return ac, vpred ################### use_ent_adjust======> adj_std_scale????????pi action sample def _mbmf_inner_pi(ob, t=0): if use_ent_adjust: return _mf_ent_pi(ob) else: #return _mf_pi(ob) if t < mbl_sh: return _mf_pi(ob) else: return _mf_det_pi(ob) # --------------------------------------- # Run multiple configuration once all_eval_descs = [] def make_mbmf_pi(n, h, e, l): def _mbmf_pi(ob): ac, rew = mbl.step(ob=ob, pi=_mbmf_inner_pi, horizon=h, num_samples=n, num_elites=e, gamma=mbl_gamma, lamb=l, use_mean_elites=use_mean_elites) return ac[None], rew return Policy(step=_mbmf_pi, reset=None) for n in num_samples: for h in horizon: for l in mbl_lamb: for e in num_elites: if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew', 'MBL_TRPO_SIL', make_mbmf_pi(n, h, e, l))) #if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), 'MBL_TRPO-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), make_mbmf_pi(n, h, e, l))) if 'mf' in eval_targs: all_eval_descs.append( ('MeanRew', 'TRPO_SIL', Policy(step=_mf_pi, reset=None))) logger.log('List of evaluation targets') for it in all_eval_descs: logger.log(it[0]) pool = Pool(mp.cpu_count()) warm_start_done = False # ---------------------------------------- atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- if traj_collect == 'mf': seg_gen = traj_segment_generator(env, timesteps_per_batch, model, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() if traj_collect == 'mf-random' or traj_collect == 'mf-mb': seg_mbl = seg_gen_mbl.__next__() else: seg_mbl = seg add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] # Val data collection if collect_val_data: for ob_, ac_, ob_next_ in zip(ob[:-1, 0, ...], ac[:-1, ...], ob[1:, 0, ...]): val_dataset_collect.append( (copy.copy(ob_), copy.copy(ac_), copy.copy(ob_next_))) # ----------------------------- # MBL update else: ob_mbl, ac_mbl = seg_mbl["ob"], seg_mbl["ac"] mbl.add_data_batch(ob_mbl[:-1, 0, ...], ac_mbl[:-1, ...], ob_mbl[1:, 0, ...]) mbl.update_forward_dynamic(require_update=iters_so_far % mbl_train_freq == 0, ob_val=val_dataset['ob'], ac_val=val_dataset['ac'], ob_next_val=val_dataset['ob_next']) # ----------------------------- if traj_collect == 'mf': #if traj_collect == 'mf' or traj_collect == 'mf-random' or traj_collect == 'mf-mb': vpredbefore = seg[ "vpred"] # predicted value function before udpate model = seg["model"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "rms"): pi.rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) with timed("SIL"): lrnow = lr(1.0 - timesteps_so_far / total_timesteps) l_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train( lrnow) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if sil_update > 0: logger.record_tabular("SilSamples", sil_samples) if rank == 0: # MBL evaluation if not collect_val_data: #set_global_seeds(seed) default_sess = tf.get_default_session() def multithread_eval_policy(env_, pi_, num_episodes_, vis_eval_, seed): with default_sess.as_default(): if hasattr(env, 'ob_rms') and hasattr(env_, 'ob_rms'): env_.ob_rms = env.ob_rms res = eval_policy(env_, pi_, num_episodes_, vis_eval_, seed, measure_time, measure_rew) try: env_.close() except: pass return res if mbl.is_warm_start_done() and iters_so_far % eval_freq == 0: warm_start_done = mbl.is_warm_start_done() if num_eval_episodes > 0: targs_names = {} with timed('eval'): num_descs = len(all_eval_descs) list_field_names = [e[0] for e in all_eval_descs] list_legend_names = [e[1] for e in all_eval_descs] list_pis = [e[2] for e in all_eval_descs] list_eval_envs = [ make_eval_env() for _ in range(num_descs) ] list_seed = [seed for _ in range(num_descs)] list_num_eval_episodes = [ num_eval_episodes for _ in range(num_descs) ] print(list_field_names) print(list_legend_names) list_vis_eval = [ vis_eval for _ in range(num_descs) ] for i in range(num_descs): field_name, legend_name = list_field_names[ i], list_legend_names[i], res = multithread_eval_policy( list_eval_envs[i], list_pis[i], list_num_eval_episodes[i], list_vis_eval[i], seed) #eval_results = pool.starmap(multithread_eval_policy, zip(list_eval_envs, list_pis, list_num_eval_episodes, list_vis_eval,list_seed)) #for field_name, legend_name, res in zip(list_field_names, list_legend_names, eval_results): perf, elapsed_time, eval_rew = res logger.record_tabular(field_name, perf) if measure_time: logger.record_tabular( 'Time-%s' % (field_name), elapsed_time) if measure_rew: logger.record_tabular( 'SimRew-%s' % (field_name), eval_rew) targs_names[field_name] = legend_name if eval_val_err: fwd_dynamics_err = mbl.eval_forward_dynamic( obs=eval_val_dataset['ob'], acs=eval_val_dataset['ac'], obs_next=eval_val_dataset['ob_next']) logger.record_tabular('FwdValError', fwd_dynamics_err) logger.dump_tabular() #print(logger.get_dir()) #print(targs_names) #if num_eval_episodes > 0: # win = plot(viz, win, logger.get_dir(), targs_names=targs_names, quant=quant, opt='best') # ----------- #logger.dump_tabular() yield pi if collect_val_data: with open(validation_set_path, 'wb') as f: pickle.dump(val_dataset_collect, f) logger.log('Save {} validation data'.format(len(val_dataset_collect)))
def __init__(self, timed, policy, ob_space, ac_space, max_kl=0.001, cg_iters=10, ent_coef=0.0,cg_damping=1e-2,vf_stepsize=3e-4,vf_iters =3,load_path=None, num_reward=1, index=1): if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker )) ################################################################# # ob ac ret atarg 都是 placeholder # ret atarg 此处应该是向量形式 ob = observation_placeholder(ob_space) # 创建pi和oldpi with tf.variable_scope(str(index)+"pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope(str(index)+"oldpi"): oldpi = policy(observ_placeholder=ob) # 每个reward都可以算一个atarg atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) #此处的KL div和entropy与reward无关 ################################## kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) # entbonus 是entropy loss entbonus = ent_coef * meanent ################################# ########################################################### # vferr 用来更新 v 网络 vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) # optimgain 用来更新 policy 网络, 应该每个reward有一个 optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] ########################################################### dist = meankl # 定义要优化的变量和 V 网络 adam 优化器 all_var_list = get_trainable_variables(str(index)+"pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables(str(index)+"pi") vf_var_list = get_vf_trainable_variables(str(index)+"pi") vfadam = MpiAdam(vf_var_list) # 把变量展开成一个向量的类 get_flat = U.GetFlat(var_list) # 这个类可以把一个向量分片赋值给var_list里的变量 set_from_flat = U.SetFromFlat(var_list) # kl散度的梯度 klgrads = tf.gradients(dist, var_list) #################################################################### # 拉直的向量 flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") # 把拉直的向量重新分成很多向量 shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz #################################################################### #################################################################### # 把kl散度梯度与变量乘积相加 gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 # 把gvp的梯度展成向量 fvp = U.flatgrad(gvp, var_list) #################################################################### # 用学习后的策略更新old策略 assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables(str(index)+"oldpi"), get_variables(str(index)+"pi"))]) # 计算loss compute_losses = U.function([ob, ac, atarg], losses) # 计算loss和梯度 compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) # 计算fvp compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) # 计算值网络的梯度 compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) # 初始化variable U.initialize() if load_path is not None: pi.load(load_path) # 得到初始化的参数向量 th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) # 把向量the_init的值分片赋值给var_list set_from_flat(th_init) #同步 vfadam.sync() print("Init param sum", th_init.sum(), flush=True) self.MPI = MPI self.pi = pi self.oldpi = oldpi self.compute_losses = compute_losses self.compute_lossandgrad = compute_lossandgrad self.compute_fvp = compute_fvp self.compute_vflossandgrad = compute_vflossandgrad self.assign_old_eq_new = assign_old_eq_new self.get_flat = get_flat self.set_from_flat = set_from_flat self.vfadam = vfadam # params self.max_kl = max_kl self.cg_iters = cg_iters self.ent_coef = ent_coef self.cg_damping = cg_damping self.vf_stepsize = vf_stepsize self.vf_iters = vf_iters self.rank = rank self.index = index self.timed = timed
def learn(env, policy_func, med_func, expert_dataset, pretrained, pretrained_weight, g_step, m_step, e_step, inner_iters, save_per_iter, ckpt_dir, log_dir, timesteps_per_batch, task_name, max_kl=0.01, max_timesteps=0, max_episodes=0, max_iters=0, batch_size=64, med_stepsize=1e-3, pi_stepsize=1e-3, callback=None, writer=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) med = med_func("mediator", ob_space, ac_space) pi_var_list = pi.get_trainable_variables() med_var_list = med.get_trainable_variables() g_ob = U.get_placeholder(name="g_ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) g_ac = U.get_placeholder(name='g_ac', dtype=tf.float32, shape=[None] + list(ac_space.shape)) e_ob = U.get_placeholder(name='e_ob', dtype=tf.float32, shape=[None] + list(ob_space.shape)) e_ac = U.get_placeholder(name='e_ac', dtype=tf.float32, shape=[None] + list(ac_space.shape)) med_loss = -tf.reduce_mean(med.g_pd.logp(g_ac) + med.e_pd.logp(e_ac)) * 0.5 #pi_loss = -0.5 * (tf.reduce_mean(pi.pd.logp(ac) - med.pd.logp(ac))) g_pdf = tfd.MultivariateNormalDiag(loc=pi.pd.mean, scale_diag=pi.pd.std) m_pdf = tfd.MultivariateNormalDiag(loc=med.g_pd.mean, scale_diag=med.g_pd.std) pi_loss = tf.reduce_mean(g_pdf.cross_entropy(m_pdf) - g_pdf.entropy()) # tf.reduce_mean(pi.pd.kl(med.pd)) kloldnew = oldpi.pd.kl(pi.pd) meankl = tf.reduce_mean(kloldnew) dist = meankl expert_loss = -tf.reduce_mean(pi.pd.logp(e_ac)) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_med_loss = U.function([g_ob, g_ac, e_ob, e_ac], med_loss) compute_pi_loss = U.function([g_ob], pi_loss) compute_exp_loss = U.function([e_ob, e_ac], expert_loss) # compute_kl_loss = U.function([ob], dist) # compute_fvp = U.function([flat_tangent, ob, ac], fvp) compute_med_lossandgrad = U.function([g_ob, g_ac, e_ob, e_ac], [med_loss, U.flatgrad(med_loss, med_var_list)]) compute_pi_lossandgrad = U.function([g_ob], [pi_loss, U.flatgrad(pi_loss, pi_var_list)]) compute_exp_lossandgrad = U.function([e_ob, e_ac], [expert_loss, U.flatgrad(expert_loss, pi_var_list)]) get_flat = U.GetFlat(pi_var_list) set_from_flat = U.SetFromFlat(pi_var_list) med_adam = MpiAdam(med_var_list) pi_adam = MpiAdam(pi_var_list) def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() # th_init = get_flat() # MPI.COMM_WORLD.Bcast(th_init, root=0) # set_from_flat(th_init) med_adam.sync() pi_adam.sync() # if rank == 0: # print("Init pi param sum %d, init med param sum %d." % (th_pi_init.sum(), th_med_init.sum()), flush=True) seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 loss_stats = stats(["med_loss", "pi_loss"]) ep_stats = stats(["True_rewards", "Episode_length"]) if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi_var_list) med_losses = [] pi_losses = [] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("********** Iteration %i ************" % iters_so_far) # ======= Optimize Mediator========= seg = seg_gen.__next__() g_ob, g_ac = seg['ob'], seg['ac'] #assign_old_eq_new() #stepsize = 3e-4 # thbefore = get_flat() d = dataset.Dataset(dict(ob=g_ob, ac=g_ac)) optim_batchsize = min(batch_size, len(g_ob)) g_loss = [] logger.log("Optimizing Generator...") for _ in range(1): g_batch = d.next_batch(optim_batchsize) g_batch_ob, g_batch_ac = g_batch['ob'], g_batch['ac'] if hasattr(pi, "obs_rms"): pi.obs_rms.update(g_batch_ob) pi_loss, g = compute_pi_lossandgrad(g_batch_ob) # kl = compute_kl_loss(g_ob) # if kl > max_kl * 1.5: # logger.log("violated KL constraint. Shrinking step.") # # stepsize *= 0.1 # break # else: # logger.log("Stepsize OK!") pi_adam.update(allmean(g), pi_stepsize) g_loss.append(pi_loss) pi_losses.append(np.mean(np.array(g_loss))) med_loss = [] logger.log("Optimizing Mediator...") for g_ob_batch, g_ac_batch in dataset.iterbatches((seg['ob'], seg['ac']), include_final_partial_batch=False, batch_size=batch_size): # g_batch = d.next_batch(optim_batchsize) # g_ob_batch, g_ac_batch = g_batch['ob'], g_batch['ac'] e_ob_batch, e_ac_batch = expert_dataset.get_next_batch(optim_batchsize) if hasattr(med, "obs_rms"): med.obs_rms.update(np.concatenate((g_ob_batch, e_ob_batch), 0)) newlosses, g = compute_med_lossandgrad(g_ob_batch, g_ac_batch, e_ob_batch, e_ac_batch) med_adam.update(allmean(g), med_stepsize) med_loss.append(newlosses) med_losses.append(np.mean(np.array(med_loss))) #logger.record_tabular("med_loss_each_iter", np.mean(np.array(med_losses))) #logger.record_tabular("gen_loss_each_iter", np.mean(np.array(pi_losses))) #logger.record_tabular("expert_loss_each_iter", np.mean(np.array(exp_losses))) logger.record_tabular("med_loss_each_iter", np.mean(np.array(med_losses))) logger.record_tabular("gen_loss_each_iter", np.mean(np.array(pi_losses))) lrlocal = (seg["ep_lens"], seg["ep_true_rets"]) listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) lens, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if writer is not None: loss_stats.add_all_summary(writer, [np.mean(np.array(med_losses)), np.mean(np.array(pi_losses))], episodes_so_far) ep_stats.add_all_summary(writer, [np.mean(true_rewbuffer), np.mean(lenbuffer)], episodes_so_far) if rank == 0: logger.dump_tabular()
def learn(*, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs ): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker )) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular() return pi
def learn(env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular()
def learn(env, policy_func, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular()
def learn( *, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() return pi
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() ## losses + [U.flatgrad(total_loss, var_list)] 这个是怎么相加的 lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) test_a = U.function([ob, ac, atarg, ret, lrmult], [ kloldnew, ent, meankl, meanent, pol_entpen, pi.pd.logp(ac), oldpi.pd.logp(ac), ratio, surr1, surr2, pi.vpred ]) #################### pi_parms = U.function([], var_list) old_list = oldpi.get_trainable_variables() old_parms = U.function([], old_list) #################### U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() # print("ac",np.shape(seg["ac"]), seg["ac"]) # print("rew",np.shape(seg["rew"]), seg["rew"]) # print("vpred",np.shape(seg["vpred"]), seg["vpred"]) # print("new",np.shape(seg["new"]), seg["new"]) # print("prevac",np.shape(seg["prevac"]), seg["prevac"]) # print("nextvpred",np.shape(seg["nextvpred"]), seg["nextvpred"]) # print("ep_rets",np.shape(seg["ep_rets"]), seg["ep_rets"]) # print("ep_lens",np.shape(seg["ep_lens"]), seg["ep_lens"]) add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), deterministic=pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") # ############ # for p in pi_parms(): # print("pi", np.sum(p)) # for p in old_parms(): # print("old", np.sum(p)) # ############ logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) # kloldnew,ent, meankl, meanent, pol_entpen, piac, oldpiac, ratio, surr1, surr2, pivpred = \ # test_a(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) # print("kloldnew",kloldnew) # print("ent",ent) # print("meankl",meankl) # print("meanent",meanent) # print("pol_entpen",pol_entpen) # print("piac",piac) # print("oldpiac",oldpiac) # print("ratio",ratio) # print("surr1",surr1) # print("surr2",surr2) # print("pivpred",pivpred) for p in pi_parms(): print("pi", np.sum(p)) for p in old_parms(): print("old", np.sum(p)) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return pi
def learn( base_env, policy_fn, *, max_fitness, # has to be negative, as cmaes consider minization popsize, gensize, bounds, sigma, eval_iters, timesteps_per_actorbatch, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, seed=0, optim_stepsize=3e-4, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): set_global_seeds(seed) # Setup losses and stuff # ---------------------------------------- ob_space = base_env.observation_space ac_space = base_env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy backup_pi = policy_fn( "backup_pi", ob_space, ac_space ) # Construct a network for every individual to adapt during the es evolution sol_dim = int( np.sum([ np.prod(v.get_shape().as_list()) for v in pi.get_trainable_variables() ])) pop_size = tf.placeholder(dtype=tf.float32, shape=[]) lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule tfkids_fit = tf.placeholder(dtype=tf.float32, shape=[ popsize, ]) tfkids = tf.placeholder(dtype=tf.float32, shape=[popsize, sol_dim]) tfmean = tf.Variable(initial_value=tf.random_normal([ sol_dim, ], 0., 1.), dtype=tf.float32) tfcov = tf.Variable(initial_value=tf.eye(sol_dim), dtype=tf.float32) mvn = MultivariateNormalFullCovariance(loc=tfmean, covariance_matrix=tfcov) loss = -tf.reduce_mean(mvn.log_prob(tfkids) * tfkids_fit) train_op = tf.train.GradientDescentOptimizer(lrmult).minimize(loss) optimize = U.function([tfkids, tfkids_fit, lrmult], [train_op]) reproduce = U.function([pop_size], [mvn.sample(popsize)]) get_mean = U.function([], [tfmean]) input_mean = tf.placeholder(dtype=tf.float32, shape=[ sol_dim, ]) assign_weights_to_mean = U.function([input_mean], [tf.assign(tfmean, input_mean)]) U.initialize() pi_set_from_flat_params = U.SetFromFlat(pi.get_trainable_variables()) pi_get_flat_params = U.GetFlat(pi.get_trainable_variables()) global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, best_fitness, eval_seq episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assign_backup_eq_new = U.function( [], [], updates=[ tf.assign(backup_v, newv) for ( backup_v, newv) in zipsame(backup_pi.get_variables(), pi.get_variables()) ]) assign_new_eq_backup = U.function( [], [], updates=[ tf.assign(newv, backup_v) for (newv, backup_v ) in zipsame(pi.get_variables(), backup_pi.get_variables()) ]) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" # Build generator for all solutions actors = [] best_fitness = -np.inf eval_seq = traj_segment_generator_eval(pi, base_env, timesteps_per_actorbatch, stochastic=True) for i in range(popsize): newActor = traj_segment_generator(pi, base_env, timesteps_per_actorbatch, stochastic=True, eval_iters=eval_iters) actors.append(newActor) while True: if max_timesteps and timesteps_so_far >= max_timesteps: logger.log("Max time steps") break elif max_episodes and episodes_so_far >= max_episodes: logger.log("Max episodes") break elif max_iters and iters_so_far >= max_iters: logger.log("Max iterations") break elif max_seconds and time.time() - tstart >= max_seconds: logger.log("Max time") break assign_backup_eq_new() # backup current policy if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / (max_timesteps / 2), 0) else: raise NotImplementedError logger.log("********** Generation %i ************" % iters_so_far) eval_seg = eval_seq.__next__() rewbuffer.extend(eval_seg["ep_rets"]) lenbuffer.extend(eval_seg["ep_lens"]) if iters_so_far == 0: result_record() assign_weights_to_mean(pi_get_flat_params()) # mean = pi_get_flat_params() solutions = reproduce(popsize) ob_segs = None segs = [] costs = [] lens = [] for id, solution in enumerate(solutions[0]): # pi.set_Flat_variables(solution) pi_set_from_flat_params(solution) seg = actors[id].__next__() costs.append(-np.mean(seg["ep_rets"])) lens.append(np.sum(seg["ep_lens"])) segs.append(seg) if ob_segs is None: ob_segs = {'ob': np.copy(seg['ob'])} else: ob_segs['ob'] = np.append(ob_segs['ob'], seg['ob'], axis=0) assign_new_eq_backup() optimize(solutions[0], np.array(costs), cur_lrmult * optim_stepsize) # fit_idx = np.array(costs).flatten().argsort()[:len(costs)] # solutions = np.array(solutions)[fit_idx] # costs = np.array(costs)[fit_idx] # segs = np.array(segs)[fit_idx] # # Weights decay # # costs, real_costs = fitness_shift(costs) # # costs, real_costs = compute_centered_ranks(costs) # l2_decay = compute_weight_decay(0.01, solutions) # costs += l2_decay # costs, real_costs = fitness_normalization(costs) # # best_solution = np.copy(solutions[0]) # # best_fitness = -real_costs[0] # # rewbuffer.extend(segs[0]["ep_rets"]) # # lenbuffer.extend(segs[0]["ep_lens"]) # es.tell_real_seg(solutions = solutions, function_values = costs, real_f = real_costs, segs = segs) # best_solution = np.copy(es.result[0]) # best_fitness = -es.result[1] # rewbuffer.extend(es.result[3]["ep_rets"]) # lenbuffer.extend(es.result[3]["ep_lens"]) # logger.log("Generation:", es.countiter) # logger.log("Best Solution Fitness:", best_fitness) pi_set_from_flat_params(get_mean()[0]) ob = ob_segs["ob"] if hasattr(pi, "ob_rms"): pi.ob_rms.update( ob) # update running mean/std for observation normalization iters_so_far += 1 episodes_so_far += sum(lens)