def __call__(self, *args, **kwargs): data_gen = self._data_generator(self.train_data_size, stochastic=True) while True: batch = next(data_gen) batch = self._add_vtarg_and_adv(batch, self.gamma, self.lambda_) obs, acs, advs, td_lam_ret = batch['obs'], batch['acs'], batch[ 'advs'], batch['td_lam_ret'] advs = (advs - advs.mean() ) / advs.std() # standardized advantage function estimate d = Dataset(dict(obs=obs, acs=acs, advs=advs, td_lam_ret=td_lam_ret), shuffle=not self.ppo.pi.recurrent) # update obs normalization self.ppo.update_ob_norm(obs) # update old by new self.ppo.update_old_by_new() # set current learning rate cur_lrmult = 1.0 if not self.lr_decay else max( 1.0 - float(self.steps) / self.max_steps, 0) # learn optimize_size = self.optimize_size or obs.shape[0] for _ in range(self.optimize_epochs): for batch in d.iterate_once(optimize_size): self.ppo.learn(batch["obs"], batch["acs"], batch["advs"], batch["td_lam_ret"], cur_lrmult, self.optimize_step_size)
def fit_data(self, training_X, training_Y, iter_num=200, batch_size=64, stepsize=0.001, save_model_callback=None): dataset = Dataset(dict(X=np.array(training_X), Y=np.array(training_Y)), shuffle=True) losses = [] for iter in range(iter_num): loss_epoch = [] for batch in dataset.iterate_once(batch_size): inputs = [batch["X"], True, batch["Y"]] loss, g = self.lossandgrad(*inputs) self.updater.update(g, stepsize) loss_epoch.append(loss) losses.append(np.mean(loss_epoch)) if iter % 5 == 0: print('iter: ', iter, 'loss: ', np.mean(loss_epoch)) if save_model_callback is not None: save_model_callback(self.model, self.model.name, iter) return losses
def train(self, seg, optim_batchsize, optim_epochs): cur_lrmult = 1.0 add_vtarg_and_adv(seg, self.gamma, self.lam) ob, unnorm_ac, atarg, tdlamret = seg["ob"], seg["unnorm_ac"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=unnorm_ac, atarg=atarg, vtarg=tdlamret), shuffle=not self.pi.recurrent) if hasattr(self.pi, "ob_rms"): self.pi.update_obs_rms(ob) # update running mean/std for policy self.assign_old_eq_new( ) # set old parameter values to new parameter values logger.log2("Optimizing...") logger.log2(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): lg = self.lossandgrad(batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, *self.fix_ob2feed(batch["ob"])) new_losses, g = lg[:-1], lg[-1] self.adam.update(g, self.optim_stepsize * cur_lrmult) losses.append(new_losses) logger.log2(fmt_row(13, np.mean(losses, axis=0))) logger.log2("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, *self.fix_ob2feed(batch["ob"])) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log2(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, self.loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) return meanlosses
def update_policy(pi, seg, gamma, lam, logger, optim_epochs, optim_batchsize, optim_stepsize, cur_lrmult, loss_names, lossandgrad, adam, assign_old_eq_new, compute_losses, mpi_moments_fn): add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): # *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments_fn(losses) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) return vpredbefore, tdlamret, optim_batchsize
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return td_v_target = tf.placeholder(dtype=tf.float32, shape=[1, 1]) # V target for RAC lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule # adv = tf.placeholder(dtype = tf.float32, shape = [1, 1]) # Advantage function for RAC clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] vf_rac_loss = tf.reduce_mean(tf.square(pi.vpred - td_v_target)) vf_rac_losses = [vf_rac_loss] vf_rac_loss_names = ["vf_rac_loss"] pol_rac_loss_surr1 = atarg * pi.pd.neglogp(ac) * ratio pol_rac_loss_surr2 = tf.clip_by_value( ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg * pi.pd.neglogp( ac) # pol_rac_loss = tf.reduce_mean( tf.minimum(pol_rac_loss_surr1, pol_rac_loss_surr2)) pol_rac_losses = [pol_rac_loss] pol_rac_loss_names = ["pol_rac_loss"] var_list = pi.get_trainable_variables() vf_final_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") and v.name.split("/")[2].startswith("final") ] pol_final_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("pol") and v.name.split("/")[2].startswith("final") ] # Train V function vf_lossandgrad = U.function([ob, td_v_target, lrmult], vf_rac_losses + [U.flatgrad(vf_rac_loss, vf_final_var_list)]) vf_adam = MpiAdam(vf_final_var_list, epsilon=adam_epsilon) # Train Policy pol_lossandgrad = U.function( [ob, ac, atarg, lrmult], pol_rac_losses + [U.flatgrad(pol_rac_loss, pol_final_var_list)]) pol_adam = MpiAdam(pol_final_var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) compute_v_pred = U.function([ob], [pi.vpred]) U.initialize() adam.sync() pol_adam.sync() vf_adam.sync() global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer,best_fitness episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" seg = None while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / (max_timesteps), 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) t = 0 ac = env.action_space.sample( ) # not used, just so we have the datatype new = True # marks if we're on first timestep of an episode ob = env.reset() cur_ep_ret = 0 # return in current episode cur_ep_len = 0 # len of current episode ep_rets = [] # returns of completed episodes in this segment ep_lens = [] # lengths of ... horizon = timesteps_per_actorbatch # Initialize history arrays obs = np.array([ob for _ in range(horizon)]) rews = np.zeros(horizon, 'float32') vpreds = np.zeros(horizon, 'float32') news = np.zeros(horizon, 'int32') acs = np.array([ac for _ in range(horizon)]) prevacs = acs.copy() rac_alpha = optim_stepsize * cur_lrmult * 0.1 rac_beta = optim_stepsize * cur_lrmult * 0.01 assign_old_eq_new() # set old parameter values to new parameter values while True: if timesteps_so_far % 10000 == 0 and timesteps_so_far > 0: result_record() prevac = ac ac, vpred = pi.act(stochastic=True, ob=ob) # Slight weirdness here because we need value function at time T # before returning segment [0, T-1] so we get the correct # terminal value if t > 0 and t % horizon == 0: seg = { "ob": obs, "rew": rews, "vpred": vpreds, "new": news, "ac": acs, "prevac": prevacs, "nextvpred": vpred * (1 - new), "ep_rets": ep_rets, "ep_lens": ep_lens } ep_rets = [] ep_lens = [] break i = t % horizon obs[i] = ob vpreds[i] = vpred news[i] = new acs[i] = ac prevacs[i] = prevac if env.spec._env_name == "LunarLanderContinuous": ac = np.clip(ac, -1.0, 1.0) next_ob, rew, new, _ = env.step(ac) # Compute v target and TD v_target = rew + gamma * np.array( compute_v_pred(next_ob.reshape((1, ob.shape[0])))) adv = v_target - np.array( compute_v_pred(ob.reshape((1, ob.shape[0])))) # Update V and Update Policy vf_loss, vf_g = vf_lossandgrad(ob.reshape((1, ob.shape[0])), v_target, rac_alpha) vf_adam.update(vf_g, rac_alpha) pol_loss, pol_g = pol_lossandgrad(ob.reshape((1, ob.shape[0])), ac.reshape((1, ac.shape[0])), adv.reshape(adv.shape[0], ), rac_beta) pol_adam.update(pol_g, rac_beta) rews[i] = rew cur_ep_ret += rew cur_ep_len += 1 timesteps_so_far += 1 ob = next_ob if new: # print( # "Episode {} - Total reward = {}, Total Steps = {}".format(episodes_so_far, cur_ep_ret, cur_ep_len)) ep_rets.append(cur_ep_ret) ep_lens.append(cur_ep_len) rewbuffer.extend(ep_rets) lenbuffer.extend(ep_lens) cur_ep_ret = 0 cur_ep_len = 0 ob = env.reset() episodes_so_far += 1 t += 1 add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values # logger.log("Optimizing...") # logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) # logger.log(fmt_row(13, np.mean(losses, axis=0))) # logger.log("Current Iteration Training Performance:" + str(np.mean(seg["ep_rets"]))) if iters_so_far == 0: result_record() iters_so_far += 1
def learn(self): # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(self.pi, self.env, self.timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([ self.max_iters > 0, self.max_timesteps > 0, self.max_episodes > 0, self.max_seconds > 0 ]) == 1, "Only one time constraint permitted" while True: if (timesteps_so_far >= self.max_timesteps) and self.max_timesteps: break elif (episodes_so_far >= self.max_episodes) and self.max_episodes: break elif (iters_so_far >= self.max_iters) and self.max_iters: break elif self.max_seconds and (time.time() - tstart >= self.max_seconds): break if self.schedule == 'constant': cur_lrmult = 1.0 elif self.schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / self.max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) self.ob, self.ac, self.atarg, tdlamret = seg["ob"], seg["ac"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate self.atarg = (self.atarg - self.atarg.mean()) / self.atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=self.ob, ac=self.ac, atarg=self.atarg, vtarg=tdlamret), shuffle=not self.pi.recurrent) self.optim_batchsize = self.optim_batchsize or self.ob.shape[0] if hasattr(self.pi, "ob_rms"): self.pi.ob_rms.update( self.ob) # update running mean/std for policy self.assign_old_eq_new( ) # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(self.optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(self.optim_batchsize): *newlosses, g = self.lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) self.adam.update(g, self.optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(self.optim_batchsize): newlosses = self.compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, self.loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation # CMAES max_fitness, # has to be negative, as cmaes consider minization popsize, gensize, bounds, sigma, eval_iters, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) seed, env_id): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy backup_pi = policy_fn("backup_pi", ob_space, ac_space) # Network for cmaes individual to train atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return reward = tf.placeholder(dtype=tf.float32, shape=[None]) # step rewards lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") next_ob = U.get_placeholder_cached( name="next_ob") # next step observation for updating q function ac = U.get_placeholder_cached( name="act") # action placeholder for computing q function kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) y = reward + gamma * tf.squeeze(pi.vpred) qf_loss = tf.reduce_mean(tf.square(y - pi.qpred)) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen # v function is independently trained qf_losses = [qf_loss] vf_losses = [vf_loss] losses = [pol_surr, pol_entpen, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "kl", "ent"] var_list = pi.get_trainable_variables() # print(var_list) if isinstance(pi, CnnPolicy): lin_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("lin") ] vf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("logits") ] pol_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("value") ] # Policy + Value function, the final layer, all trainable variables # Remove vf variables var_list = lin_var_list + pol_var_list else: fc2_var_list = [ v for v in var_list if v.name.split("/")[2].startswith("fc2") ] final_var_list = [ v for v in var_list if v.name.split("/")[2].startswith("final") ] # var_list = vf_var_list + pol_var_list var_list = fc2_var_list + final_var_list print(var_list) # print(var_list) qf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("qf") ] vf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") ] qf_lossandgrad = U.function([ob, ac, next_ob, lrmult, reward], qf_losses + [U.flatgrad(qf_loss, qf_var_list)]) vf_lossandgrad = U.function([ob, ac, atarg, ret, lrmult], vf_losses + [U.flatgrad(vf_loss, vf_var_list)]) lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) qf_adam = MpiAdam(qf_var_list, epsilon=adam_epsilon) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) # Assign pi to backup (only backup trainable variables) assign_backup_eq_new = U.function( [], [], updates=[ tf.assign(backup_v, newv) for (backup_v, newv) in zipsame(backup_pi.get_trainable_variables(), pi.get_trainable_variables()) ]) # Assign backup back to pi assign_new_eq_backup = U.function( [], [], updates=[ tf.assign(newv, backup_v) for (newv, backup_v) in zipsame(pi.get_trainable_variables(), backup_pi.get_trainable_variables()) ]) # Compute all losses compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) # compute the Advantage estimations: A = Q - V for pi get_A_estimation = U.function([ob, next_ob, ac], [pi.qpred - pi.vpred]) # compute the Advantage estimations: A = Q - V for evalpi # compute the mean action for given states under pi mean_actions = U.function([ob], [pi.pd.mode()]) # compute the mean action for given states under evalpi U.initialize() adam.sync() global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness episodes_so_far = 0 timesteps_so_far = 0 ppo_timesteps_so_far = 0 # cmaes_timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards # Prepare for rollouts # ---------------------------------------- # assign pi to eval_pi actors = [] best_fitness = 0 seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" i = 0 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) # PPO Train V and Q seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, next_ob, ac, atarg, tdlamret, reward = seg["ob"], seg[ "next_ob"], seg["ac"], seg["adv"], seg["tdlamret"], seg["rew"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy # Re-train V function for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *vf_losses, g = vf_lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) vf_adam.update(g, optim_stepsize * cur_lrmult) # Random select tansitions to train Q random_idx = [] len_repo = len(seg["ob"]) optim_epochs_q = int(len_repo / optim_batchsize) for _ in range(optim_epochs_q): random_idx.append( np.random.choice(range(len_repo), optim_batchsize)) # Re-train q function for _ in range(optim_epochs_q): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for idx in random_idx: *qf_losses, g = qf_lossandgrad(seg["next_ob"][idx], seg["ac"][idx], seg["ob"][idx], cur_lrmult, seg["rew"][idx]) qf_adam.update(g, optim_stepsize * cur_lrmult) # CMAES weights = pi.get_trainable_variables() if i >= len(weights): i = 0 while i < len(weights): # Consider both q-function and v-function if weights[i].name.split("/")[1] == "vf" or weights[i].name.split( "/")[1] == "qf": i += 1 continue print("Layer: ", i, '+', i + 1) print("Layer-Name", weights[i].name) if i + 1 < len(weights): layer_params = [weights[i], weights[i + 1]] else: layer_params = [weights[i]] if len(layer_params) <= 1: layer_params = [weights[i - 1], weights[i]] layer_params_flat = pi.get_Layer_Flat_variables(layer_params)() index, init_uniform_layer_weights = uniform_select( layer_params_flat, 500) opt = cma.CMAOptions() opt['tolfun'] = max_fitness opt['popsize'] = popsize opt['maxiter'] = gensize opt['verb_disp'] = 0 opt['verb_log'] = 0 opt['seed'] = seed opt['AdaptSigma'] = True # opt['bounds'] = bounds sigma1 = sigma - 0.001 * iters_so_far if sigma1 < 0.0001: sigma1 = 0.0001 print("Sigma=", sigma1) es = cma.CMAEvolutionStrategy(init_uniform_layer_weights, sigma1, opt) best_solution = np.copy( init_uniform_layer_weights.astype(np.float64)) costs = None while True: if es.countiter >= opt['maxiter']: break solutions = es.ask() segs = [] ob_segs = None costs = [] lens = [] # Evaluation assign_backup_eq_new( ) #backup current policy, after Q and V have been trained a_func = get_A_estimation( ob, ob, np.array(mean_actions(ob)).transpose().reshape( (len(ob), 1))) # a_func = (a_func - np.mean(a_func)) / np.std(a_func) print("A-pi0:", np.mean(a_func)) print() for id, solution in enumerate(solutions): new_variable = set_uniform_weights(layer_params_flat, solution, index) pi.set_Layer_Flat_variables(layer_params, new_variable) new_a_func = get_A_estimation( ob, ob, np.array(mean_actions(ob)).transpose().reshape( (len(ob), 1))) # new_a_func = (new_a_func - np.mean(new_a_func)) / np.std(new_a_func) print("A-pi" + str(id + 1), ":", np.mean(new_a_func)) costs.append(-np.mean(new_a_func)) assign_new_eq_backup() # Restore the backup # l2_decay = compute_weight_decay(0.999, solutions).reshape((np.array(costs).shape)) # costs += l2_decay # costs, real_costs = fitness_normalization(costs) print(costs) costs, real_costs = fitness_rank(costs) # es.tell(solutions=solutions, function_values = costs) es.tell_real_seg(solutions=solutions, function_values=costs, real_f=costs, segs=None) # if -es.result[1] >= best_fitness: print("Update Policy by CMAES") best_solution = np.copy(es.result[0]) best_fitness = -es.result[1] best_layer_params_flat = set_uniform_weights( layer_params_flat, best_solution, index) pi.set_Layer_Flat_variables(layer_params, best_layer_params_flat) print("Generation:", es.countiter) print("Best Solution Fitness:", best_fitness) # set old parameter values to new parameter values i += 2 # break assign_old_eq_new() # Reestimate Advantage function based on the newly updated Pi # seg = seg_gen.__next__() # add_vtarg_and_adv(seg, gamma, lam) # # # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) # ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ # "tdlamret"] # vpredbefore = seg["vpred"] # predicted value function before udpate # atarg = ( # atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate # d = Dataset(dict(ob = ob, ac = ac, atarg = atarg, vtarg = tdlamret), # shuffle = not pi.recurrent) # optim_batchsize = optim_batchsize or ob.shape[0] # PPO training # assign_old_eq_new() # set old parameter values to new parameter values # logger.log("Optimizing...") # logger.log(fmt_row(13, loss_names)) # # Optimize the value function to keep it up. # for _ in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for batch in d.iterate_once(optim_batchsize): # *vf_losses, g = vf_lossandgrad(batch["ob"], batch["ac"], # batch["atarg"], batch["vtarg"], # cur_lrmult) # vf_adam.update(g, optim_stepsize * cur_lrmult) # logger.log(fmt_row(13, np.mean(vf_losses, axis=0))) # Here we do a bunch of optimization epochs over the data # for _ in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for batch in d.iterate_once(optim_batchsize): # *newlosses, g = lossandgrad(batch["ob"], batch["ac"], # batch["atarg"], batch["vtarg"], # cur_lrmult) # adam.update(g, optim_stepsize * cur_lrmult) # losses.append(newlosses) # logger.log(fmt_row(13, np.mean(losses, axis = 0))) # logger.log("Evaluating losses...") # losses = [] # for batch in d.iterate_once(optim_batchsize): # newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], # batch["vtarg"], cur_lrmult) # losses.append(newlosses) # meanlosses, _, _ = mpi_moments(losses, axis = 0) iters_so_far += 1 episodes_so_far += sum(lens)
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) restore_model_from_file=None, save_model_with_prefix, # this is the naming of the saved model file. Usually here we set indication of the target goal: # for example 3dof_ppo1_H. # That way we can only select which networks we can execute to the real robot. We do not have to send all files or folder. # Naming of the model file should be self explanatory. job_id=None, # this variable is used for indentifing Spearmint iteration number. It is usually set by the Spearmint iterator outdir="/tmp/rosrl/experiments/continuous/ppo1/"): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() """ Here we add a possibility to resume from a previously saved model if a model file is provided """ if restore_model_from_file: # saver = tf.train.Saver(tf.all_variables()) saver = tf.train.import_meta_graph(restore_model_from_file) saver.restore( tf.get_default_session(), tf.train.latest_checkpoint('./')) #restore_model_from_file) logger.log("Loaded model from {}".format(restore_model_from_file)) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" if save_model_with_prefix: if job_id is not None: basePath = '/tmp/rosrl/' + str( env.__class__.__name__) + '/ppo1/' + job_id else: basePath = '/tmp/rosrl/' + str(env.__class__.__name__) + '/ppo1/' # Create the writer for TensorBoard logs summary_writer = tf.summary.FileWriter(outdir, graph=tf.get_default_graph()) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), deterministic=pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpRewSEM", np.std(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) """ Save the model at every iteration """ if save_model_with_prefix: # if np.mean(rewbuffer) > 10.0: if iters_so_far % 10 == 0 or np.mean(rewbuffer) > 10.0: basePath = outdir + "/models/" if not os.path.exists(basePath): os.makedirs(basePath) modelF = basePath + save_model_with_prefix + "_afterIter_" + str( iters_so_far) + ".model" U.save_state(modelF) logger.log("Saved model to file :{}".format(modelF)) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() summary = tf.Summary(value=[ tf.Summary.Value(tag="EpRewMean", simple_value=np.mean(rewbuffer)) ]) summary_writer.add_summary(summary, timesteps_so_far) return pi
def train(self, seg): # if callback: callback(locals(), globals()) if self.schedule == 'constant': cur_lrmult = 1.0 elif self.schedule == 'linear': cur_lrmult = max( 1.0 - float(self.iters_so_far) / self.max_iters_ppo, 1e-3) else: raise NotImplementedError self.iters_so_far += 1 logger.log("********** Iteration %i ************" % self.iters_so_far) add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=True) optim_batchsize = min(self.optim_batchsize, ob.shape[0]) if hasattr(self.pi, "ob_rms"): self.pi.ob_rms.update(ob) # update running mean/std for policy self.assign_old_eq_new( ) # set old parameter values to new parameter values # logger.log("Optimizing...") # logger.log(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(self.optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = self.lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) self.adam.update(g, self.optim_stepsize * cur_lrmult) losses.append(newlosses) # logger.log(fmt_row(13, np.mean(losses, axis=0))) # logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) n_data = seg["ob"].shape[0] self.timesteps_so_far += n_data for (lossval, name) in zipsame(meanlosses, self.loss_names): logger.log("loss_" + name + ": %f" % lossval) logger.log("ev_tdlam_before: %f" % explained_variance(vpredbefore, tdlamret)) logger.log("EpisodesThisIter: %d" % len(seg["rewAccumulated"])) logger.log("TimestepsThisIter: %d" % n_data) logger.log("TimestepsSoFar: %d" % self.timesteps_so_far) logger.log("LearningRate: %f" % (self.optim_stepsize * cur_lrmult))
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) **kwargs, ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return atarg_novel = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function for the novelty reward term ret_novel = tf.placeholder( dtype=tf.float32, shape=[None]) # Empirical return for the novelty reward term lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # surr1_novel = ratio * atarg_novel # surrogate loss of the novelty term surr2_novel = tf.clip_by_value( ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg_novel # surrogate loss of the novelty term pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) pol_surr_novel = -tf.reduce_mean(tf.minimum( surr1_novel, surr2_novel)) # PPO's surrogate for the novelty part vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) vf_loss_novel = tf.reduce_mean(tf.square(pi.vpred_novel - ret_novel)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] total_loss_novel = pol_surr_novel + pol_entpen + vf_loss_novel losses_novel = [pol_surr_novel, pol_entpen, vf_loss_novel, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] policy_var_list = pi.get_trainable_variables(scope='pi/pol') policy_var_count = 0 for vars in policy_var_list: count_in_var = 1 for dim in vars.shape._dims: count_in_var *= dim policy_var_count += count_in_var noise_count = pi.get_trainable_variables( scope='pi/pol/logstd')[0].shape._dims[1] var_list = pi.get_trainable_variables( scope='pi/pol') + pi.get_trainable_variables(scope='pi/vf/') var_list_novel = pi.get_trainable_variables( scope='pi/pol') + pi.get_trainable_variables(scope='pi/vf_novel/') var_list_pi = pi.get_trainable_variables( scope='pi/pol') + pi.get_trainable_variables( scope='pi/vf/') + pi.get_trainable_variables(scope='pi/vf_novel/') lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) lossandgrad_novel = U.function( [ob, ac, atarg_novel, ret_novel, lrmult], losses_novel + [U.flatgrad(total_loss_novel, var_list_novel)]) # adam = MpiAdam(var_list, epsilon=adam_epsilon) # adam_novel = MpiAdam(var_list_novel, epsilon=adam_epsilon) adam_all = MpiAdam(var_list_pi, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) compute_losses_novel = U.function([ob, ac, atarg_novel, ret_novel, lrmult], losses_novel) U.initialize() adam_all.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 novelty_update_iter_cycle = 10 novelty_start_iter = 50 novelty_update = True tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards rewnovelbuffer = deque( maxlen=100) # rolling buffer for episode novelty rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" # This for debug purpose # from collections import defaultdict # sum_batch = {} # sum_batch = defaultdict(lambda: 0, sum_batch) total_task_gradients = [] total_novelty_gradients = [] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, atarg_novel, tdlamret, tdlamret_novel = seg["ob"], seg[ "ac"], seg["adv"], seg["adv_novel"], seg["tdlamret"], seg[ "tdlamret_novel"] vpredbefore = seg["vpred"] # predicted value function before udpate vprednovelbefore = seg[ 'vpred_novel'] # predicted novelty value function before update atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate atarg_novel = (atarg_novel - atarg_novel.mean()) / atarg_novel.std( ) # standartized novelty advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret, atarg_novel=atarg_novel, vtarg_novel=tdlamret_novel), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) same_update_direction = [] # True task_gradient_mag = [] novel_gradient_mag = [] task_gradients = [] novel_gradients = [] same_dir_cnt = 0 oppo_dir_cnt = 0 # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) *newlosses_novel, g_novel = lossandgrad_novel( batch["ob"], batch["ac"], batch["atarg_novel"], batch["vtarg_novel"], cur_lrmult) pol_g = g[0:policy_var_count] pol_g_novel = g_novel[0:policy_var_count] comm = MPI.COMM_WORLD pol_g_reduced = np.zeros_like(pol_g) pol_g_novel_reduced = np.zeros_like(pol_g_novel) comm.Allreduce(pol_g, pol_g_reduced, op=MPI.SUM) pol_g_reduced /= comm.Get_size() comm.Allreduce(pol_g_novel, pol_g_novel_reduced, op=MPI.SUM) pol_g_novel_reduced /= comm.Get_size() final_gradient = np.zeros( len(g) + len(g_novel) - policy_var_count) final_gradient[policy_var_count::] = np.concatenate( (g[policy_var_count::], g_novel[policy_var_count::])) # pol_g_normalized = pol_g / np.linalg.norm(pol_g) # pol_g_novel_normalized = pol_g_novel / np.linalg.norm(pol_g_novel) pol_g_reduced_no_noise = pol_g_reduced[:(len(pol_g_reduced) - noise_count)] pol_g_novel_reduced_no_noise = pol_g_novel_reduced[:( len(pol_g_novel_reduced) - noise_count)] pol_g_reduced_no_noise_normalized = pol_g_reduced_no_noise / np.linalg.norm( pol_g_reduced_no_noise) pol_g_novel_reduced_no_noise_normalized = pol_g_novel_reduced_no_noise / np.linalg.norm( pol_g_novel_reduced_no_noise) dot = np.dot(pol_g_reduced_no_noise_normalized, pol_g_novel_reduced_no_noise_normalized) task_gradients.append(pol_g_reduced_no_noise) novel_gradients.append(pol_g_novel_reduced_no_noise) task_gradient_mag.append( np.linalg.norm(pol_g_reduced_no_noise)) novel_gradient_mag.append( np.linalg.norm(pol_g_novel_reduced_no_noise)) same_update_direction.append(dot) # pol_g_normalized = pol_g_reduced_normalized # pol_g_novel_normalized = pol_g_novel_reduced_normalized pol_g_reduced_normalized = pol_g_reduced / np.linalg.norm( pol_g_reduced) pol_g_novel_reduced_normalized = pol_g_novel_reduced / np.linalg.norm( pol_g_novel_reduced) if (dot > 0): same_dir_cnt += 1 bisector_no_noise = (pol_g_reduced_normalized + pol_g_novel_reduced_normalized) bisector_no_noise_normalized = bisector_no_noise / np.linalg.norm( bisector_no_noise) # quarterSector_no_noise = (pol_g_reduced_normalized + bisector_no_noise_normalized) # quarterSector_no_noise_normalized = quarterSector_no_noise / np.linalg.norm(quarterSector_no_noise) # # octSector_no_noise = (pol_g_reduced_normalized + quarterSector_no_noise_normalized) # octSector_no_noise_normalized = octSector_no_noise / np.linalg.norm(octSector_no_noise) target_dir = bisector_no_noise_normalized final_gradient[0:policy_var_count] = 0.5 * ( np.dot(pol_g_reduced, target_dir) + np.dot(pol_g_novel_reduced, target_dir)) * target_dir adam_all.update(final_gradient, optim_stepsize * cur_lrmult) else: oppo_dir_cnt += 1 task_projection_no_noise = np.dot( pol_g_reduced, pol_g_novel_reduced_normalized ) * pol_g_novel_reduced_normalized final_pol_gradient_no_noise = pol_g_reduced - task_projection_no_noise final_gradient[ 0:policy_var_count] = final_pol_gradient_no_noise adam_all.update(final_gradient, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) # newlosses_novel = compute_losses_novel(batch["ob"], batch["ac"], batch["atarg_novel"], batch["vtarg_novel"], # cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg['ep_rets_novel'] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, rews_novel = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) rewnovelbuffer.extend(rews_novel) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpRNoveltyRewMean", np.mean(rewnovelbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 if iters_so_far >= novelty_start_iter and iters_so_far % novelty_update_iter_cycle == 0: novelty_update = not novelty_update logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("RelativeDirection", np.array(same_update_direction).mean()) logger.record_tabular("SameDirectionCount", same_dir_cnt) logger.record_tabular("OppoDirectionCount", oppo_dir_cnt) logger.record_tabular("TaskGradMag", np.array(task_gradient_mag).mean()) logger.record_tabular("NoveltyGradMag", np.array(novel_gradient_mag).mean()) task_gradients = np.array(task_gradients).mean(axis=0) total_task_gradients.append(task_gradients) novel_gradients = np.array(novel_gradients).mean(axis=0) total_novelty_gradients.append(novel_gradients) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() if MPI.COMM_WORLD.Get_rank() == 0: gradient_info = {} gradient_info['task_gradients'] = np.array(total_task_gradients) gradient_info['novelty_gradients'] = np.array(total_novelty_gradients) print(np.array(total_task_gradients).shape) print(np.array(total_novelty_gradients).shape) joblib.dump(gradient_info, logger.get_dir() + '/gradientinfo.pkl', compress=True) return pi
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) sym_loss_weight=0.0, return_threshold=None, # termiante learning if reaches return_threshold op_after_init=None, init_policy_params=None, policy_scope=None, max_threshold=None, positive_rew_enforce=False, reward_drop_bound=None, min_iters=0, ref_policy_params=None, rollout_length_thershold=None): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space if policy_scope is None: pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy else: pi = policy_func(policy_scope, ob_space, ac_space) # Construct network for new policy oldpi = policy_func("old" + policy_scope, ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent sym_loss = sym_loss_weight * U.mean( tf.square(pi.mean - pi.mirrored_mean)) # mirror symmetric loss ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) + sym_loss # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent, sym_loss] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent", "sym_loss"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() if init_policy_params is not None: cur_scope = pi.get_variables()[0].name[0:pi.get_variables()[0].name. find('/')] orig_scope = list(init_policy_params.keys() )[0][0:list(init_policy_params.keys())[0].find('/')] for i in range(len(pi.get_variables())): assign_op = pi.get_variables()[i].assign( init_policy_params[pi.get_variables()[i].name.replace( cur_scope, orig_scope, 1)]) U.get_session().run(assign_op) assign_op = oldpi.get_variables()[i].assign( init_policy_params[pi.get_variables()[i].name.replace( cur_scope, orig_scope, 1)]) U.get_session().run(assign_op) if ref_policy_params is not None: ref_pi = policy_func("ref_pi", ob_space, ac_space) cur_scope = ref_pi.get_variables()[0].name[0:ref_pi.get_variables()[0]. name.find('/')] orig_scope = list(ref_policy_params.keys() )[0][0:list(ref_policy_params.keys())[0].find('/')] for i in range(len(ref_pi.get_variables())): assign_op = ref_pi.get_variables()[i].assign( ref_policy_params[ref_pi.get_variables()[i].name.replace( cur_scope, orig_scope, 1)]) U.get_session().run(assign_op) env.env.env.ref_policy = ref_pi adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" max_thres_satisfied = max_threshold is None adjust_ratio = 0.0 prev_avg_rew = -1000000 revert_parameters = {} variables = pi.get_variables() for i in range(len(variables)): cur_val = variables[i].eval() revert_parameters[variables[i].name] = cur_val revert_data = [0, 0, 0] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() if reward_drop_bound is not None: lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) revert_iteration = False if np.mean( rewbuffer ) < prev_avg_rew - reward_drop_bound: # detect significant drop in performance, revert to previous iteration print("Revert Iteration!!!!!") revert_iteration = True else: prev_avg_rew = np.mean(rewbuffer) logger.record_tabular("Revert Rew", prev_avg_rew) if revert_iteration: # revert iteration for i in range(len(pi.get_variables())): assign_op = pi.get_variables()[i].assign( revert_parameters[pi.get_variables()[i].name]) U.get_session().run(assign_op) episodes_so_far = revert_data[0] timesteps_so_far = revert_data[1] iters_so_far = revert_data[2] continue else: variables = pi.get_variables() for i in range(len(variables)): cur_val = variables[i].eval() revert_parameters[variables[i].name] = np.copy(cur_val) revert_data[0] = episodes_so_far revert_data[1] = timesteps_so_far revert_data[2] = iters_so_far if positive_rew_enforce: rewlocal = (seg["pos_rews"], seg["neg_pens"], seg["rew"] ) # local values listofrews = MPI.COMM_WORLD.allgather(rewlocal) # list of tuples pos_rews, neg_pens, rews = map(flatten_lists, zip(*listofrews)) if np.mean(rews) < 0.0: #min_id = np.argmin(rews) #adjust_ratio = pos_rews[min_id]/np.abs(neg_pens[min_id]) adjust_ratio = np.max([ adjust_ratio, np.mean(pos_rews) / np.abs(np.mean(neg_pens)) ]) for i in range(len(seg["rew"])): if np.abs(seg["rew"][i] - seg["pos_rews"][i] - seg["neg_pens"][i]) > 1e-5: print(seg["rew"][i], seg["pos_rews"][i], seg["neg_pens"][i]) print('Reward wrong!') abc seg["rew"][i] = seg["pos_rews"][ i] + seg["neg_pens"][i] * adjust_ratio add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) if reward_drop_bound is None: lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("Iter", iters_so_far) if positive_rew_enforce: if adjust_ratio is not None: logger.record_tabular("RewardAdjustRatio", adjust_ratio) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() if max_threshold is not None: print('Current max return: ', np.max(rewbuffer)) if np.max(rewbuffer) > max_threshold: max_thres_satisfied = True else: max_thres_satisfied = False return_threshold_satisfied = True if return_threshold is not None: if not (np.mean(rewbuffer) > return_threshold and iters_so_far > min_iters): return_threshold_satisfied = False rollout_length_thershold_satisfied = True if rollout_length_thershold is not None: rewlocal = (seg["avg_vels"], seg["rew"]) # local values listofrews = MPI.COMM_WORLD.allgather(rewlocal) # list of tuples avg_vels, rews = map(flatten_lists, zip(*listofrews)) if not (np.mean(lenbuffer) > rollout_length_thershold and np.mean(avg_vels) > 0.5 * env.env.env.final_tv): rollout_length_thershold_satisfied = False if rollout_length_thershold is not None or return_threshold is not None: if rollout_length_thershold_satisfied and return_threshold_satisfied: break return pi, np.mean(rewbuffer)
o, r, d, _ = policy.act(random=True) length += 1 collected_data_size += 1 if d: lengths.append(length) break print('Average rollout length: ', np.mean(lengths)) dataset = Dataset(dict(X=np.array(np.array(input_data)), Y=np.array(np.array(output_data))), shuffle=True) losses = [] for iter in range(300): loss_epoch = [] for batch in dataset.iterate_once(64): batch["X"] = torch.tensor(batch["X"], dtype=torch.float32, device=device) batch["Y"] = torch.tensor(batch["Y"], dtype=torch.float32, device=device) optimizer.zero_grad() outputs = osi(batch["X"]) loss = criterion(outputs, batch["Y"]) loss.backward() optimizer.step() loss_epoch.append(float(loss)) losses.append(np.mean(loss_epoch)) if iter % 5 == 0: print('iter: ', iter, 'loss: ', np.mean(loss_epoch))
def learn( env, policy_func, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) load_model=None, action_bias=0.4, action_repeat=0, action_repeat_rand=False, warmup_frames=0, target_kl=0.01, vf_loss_mult=1, vfloss_optim_stepsize=0.003, vfloss_optim_batchsize=8, vfloss_optim_epochs=10): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule # Not sure why they anneal clip and learning rate with the same parameter #clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen losses = [pol_surr, pol_entpen, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) lossandgrad_vfloss = U.function([ob, ac, atarg, ret], [vf_loss] + [U.flatgrad(vf_loss, var_list)]) adam_vfloss = MpiAdam(var_list, epsilon=adam_epsilon) compute_vfloss = U.function([ob, ac, atarg, ret], [vf_loss]) U.initialize() adam.sync() adam_vfloss.sync() if load_model: logger.log('Loading model: %s' % load_model) pi.load(load_model) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, action_bias=action_bias, action_repeat=action_repeat, action_repeat_rand=action_repeat_rand, warmup_frames=warmup_frames) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" ep_rew_file = None if MPI.COMM_WORLD.Get_rank() == 0: import wandb ep_rew_file = open( os.path.join(wandb.run.dir, 'episode_rewards.jsonl'), 'w') checkpoint_dir = 'checkpoints-%s' % wandb.run.id os.mkdir(checkpoint_dir) cur_lrmult = 1.0 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) elif schedule == 'target_kl': pass else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.next() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): result = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) newlosses = result[:-1] g = result[-1] adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) # vfloss optimize logger.log("Optimizing value function...") logger.log(fmt_row(13, ['vf'])) for _ in range(vfloss_optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(vfloss_optim_batchsize): result = lossandgrad_vfloss(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"]) newlosses = result[:-1] g = result[-1] adam_vfloss.update(g, vfloss_optim_stepsize) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) newlosses += compute_vfloss(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"]) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names + ['vf']): logger.record_tabular("loss_" + name, lossval) # check kl if schedule == 'target_kl': if meanlosses[2] > target_kl * 1.1: cur_lrmult /= 1.5 elif meanlosses[2] < target_kl / 1.1: cur_lrmult *= 1.5 logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) if rewbuffer: logger.record_tabular('CurLrMult', cur_lrmult) logger.record_tabular('StepSize', optim_stepsize * cur_lrmult) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMax", np.max(rewbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpRewMin", np.min(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) time_elapsed = time.time() - tstart logger.record_tabular("TimeElapsed", time_elapsed) if MPI.COMM_WORLD.Get_rank() == 0: import wandb ep_rew_file.write('%s\n' % json.dumps({ 'TimeElapsed': time_elapsed, 'Rewards': rews })) ep_rew_file.flush() data = logger.Logger.CURRENT.name2val wandb.run.history.add(data) summary_data = {} for k, v in data.iteritems(): if 'Rew' in k: summary_data[k] = v wandb.run.summary.update(summary_data) pi.save( os.path.join(checkpoint_dir, 'model-%s.ckpt' % (iters_so_far - 1))) logger.dump_tabular() else: logger.log('No episodes complete yet')
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) load_model_path, test_only, stochastic, symmetric_training=False, obs_names=None, single_episode=False, horizon_hack=False, running_avg_len=100, init_three=False, actions=None, symmetric_training_trick=False, seeds_fn=None, bootstrap_seeds=False, ): global seeds # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Network for new policy old_pi = policy_func("old_pi", ob_space, ac_space) # Network for old policy adv_targ = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return mask = tf.placeholder(dtype=tf.bool, shape=[None]) # Mask for the trick lr_mult = tf.placeholder( name='lr_mult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lr_mult # Annealed clipping parameter epsilon ob = U.get_placeholder_cached(name="ob") st = U.get_placeholder_cached(name="st") ac = pi.pdtype.sample_placeholder([None]) kl = old_pi.pd.kl(pi.pd) ent = pi.pd.entropy() mean_kl = U.mean(tf.boolean_mask(kl, mask)) # Mean over the batch mean_ent = U.mean(tf.boolean_mask(ent, mask)) entropy_penalty = -entcoeff * mean_ent ratio = tf.exp(pi.pd.logp(ac) - old_pi.pd.logp(ac)) # pi_new / pi_old surr_1 = ratio * adv_targ # surrogate from conservative policy iteration surr_2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * adv_targ # surr_loss = -U.mean(tf.boolean_mask( tf.minimum(surr_1, surr_2), mask)) # PPO's pessimistic surrogate (L^CLIP), mean over the batch vf_loss = U.mean(tf.boolean_mask(tf.square(pi.vpred - ret), mask)) total_loss = surr_loss + entropy_penalty + vf_loss losses = [surr_loss, entropy_penalty, vf_loss, mean_kl, mean_ent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() comp_loss_and_grad = U.function([ob, st, ac, adv_targ, ret, lr_mult, mask], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(old_v, new_v) for (old_v, new_v) in zipsame(old_pi.get_variables(), pi.get_variables()) ]) comp_loss = U.function([ob, st, ac, adv_targ, ret, lr_mult, mask], losses) if init_three: assign_init_three_1 = U.function( [], [], updates=[ tf.assign(new_v, old_v) for (old_v, new_v) in zipsame( pi.get_orig_variables(), pi.get_part_variables(1)) ]) assign_init_three_2 = U.function( [], [], updates=[ tf.assign(new_v, old_v) for (old_v, new_v) in zipsame( pi.get_orig_variables(), pi.get_part_variables(2)) ]) U.initialize() if load_model_path is not None: U.load_state(load_model_path) if init_three: assign_init_three_1() assign_init_three_2() adam.sync() if seeds_fn is not None: with open(seeds_fn) as f: seeds = [int(seed) for seed in f.readlines()] # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=stochastic, single_episode=test_only or single_episode, actions=actions, bootstrap_seeds=bootstrap_seeds) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() len_buffer = deque( maxlen=running_avg_len) # rolling buffer for episode lengths rew_buffer = deque( maxlen=running_avg_len) # rolling buffer for episode rewards origrew_buffer = deque( maxlen=running_avg_len) # rolling buffer for original episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam, horizon_hack=horizon_hack) # ob, ac, adv_targ, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, st, ac, adv_targ, tdlamret = seg["ob"], seg["step"], seg[ "ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate if symmetric_training_trick: first_75 = st < 75 mask = ~np.concatenate((np.zeros_like(first_75), first_75)) else: mask = np.concatenate( (np.ones_like(st, dtype=np.bool), np.ones_like(st, dtype=np.bool))) if symmetric_training: sym_obss = [] sym_acc = [] for i in range(timesteps_per_batch): obs = OrderedDict(zip(obs_names, ob[i])) sym_obs = obs.copy() swap_legs(sym_obs) sym_ac = ac[i].copy() sym_ac = np.concatenate((sym_ac[9:], sym_ac[:9])) sym_obss.append(np.asarray(list(sym_obs.values()))) sym_acc.append(sym_ac) sym_obss = np.asarray(sym_obss) sym_acc = np.asarray(sym_acc) ob = np.concatenate((ob, sym_obss)) ac = np.concatenate((ac, sym_acc)) adv_targ = np.concatenate((adv_targ, adv_targ)) tdlamret = np.concatenate((tdlamret, tdlamret)) vpredbefore = np.concatenate((vpredbefore, vpredbefore)) st = np.concatenate((st, st)) # Compute stats before updating if bootstrap_seeds: lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_orig_rets"], seg["easy_seeds"], seg["hard_seeds"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, orig_rews, easy_seeds, hard_seeds = map( flatten_lists, zip(*listoflrpairs)) easy_seeds = [x for x in easy_seeds if x != 0] hard_seeds = [x for x in hard_seeds if x != 0] print('seeds set sizes:', len(seeds), len(easy_seeds), len(hard_seeds)) seeds = list((set(seeds) - set(easy_seeds)) | set(hard_seeds)) else: lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_orig_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, orig_rews = map(flatten_lists, zip(*listoflrpairs)) len_buffer.extend(lens) rew_buffer.extend(rews) origrew_buffer.extend(orig_rews) logger.record_tabular("Iter", iters_so_far) logger.record_tabular("EpLenMean", np.mean(len_buffer)) logger.record_tabular("EpRewMean", np.mean(rew_buffer)) logger.record_tabular("EpOrigRewMean", np.mean(origrew_buffer)) logger.record_tabular("EpOrigRewStd", np.std(origrew_buffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) n_completed = 0 sum_completed = 0 for ep_len, orig_rew in zip(lens, orig_rews): if ep_len == 1000: n_completed += 1 sum_completed += orig_rew avg_completed = sum_completed / n_completed if n_completed > 0 else 0 logger.record_tabular("AvgCompleted", avg_completed) perc_completed = 100 * n_completed / len(lens) if len(lens) > 0 else 0 logger.record_tabular("PercCompleted", perc_completed) if callback: callback(locals(), globals()) adv_targ = (adv_targ - adv_targ.mean()) / adv_targ.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, st=st, ac=ac, atarg=adv_targ, vtarg=tdlamret, mask=mask), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") if not test_only: logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data. I log results only for the first worker (rank=0) for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *batch_losses, grads = comp_loss_and_grad( batch["ob"], batch["st"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, batch["mask"]) if not test_only: adam.update(grads, optim_stepsize * cur_lrmult) losses.append(batch_losses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): batch_losses = comp_loss(batch["ob"], batch["st"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, batch["mask"]) losses.append(batch_losses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() iters_so_far += 1
def ppo_learn(env, policy, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation args, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_obs=False): # Setup losses and stuff # ---------------------------------------- pi = policy oldpi = create_policy("oldpi", env) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() logger.log("trainable variables:", var_list) lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Initializing oldpi = pi. assign_old_eq_new() # Prepare for rollouts seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 ep_suc_so_far = 0 # success episodes num so far during training tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" ep_mean_rews = list() ep_mean_lens = list() eval_success_rates = list() # this is for saving global info for multiple evaluation results. eval_suc_buffer = deque(maxlen=2) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break """ Learning rate scheduler """ if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': # cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) cur_lrmult = 1.0 cur_lrmult = max(cur_lrmult * np.power(0.95, float(iters_so_far) / max_iters), 0.7) else: raise NotImplementedError logger.log("********** Iteration %i ************" % (iters_so_far+1)) # Current iteration index seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] rews = seg['rew'] ep_rets = seg['ep_rets'] train_sucs = seg['suc'] mc_rets = seg['mcreturn'] vpredbefore = seg['vpred'] tdtarget = seg['tdtarget'] """ In case of collecting real-time sim data and its vpred for further debugging """ sim_data_name = 'sim_data' with open(args['RUN_DIR'] + '/' + sim_data_name + '.csv', 'a') as f: vpred_shaped = vpredbefore.reshape(-1, 1) atarg_shaped = atarg.reshape(-1,1) tdlamret_shaped = tdlamret.reshape(-1,1) tdtarget_shaped = tdtarget.reshape(-1,1) rews_shaped = rews.reshape(-1,1) log_data = np.concatenate((ob, vpred_shaped, atarg_shaped, tdlamret_shaped, tdtarget_shaped, rews_shaped), axis=1) if args['gym_env'] == 'QuadTakeOffHoverEnv-v0': log_df = pd.DataFrame(log_data, columns=['z', 'vx', 'vy', 'vz', 'roll', 'pitch', 'yaw', 'roll_w', 'pitch_w', 'yaw_w', 'vpred', 'atarg', 'tdlamret', 'tdtarget','rews']) else: raise ValueError("invalid env !!!") log_df.to_csv(f, header=True) """ Optimization """ atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] # update pi.ob_rms based on the most recent ob if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data kl_threshold = 0.05 for _ in range(optim_epochs): losses = [] # list of sublists, each of which gives the loss based on a set of samples with size "optim_batchsize" grads = [] # list of sublists, each of which gives the gradients w.r.t all variables based on a set of samples with size "optim_batchsize" for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) if any(np.isnan(g)): logger.log("there are nan in gradients, skip further updating!") break if newlosses[3] < kl_threshold: adam.update(g, optim_stepsize * cur_lrmult) else: logger.log("KL loss is %f larger than kl_threshold %f, early stop further updating!" % (newlosses[3], kl_threshold)) break # break only jump out of the inner loop grads.append(g) losses.append(newlosses) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) ep_mean_lens.append(np.mean(lenbuffer)) ep_mean_rews.append(np.mean(rewbuffer)) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpRewMeanThisIter", np.mean(seg["ep_rets"])) logger.record_tabular("EpSuccessThisIter", Counter(train_sucs)[True]) logger.record_tabular("SucRateThisIter", Counter(train_sucs)[True] / len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 ep_suc_so_far += Counter(train_sucs)[True] logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("EpSuccessSoFar", ep_suc_so_far) logger.record_tabular("SucRateSoFar", ep_suc_so_far/episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular() """ Evaluation """ EVALUATION_FREQUENCY = 10 # 10 if iters_so_far % EVALUATION_FREQUENCY == 0: eval_max_iters = 5 eval_iters_so_far = 0 eval_timesteps_per_actorbatch = timesteps_per_actorbatch eval_lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths eval_rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards eval_episodes_so_far = 0 eval_timesteps_so_far = 0 eval_success_episodes_so_far = 0 # prepare eval episode generator eval_seg_gen = traj_segment_generator(pi, env, eval_timesteps_per_actorbatch, stochastic=False) logger.log("********** Start evaluating ... ************") while True: if eval_max_iters and eval_iters_so_far >= eval_max_iters: break logger.log("********** Eval Iteration %i ************" %(eval_iters_so_far+1)) eval_seg = eval_seg_gen.__next__() eval_lrlocal = (eval_seg["ep_lens"], eval_seg["ep_rets"]) # local values eval_listoflrpairs = MPI.COMM_WORLD.allgather(eval_lrlocal) # list of tuples eval_lens, eval_rews = map(flatten_lists, zip(*eval_listoflrpairs)) eval_lenbuffer.extend(eval_lens) eval_rewbuffer.extend(eval_rews) logger.record_tabular("EpLenMean", np.mean(eval_lenbuffer)) logger.record_tabular("EpRewMean", np.mean(eval_rewbuffer)) logger.record_tabular("EpThisIter", len(eval_lens)) eval_sucs = eval_seg["suc"] logger.record_tabular("EpSuccessThisIter", Counter(eval_sucs)[True]) eval_episodes_so_far += len(eval_lens) eval_timesteps_so_far += sum(eval_lens) eval_success_episodes_so_far += Counter(eval_sucs)[True] logger.record_tabular("EpisodesSoFar", eval_episodes_so_far) logger.record_tabular("TimestepsSoFar", eval_timesteps_so_far) logger.record_tabular("EpisodesSuccessSoFar", eval_success_episodes_so_far) logger.record_tabular("SuccessRateSoFar", eval_success_episodes_so_far * 1.0 / eval_episodes_so_far) eval_iters_so_far += 1 if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() # save success rate from each evaluation into global list eval_success_rates.append(eval_success_episodes_so_far * 1.0 / eval_episodes_so_far) eval_suc_buffer.append(eval_success_episodes_so_far * 1.0 / eval_episodes_so_far) """ Saving model and statistics """ MODEL_SAVING_FREQ = 30 # 30 is enough for some learning if iters_so_far % MODEL_SAVING_FREQ == 0: pi.save_model(args['MODEL_DIR'], iteration=iters_so_far) # save necessary training statistics with open(args['RESULT_DIR'] + '/train_reward_' + 'iter_' + str(iters_so_far) + '.pkl', 'wb') as f_train: pickle.dump(ep_mean_rews, f_train) # save necessary evaluation statistics with open(args['RESULT_DIR'] + '/eval_success_rate_' + 'iter_' + str(iters_so_far) + '.pkl', 'wb') as f_eval: pickle.dump(eval_success_rates, f_eval) """ Plotting and saving statistics """ PLOT_FREQUENCY = 10 # 10 if iters_so_far % PLOT_FREQUENCY == 0: # plot training reward performance train_plot_x = np.arange(len(ep_mean_rews)) + 1 train_plot_x = np.insert(train_plot_x, 0, 0) train_plot_y = np.insert(ep_mean_rews, 0, ep_mean_rews[0]) plot_performance(x=train_plot_x, y=train_plot_y, ylabel=r'episode mean reward at each iteration', xlabel='ppo iterations', figfile=os.path.join(args['FIGURE_DIR'], 'train_reward'), title='TRAIN') # plot evaluation success rate eval_plot_x = (np.arange(len(eval_success_rates)) + 1) * EVALUATION_FREQUENCY eval_plot_x = np.insert(eval_plot_x, 0, 0) eval_plot_y = np.insert(eval_success_rates, 0, 0) plot_performance(x=eval_plot_x, y = eval_plot_y, ylabel=r'eval success rate', xlabel='ppo iterations', figfile=os.path.join(args['FIGURE_DIR'], 'eval_success_rate'), title="EVAL") return pi
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update log_every=None, log_dir=None, episodes_so_far=0, timesteps_so_far=0, iters_so_far=0, clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) **kwargs): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy # Target advantage function (if applicable) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return # learning rate multiplier, updated with schedule lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- # GRASPING saver = tf.train.Saver(var_list=U.ALREADY_INITIALIZED, max_to_keep=1) checkpoint = tf.train.latest_checkpoint(log_dir) if checkpoint: print("Restoring checkpoint: {}".format(checkpoint)) saver.restore(U.get_session(), checkpoint) if hasattr(env, "set_actor"): def actor(obs): return pi.act(False, obs)[0] env.set_actor(actor) if not checkpoint and hasattr(env, "warm_init_eps"): pretrain(pi, env) saver.save(U.get_session(), osp.join(log_dir, "model")) # /GRASPING seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) tstart = time.time() assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) should_break = False if max_timesteps and timesteps_so_far >= max_timesteps: should_break = True elif max_episodes and episodes_so_far >= max_episodes: should_break = True elif max_iters and iters_so_far >= max_iters: should_break = True elif max_seconds and time.time() - tstart >= max_seconds: should_break = True if log_every and log_dir: if (iters_so_far + 1) % log_every == 0 or should_break: # To reduce space, don't specify global step. saver.save(U.get_session(), osp.join(log_dir, "model")) job_info = { 'episodes_so_far': episodes_so_far, 'iters_so_far': iters_so_far, 'timesteps_so_far': timesteps_so_far } with open(osp.join(log_dir, "job_info_new.yaml"), 'w') as file: yaml.dump(job_info, file, default_flow_style=False) # Make sure write is instantaneous. file.flush() os.fsync(file) os.rename(osp.join(log_dir, "job_info_new.yaml"), osp.join(log_dir, "job_info.yaml")) if should_break: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / ( atarg.std() + 1e-10) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values # logger.log("Optimizing...") # logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) # logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) logger.record_tabular("EpLenMean", np.mean(lens)) logger.record_tabular("EpRewMean", np.mean(rews)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def learn(env, policy_fn, *, timesteps_per_actorbatch, clip_param, entcoeff, optim_epochs, optim_stepsize, optim_batchsize, gamma, lam, max_timesteps, alphas, schedule, return_mv_avg, adam_epsilon=1e-5): ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) oldpi = policy_fn("oldpi", ob_space, ac_space) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) new = tf.placeholder(dtype=tf.float32, shape=[None]) ret = tf.placeholder(dtype=tf.float32, shape=[None]) lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent log_ratio = pi.pd.logp(ac) - oldpi.pd.logp(ac) bool_new = tf.cast(new, tf.bool) def get_shift(i): shift = tf.concat([tf.zeros((i, )), log_ratio[:-i]], 0) shift = tf.where(bool_new, tf.zeros_like(shift), shift) for _ in range(1, i): shift = tf.where( tf.concat([tf.ones((i, ), dtype=tf.bool), bool_new[:-i]], 0), tf.zeros_like(shift), shift) return shift shifts = [log_ratio] + [get_shift(i) for i in range(1, len(alphas))] is_log_ratio = sum(b * s for b, s in zip(alphas, shifts)) ratio = tf.exp(is_log_ratio) surr1 = ratio * atarg surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2)) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult, new], losses + [U.flatgrad(total_loss, var_list)]) adam = AdamOptimizer(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, new], losses) U.initialize() seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=200) rewbuffer = deque(maxlen=return_mv_avg) results = [] while True: if timesteps_so_far > max_timesteps: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError() logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] atarg = (atarg - atarg.mean()) / atarg.std() d = Dataset( dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret, new=seg["new"])) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) assign_old_eq_new() logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) for _ in range(optim_epochs): losses = [] for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, batch["new"]) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, batch["new"]) losses.append(newlosses) meanlosses = np.vstack(losses).mean(0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) listoflrpairs = [(seg["ep_lens"], seg["ep_rets"])] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) results.append(np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("AvgEpisodeLen", np.mean(lenbuffer)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed [h]", (time.time() - tstart) / 3600) logger.record_tabular("Timesteps/sec", timesteps_so_far / (time.time() - tstart)) logger.dump_tabular() return results
def learn( args, env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) writer=None): print("\nBeginning learning...\n") # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.compat.v1.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.compat.v1.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = {} ob['adj'] = U.get_placeholder_cached(name="adj") ob['node'] = U.get_placeholder_cached(name="node") ob_gen = {} ob_gen['adj'] = U.get_placeholder( shape=[None, ob_space['adj'].shape[0], None, None], dtype=tf.float32, name='adj_gen') ob_gen['node'] = U.get_placeholder( shape=[None, 1, None, ob_space['node'].shape[2]], dtype=tf.float32, name='node_gen') ob_real = {} ob_real['adj'] = U.get_placeholder( shape=[None, ob_space['adj'].shape[0], None, None], dtype=tf.float32, name='adj_real') ob_real['node'] = U.get_placeholder( shape=[None, 1, None, ob_space['node'].shape[2]], dtype=tf.float32, name='node_real') ac = tf.compat.v1.placeholder(dtype=tf.int64, shape=[None, 4], name='ac_real') ## PPO loss kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent pi_logp = pi.pd.logp(ac) oldpi_logp = oldpi.pd.logp(ac) ratio_log = pi.pd.logp(ac) - oldpi.pd.logp(ac) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] ## Expert loss loss_expert = -tf.reduce_mean(pi_logp) ## Discriminator loss step_pred_real, step_logit_real = discriminator_net(ob_real, args, name='d_step') step_pred_gen, step_logit_gen = discriminator_net(ob_gen, args, name='d_step') loss_d_step_real = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=step_logit_real, labels=tf.ones_like(step_logit_real) * 0.9)) loss_d_step_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=step_logit_gen, labels=tf.zeros_like(step_logit_gen))) loss_d_step = loss_d_step_real + loss_d_step_gen if args.gan_type == 'normal': loss_g_step_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=step_logit_gen, labels=tf.zeros_like(step_logit_gen))) elif args.gan_type == 'recommend': loss_g_step_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=step_logit_gen, labels=tf.ones_like(step_logit_gen) * 0.9)) elif args.gan_type == 'wgan': loss_d_step, _, _ = discriminator(ob_real, ob_gen, args, name='d_step') loss_d_step = loss_d_step * -1 loss_g_step_gen, _ = discriminator_net(ob_gen, args, name='d_step') final_pred_real, final_logit_real = discriminator_net(ob_real, args, name='d_final') final_pred_gen, final_logit_gen = discriminator_net(ob_gen, args, name='d_final') loss_d_final_real = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=final_logit_real, labels=tf.ones_like(final_logit_real) * 0.9)) loss_d_final_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=final_logit_gen, labels=tf.zeros_like(final_logit_gen))) loss_d_final = loss_d_final_real + loss_d_final_gen if args.gan_type == 'normal': loss_g_final_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=final_logit_gen, labels=tf.zeros_like(final_logit_gen))) elif args.gan_type == 'recommend': loss_g_final_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=final_logit_gen, labels=tf.ones_like(final_logit_gen) * 0.9)) elif args.gan_type == 'wgan': loss_d_final, _, _ = discriminator(ob_real, ob_gen, args, name='d_final') loss_d_final = loss_d_final * -1 loss_g_final_gen, _ = discriminator_net(ob_gen, args, name='d_final') var_list_pi = pi.get_trainable_variables() var_list_pi_stop = [ var for var in var_list_pi if ('emb' in var.name) or ('gcn' in var.name) or ('stop' in var.name) ] var_list_d_step = [ var for var in tf.compat.v1.global_variables() if 'd_step' in var.name ] var_list_d_final = [ var for var in tf.compat.v1.global_variables() if 'd_final' in var.name ] ## debug debug = {} ## loss update function lossandgrad_ppo = U.function([ ob['adj'], ob['node'], ac, pi.ac_real, oldpi.ac_real, atarg, ret, lrmult ], losses + [U.flatgrad(total_loss, var_list_pi)]) lossandgrad_expert = U.function( [ob['adj'], ob['node'], ac, pi.ac_real], [loss_expert, U.flatgrad(loss_expert, var_list_pi)]) lossandgrad_expert_stop = U.function( [ob['adj'], ob['node'], ac, pi.ac_real], [loss_expert, U.flatgrad(loss_expert, var_list_pi_stop)]) lossandgrad_d_step = U.function( [ob_real['adj'], ob_real['node'], ob_gen['adj'], ob_gen['node']], [loss_d_step, U.flatgrad(loss_d_step, var_list_d_step)]) lossandgrad_d_final = U.function( [ob_real['adj'], ob_real['node'], ob_gen['adj'], ob_gen['node']], [loss_d_final, U.flatgrad(loss_d_final, var_list_d_final)]) loss_g_gen_step_func = U.function([ob_gen['adj'], ob_gen['node']], loss_g_step_gen) loss_g_gen_final_func = U.function([ob_gen['adj'], ob_gen['node']], loss_g_final_gen) adam_pi = MpiAdam(var_list_pi, epsilon=adam_epsilon) adam_pi_stop = MpiAdam(var_list_pi_stop, epsilon=adam_epsilon) adam_d_step = MpiAdam(var_list_d_step, epsilon=adam_epsilon) adam_d_final = MpiAdam(var_list_d_final, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.compat.v1.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ ob['adj'], ob['node'], ac, pi.ac_real, oldpi.ac_real, atarg, ret, lrmult ], losses) # Prepare for rollouts # ---------------------------------------- episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths lenbuffer_valid = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_env = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_d_step = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_d_final = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_final = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_final_stat = deque( maxlen=100) # rolling buffer for episode rewardsn seg_gen = traj_segment_generator(args, pi, env, timesteps_per_actorbatch, True, loss_g_gen_step_func, loss_g_gen_final_func) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" if args.load == 1: try: fname = './ckpt/' + args.name_full_load sess = tf.get_default_session() # sess.run(tf.compat.v1.global_variables_initializer()) saver = tf.train.Saver(var_list_pi) saver.restore(sess, fname) iters_so_far = int(fname.split('_')[-1]) + 1 print('model restored!', fname, 'iters_so_far:', iters_so_far) except: print(fname, 'ckpt not found, start with iters 0') U.initialize() adam_pi.sync() adam_pi_stop.sync() adam_d_step.sync() adam_d_final.sync() counter = 0 level = 0 ## start training while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) ob_adj, ob_node, ac, atarg, tdlamret = seg["ob_adj"], seg[ "ob_node"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob_adj=ob_adj, ob_node=ob_node, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob_adj.shape[0] # inner training loop, train policy for i_optim in range(optim_epochs): loss_expert = 0 loss_expert_stop = 0 g_expert = 0 g_expert_stop = 0 loss_d_step = 0 loss_d_final = 0 g_ppo = 0 g_d_step = 0 g_d_final = 0 pretrain_shift = 5 ## Expert if iters_so_far >= args.expert_start and iters_so_far <= args.expert_end + pretrain_shift: ## Expert train # # # learn how to stop ob_expert, ac_expert = env.get_expert(optim_batchsize) loss_expert, g_expert = lossandgrad_expert( ob_expert['adj'], ob_expert['node'], ac_expert, ac_expert) loss_expert = np.mean(loss_expert) ## PPO if iters_so_far >= args.rl_start and iters_so_far <= args.rl_end: assign_old_eq_new( ) # set old parameter values to new parameter values batch = d.next_batch(optim_batchsize) # ppo if iters_so_far >= args.rl_start + pretrain_shift: # start generator after discriminator trained a well.. *newlosses, g_ppo = lossandgrad_ppo( batch["ob_adj"], batch["ob_node"], batch["ac"], batch["ac"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses_ppo = newlosses if args.has_d_step == 1 and i_optim >= optim_epochs // 2: # update step discriminator ob_expert, _ = env.get_expert( optim_batchsize, curriculum=args.curriculum, evel_total=args.curriculum_num, evel=level) loss_d_step, g_d_step = lossandgrad_d_step( ob_expert["adj"], ob_expert["node"], batch["ob_adj"], batch["ob_node"]) adam_d_step.update(g_d_step, optim_stepsize * cur_lrmult) loss_d_step = np.mean(loss_d_step) if args.has_d_final == 1 and i_optim >= optim_epochs // 4 * 3: # update final discriminator ob_expert, _ = env.get_expert( optim_batchsize, is_final=True, curriculum=args.curriculum, level_total=args.curriculum_num, level=level) seg_final_adj, seg_final_node = traj_final_generator( pi, copy.deepcopy(env), optim_batchsize, True) # update final discriminator loss_d_final, g_d_final = lossandgrad_d_final( ob_expert["adj"], ob_expert["node"], seg_final_adj, seg_final_node) adam_d_final.update(g_d_final, optim_stepsize * cur_lrmult) # update generator adam_pi.update(0.2 * g_ppo + 0.05 * g_expert, optim_stepsize * cur_lrmult) # WGAN # if args.has_d_step == 1: # clip_D = [p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in var_list_d_step] # if args.has_d_final == 1: # clip_D = [p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in var_list_d_final] # ## PPO val # if iters_so_far >= args.rl_start and iters_so_far <= args.rl_end: # logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob_adj"], batch["ob_node"], batch["ac"], batch["ac"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) # logger.log(fmt_row(13, meanlosses)) if writer is not None: writer.add_scalar("loss_expert", loss_expert, iters_so_far) writer.add_scalar("loss_expert_stop", loss_expert_stop, iters_so_far) writer.add_scalar("loss_d_step", loss_d_step, iters_so_far) writer.add_scalar("loss_d_final", loss_d_final, iters_so_far) writer.add_scalar('grad_expert_min', np.amin(g_expert), iters_so_far) writer.add_scalar('grad_expert_max', np.amax(g_expert), iters_so_far) writer.add_scalar('grad_expert_norm', np.linalg.norm(g_expert), iters_so_far) writer.add_scalar('grad_expert_stop_min', np.amin(g_expert_stop), iters_so_far) writer.add_scalar('grad_expert_stop_max', np.amax(g_expert_stop), iters_so_far) writer.add_scalar('grad_expert_stop_norm', np.linalg.norm(g_expert_stop), iters_so_far) writer.add_scalar('grad_rl_min', np.amin(g_ppo), iters_so_far) writer.add_scalar('grad_rl_max', np.amax(g_ppo), iters_so_far) writer.add_scalar('grad_rl_norm', np.linalg.norm(g_ppo), iters_so_far) writer.add_scalar('g_d_step_min', np.amin(g_d_step), iters_so_far) writer.add_scalar('g_d_step_max', np.amax(g_d_step), iters_so_far) writer.add_scalar('g_d_step_norm', np.linalg.norm(g_d_step), iters_so_far) writer.add_scalar('g_d_final_min', np.amin(g_d_final), iters_so_far) writer.add_scalar('g_d_final_max', np.amax(g_d_final), iters_so_far) writer.add_scalar('g_d_final_norm', np.linalg.norm(g_d_final), iters_so_far) writer.add_scalar('learning_rate', optim_stepsize * cur_lrmult, iters_so_far) for (lossval, name) in zipsame(meanlosses, loss_names): # logger.record_tabular("loss_"+name, lossval) if writer is not None: writer.add_scalar("loss_" + name, lossval, iters_so_far) # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) if writer is not None: writer.add_scalar("ev_tdlam_before", explained_variance(vpredbefore, tdlamret), iters_so_far) lrlocal = (seg["ep_lens"], seg["ep_lens_valid"], seg["ep_rets"], seg["ep_rets_env"], seg["ep_rets_d_step"], seg["ep_rets_d_final"], seg["ep_final_rew"], seg["ep_final_rew_stat"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, lens_valid, rews, rews_env, rews_d_step, rews_d_final, rews_final, rews_final_stat = map( flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) lenbuffer_valid.extend(lens_valid) rewbuffer.extend(rews) rewbuffer_d_step.extend(rews_d_step) rewbuffer_d_final.extend(rews_d_final) rewbuffer_env.extend(rews_env) rewbuffer_final.extend(rews_final) rewbuffer_final_stat.extend(rews_final_stat) # logger.record_tabular("EpLenMean", np.mean(lenbuffer)) # logger.record_tabular("EpRewMean", np.mean(rewbuffer)) # logger.record_tabular("EpThisIter", len(lens)) if writer is not None: writer.add_scalar("EpLenMean", np.mean(lenbuffer), iters_so_far) writer.add_scalar("EpLenValidMean", np.mean(lenbuffer_valid), iters_so_far) writer.add_scalar("EpRewMean", np.mean(rewbuffer), iters_so_far) writer.add_scalar("EpRewDStepMean", np.mean(rewbuffer_d_step), iters_so_far) writer.add_scalar("EpRewDFinalMean", np.mean(rewbuffer_d_final), iters_so_far) writer.add_scalar("EpRewEnvMean", np.mean(rewbuffer_env), iters_so_far) writer.add_scalar("EpRewFinalMean", np.mean(rewbuffer_final), iters_so_far) writer.add_scalar("EpRewFinalStatMean", np.mean(rewbuffer_final_stat), iters_so_far) writer.add_scalar("EpThisIter", len(lens), iters_so_far) episodes_so_far += len(lens) timesteps_so_far += sum(lens) # logger.record_tabular("EpisodesSoFar", episodes_so_far) # logger.record_tabular("TimestepsSoFar", timesteps_so_far) # logger.record_tabular("TimeElapsed", time.time() - tstart) if writer is not None: writer.add_scalar("EpisodesSoFar", episodes_so_far, iters_so_far) writer.add_scalar("TimestepsSoFar", timesteps_so_far, iters_so_far) writer.add_scalar("TimeElapsed", time.time() - tstart, iters_so_far) if MPI.COMM_WORLD.Get_rank() == 0: with open('molecule_gen/' + args.name_full + '.csv', 'a') as f: f.write('***** Iteration {} *****\n'.format(iters_so_far)) # save if iters_so_far % args.save_every == 0: fname = './ckpt/' + args.name_full + '_' + str(iters_so_far) saver = tf.compat.v1.train.Saver(var_list_pi) saver.save(tf.compat.v1.get_default_session(), fname) print('model saved!', fname) # fname = os.path.join(ckpt_dir, task_name) # os.makedirs(os.path.dirname(fname), exist_ok=True) # saver = tf.train.Saver() # saver.save(tf.get_default_session(), fname) # if iters_so_far==args.load_step: iters_so_far += 1 counter += 1 if counter % args.curriculum_step and counter // args.curriculum_step < args.curriculum_num: level += 1
def learn( env, pi, oldpi, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) test_envs=[ ] # can add a list of test environment to collect rewards if needed ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space #pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy #oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) if test_envs: test_gens = [] for test_env in test_envs: test_gen = traj_segment_generator(pi, test_env, timesteps_per_batch, stochastic=True) test_gens.append(test_gen) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=50) # rolling buffer for episode lengths rewbuffer = deque(maxlen=50) # rolling buffer for episode rewards test_rewbuffers = [deque(maxlen=50) for test_env in test_envs] # Maithra edits: add lists to return logs ep_lengths = [] ep_rewards = [] ep_labels = [] ep_actions = [] ep_correct_actions = [] ep_obs = [] # log results for test environment ep_rewards_test = [[] for test_env in test_envs] assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() if test_envs: segs_test = [] for test_gen in test_gens: segs_test.append(test_gen.__next__()) add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret, label = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"], seg["label"] if test_envs: for i, seg_test in enumerate(segs_test): test_rews = seg_test["ep_rets"] test_rewbuffers[i].extend(test_rews) ep_rewards_test[i].append(np.mean(test_rewbuffers[i])) vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) # Maithra edit: append intermediate results onto returned logs ep_lengths.append(np.mean(lenbuffer)) ep_rewards.append(np.mean(rewbuffer)) ep_labels.append(deepcopy(label)) ep_actions.append(deepcopy(ac)) ep_obs.append(deepcopy(ob)) # compute mean of correct actions and append, ignoring actions # where either choice could be right count = 0 idxs = np.all((label == [1, 1]), axis=1) # removing for now: count += np.sum(idxs) new_label = label[np.invert(idxs)] new_ac = ac[np.invert(idxs)] count += np.sum((new_ac == np.argmax(new_label, axis=1))) # changing ep_correct_actions.append(count/len(label)) ep_correct_actions.append(count / (len(label) - np.sum(idxs))) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() #Maithra edit return pi, { "lengths": ep_lengths, "rewards": ep_rewards, "labels": ep_labels, "actions": ep_actions, "correct_actions": ep_correct_actions, "obs": ep_obs, "test_rews": ep_rewards_test }
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) identifier, save_result=True, save_interval=100, reward_list=[], cont=False, play=False, iter, action_repeat=1): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space mirror = hasattr(env, 'mirror_id') mirror_id = env.mirror_id if mirror else None pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) if mirror: mirror_ob = U.get_placeholder(name="mirror_ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) mirror_ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) sym_loss = 4 * tf.reduce_mean(tf.square(ac - mirror_ac)) if mirror else 0 total_loss = pol_surr + pol_entpen + vf_loss + sym_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] if mirror: losses.append(sym_loss) loss_names.append("sym_loss") var_list = pi.get_trainable_variables() inputs = [ob, ac, atarg, ret, lrmult] if mirror: inputs += [mirror_ob, mirror_ac] lossandgrad = U.function(inputs, losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function(inputs, losses) if play: return pi if cont: load_state(identifier, iter) else: U.initialize() iter = 0 adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, mirror_id=mirror_id, action_repeat=action_repeat) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = int(iter) tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_ori = deque(maxlen=100) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError if MPI.COMM_WORLD.Get_rank() == 0: logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] if mirror: mirror_ob, mirror_ac = seg["mirror_ob"], seg["mirror_ac"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d_dict = dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret) if mirror: d_dict["mirror_ob"] = mirror_ob d_dict["mirror_ac"] = mirror_ac d = Dataset(d_dict, shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): batches = [ batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult ] if mirror: batches += [batch["mirror_ob"], batch["mirror_ac"]] *newlosses, g = lossandgrad(*batches) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) losses = [] for batch in d.iterate_once(optim_batchsize): batches = [ batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult ] if mirror: batches += [batch["mirror_ob"], batch["mirror_ac"]] newlosses = compute_losses(*batches) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_rets_ori"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, rews_ori = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) rewbuffer_ori.extend(rews_ori) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpRewOriMean", np.mean(rewbuffer_ori)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() reward_list.append(np.mean(rewbuffer_ori)) if save_result and iters_so_far % save_interval == 0: save_state(identifier, iters_so_far) save_rewards(reward_list, identifier, iters_so_far) logger.log('Model and reward saved') return pi
def learn( # =========== modified part begins =========== # env_id, seed, robot, # robot class with GMM params joint_optimization_iters, # total number of joint optimization iterations design_iters, # number of samples when updating physical design in each joint optimization iteration policy_iters, # number of samples when updating robot policy in each joint optimization iteration # ============ modified part ends ============ # policy_func, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # ================================== modification 1 ================================== # """ input: replace "env" (env class) with "env_id" (string) add "seed" (int) reason: to enable env.make() during training modification detail: add following lines into learn() env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) env.close() # added at the end of learn() """ import roboschool, gym from baselines import bench env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) # ================================== modification 1 ================================== # # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space # policy_func is the initialization of NN # NN structure: # state -> (num_hid_layers) fully-connected layers with (hid_size) units -> (action, predicted value) # num_hid_layers, hid_size: set in the file calls "learn" pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # placeholder for "ob" # created in mlppolicy.py ob = U.get_placeholder_cached(name="ob") # placeholder for "ac" # in common/distribution.py ac = pi.pdtype.sample_placeholder([None]) # KL divergence and Entropy, defined in common/distribution.py kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) # pol_entpen: Entropy Bounus encourages exploration # entcoeff: entropy coefficient, defined in PPO page 5, Equ. (9) pol_entpen = (-entcoeff) * meanent # probability ration, defined in PPO page 3 ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold # Surrogate Goal # defined in PPO page 3, Equ (7) surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) # Value Function Loss: square error loss for ||v_pred - v_target|| vf_loss = U.mean(tf.square(pi.vpred - ret)) # Total_loss = L^CLIP - Value Function Loss + Entropy Bounus # defined in PPO page 5, Equ. (9) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) # adam optimizer? adam = MpiAdam(var_list, epsilon=adam_epsilon) # oldpi = pi assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) # Why we need this line? compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # ================================== modification 2 ================================== # for joint_optimization_iter in range(joint_optimization_iters): U.save_state('/home/yetong/Desktop/Project/models/model{}.ckpt'.format( joint_optimization_iter)) logger.log("joint optimization progree: {}/{}".format( joint_optimization_iter, joint_optimization_iters)) # ================================== update physical design ================================== # if joint_optimization_iter > 20: Rewards_plus = np.zeros(design_iters) Rewards_minum = np.zeros(design_iters) params = robot.sample(design_iters, to_update=True) for i, param in enumerate(params): robot.modify_file(param) env = gym.make(env_id) # myenv = env.env # pdb.set_trace() env = bench.Monitor(env, logger.get_dir()) R = episode_generator(pi, env, gamma, stochastic=True) logger.log("\t update physical design: %d/%d, rew: %f" % (i, 2 * design_iters, R)) if i % 2 == 0: Rewards_plus[int(i / 2)] = R else: Rewards_minum[int(i / 2)] = R logger.log("prev_mu: ", robot.params_mu) logger.log("prev_sig: ", robot.params_sig) robot.update(Rewards_plus, Rewards_minum) logger.log("mu: ", robot.params_mu) logger.log("sig: ", robot.params_sig) # ================================== update policy ================================== # # params = robot.sample(design_iters) params = [robot.params_mu] for param in params: # reinitialize env robot.modify_file(param) env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) # ================================== modification 2 ================================== # # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([ max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0 ]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break # annealing for stepsize parameters (epsilon and adam) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy # oldpi = pi # set old parameter values to new parameter values assign_old_eq_new() logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular( "ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather( lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() # ================================== modification 1 ================================== # env.close()
def learn( env, genv, i_trial, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return gpi = policy_fn("gpi", ob_space, ac_space) # Construct network for new policy goldpi = policy_fn("goldpi", ob_space, ac_space) # Network for old policy gatarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) gret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) # gob = U.get_placeholder_cached(name='ob') gac = gpi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent gkloldnew = goldpi.pd.kl(gpi.pd) gent = gpi.pd.entropy() gmeankl = tf.reduce_mean(gkloldnew) gmeanent = tf.reduce_mean(gent) gpol_entpen = (-entcoeff) * gmeanent ratio = tf.exp(pi.pd.logp(gac) - goldpi.pd.logp(gac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] gratio = tf.exp(gpi.pd.logp(ac) - oldpi.pd.logp(ac)) gsurr1 = gratio * gatarg gsurr2 = tf.clip_by_value(gratio, 1.0 - clip_param, 1.0 + clip_param) * gatarg gpol_surr = -tf.reduce_mean(tf.minimum(gsurr1, gsurr2)) gvf_loss = tf.reduce_mean(tf.square(gpi.vpred - gret)) gtotal_loss = gpol_surr + gpol_entpen + gvf_loss glosses = [gpol_surr, gpol_entpen, gvf_loss, gmeankl, gmeanent] gloss_names = ["gpol_surr", "gpol_entpen", "gvf_loss", "gkl", "gent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, gac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) gvar_list = gpi.get_trainable_variables() glossandgrad = U.function([ob, ac, gatarg, gret, lrmult], glosses + [U.flatgrad(gtotal_loss, gvar_list)]) gadam = MpiAdam(gvar_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) gassign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(goldpi.get_variables(), gpi.get_variables()) ]) compute_losses = U.function([ob, gac, atarg, ret, lrmult], losses) gcompute_losses = U.function([ob, ac, gatarg, gret, lrmult], glosses) U.initialize() adam.sync() gadam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, gpi, env, timesteps_per_actorbatch, stochastic=True) gseg_gen = traj_segment_generator(gpi, pi, genv, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards glenbuffer = deque(maxlen=100) # rolling buffer for episode lengths grewbuffer = deque(maxlen=100) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" def standarize(value): return (value - value.mean()) / (value.std()) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError print("********** Iteration %i ************" % iters_so_far) print("********** Guided Policy ************") gseg = gseg_gen.__next__() add_vtarg_and_adv(gseg, gamma, lam) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) gob, gac, gatarg, gatarg_, gtdlamret, gtdlamret_ , gvpredbefore, gvpredbefore_ = gseg["ob"], gseg["ac"], \ gseg["adv"], gseg["adv_"], gseg["tdlamret"], gseg["tdlamret_"], gseg["vpred"], gseg["vpred_"] standarize(gatarg_) standarize(gatarg) gd = Dataset(dict(gob=gob, gac=gac, gatarg=gatarg, gatarg_=gatarg_, gvtarg=gtdlamret, gvtarg_=gtdlamret_), shuffle=not gpi.recurrent) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, atarg_, tdlamret, tdlamret_, vpredbefore, vpredbefore_ = seg["ob"], seg["ac"],\ seg["adv"], seg["adv_"], seg["tdlamret"], seg["tdlamret_"], seg["vpred"], gseg["vpred_"] standarize(atarg) standarize(atarg_) d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, atarg_=atarg_, vtarg=tdlamret, vtarg_=tdlamret_), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(gpi, "ob_rms"): gpi.ob_rms.update(ob) if hasattr(pi, "ob_rms"): pi.ob_rms.update(gob) # update running mean/std for policy gassign_old_eq_new() print("Optimizing...Guided Policy") # print(fmt_row(13, gloss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): glosses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = glossandgrad(batch["ob"], batch["ac"], batch["atarg_"], batch["vtarg_"], cur_lrmult) gadam.update(g, optim_stepsize * cur_lrmult) glosses.append(newlosses) # print(fmt_row(13, np.mean(glosses, axis=0))) # print("Evaluating losses...") glosses = [] for batch in d.iterate_once(optim_batchsize): newlosses = gcompute_losses(batch["ob"], batch["ac"], batch["atarg_"], batch["vtarg_"], cur_lrmult) glosses.append(newlosses) gmeanlosses, _, _ = mpi_moments(glosses, axis=0) # print(fmt_row(13, gmeanlosses)) for (lossval, name) in zipsame(gmeanlosses, gloss_names): logger.record_tabular("gloss_" + name, lossval) # logger.record_tabular("gev_tdlam_before", explained_variance(vpredbefore, tdlamret)) assign_old_eq_new() # set old parameter values to new parameter values print("Optimizing...Training Policy") # print(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data optim_batchsize = optim_batchsize or gob.shape[0] for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in gd.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["gob"], batch["gac"], batch["gatarg_"], batch["gvtarg_"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) # print(fmt_row(13, np.mean(losses, axis=0))) # print("Evaluating losses...") losses = [] for batch in gd.iterate_once(optim_batchsize): newlosses = compute_losses(batch["gob"], batch["gac"], batch["gatarg_"], batch["gvtarg_"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) # print(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) glrlocal = (gseg["ep_lens"], gseg["ep_rets"]) # local values glistoflrpairs = MPI.COMM_WORLD.allgather(glrlocal) # list of tuples glens, grews = map(flatten_lists, zip(*glistoflrpairs)) # lenbuffer.extend(lens) rewbuffer.extend(rews) grewbuffer.extend(grews) # logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("GEpRewMean", np.mean(grewbuffer)) # logger.record_tabular("EpThisIter", len(lens)) # episodes_so_far += len(lens) # timesteps_so_far += sum(lens) iters_so_far += 1 # logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.logkv('trial', i_trial) logger.logkv("Iteration", iters_so_far) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def enjoy( env, policy_func, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_name=None, save_per_acts=3, reload_name=None): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() if reload_name: saver = tf.train.Saver() saver.restore(tf.get_default_session(), reload_name) print("Loaded model successfully.") # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses)
def learn( env, test_env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation # CMAES max_fitness, # has to be negative, as cmaes consider minization popsize, gensize, bounds, sigma, eval_iters, max_v_train_iter, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) seed, env_id): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy backup_pi = policy_fn( "backup_pi", ob_space, ac_space ) # Construct a network for every individual to adapt during the es evolution pi_zero = policy_fn( "zero_pi", ob_space, ac_space) # pi_0 will only be updated along with iterations reward = tf.placeholder(dtype=tf.float32, shape=[None]) # step rewards atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") next_ob = U.get_placeholder_cached( name="next_ob") # next step observation for updating q function ac = U.get_placeholder_cached( name="act") # action placeholder for computing q function kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent pi_adv = pi.qpred - pi.vpred adv_mean, adv_var = tf.nn.moments(pi_adv, axes=[0]) normalized_pi_adv = (pi_adv - adv_mean) / tf.sqrt(adv_var) qf_loss = tf.reduce_mean(tf.square(reward + gamma * pi.vpred - pi.qpred)) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) qf_losses = [qf_loss] vf_losses = [vf_loss] pol_loss = -tf.reduce_mean(normalized_pi_adv) # Advantage function should be improved losses = [pol_loss, pol_entpen, meankl, meanent] loss_names = ["pol_surr_2", "pol_entpen", "kl", "ent"] var_list = pi.get_trainable_variables() qf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("qf") ] vf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") ] pol_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("pol") ] vf_lossandgrad = U.function([ob, ac, atarg, ret, lrmult], vf_losses + [U.flatgrad(vf_loss, vf_var_list)]) qf_lossandgrad = U.function([ob, ac, next_ob, lrmult, reward], qf_losses + [U.flatgrad(qf_loss, qf_var_list)]) qf_adam = MpiAdam(qf_var_list, epsilon=adam_epsilon) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) assign_backup_eq_new = U.function( [], [], updates=[ tf.assign(backup_v, newv) for ( backup_v, newv) in zipsame(backup_pi.get_variables(), pi.get_variables()) ]) assign_new_eq_backup = U.function( [], [], updates=[ tf.assign(newv, backup_v) for (newv, backup_v ) in zipsame(pi.get_variables(), backup_pi.get_variables()) ]) # Compute all losses mean_pi_actions = U.function( [ob], [pi.pd.mode()]) # later for computing pol_loss compute_pol_losses = U.function([ob, next_ob, ac], [pol_loss]) U.initialize() get_pi_flat_params = U.GetFlat(pol_var_list) set_pi_flat_params = U.SetFromFlat(pol_var_list) vf_adam.sync() qf_adam.sync() global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness episodes_so_far = 0 timesteps_so_far = 0 ppo_timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards best_fitness = np.inf eval_gen = traj_segment_generator_eval(pi, test_env, timesteps_per_actorbatch, stochastic=True) # For evaluation seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, eval_gen=eval_gen) # For train V Func # Build generator for all solutions actors = [] best_fitness = 0 for i in range(popsize): newActor = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, eval_gen=eval_gen) actors.append(newActor) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if max_timesteps and timesteps_so_far >= max_timesteps: print("Max time steps") break elif max_episodes and episodes_so_far >= max_episodes: print("Max episodes") break elif max_iters and iters_so_far >= max_iters: print("Max iterations") break elif max_seconds and time.time() - tstart >= max_seconds: print("Max time") break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) # Generate new samples # Train V func for i in range(max_v_train_iter): logger.log("Iteration:" + str(iters_so_far) + " - sub-train iter for V func:" + str(i)) logger.log("Generate New Samples") seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) ob, ac, next_ob, atarg, reward, tdlamret, traj_idx = seg["ob"], seg["ac"], seg["next_ob"], seg["adv"], seg["rew"], seg["tdlamret"], \ seg["traj_index"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update( ob) # update running mean/std for normalization assign_old_eq_new( ) # set old parameter values to new parameter values # Train V function logger.log("Training V Func and Evaluating V Func Losses") for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *vf_losses, g = vf_lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) vf_adam.update(g, optim_stepsize * cur_lrmult) losses.append(vf_losses) logger.log(fmt_row(13, np.mean(losses, axis=0))) d_q = Dataset(dict(ob=ob, ac=ac, next_ob=next_ob, reward=reward, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) # Re-train q function logger.log("Training Q Func Evaluating Q Func Losses") for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d_q.iterate_once(optim_batchsize): *qf_losses, g = qf_lossandgrad(batch["next_ob"], batch["ac"], batch["ob"], cur_lrmult, batch["reward"]) qf_adam.update(g, optim_stepsize * cur_lrmult) losses.append(qf_losses) logger.log(fmt_row(13, np.mean(losses, axis=0))) # CMAES Train Policy assign_old_eq_new() # set old parameter values to new parameter values assign_backup_eq_new() # backup current policy flatten_weights = get_pi_flat_params() opt = cma.CMAOptions() opt['tolfun'] = max_fitness opt['popsize'] = popsize opt['maxiter'] = gensize opt['verb_disp'] = 0 opt['verb_log'] = 0 opt['seed'] = seed opt['AdaptSigma'] = True es = cma.CMAEvolutionStrategy(flatten_weights, sigma, opt) while True: if es.countiter >= gensize: logger.log("Max generations for current layer") break logger.log("Iteration:" + str(iters_so_far) + " - sub-train Generation for Policy:" + str(es.countiter)) logger.log("Sigma=" + str(es.sigma)) solutions = es.ask() costs = [] lens = [] assign_backup_eq_new() # backup current policy for id, solution in enumerate(solutions): set_pi_flat_params(solution) losses = [] cost = compute_pol_losses(ob, ob, mean_pi_actions(ob)[0]) costs.append(cost[0]) assign_new_eq_backup() # Weights decay l2_decay = compute_weight_decay(0.99, solutions) costs += l2_decay # costs, real_costs = fitness_normalization(costs) costs, real_costs = fitness_rank(costs) es.tell_real_seg(solutions=solutions, function_values=costs, real_f=real_costs, segs=None) best_solution = es.result[0] best_fitness = es.result[1] logger.log("Best Solution Fitness:" + str(best_fitness)) set_pi_flat_params(best_solution) iters_so_far += 1 episodes_so_far += sum(lens)
def learn(env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular()
class PPO(object): def __init__(self, env, policy, emb_network, emb_size, clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint adam_epsilon=1e-5, schedule='constant', joint_training=False ): # Setup variables self.optim_epochs = optim_epochs self.optim_stepsize = optim_stepsize self.optim_batchsize = optim_batchsize self.gamma = gamma self.lam = lam self.max_timesteps = max_timesteps self.adam_epsilon = adam_epsilon self.schedule = schedule # Setup losses and stuff # ---------------------------------------- with tf.name_scope('ppo'): ob_space = env.observation_space ac_space = env.action_space self.pi = policy # Construct network for new policy oldpi = Policy("old_policy", env.action_space, joint_training, emb_size, emb_network) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) if joint_training: ob = U.get_placeholder_cached(name="ob_f") else: ob = U.get_placeholder_cached(name="ob") ac = self.pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(self.pi.pd) ent = self.pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(self.pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(self.pi.vpred - ret)) self.total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] self.loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = self.pi.get_trainable_variables() self.lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(self.total_loss, var_list)]) self.adam = MpiAdam(var_list, epsilon=adam_epsilon) self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), self.pi.get_variables())]) self.compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() self.adam.sync() # Prepare for rollouts # ---------------------------------------- self.episodes_so_far = 0 self.timesteps_so_far = 0 self.iters_so_far = 0 self.tstart = time.time() self.lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths self.rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards # self.train_step = tf.train.AdamOptimizer(adam_epsilon).minimize(self.total_loss, var_list=var_list) # self.train = U.function([ob, ac, atarg, ret, lrmult], self.train_step) def prepare(self, batch): if self.timesteps_so_far >= self.max_timesteps: return False if self.schedule == 'constant': self.cur_lrmult = 1.0 elif self.schedule == 'linear': self.cur_lrmult = max(1.0 - float(self.timesteps_so_far) / (self.max_timesteps * 1.1), 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%self.iters_so_far) # seg = seg_gen.__next__() # generate next sequence self.seg = batch self.add_vtarg_and_adv(self.seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, self.tdlamret = self.seg["ob"], self.seg["ac"], self.seg["adv"], self.seg["tdlamret"] self.vpredbefore = self.seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate self.d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=self.tdlamret), shuffle=not self.pi.recurrent) self.optim_batchsize = self.optim_batchsize or ob.shape[0] self.assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, self.loss_names)) return True def step(self): # Here we do a bunch of optimization epochs over the data for _ in range(self.optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for b in self.d.iterate_once(self.optim_batchsize): # self.train(b["ob"], b["ac"], b["atarg"], b["vtarg"], self.cur_lrmult) *newlosses, g = self.lossandgrad(b["ob"], b["ac"], b["atarg"], b["vtarg"], self.cur_lrmult) self.adam.update(g, self.optim_stepsize * self.cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) def log(self): logger.log("Evaluating losses...") losses = [] for b in self.d.iterate_once(self.optim_batchsize): newlosses = self.compute_losses(b["ob"], b["ac"], b["atarg"], b["vtarg"], self.cur_lrmult) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, self.loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(self.vpredbefore, self.tdlamret)) lrlocal = (self.seg["ep_lens"], self.seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(self.flatten_lists, zip(*listoflrpairs)) self.lenbuffer.extend(lens) self.rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(self.lenbuffer)) logger.record_tabular("EpRewMean", np.mean(self.rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) self.episodes_so_far += len(lens) self.timesteps_so_far += sum(lens) self.iters_so_far += 1 logger.record_tabular("EpisodesSoFar", self.episodes_so_far) logger.record_tabular("TimestepsSoFar", self.timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - self.tstart) if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular() def add_vtarg_and_adv(self, seg, gamma, lam): """ Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) """ new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1 vpred = np.append(seg["vpred"], seg["nextvpred"]) T = len(seg["rew"]) seg["adv"] = gaelam = np.empty(T, 'float32') rew = seg["rew"] lastgaelam = 0 for t in reversed(range(T)): nonterminal = 1-new[t+1] delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t] gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam seg["tdlamret"] = seg["adv"] + seg["vpred"] def flatten_lists(self, listoflists): return [el for list_ in listoflists for el in list_]
def train(self, seg, optim_batchsize, optim_epochs): #normalize the reward rffs_int = np.array( [self.rff_int.update(rew) for rew in seg["rew_int"]]) self.rff_rms_int.update(rffs_int.ravel()) seg["rew_int"] = seg["rew_int"] / np.sqrt(self.rff_rms_int.var) cur_lrmult = 1.0 add_vtarg_and_adv(seg, self.gamma, self.lam) ob, unnorm_ac, atarg_ext, tdlamret_ext, atarg_int, tdlamret_int = seg[ "ob"], seg["unnorm_ac"], seg["adv_ext"], seg["tdlamret_ext"], seg[ "adv_int"], seg["tdlamret_int"] vpredbefore_ext, vpredbefore_int = seg["vpred_ext"], seg[ "vpred_int"] # predicted value function before udpate atarg_ext = (atarg_ext - atarg_ext.mean()) / atarg_ext.std( ) # standardized advantage function estimate atarg_int = (atarg_int - atarg_int.mean()) / atarg_int.std() atarg = self.int_coeff * atarg_int + self.ext_coeff * atarg_ext d = Dataset(dict(ob=ob, ac=unnorm_ac, atarg=atarg, vtarg_ext=tdlamret_ext, vtarg_int=tdlamret_int), shuffle=not self.pi.recurrent) if hasattr(self.pi, "ob_rms"): self.pi.update_obs_rms(ob) # update running mean/std for policy if hasattr(self.int_rew, "ob_rms"): self.int_rew.update_obs_rms( ob) #update running mean/std for int_rew self.assign_old_eq_new( ) # set old parameter values to new parameter values logger.log2("Optimizing...") logger.log2(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): lg = self.lossandgrad(batch["ac"], batch["atarg"], batch["vtarg_ext"], batch["vtarg_int"], cur_lrmult, *zip(*batch["ob"].tolist())) new_losses, g = lg[:-1], lg[-1] self.adam.update(g, self.optim_stepsize * cur_lrmult) losses.append(new_losses) logger.log2(fmt_row(13, np.mean(losses, axis=0))) logger.log2("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ac"], batch["atarg"], batch["vtarg_ext"], batch["vtarg_int"], cur_lrmult, *zip(*batch["ob"].tolist())) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log2(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, self.loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular( "ev_tdlam_ext_before", explained_variance(vpredbefore_ext, tdlamret_ext)) return meanlosses
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation # CMAES max_fitness, # has to be negative, as cmaes consider minization popsize, gensize, bounds, sigma, eval_iters, max_v_train_iter, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) seed, env_id): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy backup_pi = policy_fn( "backup_pi", ob_space, ac_space ) # Construct a network for every individual to adapt during the es evolution pi_params = tf.placeholder(dtype=tf.float32, shape=[None]) old_pi_params = tf.placeholder(dtype=tf.float32, shape=[None]) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule layer_clip = tf.placeholder( name='layer_clip', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule bound_coeff = tf.placeholder( name='bound_coeff', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult * layer_clip # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - (oldpi.pd.logp(ac) + 1e-8)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) vf_losses = [vf_loss] vf_loss_names = ["vf_loss"] pol_loss = pol_surr + pol_entpen total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() vf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") ] pol_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("pol") ] layer_var_list = [] for i in range(pi.num_hid_layers): layer_var_list.append([ v for v in pol_var_list if v.name.split("/")[2].startswith('fc%i' % (i + 1)) ]) logstd_var_list = [ v for v in pol_var_list if v.name.split("/")[2].startswith("logstd") ] if len(logstd_var_list) != 0: layer_var_list.append([ v for v in pol_var_list if v.name.split("/")[2].startswith("final") ] + logstd_var_list) vf_lossandgrad = U.function([ob, ac, ret, lrmult], vf_losses + [U.flatgrad(vf_loss, vf_var_list)]) lossandgrad = U.function([ob, ac, atarg, ret, lrmult, layer_clip], losses + [U.flatgrad(total_loss, var_list)]) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) assign_backup_eq_new = U.function( [], [], updates=[ tf.assign(backup_v, newv) for ( backup_v, newv) in zipsame(backup_pi.get_variables(), pi.get_variables()) ]) assign_new_eq_backup = U.function( [], [], updates=[ tf.assign(newv, backup_v) for (newv, backup_v ) in zipsame(pi.get_variables(), backup_pi.get_variables()) ]) # Compute all losses compute_pol_losses = U.function([ob, ac, atarg, ret, lrmult, layer_clip], [pol_loss, pol_surr, pol_entpen, meankl]) compute_v_pred = U.function([ob], [pi.vpred]) a_prob = tf.exp(pi.pd.logp(ac)) compute_a_prob = U.function([ob, ac], [a_prob]) U.initialize() layer_set_operate_list = [] layer_get_operate_list = [] for var in layer_var_list: set_pi_layer_flat_params = U.SetFromFlat(var) layer_set_operate_list.append(set_pi_layer_flat_params) get_pi_layer_flat_params = U.GetFlat(var) layer_get_operate_list.append(get_pi_layer_flat_params) # get_pi_layer_flat_params = U.GetFlat(pol_var_list) # set_pi_layer_flat_params = U.SetFromFlat(pol_var_list) vf_adam.sync() adam.sync() global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness episodes_so_far = 0 timesteps_so_far = 0 ppo_timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards best_fitness = -np.inf eval_seq = traj_segment_generator_eval(pi, env, timesteps_per_actorbatch, stochastic=False) # eval_gen = traj_segment_generator_eval(pi, test_env, timesteps_per_actorbatch, stochastic = True) # For evaluation seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, eval_seq=eval_seq) # For train V Func assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" indices = [] # maintain all selected indices for each iteration opt = cma.CMAOptions() opt['tolfun'] = max_fitness opt['popsize'] = popsize opt['maxiter'] = gensize opt['verb_disp'] = 0 opt['verb_log'] = 0 # opt['seed'] = seed opt['AdaptSigma'] = True # opt['bounds'] = bounds # opt['tolstagnation'] = 20 ess = [] seg = None segs = None sum_vpred = [] while True: if max_timesteps and timesteps_so_far >= max_timesteps: print("Max time steps") break elif max_episodes and episodes_so_far >= max_episodes: print("Max episodes") break elif max_iters and iters_so_far >= max_iters: print("Max iterations") break elif max_seconds and time.time() - tstart >= max_seconds: print("Max time") break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / (max_timesteps), 0) else: raise NotImplementedError # epsilon = max(0.5 - float(timesteps_so_far) / (max_timesteps), 0) * cur_lrmult epsilon = max(0.5 * cur_lrmult, 0) # epsilon = 0.2 sigma_adapted = max(sigma * cur_lrmult, 1e-8) # sigma_adapted = max(max(sigma - float(timesteps_so_far) / (5000 * max_timesteps), 0) * cur_lrmult, 1e-8) # cmean_adapted = max(1.0 - float(timesteps_so_far) / (max_timesteps), 1e-8) # cmean_adapted = max(0.8 - float(time˚steps_so_far) / (2*max_timesteps), 1e-8) # if timesteps_so_far % max_timesteps == 10: max_v_train_iter = int( max( max_v_train_iter * (1 - timesteps_so_far / (0.5 * max_timesteps)), 1)) logger.log("********** Iteration %i ************" % iters_so_far) if iters_so_far == 0: eval_seg = eval_seq.__next__() rewbuffer.extend(eval_seg["ep_rets"]) lenbuffer.extend(eval_seg["ep_lens"]) result_record() # Repository Train train_segs = {} seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) if hasattr(pi, "ob_rms"): pi.ob_rms.update( seg["ob"]) # update running mean/std for normalization # rewbuffer.extend(seg["ep_rets"]) # lenbuffer.extend(seg["ep_lens"]) # # if iters_so_far == 0: # result_record() assign_old_eq_new() # set old parameter values to new parameter values if segs is None: segs = seg segs["v_target"] = np.zeros(len(seg["ob"]), 'float32') elif len(segs["ob"]) >= 50000: segs["ob"] = np.take(segs["ob"], np.arange(timesteps_per_actorbatch, len(segs["ob"])), axis=0) segs["next_ob"] = np.take(segs["next_ob"], np.arange(timesteps_per_actorbatch, len(segs["next_ob"])), axis=0) segs["ac"] = np.take(segs["ac"], np.arange(timesteps_per_actorbatch, len(segs["ac"])), axis=0) segs["rew"] = np.take(segs["rew"], np.arange(timesteps_per_actorbatch, len(segs["rew"])), axis=0) segs["vpred"] = np.take(segs["vpred"], np.arange(timesteps_per_actorbatch, len(segs["vpred"])), axis=0) segs["act_props"] = np.take(segs["act_props"], np.arange(timesteps_per_actorbatch, len(segs["act_props"])), axis=0) segs["new"] = np.take(segs["new"], np.arange(timesteps_per_actorbatch, len(segs["new"])), axis=0) segs["adv"] = np.take(segs["adv"], np.arange(timesteps_per_actorbatch, len(segs["adv"])), axis=0) segs["tdlamret"] = np.take(segs["tdlamret"], np.arange(timesteps_per_actorbatch, len(segs["tdlamret"])), axis=0) segs["ep_rets"] = np.take(segs["ep_rets"], np.arange(timesteps_per_actorbatch, len(segs["ep_rets"])), axis=0) segs["ep_lens"] = np.take(segs["ep_lens"], np.arange(timesteps_per_actorbatch, len(segs["ep_lens"])), axis=0) segs["v_target"] = np.take(segs["v_target"], np.arange(timesteps_per_actorbatch, len(segs["v_target"])), axis=0) segs["ob"] = np.append(segs['ob'], seg['ob'], axis=0) segs["next_ob"] = np.append(segs['next_ob'], seg['next_ob'], axis=0) segs["ac"] = np.append(segs['ac'], seg['ac'], axis=0) segs["rew"] = np.append(segs['rew'], seg['rew'], axis=0) segs["vpred"] = np.append(segs['vpred'], seg['vpred'], axis=0) segs["act_props"] = np.append(segs['act_props'], seg['act_props'], axis=0) segs["new"] = np.append(segs['new'], seg['new'], axis=0) segs["adv"] = np.append(segs['adv'], seg['adv'], axis=0) segs["tdlamret"] = np.append(segs['tdlamret'], seg['tdlamret'], axis=0) segs["ep_rets"] = np.append(segs['ep_rets'], seg['ep_rets'], axis=0) segs["ep_lens"] = np.append(segs['ep_lens'], seg['ep_lens'], axis=0) segs["v_target"] = np.append(segs['v_target'], np.zeros(len(seg["ob"]), 'float32'), axis=0) else: segs["ob"] = np.append(segs['ob'], seg['ob'], axis=0) segs["next_ob"] = np.append(segs['next_ob'], seg['next_ob'], axis=0) segs["ac"] = np.append(segs['ac'], seg['ac'], axis=0) segs["rew"] = np.append(segs['rew'], seg['rew'], axis=0) segs["vpred"] = np.append(segs['vpred'], seg['vpred'], axis=0) segs["act_props"] = np.append(segs['act_props'], seg['act_props'], axis=0) segs["new"] = np.append(segs['new'], seg['new'], axis=0) segs["adv"] = np.append(segs['adv'], seg['adv'], axis=0) segs["tdlamret"] = np.append(segs['tdlamret'], seg['tdlamret'], axis=0) segs["ep_rets"] = np.append(segs['ep_rets'], seg['ep_rets'], axis=0) segs["ep_lens"] = np.append(segs['ep_lens'], seg['ep_lens'], axis=0) segs["v_target"] = np.append(segs['v_target'], np.zeros(len(seg["ob"]), 'float32'), axis=0) if iters_so_far == 0: ob, ac, tdlamret = seg["ob"], seg["ac"], seg["tdlamret"] d = Dataset(dict(ob=ob, ac=ac, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] # Train V function # logger.log("Catchup Training V Func and Evaluating V Func Losses") for _ in range(max_v_train_iter): for batch in d.iterate_once(optim_batchsize): *vf_loss, g = vf_lossandgrad(batch["ob"], batch["ac"], batch["vtarg"], cur_lrmult) vf_adam.update(g, optim_stepsize * cur_lrmult) # logger.log(fmt_row(13, np.mean(vf_losses, axis = 0))) else: # Update v target new = segs["new"] rew = segs["rew"] act_prob = np.asarray(compute_a_prob(segs["ob"], segs["ac"])).T importance_ratio = np.squeeze(act_prob) / ( segs["act_props"] + np.ones(segs["act_props"].shape) * 1e-8) segs["v_target"] = importance_ratio * (1 / np.sum(importance_ratio)) * \ np.squeeze( rew + np.invert(new).astype(np.float32) * gamma * compute_v_pred(segs["next_ob"])) # train_segs["v_target"] = rew + np.invert(new).astype(np.float32) * gamma * compute_v_pred(train_segs["next_ob"]) if len(segs["ob"]) >= 20000: train_times = int(max_v_train_iter / 2) if int(max_v_train_iter / 2) > 0 else 1 else: train_times = 2 for i in range(train_times): selected_train_index = np.random.choice( range(len(segs["ob"])), timesteps_per_actorbatch, replace=False) train_segs["ob"] = np.take(segs["ob"], selected_train_index, axis=0) train_segs["next_ob"] = np.take(segs["next_ob"], selected_train_index, axis=0) train_segs["ac"] = np.take(segs["ac"], selected_train_index, axis=0) train_segs["rew"] = np.take(segs["rew"], selected_train_index, axis=0) train_segs["vpred"] = np.take(segs["vpred"], selected_train_index, axis=0) train_segs["new"] = np.take(segs["new"], selected_train_index, axis=0) train_segs["adv"] = np.take(segs["adv"], selected_train_index, axis=0) train_segs["tdlamret"] = np.take(segs["tdlamret"], selected_train_index, axis=0) train_segs["v_target"] = np.take(segs["v_target"], selected_train_index, axis=0) # ob, ac, v_target = train_segs["ob"], train_segs[ "ac"], train_segs["v_target"] d = Dataset(dict(ob=ob, ac=ac, vtarg=v_target), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] # Train V function # logger.log("Training V Func and Evaluating V Func Losses") # Train V function # logger.log("Catchup Training V Func and Evaluating V Func Losses") # logger.log("Train V - "+str(_)) for _ in range(max_v_train_iter): for batch in d.iterate_once(optim_batchsize): *vf_loss, g = vf_lossandgrad(batch["ob"], batch["ac"], batch["vtarg"], cur_lrmult) vf_adam.update(g, optim_stepsize * cur_lrmult) # logger.log(fmt_row(13, np.mean(vf_losses, axis = 0))) # seg['vpred'] = np.asarray(compute_v_pred(seg["ob"])).reshape(seg['vpred'].shape) # seg['nextvpred'] = seg['vpred'][-1] * (1 - seg["new"][-1]) # add_vtarg_and_adv(seg, gamma, lam) ob, ac, atarg, v_target = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=v_target), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] # Local search for _ in range(optim_epochs): for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, 1 / 4) adam.update(g, optim_stepsize * cur_lrmult) # seg['vpred'] = np.asarray(compute_v_pred(seg["ob"])).reshape(seg['vpred'].shape) # seg['nextvpred'] = seg['vpred'][-1] * (1 - seg["new"][-1]) # add_vtarg_and_adv(seg, gamma, lam) ob_po, ac_po, atarg_po, tdlamret_po = seg["ob"], seg["ac"], seg[ "adv"], seg["tdlamret"] atarg_po = (atarg_po - atarg_po.mean()) / atarg_po.std( ) # standardized advantage function estimate # opt['CMA_cmean'] = cmean_adapted # assign_old_eq_new() # set old parameter values to new parameter values for i in range(len(layer_var_list)): # CMAES Train Policy assign_backup_eq_new() # backup current policy flatten_weights = layer_get_operate_list[i]() if len(indices) < len(layer_var_list): selected_index, init_weights = uniform_select( flatten_weights, 0.5) # 0.5 means 50% proportion of params are selected indices.append(selected_index) else: rand = np.random.uniform() # print("Random-Number:", rand) # print("Epsilon:", epsilon) if rand < epsilon: selected_index, init_weights = uniform_select( flatten_weights, 0.5) indices.append(selected_index) # logger.log("Random: select new weights") else: selected_index = indices[i] init_weights = np.take(flatten_weights, selected_index) es = cma.CMAEvolutionStrategy(init_weights, sigma_adapted, opt) while True: if es.countiter >= gensize: # logger.log("Max generations for current layer") break # logger.log("Iteration:" + str(iters_so_far) + " - sub-train Generation for Policy:" + str(es.countiter)) # logger.log("Sigma=" + str(es.sigma)) # solutions = es.ask(sigma_fac = max(cur_lrmult, 1e-8)) solutions = es.ask() # solutions = [np.clip(solution, -5.0, 5.0).tolist() for solution in solutions] costs = [] lens = [] assign_backup_eq_new() # backup current policy for id, solution in enumerate(solutions): np.put(flatten_weights, selected_index, solution) layer_set_operate_list[i](flatten_weights) cost = compute_pol_losses(ob_po, ac_po, atarg_po, tdlamret_po, cur_lrmult, 1 / 4 * (i + 1)) costs.append(cost[0]) assign_new_eq_backup() # Weights decay l2_decay = compute_weight_decay(0.01, solutions) costs += l2_decay costs, real_costs = fitness_rank(costs) # logger.log("real_costs:"+str(real_costs)) # best_solution = np.copy(es.result[0]) # best_fitness = -es.result[1] es.tell_real_seg(solutions=solutions, function_values=costs, real_f=real_costs, segs=None) # best_solution = np.copy(solutions[np.argmin(costs)]) # best_fitness = -real_costs[np.argmin(costs)] best_solution = es.result[0] best_fitness = es.result[1] np.put(flatten_weights, selected_index, best_solution) layer_set_operate_list[i](flatten_weights) # logger.log("Update the layer") # best_solution = es.result[0] # best_fitness = es.result[1] # logger.log("Best Solution Fitness:" + str(best_fitness)) # set_pi_flat_params(best_solution) import gc gc.collect() iters_so_far += 1 episodes_so_far += sum(lens)
def learn(env_list, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) end_timesteps, newround ): env = env_list.popleft() # Open a file to record the accumulated rewards rewFile = open("reward/%d.txt" % (env.seed), "ab") resptimeFile = open("respTime/%d.txt" % (env.seed), "ab") pktnumFile = open("pktNum/%d.txt" % (env.seed), "ab") # Setup losses and stuff # ---------------------------------------- vf_ob_space = env.vf_observation_space # ac_ob_space = env.ac_observation_space ac_space = env.action_space pi = policy_fn("pi1", vf_ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", vf_ob_space, ac_space) # Network for old policy atarg = tf.placeholder(name="atarg", dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(name="ret", dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed clipping parameter epislon vf_ob = U.get_placeholder_cached(name="vf_ob") nn_in = U.get_placeholder_cached(name="nn_in") # placeholder for nn input ac = pi.pdtype.sample_placeholder([None]) # kloldnew = oldpi.pd.kl(pi.pd) # ent = pi.pd.entropy() pb_old_holder = tf.placeholder(name="pd_old", dtype=tf.float32, shape=[None, ac_space.n]) pb_new_holder = tf.placeholder(name="pd_new", dtype=tf.float32, shape=[None, ac_space.n]) oldpd = CategoricalPd(pb_old_holder) pd = CategoricalPd(pb_new_holder) kloldnew = oldpd.kl(pd) ent = pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent # ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold ratio = tf.placeholder(dtype=tf.float32, shape=[None]) surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() vf_var_list = [v for v in var_list if v.name.split("/")[1].startswith("vf")] pol_var_list = [v for v in var_list if v.name.split("/")[1].startswith("pol")] vf_grad = U.function([vf_ob, ret], U.flatgrad(vf_loss, vf_var_list)) # gradient of value function pol_nn_grad = U.function([nn_in], U.flatgrad(pi.nn_out, pol_var_list)) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) pol_adam = MpiAdam(pol_var_list, epsilon=adam_epsilon) clip_para = U.function([lrmult], [clip_param]) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([vf_ob, atarg, ret, lrmult, ratio, pb_new_holder, pb_old_holder], losses) U.initialize() vf_adam.sync() pol_adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) end_timestep = end_timesteps.popleft() new = newround.popleft() episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=10) # rolling buffer for episode lengths rewbuffer = deque(maxlen=10) # rolling buffer for episode rewards env_so_far = 1 assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: rewFile.close() resptimeFile.close() pktnumFile.close() para = {} for vf in range(len(vf_var_list)): # para[vf_var_list[vf].name] = vf_var_list[vf].eval() para[vf] = vf_var_list[vf].eval() for pol in range(len(pol_var_list)): # para[pol_var_list[pol].name] = pol_var_list[pol].eval() para[pol + len(vf_var_list)] = pol_var_list[pol].eval() f = open("network/%d-%d.txt" % (env.seed, timesteps_so_far), "wb") pickle.dump(para, f) f.close() print("============================= policy is stored =================================") break elif end_timestep and timesteps_so_far >= end_timestep: env = env_list.popleft() seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) end_timestep = end_timesteps.popleft() new = newround.popleft() env_so_far += 1 if True: para = {} for vf in range(len(vf_var_list)): # para[vf_var_list[vf].name] = vf_var_list[vf].eval() para[vf] = vf_var_list[vf].eval() for pol in range(len(pol_var_list)): # para[pol_var_list[pol].name] = pol_var_list[pol].eval() para[pol + len(vf_var_list)] = pol_var_list[pol].eval() f = open("network/%d-%d.txt" % (env.seed, timesteps_so_far), "wb") pickle.dump(para, f) f.close() print("======================== new environment (%s network settings left) ===========================" % len(env_list)) elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break elif timesteps_so_far == 0: para = {} for vf in range(len(vf_var_list)): # para[vf_var_list[vf].name] = vf_var_list[vf].eval() para[vf] = vf_var_list[vf].eval() for pol in range(len(pol_var_list)): # para[pol_var_list[pol].name] = pol_var_list[pol].eval() para[pol + len(vf_var_list)] = pol_var_list[pol].eval() f = open("network/%d-%d.txt" % (env.seed, timesteps_so_far), "wb") pickle.dump(para, f) f.close() if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i, Environment %i ************" % (iters_so_far, env_so_far)) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # for vf in range(len(vf_var_list)): # print(vf_var_list[vf].name, vf_var_list[vf].eval()) # for pol in range(len(pol_var_list)): # print(pol_var_list[pol].name, pol_var_list[pol].eval()) record_reward(rewFile, sum(seg["rew"])) record_reward(resptimeFile, sum(seg["resptime"])) record_reward(pktnumFile, sum(seg["pktnum"])) print("total rewards for Iteration %s: %s" % (iters_so_far, sum(seg["rew"]))) print("average response time: %s, num of pkts: %s" % (sum(seg["resptime"])/sum(seg["pktnum"]), sum(seg["pktnum"]))) prob = collections.Counter(seg["ac"]) # a dict where elements are stored as dictionary keys and their counts are stored as dictionary values. for key in prob: prob[key] = prob[key]/len(seg["ac"]) print("percentage of choosing each controller: %s" % (prob)) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) vf_ob, ac_ob, ac, atarg, tdlamret = seg["vf_ob"], seg['ac_ob'], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(vf_ob=vf_ob, ac_ob=ac_ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or vf_ob.shape[0] # if hasattr(pi, "vf_ob_rms"): pi.vf_ob_rms.update(vf_ob) # update running mean/std for policy # if hasattr(pi, "nn_in_rms"): # temp = ac_ob.reshape(-1,ac_ob.shape[2]) # pi.nn_in_rms.update(temp) assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): # calculate the value function gradient vf_g = vf_grad(batch["vf_ob"], batch["vtarg"]) vf_adam.update(vf_g, optim_stepsize * cur_lrmult) # calculate the policy gradient pol_g = [] ratios = [] pbs_new_batch = [] pbs_old_batch = [] e = clip_para(cur_lrmult)[0] for sample_id in range(optim_batchsize): sample_ac_ob = batch["ac_ob"][sample_id] sample_ac = batch["ac"][sample_id] probs_new = pi.calculate_ac_prob(sample_ac_ob) prob_new = probs_new[sample_ac] probs_old = oldpi.calculate_ac_prob(sample_ac_ob) prob_old = probs_old[sample_ac] if prob_old == 0: logger.error("pi_old = 0 in %s th iteration %s th epoch %s th sample..." % (iters_so_far, _, sample_id)) r = prob_new / prob_old ratios.append(r) pbs_new_batch.append(probs_new) pbs_old_batch.append(probs_old) if (r > 1.0 + e and batch["atarg"][sample_id] > 0) or (r < 1.0 - e and batch["atarg"][sample_id] < 0) or r == 0: dnn_dtheta = pol_nn_grad(sample_ac_ob[0].reshape(1, -1)) pol_g.append(0.*dnn_dtheta) else: nn = pi.calculate_ac_value(sample_ac_ob) denominator = np.power(sum(nn), 2) sorted_ind = np.argsort(nn) # sort the array in ascending order if len(probs_new) == 2: if sample_ac == 0: numerator1 = nn[1]*pol_nn_grad(sample_ac_ob[0].reshape(1,-1)) numerator2 = nn[0] * pol_nn_grad(sample_ac_ob[1].reshape(1, -1)) dpi_dtheta = -(numerator1-numerator2)/denominator else: numerator1 = nn[1]*pol_nn_grad(sample_ac_ob[0].reshape(1,-1)) numerator2 = nn[0]*pol_nn_grad(sample_ac_ob[1].reshape(1,-1)) dpi_dtheta = -(numerator2 - numerator1)/denominator # numerator1 = nn[sorted_ind[0]]*pol_nn_grad(sample_ac_ob[sorted_ind[1]].reshape(1,-1)) # numerator2 = nn[sorted_ind[1]]*pol_nn_grad(sample_ac_ob[sorted_ind[0]].reshape(1,-1)) # dpi_dtheta = (numerator1-numerator2)/denominator elif len(probs_new) == 3: if sample_ac == sorted_ind[0]: # the controller with lowest probability will still possible to be chosen because the probability is not zero dnn_dtheta = pol_nn_grad(sample_ac_ob[0].reshape(1, -1)) pol_g.append(0. * dnn_dtheta) else: numerator1 = sum(nn) * (pol_nn_grad(sample_ac_ob[sample_ac].reshape(1,-1)) + 0.5 * pol_nn_grad( sample_ac_ob[sorted_ind[0]].reshape(1, -1))) numerator2 = (nn[sample_ac] + 0.5 * nn[sorted_ind[0]]) * pol_nn_grad(sample_ac_ob) dpi_dtheta = -(numerator1 - numerator2) / denominator else: if sample_ac == sorted_ind[-1] or sample_ac == sorted_ind[-2]: numerator1 = sum(nn) * (pol_nn_grad(sample_ac_ob[sample_ac] .reshape(1,-1))+0.5*pol_nn_grad(sample_ac_ob[sorted_ind[0:-2]])) numerator2 = (nn[sample_ac]+0.5*sum(nn[sorted_ind[0:-2]])) * pol_nn_grad(sample_ac_ob) dpi_dtheta = -(numerator1 - numerator2) / denominator else: dnn_dtheta = pol_nn_grad(sample_ac_ob[0].reshape(1, -1)) pol_g.append(0. * dnn_dtheta) pol_g.append(batch["atarg"][sample_id] * dpi_dtheta / prob_old) pol_g_mean = np.mean(np.array(pol_g), axis=0) pol_adam.update(pol_g_mean, optim_stepsize * cur_lrmult) newlosses = compute_losses(batch["vf_ob"], batch["atarg"], batch["vtarg"], cur_lrmult, np.array(ratios), np.array(pbs_new_batch), np.array(pbs_old_batch)) # adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") # losses = [] # for batch in d.iterate_once(optim_batchsize): # newlosses = compute_losses(batch["vf_ob"], batch["ac_ob"], batch["ac"], batch["atarg"], batch["vtarg"], # cur_lrmult) # losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) if len(lenbuffer) == 0: logger.record_tabular("EpLenMean", 0) logger.record_tabular("EpRewMean", 0) else: logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() ## losses + [U.flatgrad(total_loss, var_list)] 这个是怎么相加的 lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) test_a = U.function([ob, ac, atarg, ret, lrmult], [ kloldnew, ent, meankl, meanent, pol_entpen, pi.pd.logp(ac), oldpi.pd.logp(ac), ratio, surr1, surr2, pi.vpred ]) #################### pi_parms = U.function([], var_list) old_list = oldpi.get_trainable_variables() old_parms = U.function([], old_list) #################### U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() # print("ac",np.shape(seg["ac"]), seg["ac"]) # print("rew",np.shape(seg["rew"]), seg["rew"]) # print("vpred",np.shape(seg["vpred"]), seg["vpred"]) # print("new",np.shape(seg["new"]), seg["new"]) # print("prevac",np.shape(seg["prevac"]), seg["prevac"]) # print("nextvpred",np.shape(seg["nextvpred"]), seg["nextvpred"]) # print("ep_rets",np.shape(seg["ep_rets"]), seg["ep_rets"]) # print("ep_lens",np.shape(seg["ep_lens"]), seg["ep_lens"]) add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), deterministic=pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") # ############ # for p in pi_parms(): # print("pi", np.sum(p)) # for p in old_parms(): # print("old", np.sum(p)) # ############ logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) # kloldnew,ent, meankl, meanent, pol_entpen, piac, oldpiac, ratio, surr1, surr2, pivpred = \ # test_a(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) # print("kloldnew",kloldnew) # print("ent",ent) # print("meankl",meankl) # print("meanent",meanent) # print("pol_entpen",pol_entpen) # print("piac",piac) # print("oldpiac",oldpiac) # print("ratio",ratio) # print("surr1",surr1) # print("surr2",surr2) # print("pivpred",pivpred) for p in pi_parms(): print("pi", np.sum(p)) for p in old_parms(): print("old", np.sum(p)) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return pi
def learn(env, policy_func, reward_giver, expert_dataset, rank, pretrained, pretrained_weight, *, clip_param, g_step, d_step, entcoeff, save_per_iter, optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers ckpt_dir, log_dir, timesteps_per_batch, task_name, gamma, lam, d_stepsize=3e-4, adam_epsilon=1e-5, max_timesteps=0, max_episodes=0, max_iters=0, mix_reward=False, r_lambda=0.44, callback=None, schedule='constant', # annealing for stepsize parameters (epsilon and adam), frame_stack=1 ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ob_space.shape = (ob_space.shape[0] * frame_stack,) print(ob_space) ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent # kloldnew = oldpi.pd.kl(pi.pd) # ent = pi.pd.entropy() # meankl = tf.reduce_mean(kloldnew) # meanent = tf.reduce_mean(ent) # entbonus = entcoeff * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] # vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) # ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold # surrgain = tf.reduce_mean(ratio * atarg) # optimgain = surrgain + entbonus # losses = [optimgain, meankl, entbonus, surrgain, meanent] # loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) d_adam = MpiAdam(reward_giver.get_trainable_variables()) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) # dist = meankl # all_var_list = pi.get_trainable_variables() # var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")] # vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] # assert len(var_list) == len(vf_var_list) + 1 # d_adam = MpiAdam(reward_giver.get_trainable_variables()) # vfadam = MpiAdam(vf_var_list) # get_flat = U.GetFlat(var_list) # set_from_flat = U.SetFromFlat(var_list) # klgrads = tf.gradients(dist, var_list) # flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") # shapes = [var.get_shape().as_list() for var in var_list] # start = 0 # tangents = [] # for shape in shapes: # sz = U.intprod(shape) # tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) # start += sz # gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 # fvp = U.flatgrad(gvp, var_list) # assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) # for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) # compute_losses = U.function([ob, ac, atarg], losses) # compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) # compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) # compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) if rank == 0: generator_loss = tf.placeholder(tf.float32, [], name='generator_loss') expert_loss = tf.placeholder(tf.float32, [], name='expert_loss') entropy = tf.placeholder(tf.float32, [], name='entropy') entropy_loss = tf.placeholder(tf.float32, [], name='entropy_loss') generator_acc = tf.placeholder(tf.float32, [], name='genrator_acc') expert_acc = tf.placeholder(tf.float32, [], name='expert_acc') eplenmean = tf.placeholder(tf.int32, [], name='eplenmean') eprewmean = tf.placeholder(tf.float32, [], name='eprewmean') eptruerewmean = tf.placeholder(tf.float32, [], name='eptruerewmean') # _meankl = tf.placeholder(tf.float32, [], name='meankl') # _optimgain = tf.placeholder(tf.float32, [], name='optimgain') # _surrgain = tf.placeholder(tf.float32, [], name='surrgain') _ops_to_merge = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc, eplenmean, eprewmean, eptruerewmean] ops_to_merge = [ tf.summary.scalar(op.name, op) for op in _ops_to_merge] merged = tf.summary.merge(ops_to_merge) ### TODO: report these stats ### # generator_loss = tf.placeholder(tf.float32, [], name='generator_loss') # expert_loss = tf.placeholder(tf.float32, [], name='expert_loss') # generator_acc = tf.placeholder(tf.float32, [], name='genrator_acc') # expert_acc = tf.placeholder(tf.float32, [], name='expert_acc') # eplenmean = tf.placeholder(tf.int32, [], name='eplenmean') # eprewmean = tf.placeholder(tf.float32, [], name='eprewmean') # eptruerewmean = tf.placeholder(tf.float32, [], name='eptruerewmean') @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() adam.sync() d_adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, mix_reward, r_lambda, stochastic=True, frame_stack=frame_stack) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=100) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(reward_giver.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) if rank == 0: filenames = [f for f in os.listdir(log_dir) if 'logs' in f] writer = tf.summary.FileWriter('{}/logs-{}'.format(log_dir, len(filenames))) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) from tensorflow.core.protobuf import saver_pb2 saver = tf.train.Saver(write_version=saver_pb2.SaverDef.V1) saver.save(tf.get_default_session(), fname) # U.save_state(fname) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] # # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) # ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] # vpredbefore = seg["vpred"] # predicted value function before udpate # atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy # args = seg["ob"], seg["ac"], atarg # fvpargs = [arr[::5] for arr in args] assign_old_eq_new() # set old parameter values to new parameter values with timed("policy optimization"): logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # g_losses = meanlosses # for (lossname, lossval) in zip(loss_names, meanlosses): # logger.record_tabular(lossname, lossval) # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [] # list of tuples, each of which gives the loss for a minibatch for _ in range(optim_epochs // 10): for ob_batch, ac_batch in dataset.iterbatches((ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # update running mean/std for reward_giver ob_batch = ob_batch[:, -ob_expert.shape[1]:][:-30] if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0)[:, :-30]) # *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) *newlosses, g = reward_giver.lossandgrad(ob_batch[:, :-30], ob_expert[:, :-30]) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0 and iters_so_far % 10 == 0: disc_losses = np.mean(d_losses, axis=0) res = tf.get_default_session().run(merged, feed_dict={ generator_loss: disc_losses[0], expert_loss: disc_losses[1], entropy: disc_losses[2], entropy_loss: disc_losses[3], generator_acc: disc_losses[4], expert_acc: disc_losses[5], eplenmean: np.mean(lenbuffer), eprewmean: np.mean(rewbuffer), eptruerewmean: np.mean(true_rewbuffer), }) writer.add_summary(res, iters_so_far) writer.flush() if rank == 0: logger.dump_tabular()