def learn(env, policy_func, dataset, task_name, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None): ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac-pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) U.initialize() adam.sync() if hasattr(pi, "ob_rms"): pi.ob_rms.update(dataset[0]) for _ in range(int(max_iters)): for batch in iterbatches(dataset, batch_size=optim_batch_size): train_loss, g = lossandgrad(*batch, True) adam.update(g, optim_stepsize) if ckpt_dir is None: savedir_fname = tempfile.NamedTemporaryFile().name else: savedir_fname = osp.join(ckpt_dir, task_name+"_bc") U.save_variables(savedir_fname, variables=pi.get_variables()) return savedir_fname
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, verbose=False): val_per_iter = int(max_iters / 10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") # loss = tf.reduce_mean(tf.square(ac-pi.ac)) loss = tf.reduce_mean(pi.pd.neglogp(ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss] + [U.flatgrad(loss, var_list)]) U.initialize() adam.sync() if hasattr(pi, "obs_rms"): pi.obs_rms.update(dataset.obs) # update running mean/std for policy print("Update obs normalization.") logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, False) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, False) logger.log("Training loss: {}, Validation loss: {}".format( train_loss, val_loss)) eval_infos = runner(env, policy_func, None, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, reuse=True) logger.record_tabular("iter_so_far", iter_so_far + 1) for (key, value) in eval_infos.items(): logger.record_tabular(key, value) logger.dump_tabular()
def getActionProbabilities(self, ob, ac_avail): with tf.variable_scope(self.scope): stochastic = True sequence_length = None ob_tfvar = tf_util.get_placeholder_cached(name=self.scope + "ob") ac_avail_tfvar = tf_util.get_placeholder_cached(name=self.scope + "acavail") logits = self.pd.logits.eval(feed_dict={ ob_tfvar: ob[None], ac_avail_tfvar: ac_avail[None] }) probs = np.exp(logits) / np.sum(np.exp(logits)) return probs[0]
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None, verbose=False): val_per_iter = int(max_iters / 10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac - pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss] + [U.flatgrad(loss, var_list)]) U.initialize() adam.sync() if hasattr(pi, "ob_rms"): pi.ob_rms.update(dataset.obs) # update running mean/std for policy logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) logger.log("Training loss: {}, Validation loss: {}".format( train_loss, val_loss)) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: savedir_fname = osp.join(ckpt_dir, task_name) # U.save_variables(savedir_fname, var_list=pi.get_variables()) U.save_state(savedir_fname) return savedir_fname
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None, verbose=False): val_per_iter = int(max_iters/10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) ret = tf.placeholder(dtype=tf.float32, shape=[None]) stochastic = U.get_placeholder_cached(name="stochastic") policy_loss = tf.reduce_mean(tf.square(ac-pi.ac)) value_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) all_var_list = pi.get_trainable_variables() policy_var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")] value_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] assert len(policy_var_list) == len(value_var_list) + 1 policy_adam = MpiAdam(policy_var_list, epsilon=adam_epsilon) value_adam = MpiAdam(value_var_list, epsilon=adam_epsilon) policy_lossandgrad = U.function([ob, ac, stochastic], [policy_loss]+[U.flatgrad(policy_loss, policy_var_list)]) value_lossandgrad = U.function([ob, ret, stochastic], U.flatgrad(value_loss, value_var_list)) U.initialize() policy_adam.sync() value_adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert, ret = dataset.get_next_batch(optim_batch_size, 'train') policy_train_loss, policy_g = policy_lossandgrad(ob_expert, ac_expert, True) value_g = value_lossandgrad(ob_expert, ret, True) policy_adam.update(policy_g, optim_stepsize) value_adam.update(value_g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert, ret = dataset.get_next_batch(-1, 'val') policy_val_loss, _ = policy_lossandgrad(ob_expert, ac_expert, True) logger.log("[Policy] Training loss: {}, Validation loss: {}".format(policy_train_loss, policy_val_loss)) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: savedir_fname = osp.join(ckpt_dir, task_name) #U.save_state(savedir_fname, var_list=pi.get_variables()) U.save_variables(savedir_fname, variables=pi.get_trainable_variables()) return savedir_fname
def learn(env, policy_func, timesteps_per_batch, max_kl, cg_iters, gamma, lam, entcoeff, cg_damping, vf_stepsize, vf_iters, max_timesteps, max_episodes, max_iters, callback=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() # setip losses and stuff ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) oldpi = policy_func("oldpi", ob_space, ac_space) # target advantage function atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # empirical return ret = tf.placeholder(dtype=tf.float32, shape=[None]) ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None])
def learn(env, policy_func, dataset, task_name, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None): ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") if type(ac_space) is gym.spaces.Discrete: discrete = True loss = tf.reduce_mean(pi.pd.neglogp(ac)) else: discrete = False loss = tf.reduce_mean(tf.square(ac-pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) U.initialize() adam.sync() if hasattr(pi, "ob_rms"): pi.ob_rms.update(dataset[0]) logger.info("Start Behavioral cloning...") logger.info("Iter, train_loss") for i in range(int(max_iters)): iter_train_losses = [] for batch in iterbatches(dataset, batch_size=optim_batch_size): if discrete: batch = (batch[0], np.argmax(batch[1], axis=-1)) train_loss, g = lossandgrad(*batch, True) adam.update(g, optim_stepsize) iter_train_losses.append(train_loss) logger.info(str(i+1) + "," + str(np.mean(iter_train_losses))) if ckpt_dir is None: savedir_fname = tempfile.NamedTemporaryFile().name else: savedir_fname = osp.join(ckpt_dir, task_name+"_bc") U.save_variables(savedir_fname, variables=pi.get_variables()) return savedir_fname
def pretrain(pi, env): print("Running {} initialization episodes...".format(env.warm_init_eps), flush=True) n_rollouts = env.warm_init_eps tf_ob = U.get_placeholder_cached(name="ob") ob = env.reset() obs = np.array( [ob for _ in range(n_rollouts * (env.spec.max_episode_steps + 1))]) obs_len = 0 graph = tf.get_default_graph() pdparam = graph.get_tensor_by_name("pi/pdparam:0") pdparam_shape = pdparam.shape[1].value mean, _, logstd, _ = tf.split(pdparam, [ len(SIMPLE_AC), pdparam_shape // 2 - len(SIMPLE_AC), len(SIMPLE_AC), pdparam_shape // 2 - len(SIMPLE_AC) ], 1) ac_mean = tf.constant(SIMPLE_AC, dtype=tf.float32) ac_logstd = tf.constant(np.array([0] * len(SIMPLE_AC)), dtype=tf.float32) print("Completed:", flush=True) for ep in range(n_rollouts): ob = env.reset() obs[obs_len] = ob obs_len += 1 done = False while not done: ac, vpred = pi.act(True, ob) ac[:4] = SIMPLE_AC + 0.01 * np.random.randn(4) ac[4:] = 0 ob, _, done, _ = env.step(ac) obs[obs_len] = ob obs_len += 1 print(ep + 1, flush=True) obs = obs[:obs_len] with tf.variable_scope("pretrain"): loss = tf.nn.l2_loss(mean - ac_mean) + tf.nn.l2_loss(logstd - ac_logstd) opt = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss) batch_size = 32 num_epochs = 10 U.get_session().run( tf.variables_initializer( set(tf.global_variables()) - U.ALREADY_INITIALIZED)) for ep in range(num_epochs): for i in range(len(obs) // batch_size): idx = np.random.choice(len(obs), batch_size) U.get_session().run([opt, loss], feed_dict={tf_ob: obs[idx]}) env.n_episodes = 0 print("Policy initialized!", flush=True)
def evaluate(env, policy_func, load_model_path, video_prefix, record, render, *, timesteps_per_batch # what to train on ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) U.initialize() U.load_state(load_model_path) ep_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=False, record=record, render=render) ep_lens = [] ep_rets = [] visual_obs = [] if record: record_dir = os.path.join(os.path.dirname(load_model_path), 'video') os.makedirs(record_dir, exist_ok=True) for _ in tqdm(range(10)): ep_traj = ep_gen.__next__() ep_lens.append(ep_traj["ep_len"]) ep_rets.append(ep_traj["ep_ret"]) # Video recording if _ % 2 == 0 and record: visual_obs = ep_traj["visual_obs"] if video_prefix is None: video_path = os.path.join(record_dir, '{}.mp4'.format(_)) else: video_path = os.path.join(record_dir, '{}-{}.mp4'.format(video_prefix, _)) fps = 15. def f(t): frame_length = len(visual_obs) new_fps = 1./(1./fps + 1./frame_length) idx = min(int(t*new_fps), frame_length-1) return visual_obs[idx] video = mpy.VideoClip(f, duration=len(visual_obs)/fps+2) video.write_videofile(video_path, fps, verbose=False) print('Episode Length: {}'.format(sum(ep_lens)/10.)) print('Episode Rewards: {}'.format(sum(ep_rets)/10.))
def __init__(self, ob_space, ac_space, lr=5e-4, ent_coef=0.00): self.sess = tf.get_default_session() self.ob_space = ob_space self.ac_space = ac_space self.lr = lr self.pi = Policy(name="pi", ob_space=ob_space, ac_space=ac_space, reuse=False, hid_size=64, num_hid_layers=2) ob = U.get_placeholder_cached(name="ob") ac = self.pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac - self.pi.ac)) var_list = self.pi.get_trainable_variables() self.adam = MpiAdam(var_list) self.lossandgrad = U.function([ob, ac, stochastic], [loss] + [U.flatgrad(loss, var_list)]) self.loss = tf.reduce_mean(tf.square( ac - self.pi.ac)) - ent_coef * tf.reduce_mean(self.pi.pd.entropy())
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None): ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac-pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size) loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) savedir_fname = tempfile.TemporaryDirectory().name U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None, verbose=False): val_per_iter = int(max_iters/10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac-pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss)) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: savedir_fname = osp.join(ckpt_dir, task_name) U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
def bc_learn(bool_evaluate, robot, policy_func, dataset, optim_batch_size=64, max_iters=5*1e3, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None, verbose=False): val_per_iter = int(max_iters/10) pi = policy_func("pi", robot.observation_space, robot.action_space) # Construct network for new policy saver = tf.train.Saver() if bool_evaluate: saver.restore(tf.get_default_session(), U_.getPath() + '/model/bc.ckpt') return pi # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac-pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) U.initialize() adam.sync() print("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) print("Training loss: {}, Validation loss: {}".format(train_loss, val_loss)) saver.save(tf.get_default_session(), 'model/bc.ckpt') return pi
def load_policy(env, policy_func, *, clip_param, entcoeff, # clipping parameter epsilon, entropy coeff adam_epsilon=1e-5, model_path, checkpoint): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() U.load_state(os.path.join(model_path, 'model-{}'.format(checkpoint))) return pi
def build_policy_training_vars(pi, oldpi, clip_param, entcoeff, adam_epsilon): atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) return loss_names, var_list, lossandgrad, adam, assign_old_eq_new, compute_losses
def learn( # =========== modified part begins =========== # env_id, seed, robot, # robot class with GMM params joint_optimization_iters, # total number of joint optimization iterations design_iters, # number of samples when updating physical design in each joint optimization iteration policy_iters, # number of samples when updating robot policy in each joint optimization iteration # ============ modified part ends ============ # policy_func, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # ================================== modification 1 ================================== # """ input: replace "env" (env class) with "env_id" (string) add "seed" (int) reason: to enable env.make() during training modification detail: add following lines into learn() env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) env.close() # added at the end of learn() """ import roboschool, gym from baselines import bench env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) # ================================== modification 1 ================================== # # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space # policy_func is the initialization of NN # NN structure: # state -> (num_hid_layers) fully-connected layers with (hid_size) units -> (action, predicted value) # num_hid_layers, hid_size: set in the file calls "learn" pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # placeholder for "ob" # created in mlppolicy.py ob = U.get_placeholder_cached(name="ob") # placeholder for "ac" # in common/distribution.py ac = pi.pdtype.sample_placeholder([None]) # KL divergence and Entropy, defined in common/distribution.py kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) # pol_entpen: Entropy Bounus encourages exploration # entcoeff: entropy coefficient, defined in PPO page 5, Equ. (9) pol_entpen = (-entcoeff) * meanent # probability ration, defined in PPO page 3 ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold # Surrogate Goal # defined in PPO page 3, Equ (7) surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) # Value Function Loss: square error loss for ||v_pred - v_target|| vf_loss = U.mean(tf.square(pi.vpred - ret)) # Total_loss = L^CLIP - Value Function Loss + Entropy Bounus # defined in PPO page 5, Equ. (9) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) # adam optimizer? adam = MpiAdam(var_list, epsilon=adam_epsilon) # oldpi = pi assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) # Why we need this line? compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # ================================== modification 2 ================================== # for joint_optimization_iter in range(joint_optimization_iters): U.save_state('/home/yetong/Desktop/Project/models/model{}.ckpt'.format( joint_optimization_iter)) logger.log("joint optimization progree: {}/{}".format( joint_optimization_iter, joint_optimization_iters)) # ================================== update physical design ================================== # if joint_optimization_iter > 20: Rewards_plus = np.zeros(design_iters) Rewards_minum = np.zeros(design_iters) params = robot.sample(design_iters, to_update=True) for i, param in enumerate(params): robot.modify_file(param) env = gym.make(env_id) # myenv = env.env # pdb.set_trace() env = bench.Monitor(env, logger.get_dir()) R = episode_generator(pi, env, gamma, stochastic=True) logger.log("\t update physical design: %d/%d, rew: %f" % (i, 2 * design_iters, R)) if i % 2 == 0: Rewards_plus[int(i / 2)] = R else: Rewards_minum[int(i / 2)] = R logger.log("prev_mu: ", robot.params_mu) logger.log("prev_sig: ", robot.params_sig) robot.update(Rewards_plus, Rewards_minum) logger.log("mu: ", robot.params_mu) logger.log("sig: ", robot.params_sig) # ================================== update policy ================================== # # params = robot.sample(design_iters) params = [robot.params_mu] for param in params: # reinitialize env robot.modify_file(param) env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) # ================================== modification 2 ================================== # # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([ max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0 ]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break # annealing for stepsize parameters (epsilon and adam) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy # oldpi = pi # set old parameter values to new parameter values assign_old_eq_new() logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular( "ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather( lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() # ================================== modification 1 ================================== # env.close()
def __init__( self, state_dim, action_min, action_max, clip_param, entcoeff # clipping parameter epsilon, entropy coeff , optim_epochs, optim_stepsize, optim_batchsize # optimization hypers , gamma, lam # advantage estimation , max_iters_ppo=5000, adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) , interpolate=False, hid_size=None, activation='tanh'): from baselines.ppo1 import mlp_policy U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, activation=activation, interpolate=interpolate) high = 100 * np.ones(state_dim) low = -high self.ob_space = spaces.Box(low=low, high=high, dtype=np.float32) self.ac_space = spaces.Box(low=action_min, high=action_max, dtype=np.float32) self.pi = policy_fn("pi", self.ob_space, self.ac_space) # Construct network for new policy self.oldpi = policy_fn("oldpi", self.ob_space, self.ac_space) # Network for old policy self.atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) self.ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return self.lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * self.lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = self.pi.pdtype.sample_placeholder([None]) kloldnew = self.oldpi.pd.kl(self.pi.pd) ent = self.pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(self.pi.pd.logp(ac) - self.oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * self.atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * self.atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(self.pi.vpred - self.ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] self.loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = self.pi.get_trainable_variables() self.lossandgrad = U.function( [ob, ac, self.atarg, self.ret, self.lrmult], losses + [U.flatgrad(total_loss, var_list)]) self.adam = MpiAdam(var_list, epsilon=adam_epsilon) self.assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( self.oldpi.get_variables(), self.pi.get_variables()) ]) self.compute_losses = U.function( [ob, ac, self.atarg, self.ret, self.lrmult], losses) U.initialize() self.adam.sync() self.episodes_so_far = 0 self.timesteps_so_far = 0 self.iters_so_far = 0 self.gamma = gamma self.lam = lam self.optim_epochs = optim_epochs self.optim_stepsize = optim_stepsize self.optim_batchsize = optim_batchsize self.max_iters_ppo = max_iters_ppo self.schedule = schedule
def learn( env, test_env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation # CMAES max_fitness, # has to be negative, as cmaes consider minization popsize, gensize, bounds, sigma, eval_iters, max_v_train_iter, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) seed, env_id): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy backup_pi = policy_fn( "backup_pi", ob_space, ac_space ) # Construct a network for every individual to adapt during the es evolution pi_zero = policy_fn( "zero_pi", ob_space, ac_space) # pi_0 will only be updated along with iterations reward = tf.placeholder(dtype=tf.float32, shape=[None]) # step rewards atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") next_ob = U.get_placeholder_cached( name="next_ob") # next step observation for updating q function ac = U.get_placeholder_cached( name="act") # action placeholder for computing q function kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent pi_adv = pi.qpred - pi.vpred adv_mean, adv_var = tf.nn.moments(pi_adv, axes=[0]) normalized_pi_adv = (pi_adv - adv_mean) / tf.sqrt(adv_var) qf_loss = tf.reduce_mean(tf.square(reward + gamma * pi.vpred - pi.qpred)) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) qf_losses = [qf_loss] vf_losses = [vf_loss] pol_loss = -tf.reduce_mean(normalized_pi_adv) # Advantage function should be improved losses = [pol_loss, pol_entpen, meankl, meanent] loss_names = ["pol_surr_2", "pol_entpen", "kl", "ent"] var_list = pi.get_trainable_variables() qf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("qf") ] vf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") ] pol_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("pol") ] vf_lossandgrad = U.function([ob, ac, atarg, ret, lrmult], vf_losses + [U.flatgrad(vf_loss, vf_var_list)]) qf_lossandgrad = U.function([ob, ac, next_ob, lrmult, reward], qf_losses + [U.flatgrad(qf_loss, qf_var_list)]) qf_adam = MpiAdam(qf_var_list, epsilon=adam_epsilon) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) assign_backup_eq_new = U.function( [], [], updates=[ tf.assign(backup_v, newv) for ( backup_v, newv) in zipsame(backup_pi.get_variables(), pi.get_variables()) ]) assign_new_eq_backup = U.function( [], [], updates=[ tf.assign(newv, backup_v) for (newv, backup_v ) in zipsame(pi.get_variables(), backup_pi.get_variables()) ]) # Compute all losses mean_pi_actions = U.function( [ob], [pi.pd.mode()]) # later for computing pol_loss compute_pol_losses = U.function([ob, next_ob, ac], [pol_loss]) U.initialize() get_pi_flat_params = U.GetFlat(pol_var_list) set_pi_flat_params = U.SetFromFlat(pol_var_list) vf_adam.sync() qf_adam.sync() global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness episodes_so_far = 0 timesteps_so_far = 0 ppo_timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards best_fitness = np.inf eval_gen = traj_segment_generator_eval(pi, test_env, timesteps_per_actorbatch, stochastic=True) # For evaluation seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, eval_gen=eval_gen) # For train V Func # Build generator for all solutions actors = [] best_fitness = 0 for i in range(popsize): newActor = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, eval_gen=eval_gen) actors.append(newActor) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if max_timesteps and timesteps_so_far >= max_timesteps: print("Max time steps") break elif max_episodes and episodes_so_far >= max_episodes: print("Max episodes") break elif max_iters and iters_so_far >= max_iters: print("Max iterations") break elif max_seconds and time.time() - tstart >= max_seconds: print("Max time") break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) # Generate new samples # Train V func for i in range(max_v_train_iter): logger.log("Iteration:" + str(iters_so_far) + " - sub-train iter for V func:" + str(i)) logger.log("Generate New Samples") seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) ob, ac, next_ob, atarg, reward, tdlamret, traj_idx = seg["ob"], seg["ac"], seg["next_ob"], seg["adv"], seg["rew"], seg["tdlamret"], \ seg["traj_index"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update( ob) # update running mean/std for normalization assign_old_eq_new( ) # set old parameter values to new parameter values # Train V function logger.log("Training V Func and Evaluating V Func Losses") for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *vf_losses, g = vf_lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) vf_adam.update(g, optim_stepsize * cur_lrmult) losses.append(vf_losses) logger.log(fmt_row(13, np.mean(losses, axis=0))) d_q = Dataset(dict(ob=ob, ac=ac, next_ob=next_ob, reward=reward, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) # Re-train q function logger.log("Training Q Func Evaluating Q Func Losses") for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d_q.iterate_once(optim_batchsize): *qf_losses, g = qf_lossandgrad(batch["next_ob"], batch["ac"], batch["ob"], cur_lrmult, batch["reward"]) qf_adam.update(g, optim_stepsize * cur_lrmult) losses.append(qf_losses) logger.log(fmt_row(13, np.mean(losses, axis=0))) # CMAES Train Policy assign_old_eq_new() # set old parameter values to new parameter values assign_backup_eq_new() # backup current policy flatten_weights = get_pi_flat_params() opt = cma.CMAOptions() opt['tolfun'] = max_fitness opt['popsize'] = popsize opt['maxiter'] = gensize opt['verb_disp'] = 0 opt['verb_log'] = 0 opt['seed'] = seed opt['AdaptSigma'] = True es = cma.CMAEvolutionStrategy(flatten_weights, sigma, opt) while True: if es.countiter >= gensize: logger.log("Max generations for current layer") break logger.log("Iteration:" + str(iters_so_far) + " - sub-train Generation for Policy:" + str(es.countiter)) logger.log("Sigma=" + str(es.sigma)) solutions = es.ask() costs = [] lens = [] assign_backup_eq_new() # backup current policy for id, solution in enumerate(solutions): set_pi_flat_params(solution) losses = [] cost = compute_pol_losses(ob, ob, mean_pi_actions(ob)[0]) costs.append(cost[0]) assign_new_eq_backup() # Weights decay l2_decay = compute_weight_decay(0.99, solutions) costs += l2_decay # costs, real_costs = fitness_normalization(costs) costs, real_costs = fitness_rank(costs) es.tell_real_seg(solutions=solutions, function_values=costs, real_f=real_costs, segs=None) best_solution = es.result[0] best_fitness = es.result[1] logger.log("Best Solution Fitness:" + str(best_fitness)) set_pi_flat_params(best_solution) iters_so_far += 1 episodes_so_far += sum(lens)
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) **kwargs, ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return atarg_novel = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function for the novelty reward term ret_novel = tf.placeholder( dtype=tf.float32, shape=[None]) # Empirical return for the novelty reward term lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # surr1_novel = ratio * atarg_novel # surrogate loss of the novelty term surr2_novel = tf.clip_by_value( ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg_novel # surrogate loss of the novelty term pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) pol_surr_novel = -tf.reduce_mean(tf.minimum( surr1_novel, surr2_novel)) # PPO's surrogate for the novelty part vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) vf_loss_novel = tf.reduce_mean(tf.square(pi.vpred_novel - ret_novel)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] total_loss_novel = pol_surr_novel + pol_entpen + vf_loss_novel losses_novel = [pol_surr_novel, pol_entpen, vf_loss_novel, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] policy_var_list = pi.get_trainable_variables(scope='pi/pol') policy_var_count = 0 for vars in policy_var_list: count_in_var = 1 for dim in vars.shape._dims: count_in_var *= dim policy_var_count += count_in_var noise_count = pi.get_trainable_variables( scope='pi/pol/logstd')[0].shape._dims[1] var_list = pi.get_trainable_variables( scope='pi/pol') + pi.get_trainable_variables(scope='pi/vf/') var_list_novel = pi.get_trainable_variables( scope='pi/pol') + pi.get_trainable_variables(scope='pi/vf_novel/') var_list_pi = pi.get_trainable_variables( scope='pi/pol') + pi.get_trainable_variables( scope='pi/vf/') + pi.get_trainable_variables(scope='pi/vf_novel/') lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) lossandgrad_novel = U.function( [ob, ac, atarg_novel, ret_novel, lrmult], losses_novel + [U.flatgrad(total_loss_novel, var_list_novel)]) # adam = MpiAdam(var_list, epsilon=adam_epsilon) # adam_novel = MpiAdam(var_list_novel, epsilon=adam_epsilon) adam_all = MpiAdam(var_list_pi, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) compute_losses_novel = U.function([ob, ac, atarg_novel, ret_novel, lrmult], losses_novel) U.initialize() adam_all.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 novelty_update_iter_cycle = 10 novelty_start_iter = 50 novelty_update = True tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards rewnovelbuffer = deque( maxlen=100) # rolling buffer for episode novelty rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" # This for debug purpose # from collections import defaultdict # sum_batch = {} # sum_batch = defaultdict(lambda: 0, sum_batch) total_task_gradients = [] total_novelty_gradients = [] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, atarg_novel, tdlamret, tdlamret_novel = seg["ob"], seg[ "ac"], seg["adv"], seg["adv_novel"], seg["tdlamret"], seg[ "tdlamret_novel"] vpredbefore = seg["vpred"] # predicted value function before udpate vprednovelbefore = seg[ 'vpred_novel'] # predicted novelty value function before update atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate atarg_novel = (atarg_novel - atarg_novel.mean()) / atarg_novel.std( ) # standartized novelty advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret, atarg_novel=atarg_novel, vtarg_novel=tdlamret_novel), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) same_update_direction = [] # True task_gradient_mag = [] novel_gradient_mag = [] task_gradients = [] novel_gradients = [] same_dir_cnt = 0 oppo_dir_cnt = 0 # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) *newlosses_novel, g_novel = lossandgrad_novel( batch["ob"], batch["ac"], batch["atarg_novel"], batch["vtarg_novel"], cur_lrmult) pol_g = g[0:policy_var_count] pol_g_novel = g_novel[0:policy_var_count] comm = MPI.COMM_WORLD pol_g_reduced = np.zeros_like(pol_g) pol_g_novel_reduced = np.zeros_like(pol_g_novel) comm.Allreduce(pol_g, pol_g_reduced, op=MPI.SUM) pol_g_reduced /= comm.Get_size() comm.Allreduce(pol_g_novel, pol_g_novel_reduced, op=MPI.SUM) pol_g_novel_reduced /= comm.Get_size() final_gradient = np.zeros( len(g) + len(g_novel) - policy_var_count) final_gradient[policy_var_count::] = np.concatenate( (g[policy_var_count::], g_novel[policy_var_count::])) # pol_g_normalized = pol_g / np.linalg.norm(pol_g) # pol_g_novel_normalized = pol_g_novel / np.linalg.norm(pol_g_novel) pol_g_reduced_no_noise = pol_g_reduced[:(len(pol_g_reduced) - noise_count)] pol_g_novel_reduced_no_noise = pol_g_novel_reduced[:( len(pol_g_novel_reduced) - noise_count)] pol_g_reduced_no_noise_normalized = pol_g_reduced_no_noise / np.linalg.norm( pol_g_reduced_no_noise) pol_g_novel_reduced_no_noise_normalized = pol_g_novel_reduced_no_noise / np.linalg.norm( pol_g_novel_reduced_no_noise) dot = np.dot(pol_g_reduced_no_noise_normalized, pol_g_novel_reduced_no_noise_normalized) task_gradients.append(pol_g_reduced_no_noise) novel_gradients.append(pol_g_novel_reduced_no_noise) task_gradient_mag.append( np.linalg.norm(pol_g_reduced_no_noise)) novel_gradient_mag.append( np.linalg.norm(pol_g_novel_reduced_no_noise)) same_update_direction.append(dot) # pol_g_normalized = pol_g_reduced_normalized # pol_g_novel_normalized = pol_g_novel_reduced_normalized pol_g_reduced_normalized = pol_g_reduced / np.linalg.norm( pol_g_reduced) pol_g_novel_reduced_normalized = pol_g_novel_reduced / np.linalg.norm( pol_g_novel_reduced) if (dot > 0): same_dir_cnt += 1 bisector_no_noise = (pol_g_reduced_normalized + pol_g_novel_reduced_normalized) bisector_no_noise_normalized = bisector_no_noise / np.linalg.norm( bisector_no_noise) # quarterSector_no_noise = (pol_g_reduced_normalized + bisector_no_noise_normalized) # quarterSector_no_noise_normalized = quarterSector_no_noise / np.linalg.norm(quarterSector_no_noise) # # octSector_no_noise = (pol_g_reduced_normalized + quarterSector_no_noise_normalized) # octSector_no_noise_normalized = octSector_no_noise / np.linalg.norm(octSector_no_noise) target_dir = bisector_no_noise_normalized final_gradient[0:policy_var_count] = 0.5 * ( np.dot(pol_g_reduced, target_dir) + np.dot(pol_g_novel_reduced, target_dir)) * target_dir adam_all.update(final_gradient, optim_stepsize * cur_lrmult) else: oppo_dir_cnt += 1 task_projection_no_noise = np.dot( pol_g_reduced, pol_g_novel_reduced_normalized ) * pol_g_novel_reduced_normalized final_pol_gradient_no_noise = pol_g_reduced - task_projection_no_noise final_gradient[ 0:policy_var_count] = final_pol_gradient_no_noise adam_all.update(final_gradient, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) # newlosses_novel = compute_losses_novel(batch["ob"], batch["ac"], batch["atarg_novel"], batch["vtarg_novel"], # cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg['ep_rets_novel'] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, rews_novel = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) rewnovelbuffer.extend(rews_novel) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpRNoveltyRewMean", np.mean(rewnovelbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 if iters_so_far >= novelty_start_iter and iters_so_far % novelty_update_iter_cycle == 0: novelty_update = not novelty_update logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("RelativeDirection", np.array(same_update_direction).mean()) logger.record_tabular("SameDirectionCount", same_dir_cnt) logger.record_tabular("OppoDirectionCount", oppo_dir_cnt) logger.record_tabular("TaskGradMag", np.array(task_gradient_mag).mean()) logger.record_tabular("NoveltyGradMag", np.array(novel_gradient_mag).mean()) task_gradients = np.array(task_gradients).mean(axis=0) total_task_gradients.append(task_gradients) novel_gradients = np.array(novel_gradients).mean(axis=0) total_novelty_gradients.append(novel_gradients) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() if MPI.COMM_WORLD.Get_rank() == 0: gradient_info = {} gradient_info['task_gradients'] = np.array(total_task_gradients) gradient_info['novelty_gradients'] = np.array(total_novelty_gradients) print(np.array(total_task_gradients).shape) print(np.array(total_novelty_gradients).shape) joblib.dump(gradient_info, logger.get_dir() + '/gradientinfo.pkl', compress=True) return pi
def learn(env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular()
def learn( make_env, make_policy, *, n_episodes, horizon, delta, gamma, max_iters, sampler=None, use_natural_gradient=False, #can be 'exact', 'approximate' fisher_reg=1e-2, iw_method='is', iw_norm='none', bound='J', line_search_type='parabola', save_weights=False, improvement_tol=0., center_return=False, render_after=None, max_offline_iters=100, callback=None, clipping=False, entropy='none', positive_return=False, reward_clustering='none'): np.set_printoptions(precision=3) max_samples = horizon * n_episodes if line_search_type == 'binary': line_search = line_search_binary elif line_search_type == 'parabola': line_search = line_search_parabola else: raise ValueError() # Building the environment env = make_env() ob_space = env.observation_space ac_space = env.action_space # Building the policy pi = make_policy('pi', ob_space, ac_space) oldpi = make_policy('oldpi', ob_space, ac_space) all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split('/')[1].startswith('pol') ] shapes = [U.intprod(var.get_shape().as_list()) for var in var_list] n_parameters = sum(shapes) # Placeholders ob_ = ob = U.get_placeholder_cached(name='ob') ac_ = pi.pdtype.sample_placeholder([max_samples], name='ac') mask_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='mask') rew_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='rew') disc_rew_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='disc_rew') gradient_ = tf.placeholder(dtype=tf.float32, shape=(n_parameters, 1), name='gradient') iter_number_ = tf.placeholder(dtype=tf.int32, name='iter_number') losses_with_name = [] # Policy densities target_log_pdf = pi.pd.logp(ac_) behavioral_log_pdf = oldpi.pd.logp(ac_) log_ratio = target_log_pdf - behavioral_log_pdf # Split operations disc_rew_split = tf.stack(tf.split(disc_rew_ * mask_, n_episodes)) rew_split = tf.stack(tf.split(rew_ * mask_, n_episodes)) log_ratio_split = tf.stack(tf.split(log_ratio * mask_, n_episodes)) target_log_pdf_split = tf.stack( tf.split(target_log_pdf * mask_, n_episodes)) behavioral_log_pdf_split = tf.stack( tf.split(behavioral_log_pdf * mask_, n_episodes)) mask_split = tf.stack(tf.split(mask_, n_episodes)) # Renyi divergence emp_d2_split = tf.stack( tf.split(pi.pd.renyi(oldpi.pd, 2) * mask_, n_episodes)) emp_d2_cum_split = tf.reduce_sum(emp_d2_split, axis=1) empirical_d2 = tf.reduce_mean(tf.exp(emp_d2_cum_split)) # Return ep_return = tf.reduce_sum(mask_split * disc_rew_split, axis=1) if clipping: rew_split = tf.clip_by_value(rew_split, -1, 1) if center_return: ep_return = ep_return - tf.reduce_mean(ep_return) rew_split = rew_split - (tf.reduce_sum(rew_split) / (tf.reduce_sum(mask_split) + 1e-24)) discounter = [pow(gamma, i) for i in range(0, horizon)] # Decreasing gamma discounter_tf = tf.constant(discounter) disc_rew_split = rew_split * discounter_tf return_mean = tf.reduce_mean(ep_return) return_std = U.reduce_std(ep_return) return_max = tf.reduce_max(ep_return) return_min = tf.reduce_min(ep_return) return_abs_max = tf.reduce_max(tf.abs(ep_return)) return_step_max = tf.reduce_max(tf.abs(rew_split)) # Max step reward return_step_mean = tf.abs(tf.reduce_mean(rew_split)) positive_step_return_max = tf.maximum(0.0, tf.reduce_max(rew_split)) negative_step_return_max = tf.maximum(0.0, tf.reduce_max(-rew_split)) return_step_maxmin = tf.abs(positive_step_return_max - negative_step_return_max) losses_with_name.extend([(return_mean, 'InitialReturnMean'), (return_max, 'InitialReturnMax'), (return_min, 'InitialReturnMin'), (return_std, 'InitialReturnStd'), (empirical_d2, 'EmpiricalD2'), (return_step_max, 'ReturnStepMax'), (return_step_maxmin, 'ReturnStepMaxmin')]) if iw_method == 'pdis': # log_ratio_split cumulative sum log_ratio_cumsum = tf.cumsum(log_ratio_split, axis=1) # Exponentiate ratio_cumsum = tf.exp(log_ratio_cumsum) # Multiply by the step-wise reward (not episode) ratio_reward = ratio_cumsum * disc_rew_split # Average on episodes ratio_reward_per_episode = tf.reduce_sum(ratio_reward, axis=1) w_return_mean = tf.reduce_sum(ratio_reward_per_episode, axis=0) / n_episodes # Get d2(w0:t) with mask d2_w_0t = tf.exp(tf.cumsum(emp_d2_split, axis=1)) * mask_split # LEAVE THIS OUTSIDE # Sum d2(w0:t) over timesteps episode_d2_0t = tf.reduce_sum(d2_w_0t, axis=1) # Sample variance J_sample_variance = (1 / (n_episodes - 1)) * tf.reduce_sum( tf.square(ratio_reward_per_episode - w_return_mean)) losses_with_name.append((J_sample_variance, 'J_sample_variance')) losses_with_name.extend([(tf.reduce_max(ratio_cumsum), 'MaxIW'), (tf.reduce_min(ratio_cumsum), 'MinIW'), (tf.reduce_mean(ratio_cumsum), 'MeanIW'), (U.reduce_std(ratio_cumsum), 'StdIW')]) losses_with_name.extend([(tf.reduce_max(d2_w_0t), 'MaxD2w0t'), (tf.reduce_min(d2_w_0t), 'MinD2w0t'), (tf.reduce_mean(d2_w_0t), 'MeanD2w0t'), (U.reduce_std(d2_w_0t), 'StdD2w0t')]) elif iw_method == 'is': iw = tf.exp(tf.reduce_sum(log_ratio_split, axis=1)) if iw_norm == 'none': iwn = iw / n_episodes w_return_mean = tf.reduce_sum(iwn * ep_return) J_sample_variance = (1 / (n_episodes - 1)) * tf.reduce_sum( tf.square(iw * ep_return - w_return_mean)) losses_with_name.append((J_sample_variance, 'J_sample_variance')) elif iw_norm == 'sn': iwn = iw / tf.reduce_sum(iw) w_return_mean = tf.reduce_sum(iwn * ep_return) elif iw_norm == 'regression': iwn = iw / n_episodes mean_iw = tf.reduce_mean(iw) beta = tf.reduce_sum( (iw - mean_iw) * ep_return * iw) / (tf.reduce_sum( (iw - mean_iw)**2) + 1e-24) w_return_mean = tf.reduce_mean(iw * ep_return - beta * (iw - 1)) else: raise NotImplementedError() ess_classic = tf.linalg.norm(iw, 1)**2 / tf.linalg.norm(iw, 2)**2 sqrt_ess_classic = tf.linalg.norm(iw, 1) / tf.linalg.norm(iw, 2) ess_renyi = n_episodes / empirical_d2 losses_with_name.extend([(tf.reduce_max(iwn), 'MaxIWNorm'), (tf.reduce_min(iwn), 'MinIWNorm'), (tf.reduce_mean(iwn), 'MeanIWNorm'), (U.reduce_std(iwn), 'StdIWNorm'), (tf.reduce_max(iw), 'MaxIW'), (tf.reduce_min(iw), 'MinIW'), (tf.reduce_mean(iw), 'MeanIW'), (U.reduce_std(iw), 'StdIW'), (ess_classic, 'ESSClassic'), (ess_renyi, 'ESSRenyi')]) elif iw_method == 'rbis': # Check if we need to cluster rewards rew_clustering_options = reward_clustering.split(':') if reward_clustering == 'none': pass # Do nothing elif rew_clustering_options[0] == 'global': assert len( rew_clustering_options ) == 2, "Reward clustering: Provide the correct number of parameters" N = int(rew_clustering_options[1]) tf.add_to_collection( 'prints', tf.Print(ep_return, [ep_return], 'ep_return', summarize=20)) global_rew_min = tf.Variable(float('+inf'), trainable=False) global_rew_max = tf.Variable(float('-inf'), trainable=False) rew_min = tf.reduce_min(ep_return) rew_max = tf.reduce_max(ep_return) global_rew_min = tf.assign(global_rew_min, tf.minimum(global_rew_min, rew_min)) global_rew_max = tf.assign(global_rew_max, tf.maximum(global_rew_max, rew_max)) interval_size = (global_rew_max - global_rew_min) / N ep_return = tf.floordiv(ep_return, interval_size) * interval_size elif rew_clustering_options[0] == 'batch': assert len( rew_clustering_options ) == 2, "Reward clustering: Provide the correct number of parameters" N = int(rew_clustering_options[1]) rew_min = tf.reduce_min(ep_return) rew_max = tf.reduce_max(ep_return) interval_size = (rew_max - rew_min) / N ep_return = tf.floordiv(ep_return, interval_size) * interval_size elif rew_clustering_options[0] == 'manual': assert len( rew_clustering_options ) == 4, "Reward clustering: Provide the correct number of parameters" N, rew_min, rew_max = map(int, rew_clustering_options[1:]) interval_size = (rew_max - rew_min) / N # Clip to avoid overflow and cluster ep_return = tf.clip_by_value(ep_return, rew_min, rew_max) ep_return = tf.floordiv(ep_return, interval_size) * interval_size else: raise Exception('Unrecognized reward clustering scheme.') # Get pdfs for episodes target_log_pdf_episode = tf.reduce_sum(target_log_pdf_split, axis=1) behavioral_log_pdf_episode = tf.reduce_sum(behavioral_log_pdf_split, axis=1) # Normalize log_proba (avoid as overflows as possible) normalization_factor = tf.reduce_mean( tf.stack([target_log_pdf_episode, behavioral_log_pdf_episode])) target_norm_log_pdf_episode = target_log_pdf_episode - normalization_factor behavioral_norm_log_pdf_episode = behavioral_log_pdf_episode - normalization_factor # Exponentiate target_pdf_episode = tf.clip_by_value( tf.cast(tf.exp(target_norm_log_pdf_episode), tf.float64), 1e-300, 1e+300) behavioral_pdf_episode = tf.clip_by_value( tf.cast(tf.exp(behavioral_norm_log_pdf_episode), tf.float64), 1e-300, 1e+300) tf.add_to_collection( 'asserts', tf.assert_positive(target_pdf_episode, name='target_pdf_positive')) tf.add_to_collection( 'asserts', tf.assert_positive(behavioral_pdf_episode, name='behavioral_pdf_positive')) # Compute the merging matrix (reward-clustering) and the number of clusters reward_unique, reward_indexes = tf.unique(ep_return) episode_clustering_matrix = tf.cast( tf.one_hot(reward_indexes, n_episodes), tf.float64) max_index = tf.reduce_max(reward_indexes) + 1 trajectories_per_cluster = tf.reduce_sum(episode_clustering_matrix, axis=0)[:max_index] tf.add_to_collection( 'asserts', tf.assert_positive(tf.reduce_sum(episode_clustering_matrix, axis=0)[:max_index], name='clustering_matrix')) # Get the clustered pdfs clustered_target_pdf = tf.matmul( tf.reshape(target_pdf_episode, (1, -1)), episode_clustering_matrix)[0][:max_index] clustered_behavioral_pdf = tf.matmul( tf.reshape(behavioral_pdf_episode, (1, -1)), episode_clustering_matrix)[0][:max_index] tf.add_to_collection( 'asserts', tf.assert_positive(clustered_target_pdf, name='clust_target_pdf_positive')) tf.add_to_collection( 'asserts', tf.assert_positive(clustered_behavioral_pdf, name='clust_behavioral_pdf_positive')) # Compute the J ratio_clustered = clustered_target_pdf / clustered_behavioral_pdf #ratio_reward = tf.cast(ratio_clustered, tf.float32) * reward_unique # ---- No cluster cardinality ratio_reward = tf.cast(ratio_clustered, tf.float32) * reward_unique * tf.cast( trajectories_per_cluster, tf.float32) # ---- Cluster cardinality #w_return_mean = tf.reduce_sum(ratio_reward) / tf.cast(max_index, tf.float32) # ---- No cluster cardinality w_return_mean = tf.reduce_sum(ratio_reward) / tf.cast( n_episodes, tf.float32) # ---- Cluster cardinality # Divergences ess_classic = tf.linalg.norm(ratio_reward, 1)**2 / tf.linalg.norm( ratio_reward, 2)**2 sqrt_ess_classic = tf.linalg.norm(ratio_reward, 1) / tf.linalg.norm( ratio_reward, 2) ess_renyi = n_episodes / empirical_d2 # Summaries losses_with_name.extend([(tf.reduce_max(ratio_clustered), 'MaxIW'), (tf.reduce_min(ratio_clustered), 'MinIW'), (tf.reduce_mean(ratio_clustered), 'MeanIW'), (U.reduce_std(ratio_clustered), 'StdIW'), (1 - (max_index / n_episodes), 'RewardCompression'), (ess_classic, 'ESSClassic'), (ess_renyi, 'ESSRenyi')]) else: raise NotImplementedError() if bound == 'J': bound_ = w_return_mean elif bound == 'std-d2': bound_ = w_return_mean - tf.sqrt( (1 - delta) / (delta * ess_renyi)) * return_std elif bound == 'max-d2': var_estimate = tf.sqrt( (1 - delta) / (delta * ess_renyi)) * return_abs_max bound_ = w_return_mean - tf.sqrt( (1 - delta) / (delta * ess_renyi)) * return_abs_max elif bound == 'max-ess': bound_ = w_return_mean - tf.sqrt( (1 - delta) / delta) / sqrt_ess_classic * return_abs_max elif bound == 'std-ess': bound_ = w_return_mean - tf.sqrt( (1 - delta) / delta) / sqrt_ess_classic * return_std elif bound == 'pdis-max-d2': # Discount factor if gamma >= 1: discounter = [ float(1 + 2 * (horizon - t - 1)) for t in range(0, horizon) ] else: def f(t): return pow(gamma, 2 * t) + ( 2 * pow(gamma, t) * (pow(gamma, t + 1) - pow(gamma, horizon))) / (1 - gamma) discounter = [f(t) for t in range(0, horizon)] discounter_tf = tf.constant(discounter) mean_episode_d2 = tf.reduce_sum( d2_w_0t, axis=0) / (tf.reduce_sum(mask_split, axis=0) + 1e-24) discounted_d2 = mean_episode_d2 * discounter_tf # Discounted d2 discounted_total_d2 = tf.reduce_sum(discounted_d2, axis=0) # Sum over time bound_ = w_return_mean - tf.sqrt( (1 - delta) * discounted_total_d2 / (delta * n_episodes)) * return_step_max elif bound == 'pdis-mean-d2': # Discount factor if gamma >= 1: discounter = [ float(1 + 2 * (horizon - t - 1)) for t in range(0, horizon) ] else: def f(t): return pow(gamma, 2 * t) + ( 2 * pow(gamma, t) * (pow(gamma, t + 1) - pow(gamma, horizon))) / (1 - gamma) discounter = [f(t) for t in range(0, horizon)] discounter_tf = tf.constant(discounter) mean_episode_d2 = tf.reduce_sum( d2_w_0t, axis=0) / (tf.reduce_sum(mask_split, axis=0) + 1e-24) discounted_d2 = mean_episode_d2 * discounter_tf # Discounted d2 discounted_total_d2 = tf.reduce_sum(discounted_d2, axis=0) # Sum over time bound_ = w_return_mean - tf.sqrt( (1 - delta) * discounted_total_d2 / (delta * n_episodes)) * return_step_mean else: raise NotImplementedError() # Policy entropy for exploration ent = pi.pd.entropy() meanent = tf.reduce_mean(ent) losses_with_name.append((meanent, 'MeanEntropy')) # Add policy entropy bonus if entropy != 'none': scheme, v1, v2 = entropy.split(':') if scheme == 'step': entcoeff = tf.cond(iter_number_ < int(v2), lambda: float(v1), lambda: float(0.0)) losses_with_name.append((entcoeff, 'EntropyCoefficient')) entbonus = entcoeff * meanent bound_ = bound_ + entbonus elif scheme == 'lin': ip = tf.cast(iter_number_ / max_iters, tf.float32) entcoeff_decay = tf.maximum( 0.0, float(v2) + (float(v1) - float(v2)) * (1.0 - ip)) losses_with_name.append((entcoeff_decay, 'EntropyCoefficient')) entbonus = entcoeff_decay * meanent bound_ = bound_ + entbonus elif scheme == 'exp': ent_f = tf.exp( -tf.abs(tf.reduce_mean(iw) - 1) * float(v2)) * float(v1) losses_with_name.append((ent_f, 'EntropyCoefficient')) bound_ = bound_ + ent_f * meanent else: raise Exception('Unrecognized entropy scheme.') losses_with_name.append((w_return_mean, 'ReturnMeanIW')) losses_with_name.append((bound_, 'Bound')) losses, loss_names = map(list, zip(*losses_with_name)) if use_natural_gradient: p = tf.placeholder(dtype=tf.float32, shape=[None]) target_logpdf_episode = tf.reduce_sum(target_log_pdf_split * mask_split, axis=1) grad_logprob = U.flatgrad( tf.stop_gradient(iwn) * target_logpdf_episode, var_list) dot_product = tf.reduce_sum(grad_logprob * p) hess_logprob = U.flatgrad(dot_product, var_list) compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_], [-hess_logprob]) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) assert_ops = tf.group(*tf.get_collection('asserts')) print_ops = tf.group(*tf.get_collection('prints')) compute_lossandgrad = U.function( [ob_, ac_, rew_, disc_rew_, mask_, iter_number_], losses + [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_grad = U.function( [ob_, ac_, rew_, disc_rew_, mask_, iter_number_], [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_bound = U.function( [ob_, ac_, rew_, disc_rew_, mask_, iter_number_], [bound_, assert_ops, print_ops]) compute_losses = U.function( [ob_, ac_, rew_, disc_rew_, mask_, iter_number_], losses) #compute_temp = U.function([ob_, ac_, rew_, disc_rew_, mask_], [ratio_cumsum, discounted_ratio]) set_parameter = U.SetFromFlat(var_list) get_parameter = U.GetFlat(var_list) if sampler is None: seg_gen = traj_segment_generator(pi, env, n_episodes, horizon, stochastic=True) sampler = type("SequentialSampler", (object, ), { "collect": lambda self, _: seg_gen.__next__() })() U.initialize() # Starting optimizing episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=n_episodes) rewbuffer = deque(maxlen=n_episodes) while True: iters_so_far += 1 if render_after is not None and iters_so_far % render_after == 0: if hasattr(env, 'render'): render(env, pi, horizon) if callback: callback(locals(), globals()) if iters_so_far >= max_iters: print('Finised...') break logger.log('********** Iteration %i ************' % iters_so_far) theta = get_parameter() with timed('sampling'): seg = sampler.collect(theta) add_disc_rew(seg, gamma) lens, rets = seg['ep_lens'], seg['ep_rets'] lenbuffer.extend(lens) rewbuffer.extend(rets) episodes_so_far += len(lens) timesteps_so_far += sum(lens) args = ob, ac, rew, disc_rew, mask, iter_number = seg['ob'], seg[ 'ac'], seg['rew'], seg['disc_rew'], seg['mask'], iters_so_far assign_old_eq_new() def evaluate_loss(): loss = compute_bound(*args) return loss[0] def evaluate_gradient(): gradient = compute_grad(*args) return gradient[0] if use_natural_gradient: def evaluate_fisher_vector_prod(x): return compute_linear_operator(x, *args)[0] + fisher_reg * x def evaluate_natural_gradient(g): return cg(evaluate_fisher_vector_prod, g, cg_iters=10, verbose=0) else: evaluate_natural_gradient = None with timed('summaries before'): logger.record_tabular("Iteration", iters_so_far) logger.record_tabular("InitialBound", evaluate_loss()) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if save_weights: logger.record_tabular('Weights', str(get_parameter())) import pickle file = open('checkpoint.pkl', 'wb') pickle.dump(theta, file) with timed("offline optimization"): theta, improvement = optimize_offline( theta, set_parameter, line_search, evaluate_loss, evaluate_gradient, evaluate_natural_gradient, max_offline_ite=max_offline_iters) set_parameter(theta) with timed('summaries after'): meanlosses = np.array(compute_losses(*args)) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.dump_tabular() env.close()
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation # CMAES max_fitness, # has to be negative, as cmaes consider minization popsize, gensize, bounds, sigma, eval_iters, max_v_train_iter, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) seed, env_id): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy backup_pi = policy_fn( "backup_pi", ob_space, ac_space ) # Construct a network for every individual to adapt during the es evolution pi_params = tf.placeholder(dtype=tf.float32, shape=[None]) old_pi_params = tf.placeholder(dtype=tf.float32, shape=[None]) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule layer_clip = tf.placeholder( name='layer_clip', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule bound_coeff = tf.placeholder( name='bound_coeff', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult * layer_clip # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - (oldpi.pd.logp(ac) + 1e-8)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) vf_losses = [vf_loss] vf_loss_names = ["vf_loss"] pol_loss = pol_surr + pol_entpen total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() vf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") ] pol_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("pol") ] layer_var_list = [] for i in range(pi.num_hid_layers): layer_var_list.append([ v for v in pol_var_list if v.name.split("/")[2].startswith('fc%i' % (i + 1)) ]) logstd_var_list = [ v for v in pol_var_list if v.name.split("/")[2].startswith("logstd") ] if len(logstd_var_list) != 0: layer_var_list.append([ v for v in pol_var_list if v.name.split("/")[2].startswith("final") ] + logstd_var_list) vf_lossandgrad = U.function([ob, ac, ret, lrmult], vf_losses + [U.flatgrad(vf_loss, vf_var_list)]) lossandgrad = U.function([ob, ac, atarg, ret, lrmult, layer_clip], losses + [U.flatgrad(total_loss, var_list)]) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) assign_backup_eq_new = U.function( [], [], updates=[ tf.assign(backup_v, newv) for ( backup_v, newv) in zipsame(backup_pi.get_variables(), pi.get_variables()) ]) assign_new_eq_backup = U.function( [], [], updates=[ tf.assign(newv, backup_v) for (newv, backup_v ) in zipsame(pi.get_variables(), backup_pi.get_variables()) ]) # Compute all losses compute_pol_losses = U.function([ob, ac, atarg, ret, lrmult, layer_clip], [pol_loss, pol_surr, pol_entpen, meankl]) compute_v_pred = U.function([ob], [pi.vpred]) a_prob = tf.exp(pi.pd.logp(ac)) compute_a_prob = U.function([ob, ac], [a_prob]) U.initialize() layer_set_operate_list = [] layer_get_operate_list = [] for var in layer_var_list: set_pi_layer_flat_params = U.SetFromFlat(var) layer_set_operate_list.append(set_pi_layer_flat_params) get_pi_layer_flat_params = U.GetFlat(var) layer_get_operate_list.append(get_pi_layer_flat_params) # get_pi_layer_flat_params = U.GetFlat(pol_var_list) # set_pi_layer_flat_params = U.SetFromFlat(pol_var_list) vf_adam.sync() adam.sync() global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness episodes_so_far = 0 timesteps_so_far = 0 ppo_timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards best_fitness = -np.inf eval_seq = traj_segment_generator_eval(pi, env, timesteps_per_actorbatch, stochastic=False) # eval_gen = traj_segment_generator_eval(pi, test_env, timesteps_per_actorbatch, stochastic = True) # For evaluation seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, eval_seq=eval_seq) # For train V Func assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" indices = [] # maintain all selected indices for each iteration opt = cma.CMAOptions() opt['tolfun'] = max_fitness opt['popsize'] = popsize opt['maxiter'] = gensize opt['verb_disp'] = 0 opt['verb_log'] = 0 # opt['seed'] = seed opt['AdaptSigma'] = True # opt['bounds'] = bounds # opt['tolstagnation'] = 20 ess = [] seg = None segs = None sum_vpred = [] while True: if max_timesteps and timesteps_so_far >= max_timesteps: print("Max time steps") break elif max_episodes and episodes_so_far >= max_episodes: print("Max episodes") break elif max_iters and iters_so_far >= max_iters: print("Max iterations") break elif max_seconds and time.time() - tstart >= max_seconds: print("Max time") break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / (max_timesteps), 0) else: raise NotImplementedError # epsilon = max(0.5 - float(timesteps_so_far) / (max_timesteps), 0) * cur_lrmult epsilon = max(0.5 * cur_lrmult, 0) # epsilon = 0.2 sigma_adapted = max(sigma * cur_lrmult, 1e-8) # sigma_adapted = max(max(sigma - float(timesteps_so_far) / (5000 * max_timesteps), 0) * cur_lrmult, 1e-8) # cmean_adapted = max(1.0 - float(timesteps_so_far) / (max_timesteps), 1e-8) # cmean_adapted = max(0.8 - float(time˚steps_so_far) / (2*max_timesteps), 1e-8) # if timesteps_so_far % max_timesteps == 10: max_v_train_iter = int( max( max_v_train_iter * (1 - timesteps_so_far / (0.5 * max_timesteps)), 1)) logger.log("********** Iteration %i ************" % iters_so_far) if iters_so_far == 0: eval_seg = eval_seq.__next__() rewbuffer.extend(eval_seg["ep_rets"]) lenbuffer.extend(eval_seg["ep_lens"]) result_record() # Repository Train train_segs = {} seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) if hasattr(pi, "ob_rms"): pi.ob_rms.update( seg["ob"]) # update running mean/std for normalization # rewbuffer.extend(seg["ep_rets"]) # lenbuffer.extend(seg["ep_lens"]) # # if iters_so_far == 0: # result_record() assign_old_eq_new() # set old parameter values to new parameter values if segs is None: segs = seg segs["v_target"] = np.zeros(len(seg["ob"]), 'float32') elif len(segs["ob"]) >= 50000: segs["ob"] = np.take(segs["ob"], np.arange(timesteps_per_actorbatch, len(segs["ob"])), axis=0) segs["next_ob"] = np.take(segs["next_ob"], np.arange(timesteps_per_actorbatch, len(segs["next_ob"])), axis=0) segs["ac"] = np.take(segs["ac"], np.arange(timesteps_per_actorbatch, len(segs["ac"])), axis=0) segs["rew"] = np.take(segs["rew"], np.arange(timesteps_per_actorbatch, len(segs["rew"])), axis=0) segs["vpred"] = np.take(segs["vpred"], np.arange(timesteps_per_actorbatch, len(segs["vpred"])), axis=0) segs["act_props"] = np.take(segs["act_props"], np.arange(timesteps_per_actorbatch, len(segs["act_props"])), axis=0) segs["new"] = np.take(segs["new"], np.arange(timesteps_per_actorbatch, len(segs["new"])), axis=0) segs["adv"] = np.take(segs["adv"], np.arange(timesteps_per_actorbatch, len(segs["adv"])), axis=0) segs["tdlamret"] = np.take(segs["tdlamret"], np.arange(timesteps_per_actorbatch, len(segs["tdlamret"])), axis=0) segs["ep_rets"] = np.take(segs["ep_rets"], np.arange(timesteps_per_actorbatch, len(segs["ep_rets"])), axis=0) segs["ep_lens"] = np.take(segs["ep_lens"], np.arange(timesteps_per_actorbatch, len(segs["ep_lens"])), axis=0) segs["v_target"] = np.take(segs["v_target"], np.arange(timesteps_per_actorbatch, len(segs["v_target"])), axis=0) segs["ob"] = np.append(segs['ob'], seg['ob'], axis=0) segs["next_ob"] = np.append(segs['next_ob'], seg['next_ob'], axis=0) segs["ac"] = np.append(segs['ac'], seg['ac'], axis=0) segs["rew"] = np.append(segs['rew'], seg['rew'], axis=0) segs["vpred"] = np.append(segs['vpred'], seg['vpred'], axis=0) segs["act_props"] = np.append(segs['act_props'], seg['act_props'], axis=0) segs["new"] = np.append(segs['new'], seg['new'], axis=0) segs["adv"] = np.append(segs['adv'], seg['adv'], axis=0) segs["tdlamret"] = np.append(segs['tdlamret'], seg['tdlamret'], axis=0) segs["ep_rets"] = np.append(segs['ep_rets'], seg['ep_rets'], axis=0) segs["ep_lens"] = np.append(segs['ep_lens'], seg['ep_lens'], axis=0) segs["v_target"] = np.append(segs['v_target'], np.zeros(len(seg["ob"]), 'float32'), axis=0) else: segs["ob"] = np.append(segs['ob'], seg['ob'], axis=0) segs["next_ob"] = np.append(segs['next_ob'], seg['next_ob'], axis=0) segs["ac"] = np.append(segs['ac'], seg['ac'], axis=0) segs["rew"] = np.append(segs['rew'], seg['rew'], axis=0) segs["vpred"] = np.append(segs['vpred'], seg['vpred'], axis=0) segs["act_props"] = np.append(segs['act_props'], seg['act_props'], axis=0) segs["new"] = np.append(segs['new'], seg['new'], axis=0) segs["adv"] = np.append(segs['adv'], seg['adv'], axis=0) segs["tdlamret"] = np.append(segs['tdlamret'], seg['tdlamret'], axis=0) segs["ep_rets"] = np.append(segs['ep_rets'], seg['ep_rets'], axis=0) segs["ep_lens"] = np.append(segs['ep_lens'], seg['ep_lens'], axis=0) segs["v_target"] = np.append(segs['v_target'], np.zeros(len(seg["ob"]), 'float32'), axis=0) if iters_so_far == 0: ob, ac, tdlamret = seg["ob"], seg["ac"], seg["tdlamret"] d = Dataset(dict(ob=ob, ac=ac, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] # Train V function # logger.log("Catchup Training V Func and Evaluating V Func Losses") for _ in range(max_v_train_iter): for batch in d.iterate_once(optim_batchsize): *vf_loss, g = vf_lossandgrad(batch["ob"], batch["ac"], batch["vtarg"], cur_lrmult) vf_adam.update(g, optim_stepsize * cur_lrmult) # logger.log(fmt_row(13, np.mean(vf_losses, axis = 0))) else: # Update v target new = segs["new"] rew = segs["rew"] act_prob = np.asarray(compute_a_prob(segs["ob"], segs["ac"])).T importance_ratio = np.squeeze(act_prob) / ( segs["act_props"] + np.ones(segs["act_props"].shape) * 1e-8) segs["v_target"] = importance_ratio * (1 / np.sum(importance_ratio)) * \ np.squeeze( rew + np.invert(new).astype(np.float32) * gamma * compute_v_pred(segs["next_ob"])) # train_segs["v_target"] = rew + np.invert(new).astype(np.float32) * gamma * compute_v_pred(train_segs["next_ob"]) if len(segs["ob"]) >= 20000: train_times = int(max_v_train_iter / 2) if int(max_v_train_iter / 2) > 0 else 1 else: train_times = 2 for i in range(train_times): selected_train_index = np.random.choice( range(len(segs["ob"])), timesteps_per_actorbatch, replace=False) train_segs["ob"] = np.take(segs["ob"], selected_train_index, axis=0) train_segs["next_ob"] = np.take(segs["next_ob"], selected_train_index, axis=0) train_segs["ac"] = np.take(segs["ac"], selected_train_index, axis=0) train_segs["rew"] = np.take(segs["rew"], selected_train_index, axis=0) train_segs["vpred"] = np.take(segs["vpred"], selected_train_index, axis=0) train_segs["new"] = np.take(segs["new"], selected_train_index, axis=0) train_segs["adv"] = np.take(segs["adv"], selected_train_index, axis=0) train_segs["tdlamret"] = np.take(segs["tdlamret"], selected_train_index, axis=0) train_segs["v_target"] = np.take(segs["v_target"], selected_train_index, axis=0) # ob, ac, v_target = train_segs["ob"], train_segs[ "ac"], train_segs["v_target"] d = Dataset(dict(ob=ob, ac=ac, vtarg=v_target), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] # Train V function # logger.log("Training V Func and Evaluating V Func Losses") # Train V function # logger.log("Catchup Training V Func and Evaluating V Func Losses") # logger.log("Train V - "+str(_)) for _ in range(max_v_train_iter): for batch in d.iterate_once(optim_batchsize): *vf_loss, g = vf_lossandgrad(batch["ob"], batch["ac"], batch["vtarg"], cur_lrmult) vf_adam.update(g, optim_stepsize * cur_lrmult) # logger.log(fmt_row(13, np.mean(vf_losses, axis = 0))) # seg['vpred'] = np.asarray(compute_v_pred(seg["ob"])).reshape(seg['vpred'].shape) # seg['nextvpred'] = seg['vpred'][-1] * (1 - seg["new"][-1]) # add_vtarg_and_adv(seg, gamma, lam) ob, ac, atarg, v_target = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=v_target), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] # Local search for _ in range(optim_epochs): for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, 1 / 4) adam.update(g, optim_stepsize * cur_lrmult) # seg['vpred'] = np.asarray(compute_v_pred(seg["ob"])).reshape(seg['vpred'].shape) # seg['nextvpred'] = seg['vpred'][-1] * (1 - seg["new"][-1]) # add_vtarg_and_adv(seg, gamma, lam) ob_po, ac_po, atarg_po, tdlamret_po = seg["ob"], seg["ac"], seg[ "adv"], seg["tdlamret"] atarg_po = (atarg_po - atarg_po.mean()) / atarg_po.std( ) # standardized advantage function estimate # opt['CMA_cmean'] = cmean_adapted # assign_old_eq_new() # set old parameter values to new parameter values for i in range(len(layer_var_list)): # CMAES Train Policy assign_backup_eq_new() # backup current policy flatten_weights = layer_get_operate_list[i]() if len(indices) < len(layer_var_list): selected_index, init_weights = uniform_select( flatten_weights, 0.5) # 0.5 means 50% proportion of params are selected indices.append(selected_index) else: rand = np.random.uniform() # print("Random-Number:", rand) # print("Epsilon:", epsilon) if rand < epsilon: selected_index, init_weights = uniform_select( flatten_weights, 0.5) indices.append(selected_index) # logger.log("Random: select new weights") else: selected_index = indices[i] init_weights = np.take(flatten_weights, selected_index) es = cma.CMAEvolutionStrategy(init_weights, sigma_adapted, opt) while True: if es.countiter >= gensize: # logger.log("Max generations for current layer") break # logger.log("Iteration:" + str(iters_so_far) + " - sub-train Generation for Policy:" + str(es.countiter)) # logger.log("Sigma=" + str(es.sigma)) # solutions = es.ask(sigma_fac = max(cur_lrmult, 1e-8)) solutions = es.ask() # solutions = [np.clip(solution, -5.0, 5.0).tolist() for solution in solutions] costs = [] lens = [] assign_backup_eq_new() # backup current policy for id, solution in enumerate(solutions): np.put(flatten_weights, selected_index, solution) layer_set_operate_list[i](flatten_weights) cost = compute_pol_losses(ob_po, ac_po, atarg_po, tdlamret_po, cur_lrmult, 1 / 4 * (i + 1)) costs.append(cost[0]) assign_new_eq_backup() # Weights decay l2_decay = compute_weight_decay(0.01, solutions) costs += l2_decay costs, real_costs = fitness_rank(costs) # logger.log("real_costs:"+str(real_costs)) # best_solution = np.copy(es.result[0]) # best_fitness = -es.result[1] es.tell_real_seg(solutions=solutions, function_values=costs, real_f=real_costs, segs=None) # best_solution = np.copy(solutions[np.argmin(costs)]) # best_fitness = -real_costs[np.argmin(costs)] best_solution = es.result[0] best_fitness = es.result[1] np.put(flatten_weights, selected_index, best_solution) layer_set_operate_list[i](flatten_weights) # logger.log("Update the layer") # best_solution = es.result[0] # best_fitness = es.result[1] # logger.log("Best Solution Fitness:" + str(best_fitness)) # set_pi_flat_params(best_solution) import gc gc.collect() iters_so_far += 1 episodes_so_far += sum(lens)
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) identifier, save_result=True, save_interval=100, reward_list=[], cont=False, play=False, iter, action_repeat=1): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space mirror = hasattr(env, 'mirror_id') mirror_id = env.mirror_id if mirror else None pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) if mirror: mirror_ob = U.get_placeholder(name="mirror_ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) mirror_ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) sym_loss = 4 * tf.reduce_mean(tf.square(ac - mirror_ac)) if mirror else 0 total_loss = pol_surr + pol_entpen + vf_loss + sym_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] if mirror: losses.append(sym_loss) loss_names.append("sym_loss") var_list = pi.get_trainable_variables() inputs = [ob, ac, atarg, ret, lrmult] if mirror: inputs += [mirror_ob, mirror_ac] lossandgrad = U.function(inputs, losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function(inputs, losses) if play: return pi if cont: load_state(identifier, iter) else: U.initialize() iter = 0 adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, mirror_id=mirror_id, action_repeat=action_repeat) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = int(iter) tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_ori = deque(maxlen=100) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError if MPI.COMM_WORLD.Get_rank() == 0: logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] if mirror: mirror_ob, mirror_ac = seg["mirror_ob"], seg["mirror_ac"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d_dict = dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret) if mirror: d_dict["mirror_ob"] = mirror_ob d_dict["mirror_ac"] = mirror_ac d = Dataset(d_dict, shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): batches = [ batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult ] if mirror: batches += [batch["mirror_ob"], batch["mirror_ac"]] *newlosses, g = lossandgrad(*batches) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) losses = [] for batch in d.iterate_once(optim_batchsize): batches = [ batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult ] if mirror: batches += [batch["mirror_ob"], batch["mirror_ac"]] newlosses = compute_losses(*batches) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_rets_ori"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, rews_ori = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) rewbuffer_ori.extend(rews_ori) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpRewOriMean", np.mean(rewbuffer_ori)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() reward_list.append(np.mean(rewbuffer_ori)) if save_result and iters_so_far % save_interval == 0: save_state(identifier, iters_so_far) save_rewards(reward_list, identifier, iters_so_far) logger.log('Model and reward saved') return pi
def learn( env, policy_func, reward_giver, expert_dataset, rank, pretrained, pretrained_weight, *, # 0 g_step, d_step, entcoeff, save_per_iter, # 1024 ckpt_dir, log_dir, timesteps_per_batch, task_name, robot_name, gamma, lam, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, callback=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd") ] vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] assert len(var_list) == len(vf_var_list) + 1 d_adam = MpiAdam(reward_giver.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) if robot_name == 'scara': summary_writer = tf.summary.FileWriter( '/home/yue/gym-gazebo/Tensorboard/scara', graph=tf.get_default_graph()) elif robot_name == 'mara': # summary_writer=tf.summary.FileWriter('/home/yue/gym-gazebo/Tensorboard/mara/down-home_position',graph=tf.get_default_graph()) summary_writer = tf.summary.FileWriter( '/home/yue/gym-gazebo/Tensorboard/mara/collisions_model/', graph=tf.get_default_graph()) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy if nworkers != 1: g = allmean(compute_vflossandgrad(mbob, mbret)) else: g = compute_vflossandgrad(mbob, mbret) vfadam.update(g, vf_stepsize) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # update running mean/std for reward_giver if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) if nworkers != 1: d_adam.update(allmean(g), d_stepsize) else: d_adam.update(g, d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) g_loss_summary = tf.Summary(value=[ tf.Summary.Value(tag="g_loss", simple_value=np.mean(d_losses[0][0])) ]) summary_writer.add_summary(g_loss_summary, timesteps_so_far) d_loss_summary = tf.Summary(value=[ tf.Summary.Value(tag="d_loss", simple_value=np.mean(d_losses[0][1])) ]) summary_writer.add_summary(d_loss_summary, timesteps_so_far) entropy_summary = tf.Summary(value=[ tf.Summary.Value(tag="entropy", simple_value=np.mean(d_losses[0][2])) ]) summary_writer.add_summary(entropy_summary, timesteps_so_far) entropy_loss_summary = tf.Summary(value=[ tf.Summary.Value(tag="entropy_loss", simple_value=np.mean(d_losses[0][3])) ]) summary_writer.add_summary(entropy_loss_summary, timesteps_so_far) g_acc_summary = tf.Summary(value=[ tf.Summary.Value(tag="g_acc", simple_value=np.mean(d_losses[0][4])) ]) summary_writer.add_summary(g_acc_summary, timesteps_so_far) expert_acc_summary = tf.Summary(value=[ tf.Summary.Value(tag="expert_acc", simple_value=np.mean(d_losses[0][5])) ]) summary_writer.add_summary(expert_acc_summary, timesteps_so_far) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) summary = tf.Summary(value=[ tf.Summary.Value(tag="MeanDiscriminator", simple_value=np.mean(rewbuffer)) ]) summary_writer.add_summary(summary, timesteps_so_far) truesummary = tf.Summary(value=[ tf.Summary.Value(tag="MeanGenerator", simple_value=np.mean(true_rewbuffer)) ]) summary_writer.add_summary(truesummary, timesteps_so_far) true_rets_summary = tf.Summary(value=[ tf.Summary.Value(tag="Generator", simple_value=np.mean(true_rets)) ]) summary_writer.add_summary(true_rets_summary, timesteps_so_far) len_summary = tf.Summary(value=[ tf.Summary.Value(tag="Length", simple_value=np.mean(lenbuffer)) ]) summary_writer.add_summary(len_summary, timesteps_so_far) optimgain_summary = tf.Summary(value=[ tf.Summary.Value(tag="Optimgain", simple_value=np.mean(meanlosses[0])) ]) summary_writer.add_summary(optimgain_summary, timesteps_so_far) meankl_summary = tf.Summary(value=[ tf.Summary.Value(tag="Meankl", simple_value=np.mean(meanlosses[1])) ]) summary_writer.add_summary(meankl_summary, timesteps_so_far) entloss_summary = tf.Summary(value=[ tf.Summary.Value(tag="Entloss", simple_value=np.mean(meanlosses[2])) ]) summary_writer.add_summary(entloss_summary, timesteps_so_far) surrgain_summary = tf.Summary(value=[ tf.Summary.Value(tag="Surrgain", simple_value=np.mean(meanlosses[3])) ]) summary_writer.add_summary(surrgain_summary, timesteps_so_far) entropy_summary = tf.Summary(value=[ tf.Summary.Value(tag="Entropy", simple_value=np.mean(meanlosses[4])) ]) summary_writer.add_summary(entropy_summary, timesteps_so_far) epThisIter_summary = tf.Summary(value=[ tf.Summary.Value(tag="EpThisIter", simple_value=np.mean(len(lens))) ]) summary_writer.add_summary(epThisIter_summary, timesteps_so_far) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("MeanDiscriminator", np.mean(rewbuffer)) # Save model if robot_name == 'scara': if iters_so_far % save_per_iter == 0: if np.mean(rewbuffer) <= 200 or np.mean( true_rewbuffer) >= -100: task_name = str(iters_so_far) fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) if iters_so_far == 2000: break elif robot_name == 'mara': if iters_so_far % save_per_iter == 0: # if np.mean(rewbuffer) <= 300 or np.mean(true_rewbuffer) >= -400: task_name = str(iters_so_far) fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) if iters_so_far == 5000: break logger.record_tabular("MeanGenerator", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular()
def learn( env, policy_fn, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entc=0.5, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, i_trial): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) oldpi = policy_fn("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) entp = tf.placeholder(dtype=tf.float32, shape=[]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entp * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "loss_ent"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, entp], losses) compute_lossandgrad = U.function([ob, ac, atarg, entp], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() tf.global_variables_initializer() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, gamma=gamma) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards drwdsbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # entcoeff = max(entc - float(iters_so_far) / float(max_iters), 0.01) entcoeff = 0.0 # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args, entcoeff) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): print("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args, entcoeff))) improve = surr - surrbefore print("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): print("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: print("violated KL constraint. shrinking step.") elif improve < 0: print("surrogate didn't improve. shrinking step.") else: print("Stepsize OK!") break stepsize *= .5 else: print("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.logkv(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.logkv("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_drwds"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, drwds = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) drwdsbuffer.extend(drwds) logger.logkv("EpLenMean", np.mean(lenbuffer)) logger.logkv("EpRewMean", np.mean(rewbuffer)) logger.logkv("EpThisIter", len(lens)) logger.logkv("EpDRewMean", np.mean(drwdsbuffer)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.logkv("EpisodesSoFar", episodes_so_far) logger.logkv("TimestepsSoFar", timesteps_so_far) logger.logkv("TimeElapsed", time.time() - tstart) logger.logkv('trial', i_trial) logger.logkv("Iteration", iters_so_far) logger.logkv("Name", 'TRPO') if rank == 0: logger.dump_tabular()
def __init__(self, a_name, env, policy_func, par): self.env = env self.timesteps_per_batch = par.timesteps_per_batch self.max_kl = par.max_kl self.cg_iters = par.cg_iters self.gamma = par.gamma self.lam = par.lam # advantage estimation self.entcoeff = par.entcoeff self.cg_damping = par.cg_damping self.vf_stepsize = par.vf_stepsize self.vf_iters = par.vf_iters self.max_timesteps = par.max_timesteps self.max_episodes = par.max_episodes self.max_iters = par.max_iters self.callback = par.callback, # you can do anything in the callback, since it takes locals(), globals() self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- self.ob_space = self.env.observation_space self.ac_space = self.env.action_space self.pi = policy_func(a_name, self.ob_space, self.ac_space) self.oldpi = policy_func("oldpi" + a_name, self.ob_space, self.ac_space) self.atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) self.ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return self.ob = U.get_placeholder_cached(name="ob" + str(TRPO_agent_new.index2)) self.ac = self.pi.pdtype.sample_placeholder([None]) self.kloldnew = self.oldpi.pd.kl(self.pi.pd) self.ent = self.pi.pd.entropy() meankl = U.mean(self.kloldnew) meanent = U.mean(self.ent) entbonus = self.entcoeff * meanent self.vferr = U.mean(tf.square(self.pi.vpred - self.ret)) ratio = tf.exp(self.pi.pd.logp(self.ac) - self.oldpi.pd.logp(self.ac)) # advantage * pnew / pold surrgain = U.mean(ratio * self.atarg) optimgain = surrgain + entbonus self.losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = [ "optimgain", "meankl", "entloss", "surrgain", "entropy" ] self.dist = meankl all_var_list = self.pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] self.vfadam = MpiAdam(vf_var_list) self.get_flat = U.GetFlat(var_list) self.set_from_flat = U.SetFromFlat(var_list) self.klgrads = tf.gradients(self.dist, var_list) self.flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan" + str(TRPO_agent_new.index2)) shapes = [var.get_shape().as_list() for var in var_list] start = 0 self.tangents = [] for shape in shapes: sz = U.intprod(shape) self.tangents.append( tf.reshape(self.flat_tangent[start:start + sz], shape)) start += sz self.gvp = tf.add_n([ U.sum(g * tangent) for (g, tangent) in zipsame(self.klgrads, self.tangents) ]) #pylint: disable=E1111 self.fvp = U.flatgrad(self.gvp, var_list) self.assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( self.oldpi.get_variables(), self.pi.get_variables()) ]) self.compute_losses = U.function([self.ob, self.ac, self.atarg], self.losses) self.compute_lossandgrad = U.function( [self.ob, self.ac, self.atarg], self.losses + [U.flatgrad(optimgain, var_list)]) self.compute_fvp = U.function( [self.flat_tangent, self.ob, self.ac, self.atarg], self.fvp) self.compute_vflossandgrad = U.function([self.ob, self.ret], U.flatgrad( self.vferr, vf_var_list)) TRPO_agent_new.index2 += 1 U.initialize() self.th_init = self.get_flat() MPI.COMM_WORLD.Bcast(self.th_init, root=0) self.set_from_flat(self.th_init) self.vfadam.sync() print("Init param sum", self.th_init.sum(), flush=True)
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) sym_loss_weight=0.0, return_threshold=None, # termiante learning if reaches return_threshold op_after_init=None, init_policy_params=None, policy_scope=None, max_threshold=None, positive_rew_enforce=False, reward_drop_bound=None, min_iters=0, ref_policy_params=None, rollout_length_thershold=None): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space if policy_scope is None: pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy else: pi = policy_func(policy_scope, ob_space, ac_space) # Construct network for new policy oldpi = policy_func("old" + policy_scope, ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent sym_loss = sym_loss_weight * U.mean( tf.square(pi.mean - pi.mirrored_mean)) # mirror symmetric loss ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) + sym_loss # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent, sym_loss] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent", "sym_loss"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() if init_policy_params is not None: cur_scope = pi.get_variables()[0].name[0:pi.get_variables()[0].name. find('/')] orig_scope = list(init_policy_params.keys() )[0][0:list(init_policy_params.keys())[0].find('/')] for i in range(len(pi.get_variables())): assign_op = pi.get_variables()[i].assign( init_policy_params[pi.get_variables()[i].name.replace( cur_scope, orig_scope, 1)]) U.get_session().run(assign_op) assign_op = oldpi.get_variables()[i].assign( init_policy_params[pi.get_variables()[i].name.replace( cur_scope, orig_scope, 1)]) U.get_session().run(assign_op) if ref_policy_params is not None: ref_pi = policy_func("ref_pi", ob_space, ac_space) cur_scope = ref_pi.get_variables()[0].name[0:ref_pi.get_variables()[0]. name.find('/')] orig_scope = list(ref_policy_params.keys() )[0][0:list(ref_policy_params.keys())[0].find('/')] for i in range(len(ref_pi.get_variables())): assign_op = ref_pi.get_variables()[i].assign( ref_policy_params[ref_pi.get_variables()[i].name.replace( cur_scope, orig_scope, 1)]) U.get_session().run(assign_op) env.env.env.ref_policy = ref_pi adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" max_thres_satisfied = max_threshold is None adjust_ratio = 0.0 prev_avg_rew = -1000000 revert_parameters = {} variables = pi.get_variables() for i in range(len(variables)): cur_val = variables[i].eval() revert_parameters[variables[i].name] = cur_val revert_data = [0, 0, 0] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() if reward_drop_bound is not None: lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) revert_iteration = False if np.mean( rewbuffer ) < prev_avg_rew - reward_drop_bound: # detect significant drop in performance, revert to previous iteration print("Revert Iteration!!!!!") revert_iteration = True else: prev_avg_rew = np.mean(rewbuffer) logger.record_tabular("Revert Rew", prev_avg_rew) if revert_iteration: # revert iteration for i in range(len(pi.get_variables())): assign_op = pi.get_variables()[i].assign( revert_parameters[pi.get_variables()[i].name]) U.get_session().run(assign_op) episodes_so_far = revert_data[0] timesteps_so_far = revert_data[1] iters_so_far = revert_data[2] continue else: variables = pi.get_variables() for i in range(len(variables)): cur_val = variables[i].eval() revert_parameters[variables[i].name] = np.copy(cur_val) revert_data[0] = episodes_so_far revert_data[1] = timesteps_so_far revert_data[2] = iters_so_far if positive_rew_enforce: rewlocal = (seg["pos_rews"], seg["neg_pens"], seg["rew"] ) # local values listofrews = MPI.COMM_WORLD.allgather(rewlocal) # list of tuples pos_rews, neg_pens, rews = map(flatten_lists, zip(*listofrews)) if np.mean(rews) < 0.0: #min_id = np.argmin(rews) #adjust_ratio = pos_rews[min_id]/np.abs(neg_pens[min_id]) adjust_ratio = np.max([ adjust_ratio, np.mean(pos_rews) / np.abs(np.mean(neg_pens)) ]) for i in range(len(seg["rew"])): if np.abs(seg["rew"][i] - seg["pos_rews"][i] - seg["neg_pens"][i]) > 1e-5: print(seg["rew"][i], seg["pos_rews"][i], seg["neg_pens"][i]) print('Reward wrong!') abc seg["rew"][i] = seg["pos_rews"][ i] + seg["neg_pens"][i] * adjust_ratio add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) if reward_drop_bound is None: lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("Iter", iters_so_far) if positive_rew_enforce: if adjust_ratio is not None: logger.record_tabular("RewardAdjustRatio", adjust_ratio) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() if max_threshold is not None: print('Current max return: ', np.max(rewbuffer)) if np.max(rewbuffer) > max_threshold: max_thres_satisfied = True else: max_thres_satisfied = False return_threshold_satisfied = True if return_threshold is not None: if not (np.mean(rewbuffer) > return_threshold and iters_so_far > min_iters): return_threshold_satisfied = False rollout_length_thershold_satisfied = True if rollout_length_thershold is not None: rewlocal = (seg["avg_vels"], seg["rew"]) # local values listofrews = MPI.COMM_WORLD.allgather(rewlocal) # list of tuples avg_vels, rews = map(flatten_lists, zip(*listofrews)) if not (np.mean(lenbuffer) > rollout_length_thershold and np.mean(avg_vels) > 0.5 * env.env.env.final_tv): rollout_length_thershold_satisfied = False if rollout_length_thershold is not None or return_threshold is not None: if rollout_length_thershold_satisfied and return_threshold_satisfied: break return pi, np.mean(rewbuffer)
def learn( env, make_policy, *, n_episodes, horizon, delta, gamma, max_iters, sampler=None, use_natural_gradient=False, #can be 'exact', 'approximate' fisher_reg=1e-2, iw_method='is', iw_norm='none', bound='J', line_search_type='parabola', save_weights=0, improvement_tol=0., center_return=False, render_after=None, max_offline_iters=100, callback=None, clipping=False, entropy='none', positive_return=False, reward_clustering='none', capacity=10, warm_start=True): np.set_printoptions(precision=3) max_samples = horizon * n_episodes if line_search_type == 'binary': line_search = line_search_binary elif line_search_type == 'parabola': line_search = line_search_parabola else: raise ValueError() # Building the environment ob_space = env.observation_space ac_space = env.action_space # Creating the memory buffer memory = Memory(capacity=capacity, batch_size=n_episodes, horizon=horizon, ob_space=ob_space, ac_space=ac_space) # Building the target policy and saving its parameters pi = make_policy('pi', ob_space, ac_space) all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split('/')[1].startswith('pol') ] shapes = [U.intprod(var.get_shape().as_list()) for var in var_list] n_parameters = sum(shapes) # Building a set of behavioral policies behavioral_policies = memory.build_policies(make_policy, pi) # Placeholders ob_ = ob = U.get_placeholder_cached(name='ob') ac_ = pi.pdtype.sample_placeholder([None], name='ac') mask_ = tf.placeholder(dtype=tf.float32, shape=(None), name='mask') rew_ = tf.placeholder(dtype=tf.float32, shape=(None), name='rew') disc_rew_ = tf.placeholder(dtype=tf.float32, shape=(None), name='disc_rew') clustered_rew_ = tf.placeholder(dtype=tf.float32, shape=(None)) gradient_ = tf.placeholder(dtype=tf.float32, shape=(n_parameters, 1), name='gradient') iter_number_ = tf.placeholder(dtype=tf.int32, name='iter_number') active_policies = tf.placeholder(dtype=tf.float32, shape=(capacity), name='active_policies') losses_with_name = [] # Total number of trajectories N_total = tf.reduce_sum(active_policies) * n_episodes # Split operations disc_rew_split = tf.reshape(disc_rew_ * mask_, [-1, horizon]) rew_split = tf.reshape(rew_ * mask_, [-1, horizon]) mask_split = tf.reshape(mask_, [-1, horizon]) # Policy densities target_log_pdf = pi.pd.logp(ac_) * mask_ target_log_pdf_split = tf.reshape(target_log_pdf, [-1, horizon]) behavioral_log_pdfs = tf.stack([ bpi.pd.logp(ac_) * mask_ for bpi in memory.policies ]) # Shape is (capacity, ntraj*horizon) behavioral_log_pdfs_split = tf.reshape(behavioral_log_pdfs, [memory.capacity, -1, horizon]) # Compute renyi divergencies and sum over time, then exponentiate emp_d2_split = tf.reshape( tf.stack([pi.pd.renyi(bpi.pd, 2) * mask_ for bpi in memory.policies]), [memory.capacity, -1, horizon]) emp_d2_split_cum = tf.exp(tf.reduce_sum(emp_d2_split, axis=2)) # Compute arithmetic and harmonic mean of emp_d2 emp_d2_mean = tf.reduce_mean(emp_d2_split_cum, axis=1) emp_d2_arithmetic = tf.reduce_sum( emp_d2_mean * active_policies) / tf.reduce_sum(active_policies) emp_d2_harmonic = tf.reduce_sum(active_policies) / tf.reduce_sum( 1 / emp_d2_mean) # Return processing: clipping, centering, discounting ep_return = clustered_rew_ #tf.reduce_sum(mask_split * disc_rew_split, axis=1) if clipping: rew_split = tf.clip_by_value(rew_split, -1, 1) if center_return: ep_return = ep_return - tf.reduce_mean(ep_return) rew_split = rew_split - (tf.reduce_sum(rew_split) / (tf.reduce_sum(mask_split) + 1e-24)) discounter = [pow(gamma, i) for i in range(0, horizon)] # Decreasing gamma discounter_tf = tf.constant(discounter) disc_rew_split = rew_split * discounter_tf # Reward statistics return_mean = tf.reduce_mean(ep_return) return_std = U.reduce_std(ep_return) return_max = tf.reduce_max(ep_return) return_min = tf.reduce_min(ep_return) return_abs_max = tf.reduce_max(tf.abs(ep_return)) return_step_max = tf.reduce_max(tf.abs(rew_split)) # Max step reward return_step_mean = tf.abs(tf.reduce_mean(rew_split)) positive_step_return_max = tf.maximum(0.0, tf.reduce_max(rew_split)) negative_step_return_max = tf.maximum(0.0, tf.reduce_max(-rew_split)) return_step_maxmin = tf.abs(positive_step_return_max - negative_step_return_max) losses_with_name.extend([(return_mean, 'InitialReturnMean'), (return_max, 'InitialReturnMax'), (return_min, 'InitialReturnMin'), (return_std, 'InitialReturnStd'), (emp_d2_arithmetic, 'EmpiricalD2Arithmetic'), (emp_d2_harmonic, 'EmpiricalD2Harmonic'), (return_step_max, 'ReturnStepMax'), (return_step_maxmin, 'ReturnStepMaxmin')]) if iw_method == 'is': # Sum the log prob over time. Shapes: target(Nep, H), behav (Cap, Nep, H) target_log_pdf_episode = tf.reduce_sum(target_log_pdf_split, axis=1) behavioral_log_pdf_episode = tf.reduce_sum(behavioral_log_pdfs_split, axis=2) # To avoid numerical instability, compute the inversed ratio log_ratio = target_log_pdf_split - behavioral_log_pdfs_split inverse_log_ratio_episode = -tf.reduce_sum(log_ratio, axis=2) iw = 1 / tf.reduce_sum(tf.exp(inverse_log_ratio_episode) * tf.expand_dims(active_policies, -1), axis=0) # Compute also the balance-heuristic weights iw_split = tf.reshape(iw, (memory.capacity, -1)) iw_by_behavioral = tf.reduce_mean(iw_split, axis=1) losses_with_name.append( (iw_by_behavioral[0] / tf.reduce_sum(iw_by_behavioral), 'MultiIWFirstRatio')) losses_with_name.append( (tf.reduce_max(iw_by_behavioral), 'MultiIWMax')) losses_with_name.append( (tf.reduce_sum(iw_by_behavioral), 'MultiIWSum')) losses_with_name.append( (tf.reduce_min(iw_by_behavioral), 'MultiIWMin')) # Get the probability by exponentiation #target_pdf_episode = tf.exp(target_log_pdf_episode) #behavioral_pdf_episode = tf.exp(behavioral_log_pdf_episode) # Get the denominator by averaging over behavioral policies #behavioral_pdf_mixture = tf.reduce_mean(behavioral_pdf_episode, axis=0) + 1e-24 #iw = target_pdf_episode / behavioral_pdf_mixture iwn = iw / n_episodes # Compute the J w_return_mean = tf.reduce_sum(ep_return * iwn) # Empirical D2 of the mixture and relative ESS ess_renyi_arithmetic = N_total / emp_d2_arithmetic ess_renyi_harmonic = N_total / emp_d2_harmonic # Log quantities losses_with_name.extend([ (tf.reduce_max(iw), 'MaxIW'), (tf.reduce_min(iw), 'MinIW'), (tf.reduce_mean(iw), 'MeanIW'), (U.reduce_std(iw), 'StdIW'), (tf.reduce_min(target_log_pdf_episode), 'MinTargetPdf'), (tf.reduce_min(behavioral_log_pdf_episode), 'MinBehavPdf'), (ess_renyi_arithmetic, 'ESSRenyiArithmetic'), (ess_renyi_harmonic, 'ESSRenyiHarmonic') ]) else: raise NotImplementedError() if bound == 'J': bound_ = w_return_mean elif bound == 'max-d2-harmonic': bound_ = w_return_mean - tf.sqrt( (1 - delta) / (delta * ess_renyi_harmonic)) * return_abs_max elif bound == 'max-d2-arithmetic': bound_ = w_return_mean - tf.sqrt( (1 - delta) / (delta * ess_renyi_arithmetic)) * return_abs_max else: raise NotImplementedError() # Policy entropy for exploration ent = pi.pd.entropy() meanent = tf.reduce_mean(ent) losses_with_name.append((meanent, 'MeanEntropy')) # Add policy entropy bonus if entropy != 'none': scheme, v1, v2 = entropy.split(':') if scheme == 'step': entcoeff = tf.cond(iter_number_ < int(v2), lambda: float(v1), lambda: float(0.0)) losses_with_name.append((entcoeff, 'EntropyCoefficient')) entbonus = entcoeff * meanent bound_ = bound_ + entbonus elif scheme == 'lin': ip = tf.cast(iter_number_ / max_iters, tf.float32) entcoeff_decay = tf.maximum( 0.0, float(v2) + (float(v1) - float(v2)) * (1.0 - ip)) losses_with_name.append((entcoeff_decay, 'EntropyCoefficient')) entbonus = entcoeff_decay * meanent bound_ = bound_ + entbonus elif scheme == 'exp': ent_f = tf.exp( -tf.abs(tf.reduce_mean(iw) - 1) * float(v2)) * float(v1) losses_with_name.append((ent_f, 'EntropyCoefficient')) bound_ = bound_ + ent_f * meanent else: raise Exception('Unrecognized entropy scheme.') losses_with_name.append((w_return_mean, 'ReturnMeanIW')) losses_with_name.append((bound_, 'Bound')) losses, loss_names = map(list, zip(*losses_with_name)) ''' if use_natural_gradient: p = tf.placeholder(dtype=tf.float32, shape=[None]) target_logpdf_episode = tf.reduce_sum(target_log_pdf_split * mask_split, axis=1) grad_logprob = U.flatgrad(tf.stop_gradient(iwn) * target_logpdf_episode, var_list) dot_product = tf.reduce_sum(grad_logprob * p) hess_logprob = U.flatgrad(dot_product, var_list) compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_], [-hess_logprob]) ''' assert_ops = tf.group(*tf.get_collection('asserts')) print_ops = tf.group(*tf.get_collection('prints')) compute_lossandgrad = U.function([ ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies ], losses + [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_grad = U.function([ ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies ], [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_bound = U.function([ ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies ], [bound_, assert_ops, print_ops]) compute_losses = U.function([ ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies ], losses) #compute_temp = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies], [log_inverse_ratio, abc, iw]) set_parameter = U.SetFromFlat(var_list) get_parameter = U.GetFlat(var_list) policy_reinit = tf.variables_initializer(var_list) if sampler is None: seg_gen = traj_segment_generator(pi, env, n_episodes, horizon, stochastic=True, gamma=gamma) sampler = type("SequentialSampler", (object, ), { "collect": lambda self, _: seg_gen.__next__() })() U.initialize() # Starting optimizing episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=n_episodes) rewbuffer = deque(maxlen=n_episodes) while True: iters_so_far += 1 if render_after is not None and iters_so_far % render_after == 0: if hasattr(env, 'render'): render(env, pi, horizon) if callback: callback(locals(), globals()) if iters_so_far >= max_iters: print('Finished...') break logger.log('********** Iteration %i ************' % iters_so_far) theta = get_parameter() with timed('sampling'): seg = sampler.collect(theta) lens, rets = seg['ep_lens'], seg['ep_rets'] lenbuffer.extend(lens) rewbuffer.extend(rets) episodes_so_far += len(lens) timesteps_so_far += sum(lens) # Adding batch of trajectories to memory memory.add_trajectory_batch(seg) # Get multiple batches from memory seg_with_memory = memory.get_trajectories() # Get clustered reward reward_matrix = np.reshape( seg_with_memory['disc_rew'] * seg_with_memory['mask'], (-1, horizon)) ep_reward = np.sum(reward_matrix, axis=1) ep_reward = cluster_rewards(ep_reward, reward_clustering) args = ob, ac, rew, disc_rew, clustered_rew, mask, iter_number, active_policies = ( seg_with_memory['ob'], seg_with_memory['ac'], seg_with_memory['rew'], seg_with_memory['disc_rew'], ep_reward, seg_with_memory['mask'], iters_so_far, memory.get_active_policies_mask()) def evaluate_loss(): loss = compute_bound(*args) return loss[0] def evaluate_gradient(): gradient = compute_grad(*args) return gradient[0] if use_natural_gradient: def evaluate_fisher_vector_prod(x): return compute_linear_operator(x, *args)[0] + fisher_reg * x def evaluate_natural_gradient(g): return cg(evaluate_fisher_vector_prod, g, cg_iters=10, verbose=0) else: evaluate_natural_gradient = None with timed('summaries before'): logger.record_tabular("Iteration", iters_so_far) logger.record_tabular("InitialBound", evaluate_loss()) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if save_weights > 0 and iters_so_far % save_weights == 0: logger.record_tabular('Weights', str(get_parameter())) import pickle file = open('checkpoint' + str(iters_so_far) + '.pkl', 'wb') pickle.dump(theta, file) if not warm_start or memory.get_current_load() == capacity: # Optimize with timed("offline optimization"): theta, improvement = optimize_offline( theta, set_parameter, line_search, evaluate_loss, evaluate_gradient, evaluate_natural_gradient, max_offline_ite=max_offline_iters) set_parameter(theta) print(theta) with timed('summaries after'): meanlosses = np.array(compute_losses(*args)) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) else: # Reinitialize the policy tf.get_default_session().run(policy_reinit) logger.dump_tabular() env.close()
def learn( env, genv, i_trial, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return gpi = policy_fn("gpi", ob_space, ac_space) # Construct network for new policy goldpi = policy_fn("goldpi", ob_space, ac_space) # Network for old policy gatarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) gret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) # gob = U.get_placeholder_cached(name='ob') gac = gpi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent gkloldnew = goldpi.pd.kl(gpi.pd) gent = gpi.pd.entropy() gmeankl = tf.reduce_mean(gkloldnew) gmeanent = tf.reduce_mean(gent) gpol_entpen = (-entcoeff) * gmeanent ratio = tf.exp(pi.pd.logp(gac) - goldpi.pd.logp(gac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] gratio = tf.exp(gpi.pd.logp(ac) - oldpi.pd.logp(ac)) gsurr1 = gratio * gatarg gsurr2 = tf.clip_by_value(gratio, 1.0 - clip_param, 1.0 + clip_param) * gatarg gpol_surr = -tf.reduce_mean(tf.minimum(gsurr1, gsurr2)) gvf_loss = tf.reduce_mean(tf.square(gpi.vpred - gret)) gtotal_loss = gpol_surr + gpol_entpen + gvf_loss glosses = [gpol_surr, gpol_entpen, gvf_loss, gmeankl, gmeanent] gloss_names = ["gpol_surr", "gpol_entpen", "gvf_loss", "gkl", "gent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, gac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) gvar_list = gpi.get_trainable_variables() glossandgrad = U.function([ob, ac, gatarg, gret, lrmult], glosses + [U.flatgrad(gtotal_loss, gvar_list)]) gadam = MpiAdam(gvar_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) gassign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(goldpi.get_variables(), gpi.get_variables()) ]) compute_losses = U.function([ob, gac, atarg, ret, lrmult], losses) gcompute_losses = U.function([ob, ac, gatarg, gret, lrmult], glosses) U.initialize() adam.sync() gadam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, gpi, env, timesteps_per_actorbatch, stochastic=True) gseg_gen = traj_segment_generator(gpi, pi, genv, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards glenbuffer = deque(maxlen=100) # rolling buffer for episode lengths grewbuffer = deque(maxlen=100) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" def standarize(value): return (value - value.mean()) / (value.std()) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError print("********** Iteration %i ************" % iters_so_far) print("********** Guided Policy ************") gseg = gseg_gen.__next__() add_vtarg_and_adv(gseg, gamma, lam) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) gob, gac, gatarg, gatarg_, gtdlamret, gtdlamret_ , gvpredbefore, gvpredbefore_ = gseg["ob"], gseg["ac"], \ gseg["adv"], gseg["adv_"], gseg["tdlamret"], gseg["tdlamret_"], gseg["vpred"], gseg["vpred_"] standarize(gatarg_) standarize(gatarg) gd = Dataset(dict(gob=gob, gac=gac, gatarg=gatarg, gatarg_=gatarg_, gvtarg=gtdlamret, gvtarg_=gtdlamret_), shuffle=not gpi.recurrent) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, atarg_, tdlamret, tdlamret_, vpredbefore, vpredbefore_ = seg["ob"], seg["ac"],\ seg["adv"], seg["adv_"], seg["tdlamret"], seg["tdlamret_"], seg["vpred"], gseg["vpred_"] standarize(atarg) standarize(atarg_) d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, atarg_=atarg_, vtarg=tdlamret, vtarg_=tdlamret_), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(gpi, "ob_rms"): gpi.ob_rms.update(ob) if hasattr(pi, "ob_rms"): pi.ob_rms.update(gob) # update running mean/std for policy gassign_old_eq_new() print("Optimizing...Guided Policy") # print(fmt_row(13, gloss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): glosses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = glossandgrad(batch["ob"], batch["ac"], batch["atarg_"], batch["vtarg_"], cur_lrmult) gadam.update(g, optim_stepsize * cur_lrmult) glosses.append(newlosses) # print(fmt_row(13, np.mean(glosses, axis=0))) # print("Evaluating losses...") glosses = [] for batch in d.iterate_once(optim_batchsize): newlosses = gcompute_losses(batch["ob"], batch["ac"], batch["atarg_"], batch["vtarg_"], cur_lrmult) glosses.append(newlosses) gmeanlosses, _, _ = mpi_moments(glosses, axis=0) # print(fmt_row(13, gmeanlosses)) for (lossval, name) in zipsame(gmeanlosses, gloss_names): logger.record_tabular("gloss_" + name, lossval) # logger.record_tabular("gev_tdlam_before", explained_variance(vpredbefore, tdlamret)) assign_old_eq_new() # set old parameter values to new parameter values print("Optimizing...Training Policy") # print(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data optim_batchsize = optim_batchsize or gob.shape[0] for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in gd.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["gob"], batch["gac"], batch["gatarg_"], batch["gvtarg_"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) # print(fmt_row(13, np.mean(losses, axis=0))) # print("Evaluating losses...") losses = [] for batch in gd.iterate_once(optim_batchsize): newlosses = compute_losses(batch["gob"], batch["gac"], batch["gatarg_"], batch["gvtarg_"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) # print(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) glrlocal = (gseg["ep_lens"], gseg["ep_rets"]) # local values glistoflrpairs = MPI.COMM_WORLD.allgather(glrlocal) # list of tuples glens, grews = map(flatten_lists, zip(*glistoflrpairs)) # lenbuffer.extend(lens) rewbuffer.extend(rews) grewbuffer.extend(grews) # logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("GEpRewMean", np.mean(grewbuffer)) # logger.record_tabular("EpThisIter", len(lens)) # episodes_so_far += len(lens) # timesteps_so_far += sum(lens) iters_so_far += 1 # logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.logkv('trial', i_trial) logger.logkv("Iteration", iters_so_far) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def enjoy( env, policy_func, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_name=None, save_per_acts=3, reload_name=None): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() if reload_name: saver = tf.train.Saver() saver.restore(tf.get_default_session(), reload_name) print("Loaded model successfully.") # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses)
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) num_options=1, app='', saves=False, wsaves=False, epoch=-1, seed=1, dc=0): optim_batchsize_ideal = optim_batchsize np.random.seed(seed) tf.set_random_seed(seed) env.seed(seed) ### Book-keeping gamename = env.spec.id[:-3].lower() gamename += 'seed' + str(seed) gamename += app version_name = 'FINAL_NORM-ACT-LOWER-LR-len-400-wNoise-update1-ppo-ESCH-1-2-5-nI' dirname = '{}_{}_{}opts_saves/'.format(version_name, gamename, num_options) print(dirname) #input ("wait here after dirname") if wsaves: first = True if not os.path.exists(dirname): os.makedirs(dirname) first = False # while os.path.exists(dirname) and first: # dirname += '0' files = ['pposgd_simple.py', 'mlp_policy.py', 'run_mujoco.py'] first = True for i in range(len(files)): src = os.path.join( '/home/nfunk/Code_MA/ppoc_off_tryout/baselines/baselines/ppo1/' ) + files[i] print(src) #dest = os.path.join('/home/nfunk/results_NEW/ppo1/') + dirname dest = dirname + "src_code/" if (first): os.makedirs(dest) first = False print(dest) shutil.copy2(src, dest) # brute force copy normal env file at end of copying process: src = os.path.join( '/home/nfunk/Code_MA/ppoc_off_tryout/nfunk/envs_nf/pendulum_nf.py') shutil.copy2(src, dest) ### # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space max_action = env.action_space.high # add the dimension in the observation space! ob_space.shape = ((ob_space.shape[0] + ac_space.shape[0]), ) print(ob_space.shape) print(ac_space.shape) #input ("wait here where the spaces are printed!!!") pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return pol_ov_op_ent = tf.placeholder(dtype=tf.float32, shape=None) # Empirical return # option = tf.placeholder(dtype=tf.int32, shape=[None]) lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # pdb.set_trace() ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None]) ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold atarg_clip = atarg #tf.clip_by_value(atarg,-10,10) surr1 = ratio * atarg_clip #atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg_clip #atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) #vf_loss = U.mean(tf.square(tf.clip_by_value(pi.vpred - ret, -10.0, 10.0))) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] term_loss = pi.tpred * term_adv force_pi_loss = U.mean( tf.square( tf.clip_by_value(pi.op_pi, 1e-5, 1.0) - tf.constant([[0.05, 0.95]]))) log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-5, 1.0)) #log_pi = tf.Print(log_pi, [log_pi, tf.shape(tf.transpose(log_pi))]) old_log_pi = tf.log(tf.clip_by_value(oldpi.op_pi, 1e-5, 1.0)) entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1) ratio_pol_ov_op = tf.exp( tf.transpose(log_pi)[option[0]] - tf.transpose(old_log_pi)[option[0]]) # pnew / pold term_adv_clip = term_adv #tf.clip_by_value(term_adv,-10,10) surr1_pol_ov_op = ratio_pol_ov_op * term_adv_clip # surrogate from conservative policy iteration surr2_pol_ov_op = U.clip(ratio_pol_ov_op, 1.0 - clip_param, 1.0 + clip_param) * term_adv_clip # pol_surr_pol_ov_op = -U.mean( tf.minimum(surr1_pol_ov_op, surr2_pol_ov_op)) # PPO's pessimistic surrogate (L^CLIP) op_loss = pol_surr_pol_ov_op - pol_ov_op_ent * tf.reduce_sum(entropy) #op_loss = pol_surr_pol_ov_op #total_loss += force_pi_loss total_loss += op_loss var_list = pi.get_trainable_variables() term_list = var_list[6:8] lossandgrad = U.function( [ob, ac, atarg, ret, lrmult, option, term_adv, pol_ov_op_ent], losses + [U.flatgrad(total_loss, var_list)]) termloss = U.function([ob, option, term_adv], [U.flatgrad(term_loss, var_list) ]) # Since we will use a different step size. adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() saver = tf.train.Saver(max_to_keep=10000) saver_best = tf.train.Saver(max_to_keep=1) ### More book-kepping results = [] if saves: results = open( version_name + '_' + gamename + '_' + str(num_options) + 'opts_' + '_results.csv', 'w') results_best_model = open( dirname + version_name + '_' + gamename + '_' + str(num_options) + 'opts_' + '_bestmodel.csv', 'w') out = 'epoch,avg_reward' for opt in range(num_options): out += ',option {} dur'.format(opt) for opt in range(num_options): out += ',option {} std'.format(opt) for opt in range(num_options): out += ',option {} term'.format(opt) for opt in range(num_options): out += ',option {} adv'.format(opt) out += '\n' results.write(out) # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n') results.flush() if epoch >= 0: dirname = '{}_{}opts_saves/'.format(gamename, num_options) print("Loading weights from iteration: " + str(epoch)) filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch) saver.restore(U.get_session(), filename) ### episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 des_pol_op_ent = 0.1 max_val = -100000 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, num_options=num_options, saves=saves, results=results, rewbuffer=rewbuffer, dc=dc) datas = [0 for _ in range(num_options)] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) opt_d = [] for i in range(num_options): dur = np.mean( seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0. opt_d.append(dur) std = [] for i in range(num_options): logstd = np.mean( seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0. std.append(np.exp(logstd)) print("mean opt dur:", opt_d) print("mean op pol:", np.mean(np.array(seg['optpol_p']), axis=0)) print("mean term p:", np.mean(np.array(seg['term_p']), axis=0)) print("mean value val:", np.mean(np.array(seg['value_val']), axis=0)) ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy if hasattr(pi, "ob_rms_only"): pi.ob_rms_only.update(ob[:, :-ac_space.shape[0]] ) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values if (iters_so_far + 1) % 1000 == 0: des_pol_op_ent = des_pol_op_ent / 10 if iters_so_far % 50 == 0 and wsaves: print("weights are saved...") filename = dirname + '{}_epoch_{}.ckpt'.format( gamename, iters_so_far) save_path = saver.save(U.get_session(), filename) # adaptively save best run: if (np.mean(rewbuffer) > max_val) and wsaves: max_val = np.mean(rewbuffer) results_best_model.write('epoch: ' + str(iters_so_far) + 'rew: ' + str(np.mean(rewbuffer)) + '\n') results_best_model.flush() filename = dirname + 'best.ckpt'.format(gamename, iters_so_far) save_path = saver_best.save(U.get_session(), filename) min_batch = 160 # Arbitrary t_advs = [[] for _ in range(num_options)] for opt in range(num_options): indices = np.where(opts == opt)[0] print("batch size:", indices.size) opt_d[opt] = indices.size if not indices.size: t_advs[opt].append(0.) continue ### This part is only necessasry when we use options. We proceed to these verifications in order not to discard any collected trajectories. if datas[opt] != 0: if (indices.size < min_batch and datas[opt].n > min_batch): datas[opt] = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif indices.size + datas[opt].n < min_batch: # pdb.set_trace() oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif (indices.size + datas[opt].n > min_batch and datas[opt].n < min_batch) or (indices.size > min_batch and datas[opt].n < min_batch): oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = d = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) if (indices.size > min_batch and datas[opt].n > min_batch): datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) elif datas[opt] == 0: datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) ### optim_batchsize = optim_batchsize or ob.shape[0] optim_epochs = np.clip( np.int(10 * (indices.size / (timesteps_per_batch / num_options))), 10, 10) if num_options > 1 else optim_epochs print("optim epochs:", optim_epochs) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): #tadv,nodc_adv = pi.get_term_adv(batch["ob"],[opt]) tadv, nodc_adv = pi.get_opt_adv(batch["ob"], [opt]) tadv = tadv if num_options > 1 else np.zeros_like(tadv) t_advs[opt].append(nodc_adv) #if (opt==1): # *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv) #else: # *newlosses, grads = lossandgrad0(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv) *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv, des_pol_op_ent) #*newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv) #termg = termloss(batch["ob"], [opt], tadv) #adam.update(termg[0], 5e-7 * cur_lrmult) adam.update(grads, optim_stepsize * cur_lrmult) losses.append(newlosses) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() ### Book keeping if saves: out = "{},{}" for _ in range(num_options): out += ",{},{},{},{}" out += "\n" info = [iters_so_far, np.mean(rewbuffer)] for i in range(num_options): info.append(opt_d[i]) for i in range(num_options): info.append(std[i]) for i in range(num_options): info.append(np.mean(np.array(seg['term_p']), axis=0)[i]) for i in range(num_options): info.append(np.mean(t_advs[i])) results.write(out.format(*info)) results.flush()
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() ## losses + [U.flatgrad(total_loss, var_list)] 这个是怎么相加的 lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) test_a = U.function([ob, ac, atarg, ret, lrmult], [ kloldnew, ent, meankl, meanent, pol_entpen, pi.pd.logp(ac), oldpi.pd.logp(ac), ratio, surr1, surr2, pi.vpred ]) #################### pi_parms = U.function([], var_list) old_list = oldpi.get_trainable_variables() old_parms = U.function([], old_list) #################### U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() # print("ac",np.shape(seg["ac"]), seg["ac"]) # print("rew",np.shape(seg["rew"]), seg["rew"]) # print("vpred",np.shape(seg["vpred"]), seg["vpred"]) # print("new",np.shape(seg["new"]), seg["new"]) # print("prevac",np.shape(seg["prevac"]), seg["prevac"]) # print("nextvpred",np.shape(seg["nextvpred"]), seg["nextvpred"]) # print("ep_rets",np.shape(seg["ep_rets"]), seg["ep_rets"]) # print("ep_lens",np.shape(seg["ep_lens"]), seg["ep_lens"]) add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), deterministic=pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") # ############ # for p in pi_parms(): # print("pi", np.sum(p)) # for p in old_parms(): # print("old", np.sum(p)) # ############ logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) # kloldnew,ent, meankl, meanent, pol_entpen, piac, oldpiac, ratio, surr1, surr2, pivpred = \ # test_a(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) # print("kloldnew",kloldnew) # print("ent",ent) # print("meankl",meankl) # print("meanent",meanent) # print("pol_entpen",pol_entpen) # print("piac",piac) # print("oldpiac",oldpiac) # print("ratio",ratio) # print("surr1",surr1) # print("surr2",surr2) # print("pivpred",pivpred) for p in pi_parms(): print("pi", np.sum(p)) for p in old_parms(): print("old", np.sum(p)) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return pi
def learn(env, policy_func, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular()