def learn( env, test_env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy # import numpy as np # print(np.random.get_state()[1][0]) oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter episloni ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() normalizer = Normalizer(1) # Prepare for rollouts # ---------------------------------------- eval_gen = traj_segment_generator_eval(pi, test_env, timesteps_per_actorbatch, stochastic=False) seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, normalizer=normalizer) global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, best_fitness episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) eval_seg = eval_gen.__next__() lrlocal = (eval_seg["ep_lens"], eval_seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) if timesteps_so_far == 0: result_record() seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values # logger.log("Optimizing...") # logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) # logger.log(fmt_row(13, np.mean(losses, axis=0))) # logger.log("Current Iteration Training Performance:" + str(np.mean(seg["ep_rets"]))) # logger.log("Evaluating losses...") # losses = [] # for batch in d.iterate_once(optim_batchsize): # newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) # losses.append(newlosses) # meanlosses,_,_ = mpi_moments(losses, axis=0) # logger.log(fmt_row(13, meanlosses)) # for (lossval, name) in zipsame(meanlosses, loss_names): # logger.record_tabular("loss_"+name, lossval) # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # logger.record_tabular("EpLenMean", np.mean(lenbuffer)) # logger.record_tabular("EpRewMean", np.mean(rewbuffer)) # logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) # timesteps_so_far += sum(lens) # if iters_so_far == 0: # result_record() iters_so_far += 1
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, shift=0, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy td_v_target = tf.placeholder(dtype=tf.float32, shape=[1, 1]) # V target lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule G_t_inv = tf.placeholder(dtype=tf.float32, shape=[None, None]) alpha = tf.placeholder(dtype=tf.float32, shape=[1]) ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([]) adv = tf.placeholder(dtype=tf.float32, shape=[1, 1]) step = tf.placeholder(dtype=tf.float32, shape=[1]) vf_loss = tf.reduce_mean(tf.square(pi.vpred - td_v_target)) vf_losses = [vf_loss] vf_loss_names = ["vf_loss"] pol_loss = -tf.reduce_mean(adv * pi.pd.logp(ac)) pol_losses = [pol_loss] pol_loss_names = ["pol_loss"] var_list = pi.get_trainable_variables() vf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") ] pol_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("pol") ] compatible_feature = U.flatgrad(pi.pd.neglogp(ac), pol_var_list) G_t_inv_next = 1 / ( 1 - alpha) * (G_t_inv - alpha * (G_t_inv * compatible_feature) * tf.transpose(G_t_inv * compatible_feature) / (1 - alpha + alpha * tf.transpose(compatible_feature) * G_t_inv * compatible_feature)) # Train V function vf_lossandgrad = U.function([ob, td_v_target, lrmult], vf_losses + [U.flatgrad(vf_loss, vf_var_list, 20.0)]) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) # vf_optimizer = tf.train.AdamOptimizer(learning_rate = lrmult, epsilon = adam_epsilon) # vf_train_op = vf_optimizer.minimize(vf_loss, vf_var_list) # Train Policy pol_lossandgrad = U.function([ob, ac, adv, lrmult, td_v_target], pol_losses + [U.flatgrad(pol_loss, pol_var_list, 20.0)]) pol_adam = MpiAdam(pol_var_list, epsilon=adam_epsilon) # pol_optimizer = tf.train.AdamOptimizer(learning_rate = 0.1 * lrmult, epsilon = adam_epsilon) # pol_train_op = pol_optimizer.minimize(pol_loss, pol_var_list) # Computation compute_v_pred = U.function([ob], [pi.vpred]) get_pol_weights_num = np.sum( [np.prod(v.get_shape().as_list()) for v in pol_var_list]) get_compatible_feature = U.function([ob, ac], [compatible_feature]) # vf_update = U.function([ob, td_v_target], [vf_train_op]) # pol_update = U.function([ob, ac, adv], [pol_train_op]) U.initialize() vf_adam.sync() pol_adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=False) global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, best_fitness episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards Transition = collections.namedtuple( "Transition", ["ob", "ac", "reward", "next_ob", "done"]) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" # Step learning, this loop now indicates episodes omega_t = np.zeros(get_pol_weights_num) normalizer = Normalizer(1) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # logger.log("********** Episode %i ************" % episodes_so_far) rac_alpha = optim_stepsize * cur_lrmult rac_beta = optim_stepsize * cur_lrmult * 0.05 # # print("rac_alpha=", rac_alpha) # print("rac_beta=", rac_beta) if timesteps_so_far == 0: # result_record() seg = seg_gen.__next__() lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) result_record() ob = env.reset() # episode = [] cur_ep_ret = 0 # return in current episode cur_ep_len = 0 # len of current episode ep_rets = [] # returns of completed episodes in this segment ep_lens = [] # lengths of ... obs = [] record = False for t in itertools.count(): ac, vpred = pi.act(stochastic=True, ob=ob) origin_ac = ac ac = np.clip(ac, ac_space.low, ac_space.high) obs.append(ob) next_ob, rew, done, _ = env.step(ac) if env.spec._env_name == "MountainCarContinuous": rew = rew - np.abs(next_ob[0] - env.unwrapped.goal_position) ac = origin_ac # rew = np.clip(rew, -1., 1.) # episode.append(Transition(ob=ob.reshape((1, ob.shape[0])), ac=ac.reshape((1, ac.shape[0])), reward=rew, next_ob=next_ob.reshape((1, ob.shape[0])), done=done)) original_rew = rew if env.spec._env_name != "InvertedPendulumBulletEnv": normalizer.update(rew) rew = normalizer.normalize(rew) cur_ep_ret += (original_rew - shift) cur_ep_len += 1 timesteps_so_far += 1 # Compute v target and TD v_target = rew + gamma * np.array( compute_v_pred(next_ob.reshape((1, ob.shape[0])))) adv = v_target - np.array( compute_v_pred(ob.reshape((1, ob.shape[0])))) # Update V and Update Policy vf_loss, vf_g = vf_lossandgrad(ob.reshape((1, ob.shape[0])), v_target, rac_alpha) vf_adam.update(vf_g, rac_alpha) pol_loss, pol_g = pol_lossandgrad(ob.reshape((1, ob.shape[0])), ac, adv, rac_beta) compatible_feature = np.array( get_compatible_feature(ob.reshape((1, ob.shape[0])), ac)) compatible_feature_product = compatible_feature * compatible_feature.T omega_t = (np.eye(compatible_feature_product.shape[0]) - 0.1 * rac_alpha * compatible_feature_product).dot( omega_t) \ + 0.1 * rac_alpha * pol_g pol_adam.update(omega_t, rac_beta) ob = next_ob if timesteps_so_far % 10000 == 0: record = True if done: # print( # "Episode {} - Total reward = {}, Total Steps = {}".format(episodes_so_far, cur_ep_ret, cur_ep_len)) # ep_rets.append(cur_ep_ret) # returns of completed episodes in this segment # ep_lens.append(cur_ep_len) # lengths of .. # lenbuffer.append(cur_ep_len) # rewbuffer.append(cur_ep_ret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(np.array( obs)) # update running mean/std for normalization iters_so_far += 1 episodes_so_far += 1 ob = env.reset() if record: seg = seg_gen.__next__() lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather( lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) result_record() record = False break
def train(env, policy, normalizer, hp): global lenbuffer, rewbuffer, iters_so_far, timesteps_so_far, \ episodes_so_far, tstart tstart = time.time() rewbuffer.extend(evaluate(env, normalizer, policy)) # print(rewbuffer) result_record() record = False rw_normalizer = Normalizer(1) for episode in range(hp.main_loop_size): cur_lrmult = 1.0 # cur_lrmult = max(1.0 - float(timesteps_so_far) / (0.5 * hp.max_timesteps), 1e-8) if timesteps_so_far >= hp.max_timesteps: result_record() break # init deltas and rewards deltas = policy.sample_deltas() reward_positive = [0] * hp.n_directions reward_negative = [0] * hp.n_directions record = False # positive directions for k in range(hp.n_directions): state = env.reset() done = False num_plays = 0. while not done and num_plays < hp.horizon: normalizer.observe(state) state = normalizer.normalize(state) action = policy.positive_perturbation(state, deltas[k]) action = np.clip(action, env.action_space.low, env.action_space.high) state, reward, done, _ = env.step(action) # reward = max(min(reward, 1), -1) if env.spec._env_name != "InvertedPendulumBulletEnv": rw_normalizer.update(reward) reward = rw_normalizer.normalize(reward) reward_positive[k] += reward num_plays += 1 timesteps_so_far += 1 if timesteps_so_far % 10000 == 0 and timesteps_so_far > 0: record = True episodes_so_far += 1 if record: # print(total_steps) rewbuffer.extend(evaluate(env, normalizer, policy)) # print(rewbuffer) # print("Averge Rewards:", np.mean(rewbuffer)) result_record() record = False # negative directions for k in range(hp.n_directions): state = env.reset() done = False num_plays = 0. while not done and num_plays < hp.horizon: normalizer.observe(state) state = normalizer.normalize(state) action = policy.negative_perturbation(state, deltas[k]) action = np.clip(action, env.action_space.low, env.action_space.high) state, reward, done, _ = env.step(action) # reward = max(min(reward, 1), -1) if env.spec._env_name != "InvertedPendulumBulletEnv": rw_normalizer.update(reward) reward = rw_normalizer.normalize(reward) reward_negative[k] += reward num_plays += 1 timesteps_so_far += 1 if timesteps_so_far % 10000 == 0 and timesteps_so_far > 0: record = True episodes_so_far += 1 if record: # print(total_steps) # print(rewbuffer) rewbuffer.extend(evaluate(env, normalizer, policy)) # print("Averge Rewards:", np.mean(rewbuffer)) result_record() record = False all_rewards = np.array(reward_negative + reward_positive) sigma_r = all_rewards.std() # sort rollouts wrt max(r_pos, r_neg) and take (hp.b) best scores = { k: max(r_pos, r_neg) for k, (r_pos, r_neg) in enumerate(zip(reward_positive, reward_negative)) } order = sorted(scores.keys(), key=lambda x: scores[x])[-hp.b:] rollouts = [(reward_positive[k], reward_negative[k], deltas[k]) for k in order[::-1]] hp.step_size = hp.step_size * cur_lrmult # update policy: policy.update(rollouts, sigma_r)