def learn( env, policy_fn, *, timesteps_per_batch, # what to train on epsilon, beta, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, TRPO=False): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space discrete_ac_space = isinstance(ac_space, gym.spaces.Discrete) print("ob_space: " + str(ob_space)) print("ac_space: " + str(ac_space)) pi = policy_fn("pi", ob_space, ac_space) oldpi = policy_fn("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() old_entropy = oldpi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "Entropy"] dist = meankl all_var_list = pi.get_trainable_variables() all_var_list = [ v for v in all_var_list if v.name.split("/")[0].startswith("pi") ] var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz #????gvp and fvp??? gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Initialize eta, omega optimizer if discrete_ac_space: init_eta = 1 init_omega = 0.5 eta_omega_optimizer = EtaOmegaOptimizerDiscrete( beta, epsilon, init_eta, init_omega) else: init_eta = 0.5 init_omega = 2.0 #????eta_omega_optimizer details????? eta_omega_optimizer = EtaOmegaOptimizer(beta, epsilon, init_eta, init_omega) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate #print(ob[:20]) #print(ac[:20]) if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): print(pi.ob_rms.mean) pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() if TRPO: # # TRPO specific code. # Find correct step size using line search # shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / epsilon) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > epsilon * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 '''else: logger.log("couldn't compute a good step") set_from_flat(thbefore)''' if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) else: # # COPOS specific implementation. # copos_update_dir = stepdir # Split direction into log-linear 'w_theta' and non-linear 'w_beta' parts w_theta, w_beta = pi.split_w(copos_update_dir) tmp_ob = np.zeros( (1, ) + env.observation_space.shape ) # We assume that entropy does not depend on the NN # Optimize eta and omega if discrete_ac_space: entropy = lossbefore[4] #entropy = - 1/timesteps_per_batch * np.sum(np.sum(pi.get_action_prob(ob) * pi.get_log_action_prob(ob), axis=1)) eta, omega = eta_omega_optimizer.optimize( pi.compute_F_w(ob, copos_update_dir), pi.get_log_action_prob(ob), timesteps_per_batch, entropy) else: Waa, Wsa = pi.w2W(w_theta) wa = pi.get_wa(ob, w_beta) varphis = pi.get_varphis(ob) #old_ent = old_entropy.eval({oldpi.ob: tmp_ob})[0] old_ent = lossbefore[4] eta, omega = eta_omega_optimizer.optimize( w_theta, Waa, Wsa, wa, varphis, pi.get_kt(), pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent) logger.log("Initial eta: " + str(eta) + " and omega: " + str(omega)) current_theta_beta = get_flat() prev_theta, prev_beta = pi.all_to_theta_beta( current_theta_beta) if discrete_ac_space: # Do a line search for both theta and beta parameters by adjusting only eta eta = eta_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, epsilon, args, discrete_ac_space) logger.log("Updated eta, eta: " + str(eta)) set_from_flat(pi.theta_beta_to_all(prev_theta, prev_beta)) # Find proper omega for new eta. Use old policy parameters first. eta, omega = eta_omega_optimizer.optimize( pi.compute_F_w(ob, copos_update_dir), pi.get_log_action_prob(ob), timesteps_per_batch, entropy, eta) logger.log("Updated omega, eta: " + str(eta) + " and omega: " + str(omega)) # do line search for ratio for non-linear "beta" parameter values #ratio = beta_ratio_line_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, # epsilon, beta, args) # set ratio to 1 if we do not use beta ratio line search ratio = 1 #print("ratio from line search: " + str(ratio)) cur_theta = (eta * prev_theta + w_theta.reshape(-1, )) / (eta + omega) cur_beta = prev_beta + ratio * w_beta.reshape(-1, ) / eta else: for i in range(2): # Do a line search for both theta and beta parameters by adjusting only eta eta = eta_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, epsilon, args) logger.log("Updated eta, eta: " + str(eta) + " and omega: " + str(omega)) # Find proper omega for new eta. Use old policy parameters first. set_from_flat( pi.theta_beta_to_all(prev_theta, prev_beta)) eta, omega = \ eta_omega_optimizer.optimize(w_theta, Waa, Wsa, wa, varphis, pi.get_kt(), pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent, eta) logger.log("Updated omega, eta: " + str(eta) + " and omega: " + str(omega)) # Use final policy logger.log("Final eta: " + str(eta) + " and omega: " + str(omega)) cur_theta = (eta * prev_theta + w_theta.reshape(-1, )) / (eta + omega) cur_beta = prev_beta + w_beta.reshape(-1, ) / eta paramnew = allmean(pi.theta_beta_to_all(cur_theta, cur_beta)) set_from_flat(paramnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (paramnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) ##copos specific over #cg over for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) #policy update over with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) print("Reward max: " + str(max(rewbuffer))) print("Reward min: " + str(min(rewbuffer))) logger.record_tabular( "EpLenMean", np.mean(lenbuffer) if np.sum(lenbuffer) != 0.0 else 0.0) logger.record_tabular( "EpRewMean", np.mean(rewbuffer) if np.sum(rewbuffer) != 0.0 else 0.0) logger.record_tabular( "AverageReturn", np.mean(rewbuffer) if np.sum(rewbuffer) != 0.0 else 0.0) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular()
def learn(env, policy_func, rank, pretrained, pretrained_weight, *, g_step, d_step, entcoeff, save_per_iter, ckpt_dir, log_dir, timesteps_per_batch, task_name, gamma, lam, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=1e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, callback=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func( "pi", ob_space, ac_space, ) # reuse=(pretrained_weight != None) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return #ob = U.get_placeholder_cached(name="ob") ob_config = U.get_placeholder_cached(name="ob") ob_target = U.get_placeholder_cached(name="goal") obs_pos = U.get_placeholder_cached(name="obs_pos") #obs_pos2 = U.get_placeholder_cached(name="obs_pos2") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd") or v.name.startswith("pi/obs") ] vf_var_list = [ v for v in all_var_list if v.name.startswith("pi/vf") or v.name.startswith("pi/obs") ] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob_config, ob_target, obs_pos, ac, atarg], losses) compute_lossandgrad = U.function( [ob_config, ob_target, obs_pos, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function( [flat_tangent, ob_config, ob_target, obs_pos, ac, atarg], fvp) compute_vflossandgrad = U.function([ob_config, ob_target, obs_pos, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) max_trm = -5 true_reward_mean = 0 assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: #U.load_variables(pretrained_weight, variables=pi.get_variables()) saver = tf.train.Saver() saver.restore(tf.get_default_session(), pretrained_weight) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if rank == 0 and ckpt_dir is not None and true_reward_mean > max_trm: fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) max_trm = true_reward_mean logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): v1 = allmean(compute_fvp(p, *fvpargs)) # print("norm(v1):%.2e, norm(p):%.2e, cg_damping:%.2e"%(np.linalg.norm(v1), np.linalg.norm(p), cg_damping)) return v1 + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy config, goal, obstacle_pos = [], [], [] for o in seg["ob"]: config.append(o["joint"]) goal.append(o["target"]) obstacle_pos.append(o["obstacle_pos1"]) #obstacle_pos2.append(o["obstacle_pos2"]) config, goal, obstacle_pos = map(np.array, [config, goal, obstacle_pos]) args = config, goal, obstacle_pos, seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) logger.log( 'iter:{:d}, norm of g: {:.4f}, error of cg: {:.4f}'. format( cg_iters, np.linalg.norm(g), np.linalg.norm(g - compute_fvp(stepdir, *fvpargs)))) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbg, mbop, mbret) in dataset.iterbatches( (config, goal, obstacle_pos, seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean( compute_vflossandgrad(mbob, mbg, mbop, mbret)) vfadam.update(g, vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) true_reward_mean = np.mean(true_rewbuffer) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular()
def train( env: ConfMDP, policy: Policy, model_approximator: ModelApproximator, eval_steps: int = 4, eval_freq: int = 5, n_trajectories: int = 20, iteration_number: int = 2000, gamma: float = 1, render=False, checkpoint_file: str = "tf_checkpoint/general/model.ckpt", restore_variables: bool = False, save_variables: bool = True, logdir: str = None, log: bool = False, omega=5, kappa: float = 1e-5, training_set_size: int = 500, normalize_data: bool = False, dual_reg: float = 0.0, policy_reg: float = 0.0, exact: bool = False, num_processes: int = 1, load_data: bool = True, **kwargs, ): """ Runner for the REMPS algorithm. Setup logging, initialize agent, takes care of fitting or loading things. Executes the main training loop by managing workers :param env: Environment (Conf-MDP) :param policy: The agent policy :param model_approximator: the approximation of the model or the true model :param eval_steps: how many steps in order to perform evaluation :param eval_freq: the frequency of evaluation :param n_trajectories: number of trajectories to collect :param iteration_number: number of iterations of REMPS :param gamma: discount factor :param render: render or not episodes :param checkpoint_file: where to store checkpoints :param restore_variables: restore variables or not from checkpoint :param save_variables: save variables in checkpoint :param logdir: directory containing logs :param log: if true the agents logs the actions probability :param omega: initial environment parameters :param kappa: parameter of remps environment :param training_set_size: number of samples contained in the training set :param normalize_data: Whether to normalize data from the training set :param dual_reg: regularization on the dual :param policy_reg: regularization on the policy :param exact: whether the model approximation is exact or not :param num_processes: number of processing :param load_data: whether to load stored data :param kwargs: :return: """ # setup logging writer = tf.summary.FileWriter(logdir) logger.configure(dir=logdir, format_strs=["stdout", "csv"]) # setup agent agent = REMPS( policy=policy, model=model_approximator, env=env, kappa=kappa, projection_type=Projection.STATE_KERNEL, use_features=False, training_set_size=training_set_size, L2_reg_dual=dual_reg, L2_reg_loss=policy_reg, exact=exact, ) # create parallel samplers # Split work among workers n_steps = n_trajectories nb_episodes_per_worker = n_steps // num_processes inputQs = [Queue() for _ in range(num_processes)] outputQ = Queue() workers = [ SamplingWorker( policy, env, nb_episodes_per_worker, inputQs[i], outputQ, env.action_space.n, env.observation_space_size, ) for i in range(num_processes) ] # Start the workers for w in workers: w.start() # Collect data for model fitting # torcs model fitting needs to be done before the session initialization # due to multiprocessing issues if not load_data and isinstance(env, Torcs): if isinstance(env, Torcs): x, y, avg_rew, ret = collect_data( env, policy=policy, total_n_samples=training_set_size, n_params=2, initial_port=env.port + 1000, ) logger.log( f"Data collection terminate. Avg rew: {np.mean(avg_rew)}, Avg ret: {np.mean(ret)}", logger.INFO, ) with U.single_threaded_session() as sess: # initialization with session agent.initialize(sess, writer, omega) # to save variables saver = tf.train.Saver() # initialize all if restore_variables: # Add ops to save and restore all the variables. saver.restore(sess, tf.train.latest_checkpoint(checkpoint_file)) else: init = tf.global_variables_initializer() sess.run(init) # make sure all variables are initialized sess.run(tf.assert_variables_initialized()) logger.log("Collecting Data", level=logger.INFO) if not load_data and not isinstance(env, Torcs): x, y = run_env( env, episode_count=1, bins=200, omega_max=30, omega_min=1, n_samples_per_omega=500, policy=agent, grid=True, total_n_samples=training_set_size, ) # store data in the agent agent.store_data(x, y, normalize_data) logger.log("Data Stored", logger.INFO) # fit the model agent.fit() logger.log("Model fitted", logger.INFO) # set configurable parameters env.set_params(omega) get_parameters = U.GetFlat(agent.get_policy_params()) # ------------------------------------- # --------- Training Loop ------------- # ------------------------------------- for n in range(iteration_number): states = list() next_states = list() rewards = list() actions_one_hot = list() actions = list() timesteps = list() paths = list() # statistics wins = 0 small_vel = 0 traj = 0 confort_violation = 0 reward_list = list() policy_ws = get_parameters() # Run parallel sampling: # for each worker send message sample with # policy weights and environment parameters for i in range(num_processes): inputQs[i].put(("sample", policy_ws, omega)) # Collect results when ready with timed("sampling"): for i in range(num_processes): _, stats = outputQ.get() states.extend(stats["states"]) paths.extend(stats["paths"]) next_states.extend(stats["next_states"]) rewards.extend(stats["rewards"]) actions_one_hot.extend(stats["actions_one_hot"]) actions.extend(stats["actions"]) timesteps.extend(stats["timesteps"]) reward_list.extend(stats["reward_list"]) wins += stats["wins"] small_vel += stats["small_vel"] traj += stats["traj"] confort_violation += stats["confort_violation"] samples_data = { "actions": np.matrix(actions).transpose(), "actions_one_hot": np.array(actions_one_hot), "observations": states, "paths": paths, "rewards": np.transpose(np.expand_dims(np.array(rewards), axis=0)), "reward_list": reward_list, "timesteps": timesteps, "wins": (wins / traj) * 100, "omega": omega, "traj": traj, "confort_violation": confort_violation, } # print statistics logger.log(f"Training steps: {n}", logger.INFO) logger.log(f"Number of wins: {wins}", logger.INFO) logger.log(f"Percentage of wins: {(wins/n_trajectories)*100}", logger.INFO) logger.log(f"Average reward: {np.mean(reward_list)}", logger.INFO) logger.log(f"Avg timesteps: {np.mean(timesteps)}") # learning routine with timed("training"): omega = agent.train(samples_data) # Configure environments with # parameters returned by the agent env.set_params(omega) # Only TORCS: we kill torcs every 10 iterations due to a memory leak if n % 10 == 0 and isinstance(env, Torcs): print("Killing torcs") os.system("ps | grep torcs | awk '{print $1}' | xargs kill -9") # ------------------------------------- # --------- Evaluation ---------------- # ------------------------------------- if ((n + 1) % eval_freq) == 0: # for plotting eval_rewards = [] # evaluation loop for i in range(eval_steps): logger.log("Evaluating...", logger.INFO) state = env.reset() done = False # gamma_cum is gamma^t gamma_cum = 1 cum_reward = 0 t = 0 # here starts an episode while not done: if render: env.render() # sample one action at random action = agent.pi(state[np.newaxis, :], log=log) # observe the next state, reward etc newState, reward, done, info = env.step(action) cum_reward += reward * gamma_cum gamma_cum = gamma * gamma_cum state = newState if done: break t = t + 1 eval_rewards.append(cum_reward) # save variables if save_variables: save_path = saver.save(sess, checkpoint_file) logger.log(f"Steps: {n}", logger.INFO) logger.log(f"Model saved in path: {save_path}", logger.INFO) # Close the env env.close() # save variables if save_variables: save_path = saver.save(sess, checkpoint_file) logger.log(f"Model saved in path: {save_path}") # exit workers for i in range(num_processes): inputQs[i].put(("exit", None, None))
def learn( env, policy_fn, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) oldpi = policy_fn("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- eval_seq = traj_segment_generator_eval(pi, env, timesteps_per_batch, stochastic=False) seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, eval_seq=eval_seq) global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, best_fitness episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) if iters_so_far == 0: eval_seg = eval_seq.__next__() rewbuffer.extend(eval_seg["ep_rets"]) lenbuffer.extend(eval_seg["ep_lens"]) result_record() with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) # for (lossname, lossval) in zip(loss_names, meanlosses): # logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # # lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values # listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples # lens, rews = map(flatten_lists, zip(*listoflrpairs)) # lenbuffer.extend(lens) # rewbuffer.extend(rews) # # logger.record_tabular("EpLenMean", np.mean(lenbuffer)) # logger.record_tabular("EpRewMean", np.mean(rewbuffer)) # logger.record_tabular("EpThisIter", len(lens)) # episodes_so_far += len(lens) # timesteps_so_far += sum(lens) iters_so_far += 1
def train(env, nb_epochs, nb_episodes, nb_epoch_cycles, episode_length, nb_train_steps, eval_freq, save_freq, nb_eval_episodes, actor, critic, memory, gamma, normalize_returns, normalize_observations, critic_l2_reg, action_noise, param_noise, popart, clip_norm, batch_size, reward_scale, action_repeat, full, exclude_centering_frame, visualize, fail_reward, num_processes, num_processes_to_wait, num_testing_processes, learning_session, min_buffer_length, integrator_accuracy=5e-5, max_env_traj=100, tau=0.01): """ Parameters ---------- nb_epochs : the number of epochs to train. nb_episodes : the number of episodes for each epoch. episode_length : the maximum number of steps for each episode. gamma : discount factor. tau : soft update coefficient. clip_norm : clip on the norm of the gradient. """ assert action_repeat > 0 assert nb_episodes >= num_processes # Get params from learning session checkpoint_dir = learning_session.checkpoint_dir log_dir = learning_session.log_dir training_step = learning_session.last_training_step # Initialize DDPG agent (target network and replay buffer) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=None, critic_l2_reg=critic_l2_reg, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, training_step=training_step) # We need max_action because the NN output layer is a tanh. # So we must scale it back. max_action = env.action_space.high # Build Workers events = [Event() for _ in range(num_processes)] inputQs = [Queue() for _ in range(num_processes)] outputQ = Queue() # Split work among workers nb_episodes_per_worker = nb_episodes // num_processes workers = [ SamplingWorker(i, actor, critic, episode_length, nb_episodes_per_worker, action_repeat, max_action, gamma, tau, normalize_returns, batch_size, normalize_observations, param_noise, critic_l2_reg, popart, clip_norm, reward_scale, events[i], inputQs[i], outputQ, full, exclude_centering_frame, integrator_accuracy, max_env_traj, visualize, fail_reward) for i in range(num_processes) ] # Run the Workers for w in workers: w.start() # Create Round Robin tester tester = RoundRobinTester( num_testing_processes, actor, critic, episode_length, nb_eval_episodes, action_repeat, max_action, gamma, tau, normalize_returns, batch_size, normalize_observations, critic_l2_reg, popart, clip_norm, reward_scale, full, exclude_centering_frame, integrator_accuracy, max_env_traj, visualize, fail_reward) # Start training loop with U.single_threaded_session() as sess: agent.initialize(sess) writer = tf.summary.FileWriter(log_dir) writer.add_graph(sess.graph) # Initialize writer and statistics stats = EvaluationStatistics(tf_session=sess, tf_writer=writer) # setup saver saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=2) get_parameters = U.GetFlat(actor.trainable_vars) global_step = 0 obs = env.reset() agent.reset() # Processes waiting for a new sampling task waiting_indices = [i for i in range(num_processes)] for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # If we have sampling workers waiting, dispatch a sampling job if waiting_indices: actor_ws = get_parameters() # Run parallel sampling for i in waiting_indices: inputQs[i].put(('sample', actor_ws)) events[i].set() # Notify worker: sample baby, sample! waiting_indices.clear() # Collect results when ready for i in range(num_processes_to_wait): process_index, transitions = outputQ.get() waiting_indices.append(process_index) print('Collecting transition samples from Worker {}/{}'. format(i + 1, num_processes_to_wait)) for t in transitions: agent.store_transition(*t) # try to collect other samples if available for i in range(num_processes): try: process_index, transitions = outputQ.get_nowait() if process_index not in waiting_indices: waiting_indices.append(process_index) print('Collecting transition samples from Worker {}'. format(process_index)) for t in transitions: agent.store_transition(*t) except queue.Empty: # No sampling ready, keep on training. pass # Training phase if agent.memory.nb_entries > min_buffer_length: for _ in range(nb_train_steps): critic_loss, actor_loss = agent.train() agent.update_target_net() # Plot statistics stats.add_critic_loss(critic_loss, global_step) stats.add_actor_loss(actor_loss, global_step) global_step += 1 # Evaluation phase if cycle % eval_freq == 0: print("Cycle number: ", cycle + epoch * nb_epoch_cycles) print("Sending testing job...") actor_ws = get_parameters() # Send a testing job tester.test(actor_ws, global_step) # Print stats (if any) tester.log_stats(stats, logger) if cycle % save_freq == 0: # Save weights save_path = saver.save(sess, checkpoint_dir, global_step=global_step) print("Model saved in path: %s" % save_path) # Dump learning session learning_session.dump(agent.training_step) print("Learning session dumped to: %s" % str(learning_session.session_path)) else: print("Not enough entry in memory buffer") # Stop workers for i in range(num_processes): inputQs[i].put(('exit', None)) events[i].set() # Notify worker: exit! tester.close() # Stop testing workers env.close()
def learn(make_env, make_policy, *, n_episodes, horizon, delta, gamma, max_iters, sampler=None, use_natural_gradient=False, #can be 'exact', 'approximate' fisher_reg=1e-2, iw_method='is', iw_norm='none', bound='J', line_search_type='parabola', save_weights=0, improvement_tol=0., center_return=False, render_after=None, max_offline_iters=100, callback=None, clipping=False, entropy='none', positive_return=False, reward_clustering='none'): np.set_printoptions(precision=3) max_samples = horizon * n_episodes if line_search_type == 'binary': line_search = line_search_binary elif line_search_type == 'parabola': line_search = line_search_parabola else: raise ValueError() # Building the environment env = make_env() ob_space = env.observation_space ac_space = env.action_space # Building the policy pi = make_policy('pi', ob_space, ac_space) oldpi = make_policy('oldpi', ob_space, ac_space) all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.split('/')[1].startswith('pol')] shapes = [U.intprod(var.get_shape().as_list()) for var in var_list] n_parameters = sum(shapes) # Placeholders ob_ = ob = U.get_placeholder_cached(name='ob') ac_ = pi.pdtype.sample_placeholder([max_samples], name='ac') mask_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='mask') rew_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='rew') disc_rew_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='disc_rew') clustered_rew_ = tf.placeholder(dtype=tf.float32, shape=(n_episodes)) gradient_ = tf.placeholder(dtype=tf.float32, shape=(n_parameters, 1), name='gradient') iter_number_ = tf.placeholder(dtype=tf.int32, name='iter_number') losses_with_name = [] # Policy densities target_log_pdf = pi.pd.logp(ac_) behavioral_log_pdf = oldpi.pd.logp(ac_) log_ratio = target_log_pdf - behavioral_log_pdf # Split operations disc_rew_split = tf.stack(tf.split(disc_rew_ * mask_, n_episodes)) rew_split = tf.stack(tf.split(rew_ * mask_, n_episodes)) log_ratio_split = tf.stack(tf.split(log_ratio * mask_, n_episodes)) target_log_pdf_split = tf.stack(tf.split(target_log_pdf * mask_, n_episodes)) behavioral_log_pdf_split = tf.stack(tf.split(behavioral_log_pdf * mask_, n_episodes)) mask_split = tf.stack(tf.split(mask_, n_episodes)) # Renyi divergence emp_d2_split = tf.stack(tf.split(pi.pd.renyi(oldpi.pd, 2) * mask_, n_episodes)) emp_d2_cum_split = tf.reduce_sum(emp_d2_split, axis=1) empirical_d2 = tf.reduce_mean(tf.exp(emp_d2_cum_split)) # Return ep_return = clustered_rew_ #tf.reduce_sum(mask_split * disc_rew_split, axis=1) if clipping: rew_split = tf.clip_by_value(rew_split, -1, 1) if center_return: ep_return = ep_return - tf.reduce_mean(ep_return) rew_split = rew_split - (tf.reduce_sum(rew_split) / (tf.reduce_sum(mask_split) + 1e-24)) discounter = [pow(gamma, i) for i in range(0, horizon)] # Decreasing gamma discounter_tf = tf.constant(discounter) disc_rew_split = rew_split * discounter_tf #tf.add_to_collection('prints', tf.Print(ep_return, [ep_return], 'ep_return_not_clustered', summarize=20)) # Reward clustering ''' rew_clustering_options = reward_clustering.split(':') if reward_clustering == 'none': pass # Do nothing elif rew_clustering_options[0] == 'global': assert len(rew_clustering_options) == 2, "Reward clustering: Provide the correct number of parameters" N = int(rew_clustering_options[1]) tf.add_to_collection('prints', tf.Print(ep_return, [ep_return], 'ep_return', summarize=20)) global_rew_min = tf.Variable(float('+inf'), trainable=False) global_rew_max = tf.Variable(float('-inf'), trainable=False) rew_min = tf.reduce_min(ep_return) rew_max = tf.reduce_max(ep_return) global_rew_min = tf.assign(global_rew_min, tf.minimum(global_rew_min, rew_min)) global_rew_max = tf.assign(global_rew_max, tf.maximum(global_rew_max, rew_max)) interval_size = (global_rew_max - global_rew_min) / N ep_return = tf.floordiv(ep_return, interval_size) * interval_size elif rew_clustering_options[0] == 'batch': assert len(rew_clustering_options) == 2, "Reward clustering: Provide the correct number of parameters" N = int(rew_clustering_options[1]) rew_min = tf.reduce_min(ep_return) rew_max = tf.reduce_max(ep_return) interval_size = (rew_max - rew_min) / N ep_return = tf.floordiv(ep_return, interval_size) * interval_size elif rew_clustering_options[0] == 'manual': assert len(rew_clustering_options) == 4, "Reward clustering: Provide the correct number of parameters" N, rew_min, rew_max = map(int, rew_clustering_options[1:]) print("N:", N) print("Min reward:", rew_min) print("Max reward:", rew_max) interval_size = (rew_max - rew_min) / N print("Interval size:", interval_size) # Clip to avoid overflow and cluster ep_return = tf.clip_by_value(ep_return, rew_min, rew_max) ep_return = tf.cast(tf.floordiv(ep_return, interval_size) * interval_size, tf.float32) tf.add_to_collection('prints', tf.Print(ep_return, [ep_return], 'ep_return_clustered', summarize=20)) else: raise Exception('Unrecognized reward clustering scheme.') ''' return_mean = tf.reduce_mean(ep_return) return_std = U.reduce_std(ep_return) return_max = tf.reduce_max(ep_return) return_min = tf.reduce_min(ep_return) return_abs_max = tf.reduce_max(tf.abs(ep_return)) return_step_max = tf.reduce_max(tf.abs(rew_split)) # Max step reward return_step_mean = tf.abs(tf.reduce_mean(rew_split)) positive_step_return_max = tf.maximum(0.0, tf.reduce_max(rew_split)) negative_step_return_max = tf.maximum(0.0, tf.reduce_max(-rew_split)) return_step_maxmin = tf.abs(positive_step_return_max - negative_step_return_max) losses_with_name.extend([(return_mean, 'InitialReturnMean'), (return_max, 'InitialReturnMax'), (return_min, 'InitialReturnMin'), (return_std, 'InitialReturnStd'), (empirical_d2, 'EmpiricalD2'), (return_step_max, 'ReturnStepMax'), (return_step_maxmin, 'ReturnStepMaxmin')]) if iw_method == 'pdis': # log_ratio_split cumulative sum log_ratio_cumsum = tf.cumsum(log_ratio_split, axis=1) # Exponentiate ratio_cumsum = tf.exp(log_ratio_cumsum) # Multiply by the step-wise reward (not episode) ratio_reward = ratio_cumsum * disc_rew_split # Average on episodes ratio_reward_per_episode = tf.reduce_sum(ratio_reward, axis=1) w_return_mean = tf.reduce_sum(ratio_reward_per_episode, axis=0) / n_episodes # Get d2(w0:t) with mask d2_w_0t = tf.exp(tf.cumsum(emp_d2_split, axis=1)) * mask_split # LEAVE THIS OUTSIDE # Sum d2(w0:t) over timesteps episode_d2_0t = tf.reduce_sum(d2_w_0t, axis=1) # Sample variance J_sample_variance = (1/(n_episodes-1)) * tf.reduce_sum(tf.square(ratio_reward_per_episode - w_return_mean)) losses_with_name.append((J_sample_variance, 'J_sample_variance')) losses_with_name.extend([(tf.reduce_max(ratio_cumsum), 'MaxIW'), (tf.reduce_min(ratio_cumsum), 'MinIW'), (tf.reduce_mean(ratio_cumsum), 'MeanIW'), (U.reduce_std(ratio_cumsum), 'StdIW'), (tf.reduce_mean(U.reduce_std(ratio_cumsum, axis=0)), 'StdIW_mean')]) losses_with_name.extend([(tf.reduce_max(d2_w_0t), 'MaxD2w0t'), (tf.reduce_min(d2_w_0t), 'MinD2w0t'), (tf.reduce_mean(d2_w_0t), 'MeanD2w0t'), (U.reduce_std(d2_w_0t), 'StdD2w0t')]) ''' # TMP: adding also IS logging to compare iw = tf.exp(tf.reduce_sum(log_ratio_split, axis=1)) iwn = iw / n_episodes IS_w_return_mean = tf.reduce_sum(iwn * ep_return) IS_J_sample_variance = (1/(n_episodes-1)) * tf.reduce_sum(tf.square(iw * ep_return - w_return_mean)) losses_with_name.append((IS_J_sample_variance, 'IS_J_sample_variance')) losses_with_name.append((IS_w_return_mean, 'IS_ReturnMeanIW')) losses_with_name.extend([(tf.reduce_max(iwn), 'IS_MaxIWNorm'), (tf.reduce_min(iwn), 'IS_MinIWNorm'), (tf.reduce_mean(iwn), 'IS_MeanIWNorm'), (U.reduce_std(iwn), 'IS_StdIWNorm'), (tf.reduce_max(iw), 'IS_MaxIW'), (tf.reduce_min(iw), 'IS_MinIW'), (tf.reduce_mean(iw), 'IS_MeanIW'), (U.reduce_std(iw), 'IS_StdIW')]) ''' elif iw_method == 'is': iw = tf.exp(tf.reduce_sum(log_ratio_split, axis=1)) if iw_norm == 'none': iwn = iw / n_episodes w_return_mean = tf.reduce_sum(iwn * ep_return) J_sample_variance = (1/(n_episodes-1)) * tf.reduce_sum(tf.square(iw * ep_return - w_return_mean)) losses_with_name.append((J_sample_variance, 'J_sample_variance')) elif iw_norm == 'sn': iwn = iw / tf.reduce_sum(iw) w_return_mean = tf.reduce_sum(iwn * ep_return) elif iw_norm == 'regression': # Get optimized beta mean_iw = tf.reduce_mean(iw) beta = tf.reduce_sum((iw - mean_iw) * ep_return * iw) / (tf.reduce_sum((iw - mean_iw) ** 2) + 1e-24) # Get the estimator w_return_mean = tf.reduce_sum(ep_return * iw + beta * (iw - 1)) / n_episodes else: raise NotImplementedError() ess_classic = tf.linalg.norm(iw, 1) ** 2 / tf.linalg.norm(iw, 2) ** 2 sqrt_ess_classic = tf.linalg.norm(iw, 1) / tf.linalg.norm(iw, 2) ess_renyi = n_episodes / empirical_d2 losses_with_name.extend([(tf.reduce_max(iwn), 'MaxIWNorm'), (tf.reduce_min(iwn), 'MinIWNorm'), (tf.reduce_mean(iwn), 'MeanIWNorm'), (U.reduce_std(iwn), 'StdIWNorm'), (tf.reduce_max(iw), 'MaxIW'), (tf.reduce_min(iw), 'MinIW'), (tf.reduce_mean(iw), 'MeanIW'), (U.reduce_std(iw), 'StdIW'), (ess_classic, 'ESSClassic'), (ess_renyi, 'ESSRenyi')]) elif iw_method == 'rbis': # Get pdfs for episodes target_log_pdf_episode = tf.reduce_sum(target_log_pdf_split, axis=1) behavioral_log_pdf_episode = tf.reduce_sum(behavioral_log_pdf_split, axis=1) # Normalize log_proba (avoid as overflows as possible) normalization_factor = tf.reduce_mean(tf.stack([target_log_pdf_episode, behavioral_log_pdf_episode])) target_norm_log_pdf_episode = target_log_pdf_episode - normalization_factor behavioral_norm_log_pdf_episode = behavioral_log_pdf_episode - normalization_factor # Exponentiate target_pdf_episode = tf.clip_by_value(tf.cast(tf.exp(target_norm_log_pdf_episode), tf.float64), 1e-300, 1e+300) behavioral_pdf_episode = tf.clip_by_value(tf.cast(tf.exp(behavioral_norm_log_pdf_episode), tf.float64), 1e-300, 1e+300) tf.add_to_collection('asserts', tf.assert_positive(target_pdf_episode, name='target_pdf_positive')) tf.add_to_collection('asserts', tf.assert_positive(behavioral_pdf_episode, name='behavioral_pdf_positive')) # Compute the merging matrix (reward-clustering) and the number of clusters reward_unique, reward_indexes = tf.unique(ep_return) episode_clustering_matrix = tf.cast(tf.one_hot(reward_indexes, n_episodes), tf.float64) max_index = tf.reduce_max(reward_indexes) + 1 trajectories_per_cluster = tf.reduce_sum(episode_clustering_matrix, axis=0)[:max_index] tf.add_to_collection('asserts', tf.assert_positive(tf.reduce_sum(episode_clustering_matrix, axis=0)[:max_index], name='clustering_matrix')) # Get the clustered pdfs clustered_target_pdf = tf.matmul(tf.reshape(target_pdf_episode, (1, -1)), episode_clustering_matrix)[0][:max_index] clustered_behavioral_pdf = tf.matmul(tf.reshape(behavioral_pdf_episode, (1, -1)), episode_clustering_matrix)[0][:max_index] tf.add_to_collection('asserts', tf.assert_positive(clustered_target_pdf, name='clust_target_pdf_positive')) tf.add_to_collection('asserts', tf.assert_positive(clustered_behavioral_pdf, name='clust_behavioral_pdf_positive')) # Compute the J ratio_clustered = clustered_target_pdf / clustered_behavioral_pdf #ratio_reward = tf.cast(ratio_clustered, tf.float32) * reward_unique # ---- No cluster cardinality ratio_reward = tf.cast(ratio_clustered, tf.float32) * reward_unique * tf.cast(trajectories_per_cluster, tf.float32) # ---- Cluster cardinality #w_return_mean = tf.reduce_sum(ratio_reward) / tf.cast(max_index, tf.float32) # ---- No cluster cardinality w_return_mean = tf.reduce_sum(ratio_reward) / tf.cast(n_episodes, tf.float32) # ---- Cluster cardinality # Divergences ess_classic = tf.linalg.norm(ratio_reward, 1) ** 2 / tf.linalg.norm(ratio_reward, 2) ** 2 sqrt_ess_classic = tf.linalg.norm(ratio_reward, 1) / tf.linalg.norm(ratio_reward, 2) ess_renyi = n_episodes / empirical_d2 # Summaries losses_with_name.extend([(tf.reduce_max(ratio_clustered), 'MaxIW'), (tf.reduce_min(ratio_clustered), 'MinIW'), (tf.reduce_mean(ratio_clustered), 'MeanIW'), (U.reduce_std(ratio_clustered), 'StdIW'), (1-(max_index / n_episodes), 'RewardCompression'), (ess_classic, 'ESSClassic'), (ess_renyi, 'ESSRenyi')]) else: raise NotImplementedError() if bound == 'J': bound_ = w_return_mean elif bound == 'std-d2': bound_ = w_return_mean - tf.sqrt((1 - delta) / (delta * ess_renyi)) * return_std elif bound == 'max-d2': var_estimate = tf.sqrt((1 - delta) / (delta * ess_renyi)) * return_abs_max bound_ = w_return_mean - tf.sqrt((1 - delta) / (delta * ess_renyi)) * return_abs_max elif bound == 'max-ess': bound_ = w_return_mean - tf.sqrt((1 - delta) / delta) / sqrt_ess_classic * return_abs_max elif bound == 'std-ess': bound_ = w_return_mean - tf.sqrt((1 - delta) / delta) / sqrt_ess_classic * return_std elif bound == 'pdis-max-d2': # Discount factor if gamma >= 1: discounter = [float(1+2*(horizon-t-1)) for t in range(0, horizon)] else: def f(t): return pow(gamma, 2*t) + (2*pow(gamma,t)*(pow(gamma, t+1) - pow(gamma, horizon))) / (1-gamma) discounter = [f(t) for t in range(0, horizon)] discounter_tf = tf.constant(discounter) mean_episode_d2 = tf.reduce_sum(d2_w_0t, axis=0) / (tf.reduce_sum(mask_split, axis=0) + 1e-24) discounted_d2 = mean_episode_d2 * discounter_tf # Discounted d2 discounted_total_d2 = tf.reduce_sum(discounted_d2, axis=0) # Sum over time bound_ = w_return_mean - tf.sqrt((1-delta) * discounted_total_d2 / (delta*n_episodes)) * return_step_max elif bound == 'pdis-mean-d2': # Discount factor if gamma >= 1: discounter = [float(1+2*(horizon-t-1)) for t in range(0, horizon)] else: def f(t): return pow(gamma, 2*t) + (2*pow(gamma,t)*(pow(gamma, t+1) - pow(gamma, horizon))) / (1-gamma) discounter = [f(t) for t in range(0, horizon)] discounter_tf = tf.constant(discounter) mean_episode_d2 = tf.reduce_sum(d2_w_0t, axis=0) / (tf.reduce_sum(mask_split, axis=0) + 1e-24) discounted_d2 = mean_episode_d2 * discounter_tf # Discounted d2 discounted_total_d2 = tf.reduce_sum(discounted_d2, axis=0) # Sum over time bound_ = w_return_mean - tf.sqrt((1-delta) * discounted_total_d2 / (delta*n_episodes)) * return_step_mean else: raise NotImplementedError() # Policy entropy for exploration ent = pi.pd.entropy() meanent = tf.reduce_mean(ent) losses_with_name.append((meanent, 'MeanEntropy')) # Add policy entropy bonus if entropy != 'none': scheme, v1, v2 = entropy.split(':') if scheme == 'step': entcoeff = tf.cond(iter_number_ < int(v2), lambda: float(v1), lambda: float(0.0)) losses_with_name.append((entcoeff, 'EntropyCoefficient')) entbonus = entcoeff * meanent bound_ = bound_ + entbonus elif scheme == 'lin': ip = tf.cast(iter_number_ / max_iters, tf.float32) entcoeff_decay = tf.maximum(0.0, float(v2) + (float(v1) - float(v2)) * (1.0 - ip)) losses_with_name.append((entcoeff_decay, 'EntropyCoefficient')) entbonus = entcoeff_decay * meanent bound_ = bound_ + entbonus elif scheme == 'exp': ent_f = tf.exp(-tf.abs(tf.reduce_mean(iw) - 1) * float(v2)) * float(v1) losses_with_name.append((ent_f, 'EntropyCoefficient')) bound_ = bound_ + ent_f * meanent else: raise Exception('Unrecognized entropy scheme.') losses_with_name.append((w_return_mean, 'ReturnMeanIW')) losses_with_name.append((bound_, 'Bound')) losses, loss_names = map(list, zip(*losses_with_name)) if use_natural_gradient: p = tf.placeholder(dtype=tf.float32, shape=[None]) target_logpdf_episode = tf.reduce_sum(target_log_pdf_split * mask_split, axis=1) grad_logprob = U.flatgrad(tf.stop_gradient(iwn) * target_logpdf_episode, var_list) dot_product = tf.reduce_sum(grad_logprob * p) hess_logprob = U.flatgrad(dot_product, var_list) compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_], [-hess_logprob]) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) assert_ops = tf.group(*tf.get_collection('asserts')) print_ops = tf.group(*tf.get_collection('prints')) compute_lossandgrad = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_], losses + [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_grad = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_], [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_bound = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_], [bound_, assert_ops, print_ops]) compute_losses = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_], losses) #compute_temp = U.function([ob_, ac_, rew_, disc_rew_, mask_], [ratio_cumsum, discounted_ratio]) set_parameter = U.SetFromFlat(var_list) get_parameter = U.GetFlat(var_list) if sampler is None: seg_gen = traj_segment_generator(pi, env, n_episodes, horizon, stochastic=True) sampler = type("SequentialSampler", (object,), {"collect": lambda self, _: seg_gen.__next__()})() U.initialize() # Starting optimizing episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=n_episodes) rewbuffer = deque(maxlen=n_episodes) while True: iters_so_far += 1 if render_after is not None and iters_so_far % render_after == 0: if hasattr(env, 'render'): render(env, pi, horizon) if callback: callback(locals(), globals()) if iters_so_far >= max_iters: print('Finised...') break logger.log('********** Iteration %i ************' % iters_so_far) theta = get_parameter() with timed('sampling'): seg = sampler.collect(theta) add_disc_rew(seg, gamma) lens, rets = seg['ep_lens'], seg['ep_rets'] lenbuffer.extend(lens) rewbuffer.extend(rets) episodes_so_far += len(lens) timesteps_so_far += sum(lens) #print('------') #print(np.reshape(seg['ob'], (n_episodes, horizon, -1))[:,:,0]) #print(np.reshape(seg['mask'], (n_episodes, horizon))) # Get clustered reward reward_matrix = np.reshape(seg['disc_rew'] * seg['mask'], (n_episodes, horizon)) ep_reward = np.sum(reward_matrix, axis=1) ep_reward = cluster_rewards(ep_reward, reward_clustering) args = ob, ac, rew, disc_rew, clustered_rew, mask, iter_number = seg['ob'], seg['ac'], seg['rew'], seg['disc_rew'], ep_reward, seg['mask'], iters_so_far assign_old_eq_new() def evaluate_loss(): loss = compute_bound(*args) return loss[0] def evaluate_gradient(): gradient = compute_grad(*args) return gradient[0] if use_natural_gradient: def evaluate_fisher_vector_prod(x): return compute_linear_operator(x, *args)[0] + fisher_reg * x def evaluate_natural_gradient(g): return cg(evaluate_fisher_vector_prod, g, cg_iters=10, verbose=0) else: evaluate_natural_gradient = None with timed('summaries before'): logger.record_tabular("Iteration", iters_so_far) logger.record_tabular("InitialBound", evaluate_loss()) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if save_weights > 0 and iters_so_far % save_weights == 0: logger.record_tabular('Weights', str(get_parameter())) import pickle file = open('checkpoint' + str(iters_so_far) + '.pkl', 'wb') pickle.dump(theta, file) with timed("offline optimization"): theta, improvement = optimize_offline(theta, set_parameter, line_search, evaluate_loss, evaluate_gradient, evaluate_natural_gradient, max_offline_ite=max_offline_iters) set_parameter(theta) with timed('summaries after'): meanlosses = np.array(compute_losses(*args)) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.dump_tabular() env.close()
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, lr=0.03, momentum=0.9): ob_dim, ac_dim = policy.ob_dim, policy.ac_dim dbpi = GaussianMlpPolicy(ob_dim, ac_dim, 'dbp') oldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'oe') dboldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'doi') # with tf.variable_scope('dbp'): # with tf.variable_scope('oe'): # with tf.variable_scope('doi'): pi = policy do_std = U.function([], [pi.std_1a, pi.logstd_1a]) kloldnew = oldpi.pd.kl(pi.pd) dbkloldnew = dboldpi.pd.kl(dbpi.pd) dist = meankl = tf.reduce_mean(kloldnew) dbkl = tf.reduce_mean(dbkloldnew) obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(lr)), name='stepsize') inputs, loss, loss_sampled = policy.update_info var_list = [v for v in tf.global_variables() if "pi" in v.name] db_var_list = [v for v in tf.global_variables() if "dbp" in v.name] old_var_list = [v for v in tf.global_variables() if "oe" in v.name] db_old_var_list = [v for v in tf.global_variables() if "doi" in v.name] print(len(var_list), len(db_var_list), len(old_var_list), len(db_old_var_list)) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_var_list, var_list) ]) assign_db = U.function( [], [], updates=[ tf.assign(db, o) for (db, o) in zipsame(db_var_list, var_list) ] + [ tf.assign(dbold, dbnew) for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list) ]) assign_old_eq_newr = U.function( [], [], updates=[ tf.assign(newv, oldv) for (oldv, newv) in zipsame(old_var_list, var_list) ]) # assign_dbr = U.function([], [], updates= # [tf.assign(o, db) for (db, o) in zipsame(db_var_list, var_list)] + # [tf.assign(dbnew, dbold) for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list)]) klgrads = tf.gradients(dist, var_list) dbklgrads = tf.gradients(dbkl, db_var_list) p_grads = [tf.ones_like(v) for v in dbklgrads] get_flat = U.GetFlat(var_list) get_old_flat = U.GetFlat(old_var_list) set_from_flat = U.SetFromFlat(var_list) flat_tangent2 = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan2") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents2 = [] for shape in shapes: sz = U.intprod(shape) tangents2.append(tf.reshape(flat_tangent2[start:start + sz], shape)) start += sz gvp2 = tf.add_n([ tf.reduce_sum(g * tangent2) for (g, tangent2) in zipsame(dbklgrads, tangents2) ]) gvp2_grads = tf.gradients(gvp2, db_var_list) neg_term = tf.add_n([ tf.reduce_sum(g * tangent2) for (g, tangent2) in zipsame(gvp2_grads, tangents2) ]) / 2. ng1 = tf.gradients(neg_term, db_var_list) ng2 = tf.gradients(neg_term, db_old_var_list) neg_term_grads = [ a + b for (a, b) in zip(tf.gradients(neg_term, db_var_list), tf.gradients(neg_term, db_old_var_list)) ] neg_term = neg_term_grads # neg_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in neg_term_grads]) pos_term = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(gvp2_grads, p_grads) ]) pos_term_grads = [ a + b for (a, b) in zip(tf.gradients(pos_term, db_var_list), tf.gradients(pos_term, db_old_var_list)) ] pos_term_sum = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(pos_term_grads, tangents2) ]) pos_term_grads = tf.gradients(pos_term_sum, p_grads) pos_term = pos_term_grads # pos_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in pos_term_grads]) geo_term = [(p - n) * 0.5 for p, n in zip(pos_term, neg_term)] optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=momentum, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) grads = optim.compute_gradients(loss, var_list=pi_var_list) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) geo_term = [g1 + g2[0] for g1, g2 in zip(geo_term, grads)] geo_grads = list(zip(geo_term, var_list)) update_geo_op, q_runner_geo = optim.apply_gradients(geo_grads) do_update = U.function(inputs, update_op) inputs_tangent = list(inputs) + [flat_tangent2] do_update_geo = U.function(inputs_tangent, update_geo_op) do_get_geo_term = U.function(inputs_tangent, [ng1, ng2]) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner, q_runner_geo]: assert (qr != None) enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) assign_old_eq_new() # set old parameter values to new parameter values assign_db() # Policy update do_update(ob_no, action_na, standardized_adv_n) # ft2 = get_flat() - get_old_flat() # assign_old_eq_newr() # assign back # gnp = do_get_geo_term(ob_no, action_na, standardized_adv_n, ft2) # def check_nan(bs): # return [~np.isnan(b).all() for b in bs] # print(gnp[0]) # print('.....asdfasdfadslfkadsjfaksdfalsdkfjaldskf') # print(gnp[1]) # do_update_geo(ob_no, action_na, standardized_adv_n, ft2) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) # if kl > desired_kl * 2: # logger.log("kl too high") # tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() # elif kl < desired_kl / 2: # logger.log("kl too low") # tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() # else: # logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) print(do_std()) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def train(self, samples_data, normalize_rewards=False): rewards = samples_data["rewards"] reward_list = samples_data["reward_list"] actions = samples_data["actions"] timesteps = samples_data["timesteps"] actions_one_hot = samples_data["actions_one_hot"] wins = samples_data.get("wins", 0) observations = samples_data["observations"] obs_flat_and_padded = flat_and_pad( samples_data["paths_full"]["states"]) next_states_flat_and_padded = flat_and_pad( samples_data["paths_full"]["next_states_centred"]) # preprocess rewards if normalize_rewards: mean_rew = np.mean(rewards) std_rew = np.maximum(np.std(rewards), 1e-5) for i in range(len(samples_data["paths_full"]["rewards"])): samples_data["paths_full"]["rewards"][i] = ( samples_data["paths_full"]["rewards"][i] - mean_rew) / std_rew rew_flat_and_padded = flat_and_pad( samples_data["paths_full"]["rewards"]) mask = rew_flat_and_padded != 0 actions_one_hot_flat_and_padded = flat_and_pad( samples_data["paths_full"]["actions_one_hot"]) actions = np.zeros((obs_flat_and_padded.shape[0], 1)) actions = np.hstack((actions - 1, actions + 1)) omega_before = self.sess.run(self.model.get_omega()) variables_before = U.GetFlat(self.policy.trainable_vars)() inputs_dict = { self.rewards_ph: rew_flat_and_padded, self.actions_one_hot_ph: actions_one_hot_flat_and_padded, self.observations_ph: obs_flat_and_padded, self.next_states_ph: next_states_flat_and_padded, self.actions_ph: actions, self.returns_ph: reward_list, self.timesteps_ph: timesteps, self.mask: mask, } inputs_dict.update(self.model.get_feed_dict()) _, summary_str, ac_prob, model_prob, log_prob = self.sess.run( [ self.train_op, self.summarize, self.policy_tf, self.model_tf, self.log_prob, ], feed_dict=inputs_dict, ) self.global_step += 1 self.summary_writer.add_summary(summary_str, self.global_step) omega = self.sess.run(self.model.get_omega()) variables_after = U.GetFlat(self.policy.trainable_vars)() delta_variables = variables_after - variables_before norm_delta_var = np.linalg.norm(delta_variables) delta_omega = omega - omega_before norm_delta_omega = np.linalg.norm(delta_omega) # theta = self.sess.run(self.policy.getTheta()) # record all logger.record_tabular("ITERATIONS", self.iteration) # logger.record_tabular("Theta", theta) logger.record_tabular("OmegaBefore", omega_before) logger.record_tabular("Omega", omega) logger.record_tabular("NormDeltaOmega", norm_delta_omega) logger.record_tabular("NormDeltaVar", norm_delta_var) logger.record_tabular("DeltaOmega", delta_omega) logger.record_tabular("ReturnsMean", np.mean(reward_list)) logger.record_tabular("ReturnsStd", np.std(reward_list)) logger.record_tabular("RewardMean", np.mean(rewards)) logger.record_tabular("RewardStd", np.std(rewards)) logger.record_tabular("TimestepsMean", np.mean(timesteps)) logger.record_tabular("TimestepsStd", np.std(timesteps)) logger.record_tabular("Wins", wins) logger.record_tabular("Traj", samples_data["traj"]) logger.record_tabular("ConfortViolation", samples_data["confort_violation"]) logger.dump_tabular() self.iteration += 1 return omega
def learn(make_env, make_policy, horizon, gamma=0.99, max_iters=1000, filename=None, grid_size=100, feature_fun=None, plot_bound=False): # Build the environment env = make_env() ob_space = env.observation_space ac_space = env.action_space # Build the higher level policy pi = make_policy('pi', ob_space, ac_space) # Get all pi's learnable parameters all_var_list = pi.get_trainable_variables() var_list = \ [v for v in all_var_list if v.name.split('/')[1].startswith('higher')] # TF functions set_parameters = U.SetFromFlat(var_list) get_parameters = U.GetFlat(var_list) # Generate the grid of parameters to evaluate gain_grid = np.linspace(-1, 1, grid_size) rho = get_parameters() std_too = (len(rho) == 2) if std_too: grid_size_std = int(grid_size) logstd_grid = np.linspace(-4, 0, grid_size_std) x, y = np.meshgrid(gain_grid, logstd_grid) X = x.reshape((np.prod(x.shape),)) Y = y.reshape((np.prod(y.shape),)) rho_grid = list(zip(X, Y)) else: rho_grid = [[x] for x in gain_grid] # initialize loop variables n_selections = np.zeros(len(rho_grid)) ret_sums = np.zeros(len(rho_grid)) regret = 0 iter = 0 # Learning loop tstart = time.time() while True: iter += 1 # Exit loop in the end if iter - 1 >= max_iters: print('Finished...') break # Learning iteration logger.log('********** Iteration %i ************' % iter) ub = [] ub_best = 0 i_best = 0 average_ret = [] bonus = [] for i, rho in enumerate(rho_grid): if n_selections[i] > 0: average_ret_rho = ret_sums[i] / n_selections[i] bonus_rho = np.sqrt(2 * np.log(iter) / n_selections[i]) ub_rho = average_ret_rho + bonus_rho ub.append(ub_rho) if not std_too: average_ret.append(average_ret_rho) bonus.append(bonus_rho) else: ub_rho = 1e100 ub.append(ub_rho) average_ret.append(0) bonus.append(1e100) if ub_rho > ub_best: ub_best = ub_rho rho_best = rho i_best = i # Sample actor's parameters from chosen arm set_parameters(rho_best) _ = pi.resample() # Sample a trajectory with the newly parametrized actor _, disc_ret, _ = eval_trajectory( env, pi, gamma, horizon, feature_fun) ret_sums[i_best] += disc_ret regret += (0.96512 - disc_ret) n_selections[i_best] += 1 # Store info about variables of interest if env.spec.id == 'LQG1D-v0': mu1_actor = pi.eval_actor_mean([[1]])[0][0] mu1_higher = pi.eval_higher_mean()[0] sigma = pi.eval_higher_std()[0] logger.record_tabular("LQGmu1_actor", mu1_actor) logger.record_tabular("LQGmu1_higher", mu1_higher) logger.record_tabular("LQGsigma_higher", sigma) logger.record_tabular("ReturnLastEpisode", disc_ret) logger.record_tabular("ReturnMean", sum(ret_sums) / iter) logger.record_tabular("Regret", regret) logger.record_tabular("Regret/t", regret / iter) logger.record_tabular("Iteration", iter) logger.record_tabular("TimeElapsed", time.time() - tstart) # Plot the profile of the bound and its components if plot_bound: if std_too: ub = np.array(ub).reshape((grid_size_std, grid_size)) plot3D_bound_profile(x, y, ub, rho_best, ub_best, iter, filename) else: plot_bound_profile(gain_grid, ub, average_ret, bonus, rho_best, ub_best, iter, filename) # Print all info in a table logger.dump_tabular() # Close environment in the end env.close()
def learn( env, test_env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation # CMAES max_fitness, # has to be negative, as cmaes consider minization popsize, gensize, bounds, sigma, eval_iters, max_v_train_iter, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) seed, env_id): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy backup_pi = policy_fn( "backup_pi", ob_space, ac_space ) # Construct a network for every individual to adapt during the es evolution reward = tf.placeholder(dtype=tf.float32, shape=[None]) # step rewards pi_params = tf.placeholder(dtype=tf.float32, shape=[None]) old_pi_params = tf.placeholder(dtype=tf.float32, shape=[None]) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule bound_coeff = tf.placeholder( name='bound_coeff', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") next_ob = U.get_placeholder_cached( name="next_ob") # next step observation for updating q function ac = U.get_placeholder_cached( name="act") # action placeholder for computing q function mean_ac = U.get_placeholder_cached( name="mean_act") # action placeholder for computing q function kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent pi_adv = pi.qpred - pi.vpred adv_mean, adv_var = tf.nn.moments(pi_adv, axes=[0]) normalized_pi_adv = (pi_adv - adv_mean) / tf.sqrt(adv_var) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * normalized_pi_adv # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * normalized_pi_adv # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) qf_loss = tf.reduce_mean( tf.square(reward + gamma * pi.mean_qpred - pi.qpred)) # qf_loss = tf.reduce_mean(U.huber_loss(pi.qpred - tf.stop_gradient(reward + gamma * pi.mean_qpred))) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) qf_losses = [qf_loss] vf_losses = [vf_loss] # pol_loss = -tf.reduce_mean(pi_adv) pol_loss = pol_surr + pol_entpen losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() vf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") ] pol_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("pol") ] qf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("qf") ] mean_qf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("meanqf") ] vf_lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(vf_loss, vf_var_list)]) qf_lossandgrad = U.function( [ob, ac, next_ob, mean_ac, lrmult, reward, atarg], qf_losses + [U.flatgrad(qf_loss, qf_var_list)]) qf_adam = MpiAdam(qf_var_list, epsilon=adam_epsilon) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) assign_target_q_eq_eval_q = U.function( [], [], updates=[ tf.assign(target_q, eval_q) for (target_q, eval_q) in zipsame(mean_qf_var_list, qf_var_list) ]) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) assign_backup_eq_new = U.function( [], [], updates=[ tf.assign(backup_v, newv) for ( backup_v, newv) in zipsame(backup_pi.get_variables(), pi.get_variables()) ]) assign_new_eq_backup = U.function( [], [], updates=[ tf.assign(newv, backup_v) for (newv, backup_v ) in zipsame(pi.get_variables(), backup_pi.get_variables()) ]) # Compute all losses mean_pi_actions = U.function([ob], [pi.pd.mode()]) compute_pol_losses = U.function([ob, ac, atarg, ret, lrmult], [pol_loss, pol_surr, pol_entpen, meankl]) U.initialize() get_pi_flat_params = U.GetFlat(pol_var_list) set_pi_flat_params = U.SetFromFlat(pol_var_list) vf_adam.sync() global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness episodes_so_far = 0 timesteps_so_far = 0 ppo_timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards eval_gen = traj_segment_generator_eval(pi, test_env, timesteps_per_actorbatch, stochastic=True) # For evaluation seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, eval_gen=eval_gen) # For train V Func # Build generator for all solutions actors = [] best_fitness = 0 for i in range(popsize): newActor = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, eval_gen=eval_gen) actors.append(newActor) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if max_timesteps and timesteps_so_far >= max_timesteps: print("Max time steps") break elif max_episodes and episodes_so_far >= max_episodes: print("Max episodes") break elif max_iters and iters_so_far >= max_iters: print("Max iterations") break elif max_seconds and time.time() - tstart >= max_seconds: print("Max time") break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) # Generate new samples # Train V func for i in range(max_v_train_iter): logger.log("Iteration:" + str(iters_so_far) + " - sub-train iter for V func:" + str(i)) logger.log("Generate New Samples") seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) ob, ac, next_ob, atarg, reward, tdlamret, traj_idx = seg["ob"], seg["ac"], seg["next_ob"], seg["adv"], seg[ "rew"], seg["tdlamret"], \ seg["traj_index"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update( ob) # update running mean/std for normalization assign_old_eq_new( ) # set old parameter values to new parameter values # Train V function logger.log("Training V Func and Evaluating V Func Losses") for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *vf_losses, g = vf_lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) vf_adam.update(g, optim_stepsize * cur_lrmult) losses.append(vf_losses) logger.log(fmt_row(13, np.mean(losses, axis=0))) d_q = Dataset(dict(ob=ob, ac=ac, next_ob=next_ob, reward=reward, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) # Re-train q function logger.log("Training Q Func Evaluating Q Func Losses") for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d_q.iterate_once(optim_batchsize): *qf_losses, g = qf_lossandgrad( batch["ob"], batch["ac"], batch["next_ob"], mean_pi_actions(batch["ob"])[0], cur_lrmult, batch["reward"], batch["atarg"]) qf_adam.update(g, optim_stepsize * cur_lrmult) losses.append(qf_losses) logger.log(fmt_row(13, np.mean(losses, axis=0))) assign_target_q_eq_eval_q() # CMAES Train Policy assign_old_eq_new() # set old parameter values to new parameter values assign_backup_eq_new() # backup current policy flatten_weights = get_pi_flat_params() opt = cma.CMAOptions() opt['tolfun'] = max_fitness opt['popsize'] = popsize opt['maxiter'] = gensize opt['verb_disp'] = 0 opt['verb_log'] = 0 opt['seed'] = seed opt['AdaptSigma'] = True es = cma.CMAEvolutionStrategy(flatten_weights, sigma, opt) while True: if es.countiter >= gensize: logger.log("Max generations for current layer") break logger.log("Iteration:" + str(iters_so_far) + " - sub-train Generation for Policy:" + str(es.countiter)) logger.log("Sigma=" + str(es.sigma)) solutions = es.ask() costs = [] lens = [] assign_backup_eq_new() # backup current policy for id, solution in enumerate(solutions): set_pi_flat_params(solution) losses = [] cost = compute_pol_losses(ob, ac, atarg, tdlamret, cur_lrmult) costs.append(cost[0]) assign_new_eq_backup() # Weights decay l2_decay = compute_weight_decay(0.99, solutions) costs += l2_decay costs, real_costs = fitness_rank(costs) es.tell_real_seg(solutions=solutions, function_values=costs, real_f=real_costs, segs=None) best_solution = es.result[0] best_fitness = es.result[1] logger.log("Best Solution Fitness:" + str(best_fitness)) set_pi_flat_params(best_solution) iters_so_far += 1 episodes_so_far += sum(lens)
def learn(env, policy_func, reward_giver, expert_dataset, rank, pretrained, pretrained_weight, *, g_step, d_step, entcoeff, ckpt_dir, timesteps_per_batch, task_name, gamma, lam, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, rnd_iter=200, callback=None, dyn_norm=False, mmd=False): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd") ] vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = pi.vlossandgrad def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_variables(pretrained_weight, variables=pi.get_variables()) else: if not dyn_norm: pi.ob_rms.update(expert_dataset[0]) if not mmd: reward_giver.train(*expert_dataset, iter=rnd_iter) #inspect the reward learned # for batch in iterbatches(expert_dataset, batch_size=32): # print(reward_giver.get_reward(*batch)) best = -2000 save_ind = 0 max_save = 3 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ # logger.log("Optimizing Policy...") for _ in range(g_step): seg = seg_gen.__next__() #mmd reward if mmd: reward_giver.set_b2(seg["ob"], seg["ac"]) seg["rew"] = reward_giver.get_reward(seg["ob"], seg["ac"]) #report stats and save policy if any good lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) true_rew_avg = np.mean(true_rewbuffer) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", true_rew_avg) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("Best so far", best) # Save model if ckpt_dir is not None and true_rew_avg >= best: best = true_rew_avg fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) pi.save_policy(fname + "_" + str(save_ind)) save_ind = (save_ind + 1) % max_save #compute gradient towards next policy add_vtarg_and_adv(seg, gamma, lam) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms") and dyn_norm: pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=False) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) if pi.use_popart: pi.update_popart(tdlamret) for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms") and dyn_norm: pi.ob_rms.update( mbob) # update running mean/std for policy vfadam.update(allmean(compute_vflossandgrad(mbob, mbret)), vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) if rank == 0: logger.dump_tabular()
def learn(env, eval_env, policy_func, reward_giver, expert_dataset, rank, pretrained, pretrained_weight, *, g_step, d_step, entcoeff, save_per_iter, ckpt_dir, log_dir, timesteps_per_batch, evaluation_freq, task_name, gamma, lam, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, num_epochs=1000, callback=None): # configure log logger.configure(dir=log_dir) nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd") ] vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] # assert len(var_list) == len(vf_var_list) + 1 d_adam = MpiAdam(reward_giver.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) g_loss_stats = stats(loss_names) d_loss_stats = stats(reward_giver.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) for epoch in range(num_epochs): if callback: callback(locals(), globals()) # Save model if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("********** Epoch %i ************" % epoch) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") total_obs = [] total_acs = [] total_ep_rets = [] total_ep_lens = [] total_ep_true_rets = [] for g_step_num in range(g_step): with timed("sampling"): seg = seg_gen.__next__() # Add seg into total_seg total_obs.append(seg["ob"]) total_acs.append(seg["ac"]) total_ep_rets.append(seg["ep_rets"]) total_ep_lens.append(seg["ep_lens"]) total_ep_true_rets.append(seg["ep_true_rets"]) add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before update atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) # Evaluate current policy if (g_step * epoch + g_step_num) % evaluation_freq == 0: evaluate_policy(pi, reward_giver, eval_env, g_step * epoch + g_step_num, timesteps_per_batch, tstart) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") total_obs = np.vstack(total_obs) total_acs = np.vstack(total_acs) total_ep_rets = np.concatenate(total_ep_rets) total_ep_lens = np.concatenate(total_ep_lens) total_ep_true_rets = np.concatenate(total_ep_true_rets) logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(total_obs)) batch_size = len(total_obs) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (total_obs, total_acs), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # Update running mean/std for reward_giver if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0)))
def learn(env, last_ob, last_jpos, run_reach, policy_func, reward_giver, expert_dataset, rank, pretrained, pretrained_weight, *, g_step, d_step, entcoeff, save_per_iter, ckpt_dir, log_dir, timesteps_per_batch, task_name, gamma, lam, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, callback=None ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi_grasp", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) # Changes are made in order to use tensorboard # ------------------------------------------- #train_writer = tf.compat.v1.summary.FileWriter('../../logs/trpo_mpi') # sets log dir to GailPart folder #sess = tf.compat.v1.Session() # create a session?? # ------------------------------------------- kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.startswith("pi_grasp/pol") or v.name.startswith("pi_grasp/logstd")] vf_var_list = [v for v in all_var_list if v.name.startswith("pi_grasp/vff")] assert len(var_list) == len(vf_var_list) + 1 d_adam = MpiAdam(reward_giver.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([], [], updates=[tf.compat.v1.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, last_ob, last_jpos, run_reach, policy_func, env, reward_giver, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(reward_giver.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: t_name = task_name + "_" + str(iters_so_far) fname = os.path.join(ckpt_dir, t_name) # changed from task_name os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.compat.v1.train.Saver() saver.save(tf.compat.v1.get_default_session(), fname) logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() #print("trpo_mpi, seg = seg_gen.__next__() call output: ", seg ) add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) #logger.log("trpo_mpi.py, what should be logged with loss names ie. meanlosses:_", meanlosses) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update(mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) g_losses = meanlosses #logger.log("trpo_mpi.py, mean losses before logging wiht loss names: \n") #logger.log(meanlosses) # This is where the nan values are tabulated for some of the entries #logger.log("trpo_mpi.py, view whats being printed with (loss_names, lossvalues)") for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches((ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # update running mean/std for reward_giver if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) # This is to see what the d_losses are #logger.log("trpo_mpi.py, see what is being logged in d_losses") #logger.log("trpo_mpi.py, d_losses") #logger.log(d_losses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) # For Tensorboard Logging # --------------------------- #tf.compat.v1.summary.scalar("Generator Accuracy", tf.convert_to_tensor( np.mean(d_losses, axis=0)[4] ) ) # 5 position #tf.compat.v1.summary.scalar("Expert Accuracy", tf.convert_to_tensor( np.mean(d_losses, axis=0)[5] ) ) # 6 position #tf.compat.v1.summary.scalar("Entropy Loss", tf.convert_to_tensor( np.mean(d_losses, axis=0)[3] ) ) # 4 position #merge = tf.compat.v1.summary.merge_all() # merge summaries #summary = sess.run([merge]) #train_writer.add_summary(summary, iters_so_far) # Is there a need to reset metric after every epoch? I dont think so? # --------------------------- #logger.log("trpo_mpi.py, after logging, but before recordeing timesteps so far") lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values, truly confirmed is empty after call listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) # Could it be that the seg locals for lens and rets are ommitted since has no use in gail algorithm? # Probably dont have to worry about it, check the scalar part logger.record_tabular("EpLenMean", np.mean(lenbuffer)) # This has nan values logger.record_tabular("EpRewMean", np.mean(rewbuffer)) # This has nan values logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) # This has nan values logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) #timesteps_so_far += sum(lens) timesteps_so_far += seg["steps"] # changed to match setup with no finishing condition iters_so_far += 1 #env.reset() #reset the environment after a new iteration, therefore in traj generator check ob logger.record_tabular("EpisodesSoFar", episodes_so_far) # This is 0 ? if lens which is the number of entries for episode length doesnt exist, doesnt make sense for it to have a return. logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) # I think the entloss, entrpoy, ev_.... and the useful ones arent from the environment called using the trpo if rank == 0: logger.dump_tabular()
def render_evaluate( env, policy_func, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n( [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out # set up saver sess = tf.get_default_session() saver = tf.train.Saver() U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) print("loading pretrained model") saver.restore(sess, callback.model_dir) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 if True: for _ in range(50): done = False ob = env.reset() env.render() stochastic = 1 while not done: ac, vpred = pi.act(stochastic, ob) ob, rew, done, _ = env.step(ac) env.render() if rank == 0: logger.dump_tabular() if callback is not None: callback(locals(), globals())
def get_Flat_variables(self): # weights = [v for v in self.get_trainable_variables()] return U.GetFlat(self.get_trainable_variables())
def learn( *, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.002, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.00, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, num_reward=1, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) set_global_seeds(seed) # 创建policy policy = build_policy(env, network, value_network='copy', num_reward=num_reward, **network_kwargs) process_dir = logger.get_dir() save_dir = process_dir.split( 'Data')[-2] + 'log/l2/seed' + process_dir[-1] + '/' os.makedirs(save_dir, exist_ok=True) coe_save = [] impro_save = [] grad_save = [] adj_save = [] coe = np.ones((num_reward)) / num_reward np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ################################################################# # ob ac ret atarg 都是 placeholder # ret atarg 此处应该是向量形式 ob = observation_placeholder(ob_space) # 创建pi和oldpi with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) # 每个reward都可以算一个atarg atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None, num_reward]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) #此处的KL div和entropy与reward无关 ################################## kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) # entbonus 是entropy loss entbonus = ent_coef * meanent ################################# ########################################################### # vferr 用来更新 v 网络 vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) # optimgain 用来更新 policy 网络, 应该每个reward有一个 optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] ########################################################### dist = meankl # 定义要优化的变量和 V 网络 adam 优化器 all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) # 把变量展开成一个向量的类 get_flat = U.GetFlat(var_list) # 这个类可以把一个向量分片赋值给var_list里的变量 set_from_flat = U.SetFromFlat(var_list) # kl散度的梯度 klgrads = tf.gradients(dist, var_list) #################################################################### # 拉直的向量 flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") # 把拉直的向量重新分成很多向量 shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz #################################################################### #################################################################### # 把kl散度梯度与变量乘积相加 gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 # 把gvp的梯度展成向量 fvp = U.flatgrad(gvp, var_list) #################################################################### # 用学习后的策略更新old策略 assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) # 计算loss compute_losses = U.function([ob, ac, atarg], losses) # 计算loss和梯度 compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) # 计算fvp compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) # 计算值网络的梯度 compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out # 初始化variable U.initialize() if load_path is not None: pi.load(load_path) # 得到初始化的参数向量 th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) # 把向量the_init的值分片赋值给var_list set_from_flat(th_init) #同步 vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- # 这是一个生成数据的迭代器 seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, num_reward=num_reward) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() # 双端队列 lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() # 计算累积回报 add_vtarg_and_adv(seg, gamma, lam, num_reward=num_reward) ###########$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ToDo # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) # ob, ac, atarg, tdlamret 的类型都是ndarray #ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] _, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] #print(seg['ob'].shape,type(seg['ob'])) #print(seg['ac'],type(seg['ac'])) #print(seg['adv'],type(seg['adv'])) #print(seg["tdlamret"].shape,type(seg['tdlamret'])) vpredbefore = seg["vpred"] # predicted value function before udpate # 标准化 #print("============================== atarg =========================================================") #print(atarg) atarg = (atarg - np.mean(atarg, axis=0)) / np.std( atarg, axis=0) # standardized advantage function estimate #atarg = (atarg) / np.max(np.abs(atarg),axis=0) #print('======================================= standardized atarg ====================================') #print(atarg) if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy ## set old parameter values to new parameter values assign_old_eq_new() G = None S = None mr_lossbefore = np.zeros((num_reward, len(loss_names))) grad_norm = np.zeros((num_reward + 1)) for i in range(num_reward): args = seg["ob"], seg["ac"], atarg[:, i] #print(atarg[:,i]) # 算是args的一个sample,每隔5个取出一个 fvpargs = [arr[::5] for arr in args] # 这个函数计算fisher matrix 与向量 p 的 乘积 def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p with timed("computegrad of " + str(i + 1) + ".th reward"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) mr_lossbefore[i] = lossbefore g = allmean(g) #print("***************************************************************") #print(g) if isinstance(G, np.ndarray): G = np.vstack((G, g)) else: G = g # g是目标函数的梯度 # 利用共轭梯度获得更新方向 if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg of " + str(i + 1) + ".th reward"): # stepdir 是更新方向 stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm grad_norm[i] = np.linalg.norm(fullstep) assert np.isfinite(stepdir).all() if isinstance(S, np.ndarray): S = np.vstack((S, stepdir)) else: S = stepdir #print('======================================= G ====================================') #print(G) #print('======================================= S ====================================') #print(S) try: new_coe = get_coefficient(G, S) #coe = 0.99 * coe + 0.01 * new_coe coe = new_coe coe_save.append(coe) #根据梯度的夹角调整参数 # GG = np.dot(S, S.T) # D = np.sqrt(np.diag(1/np.diag(GG))) # GG = np.dot(np.dot(D,GG),D) # #print('======================================= inner product ====================================') # #print(GG) # adj = np.sum(GG) / (num_reward ** 2) adj = 1 #print('======================================= adj ====================================') #print(adj) adj_save.append(adj) adj_max_kl = adj * max_kl ################################################################# grad_norm = grad_norm * np.sqrt(adj) stepdir = np.dot(coe, S) g = np.dot(coe, G) lossbefore = np.dot(coe, mr_lossbefore) ################################################################# shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / adj_max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm grad_norm[num_reward] = np.linalg.norm(fullstep) grad_save.append(grad_norm) expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() def compute_mr_losses(): mr_losses = np.zeros((num_reward, len(loss_names))) for i in range(num_reward): args = seg["ob"], seg["ac"], atarg[:, i] one_reward_loss = allmean(np.array(compute_losses(*args))) mr_losses[i] = one_reward_loss mr_loss = np.dot(coe, mr_losses) return mr_loss, mr_losses # 做10次搜索 for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) mr_loss_new, mr_losses_new = compute_mr_losses() mr_impro = mr_losses_new - mr_lossbefore meanlosses = surr, kl, *_ = allmean(np.array(mr_loss_new)) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > adj_max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") impro_save.append(np.hstack((mr_impro[:, 0], improve))) break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): #print('======================================= tdlamret ====================================') #print(seg["tdlamret"]) for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): #with tf.Session() as sess: # sess.run(tf.global_variables_initializer()) # aaa = sess.run(pi.vf,feed_dict={ob:mbob,ret:mbret}) # print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") # print(aaa.shape) # print(mbret.shape) g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) except: print('error') #print(mbob,mbret) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() #pdb.set_trace() np.save(save_dir + 'coe.npy', coe_save) np.save(save_dir + 'grad.npy', grad_save) np.save(save_dir + 'improve.npy', impro_save) np.save(save_dir + 'adj.npy', adj_save) return pi
def get_Layer_Flat_variables(self, var_list): return U.GetFlat(var_list)
def learn(base_env, policy_fn, *, max_fitness, # has to be negative, as cmaes consider minization popsize, gensize, truncation_size, sigma, eval_iters, timesteps_per_actorbatch, max_timesteps = 0, max_episodes = 0, max_iters = 0, max_seconds = 0, seed = 0 ): # Setup losses and stuff # ---------------------------------------- ob_space = base_env.observation_space ac_space = base_env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy best_pi = policy_fn("best_pi", ob_space, ac_space) # Construct network for new policy backup_pi = policy_fn("backup_pi", ob_space, ac_space) # Construct a network for every individual to adapt during the es evolution U.initialize() pi_set_from_flat_params = U.SetFromFlat(pi.get_trainable_variables()) pi_get_flat_params = U.GetFlat(pi.get_trainable_variables()) global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer,best_fitness episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen = 100) # rolling buffer for episode lengths rewbuffer = deque(maxlen = 100) # rolling buffer for episode rewards assign_backup_eq_new = U.function([], [], updates = [tf.assign(backup_v, newv) for (backup_v, newv) in zipsame( backup_pi.get_variables(), pi.get_variables())]) assign_new_eq_backup = U.function([], [], updates = [tf.assign(newv, backup_v) for (newv, backup_v) in zipsame( pi.get_variables(), backup_pi.get_variables())]) assign_best_eq_pi = U.function([], [], updates = [tf.assign(bestv, newv) for (bestv, newv) in zipsame( best_pi.get_variables(), pi.get_variables())]) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" # Build generator for all solutions eval_seq = traj_segment_generator_eval(best_pi, base_env, timesteps_per_actorbatch, stochastic=False) actors = [] best_fitness = 0 for i in range(popsize): newActor = traj_segment_generator(pi, base_env, timesteps_per_actorbatch, stochastic = False, eval_iters = eval_iters, eval_seq=eval_seq) actors.append(newActor) flatten_weights = pi_get_flat_params() indv_len = len(flatten_weights) pop = {} pop["solutions"] = np.random.randn(popsize, indv_len) pop["parents"] = pop["solutions"][:, truncation_size] pop["fitness"] = np.empty([popsize, 1], dtype = float) gen_counter = 0 while True: if max_timesteps and timesteps_so_far >= max_timesteps: logger.log("Max time steps") break elif max_episodes and episodes_so_far >= max_episodes: logger.log("Max episodes") break elif max_iters and iters_so_far >= max_iters: logger.log("Max iterations (Generations)") break elif max_seconds and time.time() - tstart >= max_seconds: logger.log("Max time") break elif gen_counter >= gensize: logger.log("Max iterations (Generations)") break assign_backup_eq_new() # backup current policy assign_best_eq_pi() #get the best pi equal to current pi cur_lrmult = max(1.0 - float(timesteps_so_far) / (max_timesteps), 1e-8) logger.log("********** Generation %i ************" % iters_so_far) if iters_so_far == 0: eval_seg = eval_seq.__next__() rewbuffer.extend(eval_seg["ep_rets"]) lenbuffer.extend(eval_seg["ep_lens"]) result_record() ob_segs = None for i in range(popsize): # First generation if gen_counter == 0: pop["solutions"][i] = flatten_weights + sigma*cur_lrmult * np.random.normal(0.0, 1.0, indv_len) pi_set_from_flat_params(pop["solutions"][i]) seg = actors[i].__next__() pop["fitness"][i] = np.mean(seg["ep_rets"]) else: if i != 0: k = np.random.randint(1, truncation_size) pop["solutions"][i] = pop["parents"][k] + sigma*cur_lrmult * np.random.normal(0.0, 1.0, indv_len) pi_set_from_flat_params(pop["solutions"][i]) seg = actors[i].__next__() pop["fitness"][i] = np.mean(seg["ep_rets"]) if ob_segs is None: ob_segs = {'ob': np.copy(seg['ob'])} else: ob_segs['ob'] = np.append(ob_segs['ob'], seg['ob'], axis=0) assign_new_eq_backup() pop["fitness"], real_costs = fitness_normalization(pop["fitness"]) fit_idx = pop["fitness"].flatten().argsort()[::-1][:popsize] pop["solutions"] = pop["solutions"][fit_idx] pop["parents"] = pop["solutions"][:, truncation_size] pop["fitness"] = pop["fitness"][fit_idx] # print(pop["fitness"]) # pop["fitness"], real_fitness = fitness_normalization(pop["fitness"][fit_idx]) # logger.log("Best Solution Fitness:", pop["fitness"][0]) pi_set_from_flat_params(pop["solutions"][0]) ob = ob_segs["ob"] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for observation normalization gen_counter += 1 iters_so_far += 1
def _init(self, ob_name, ob_space, ac_space, hid_size, num_hid_layers, init_std=1.0, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None self.varphi_dim = hid_size self.ob = utils.get_placeholder(name=ob_name, dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) # self.ob = utils.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) # self.ob = tf.placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value( (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=utils.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=utils.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz # Create 'num_hid_layers' hidden layers for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=utils.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): self.action_dim = ac_space.shape[0] # mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) # logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) self.dist_diagonal = True self.varphi = last_out self.varphi_dim = hid_size if self.dist_diagonal: stddev_init = np.ones([1, self.action_dim]) * init_std prec_init = 1. / (np.multiply(stddev_init, stddev_init) ) # 1 x |a| self.prec = tf.get_variable( name="prec", shape=[1, self.action_dim], initializer=tf.constant_initializer(prec_init)) kt_init = np.ones([self.varphi_dim, self.action_dim ]) * 0.5 / self.varphi_dim ktprec_init = kt_init * prec_init self.ktprec = tf.get_variable( name="ktprec", shape=[self.varphi_dim, self.action_dim], initializer=tf.constant_initializer(ktprec_init)) kt = tf.divide(self.ktprec, self.prec) mean = tf.matmul(last_out, kt) logstd = tf.log(tf.sqrt(1. / self.prec)) else: # Not implemented yet raise NotImplementedError self.prec_get_flat = utils.GetFlat([self.prec]) self.prec_set_from_flat = utils.SetFromFlat([self.prec]) self.ktprec_get_flat = utils.GetFlat([self.ktprec]) self.ktprec_set_from_flat = utils.SetFromFlat([self.ktprec]) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=utils.normc_initializer(0.01)) self.scope = tf.get_variable_scope().name self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = utils.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = utils.function([stochastic, self.ob], [ac, self.vpred]) # Get all policy parameters vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope + '/pol') # Remove log-linear parameters ktprec and prec to get only non-linear parameters del vars[-1] del vars[-1] beta_params = vars # Flat w_beta beta_len = np.sum( [np.prod(p.get_shape().as_list()) for p in beta_params]) w_beta_var = tf.placeholder(dtype=tf.float32, shape=[beta_len]) # Unflatten w_beta beta_shapes = list(map(tf.shape, beta_params)) w_beta_unflat_var = self.unflatten_tensor_variables( w_beta_var, beta_shapes) # w_beta^T * \grad_beta \varphi(s)^T v = tf.placeholder(dtype=self.varphi.dtype, shape=self.varphi.get_shape(), name="v_in_Rop") features_beta = self.alternative_Rop(self.varphi, beta_params, w_beta_unflat_var, v) self.features_beta = utils.function([self.ob, w_beta_var, v], features_beta)
def _init(self, ob_space, ac_space, hid_layers=[], deterministic=True, diagonal=True, use_bias=False, use_critic=False, seed=None, verbose=True, zero_init=False): """Params: ob_space: task observation space ac_space : task action space hid__layers: list with width of each hidden layer deterministic: whether the actor is deterministic diagonal: whether the higher order policy has a diagonal covariance matrix use_bias: whether to include bias in neurons use_critic: whether to include a critic network seed: optional random seed """ assert isinstance(ob_space, gym.spaces.Box) assert len(ac_space.shape) == 1 self.diagonal = diagonal self.use_bias = use_bias batch_length = None #Accepts a sequence of episodes of arbitrary length self.observation_space = ob_space self.action_space = ac_space self.ac_dim = ac_space.shape[0] self.ob_dim = ob_space.shape[0] self.hid_layers = hid_layers self.deterministic = deterministic self.use_critic = use_critic self.linear = not hid_layers self.verbose = verbose if seed is not None: set_global_seeds(seed) self._ob = ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) #Critic (normally not used) if use_critic: with tf.variable_scope('critic'): last_out = ob for i, hid_size in enumerate(hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] #Actor (N.B.: weight initialization is irrelevant) with tf.variable_scope('actor'): last_out = ob for i, hid_size in enumerate(hid_layers): #Mlp feature extraction last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=tf.initializers.constant(0.), use_bias=use_bias)) if deterministic and isinstance(ac_space, gym.spaces.Box): #Determinisitc action selection self.actor_mean = actor_mean = tf.layers.dense( last_out, ac_space.shape[0], name='action', kernel_initializer=tf.initializers.constant(0.), use_bias=use_bias) else: raise NotImplementedError #Currently supports only deterministic action policies #Higher order policy (Gaussian) with tf.variable_scope('actor') as scope: self.actor_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \ scope=scope.name) self.flat_actor_weights = tf.concat([tf.reshape(w, [-1]) for w in \ self.actor_weights], axis=0) #flatten self._n_actor_weights = n_actor_weights = self.flat_actor_weights.shape[ 0] with tf.variable_scope('higher'): if zero_init: higher_mean_init = tf.where( tf.not_equal(self.flat_actor_weights, tf.constant(0, dtype=tf.float32)), tf.zeros(shape=[n_actor_weights.value]), tf.zeros(shape=[n_actor_weights])) else: #Initial means sampled from a normal distribution N(0,1) higher_mean_init = tf.where( tf.not_equal(self.flat_actor_weights, tf.constant(0, dtype=tf.float32)), tf.random_normal(shape=[n_actor_weights.value], stddev=0.01), tf.zeros(shape=[n_actor_weights])) self.higher_mean = higher_mean = tf.get_variable( name='higher_mean', initializer=higher_mean_init) if diagonal: #Diagonal covariance matrix; all stds initialized to 0 self.higher_logstd = higher_logstd = tf.get_variable( name='higher_logstd', shape=[n_actor_weights], initializer=tf.initializers.constant(0.)) pdparam = tf.concat( [higher_mean, higher_mean * 0. + higher_logstd], axis=0) self.pdtype = pdtype = DiagGaussianPdType( n_actor_weights.value) else: #Cholesky covariance matrix self.higher_logstd = higher_logstd = tf.get_variable( name='higher_logstd', shape=[n_actor_weights * (n_actor_weights + 1) // 2], initializer=tf.initializers.constant(0.)) pdparam = tf.concat([higher_mean, higher_logstd], axis=0) self.pdtype = pdtype = CholeskyGaussianPdType( n_actor_weights.value) #Sample actor weights self.pd = pdtype.pdfromflat(pdparam) sampled_actor_params = self.pd.sample() symm_sampled_actor_params = self.pd.sample_symmetric() self._sample_symm_actor_params = U.function( [], list(symm_sampled_actor_params)) self._sample_actor_params = U.function([], [sampled_actor_params]) #Assign actor weights with tf.variable_scope('actor') as scope: actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \ scope=scope.name) self._use_sampled_actor_params = U.assignFromFlat( actor_params, sampled_actor_params) self._set_actor_params = U.SetFromFlat(actor_params) self._get_actor_params = U.GetFlat(actor_params) #Act self._action = action = actor_mean self._act = U.function([ob], [action]) #Higher policy weights with tf.variable_scope('higher') as scope: self._higher_params = higher_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \ scope=scope.name) self.flat_higher_params = tf.concat([tf.reshape(w, [-1]) for w in \ self._higher_params], axis=0) #flatten self._n_higher_params = self.flat_higher_params.shape[0] self._get_flat_higher_params = U.GetFlat(higher_params) self._set_higher_params = U.SetFromFlat(self._higher_params) #Batch PGPE self._actor_params_in = actor_params_in = \ U.get_placeholder(name='actor_params_in', dtype=tf.float32, shape=[batch_length] + [n_actor_weights]) self._rets_in = rets_in = U.get_placeholder(name='returns_in', dtype=tf.float32, shape=[batch_length]) ret_mean, ret_std = tf.nn.moments(rets_in, axes=[0]) self._get_ret_mean = U.function([self._rets_in], [ret_mean]) self._get_ret_std = U.function([self._rets_in], [ret_std]) self._logprobs = logprobs = self.pd.logp(actor_params_in) pgpe_times_n = U.flatgrad(logprobs * rets_in, higher_params) self._get_pgpe_times_n = U.function([actor_params_in, rets_in], [pgpe_times_n]) #One-episode PGPE #Used N times to compute the baseline -> can we do better? self._one_actor_param_in = one_actor_param_in = U.get_placeholder( name='one_actor_param_in', dtype=tf.float32, shape=[n_actor_weights]) one_logprob = self.pd.logp(one_actor_param_in) score = U.flatgrad(one_logprob, higher_params) score_norm = tf.norm(score) self._get_score = U.function([one_actor_param_in], [score]) self._get_score_norm = U.function([one_actor_param_in], [score_norm]) #Batch off-policy PGPE self._probs = tf.exp(logprobs) self._behavioral = None self._renyi_other = None #One episode off-PGPE self._one_prob = tf.exp(one_logprob) #Renyi computation self._det_sigma = tf.exp(tf.reduce_sum(self.higher_logstd)) #Fisher computation (diagonal case) mean_fisher_diag = tf.exp(-2 * self.higher_logstd) cov_fisher_diag = mean_fisher_diag * 0 + 2 self._fisher_diag = tf.concat([mean_fisher_diag, cov_fisher_diag], axis=0) self._get_fisher_diag = U.function([], [self._fisher_diag]) #Multiple importance sampling self._memory = None
def learn_hoof_a2c( network, env, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, # Baselines default settings till here optimiser='RMSProp', lr_upper_bound=None, ent_upper_bound=None, num_lr=None, num_ent_coeff=None, max_kl=-1.0, # -1.0 is for no KL constraint **network_kwargs): ''' Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Parameters: ----------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int, total number of timesteps to train on (default: 80M) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # overwrite default params if using HOOF if lr_upper_bound is not None: lr = 1.0 lrschedule = 'constant' else: num_lr = 1 if ent_upper_bound is None: num_ent_coeff = 1 # Instantiate the model object (that creates step_model and train_model) model = HOOF_Model( policy=policy, env=env, nsteps=nsteps, optimiser=optimiser, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, total_timesteps=total_timesteps, alpha=alpha, epsilon=epsilon # defaults for RMSProp ) runner = HOOF_Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Calculate the batch_size nbatch = nenvs * nsteps # model helper functions model_params = find_trainable_variables("a2c_model") get_flat = U.GetFlat(model_params) set_from_flat = U.SetFromFlat(model_params) # for Gaussian policies def kl(new_mean, new_sd, old_mean, old_sd): approx_kl = np.log(new_sd / old_sd) + ( old_sd**2 + (old_mean - new_mean)**2) / (2.0 * new_sd**2 + 10**-8) - 0.5 approx_kl = np.sum(approx_kl, axis=1) approx_kl = np.mean(approx_kl) return approx_kl if max_kl == -1.0: # set max kl to a high val in case there is no constraint max_kl = 10**8 # Start total timer tstart = time.time() for update in range(1, int(total_timesteps // nbatch + 1)): obs, states, rewards, masks, actions, values, undisc_rwds, epinfos = runner.run( ) epinfobuf.extend(epinfos) old_mean, old_sd, old_neg_ll = model.get_mean_std_neg_ll(obs, actions) for step in range(len(obs)): cur_lr = lr.value() opt_pol_val = -10**8 old_params = get_flat() rms_weights_before_upd = model.get_opt_state() approx_kl = np.zeros((num_ent_coeff, num_lr)) epv = np.zeros((num_ent_coeff, num_lr)) rand_lr = lr_upper_bound * np.random.rand( num_lr) if lr_upper_bound is not None else [cur_lr] rand_lr = np.sort(rand_lr) rand_ent_coeff = ent_upper_bound * np.random.rand( num_ent_coeff) if ent_upper_bound is not None else [ent_coef] for nec in range(num_ent_coeff): # reset policy and optimiser set_from_flat(old_params) model.set_opt_state(rms_weights_before_upd) # get grads for loss fn with given entropy coeff policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values, rand_ent_coeff[nec]) new_params = get_flat() ent_grads = new_params - old_params # enumerate over different LR for nlr in range(num_lr): new_params = old_params + rand_lr[nlr] * ent_grads set_from_flat(new_params) new_mean, new_sd, new_neg_ll = model.get_mean_std_neg_ll( obs, actions) lik_ratio = np.exp(-new_neg_ll + old_neg_ll) est_pol_val = wis_estimate(nenvs, nsteps, undisc_rwds, lik_ratio) approx_kl[nec, nlr] = kl(new_mean, new_sd, old_mean, old_sd) epv[nec, nlr] = est_pol_val if (nec == 0 and nlr == 0) or (est_pol_val > opt_pol_val and approx_kl[nec, nlr] < max_kl): opt_pol_val = est_pol_val opt_pol_params = get_flat() opt_rms_wts = model.get_opt_state() opt_lr = rand_lr[nlr] opt_ent_coeff = rand_ent_coeff[nec] opt_kl = approx_kl[nec, nlr] # update policy and rms prop to optimal wts set_from_flat(opt_pol_params) model.set_opt_state(opt_rms_wts) # Shrink LR search space if too many get rejected if lr_upper_bound is not None: rejections = np.sum(approx_kl > max_kl) / num_lr if rejections > 0.8: lr_upper_bound *= 0.8 if rejections == 0: lr_upper_bound *= 1.25 nseconds = time.time() - tstart # Calculate the fps (frame per second) fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("opt_lr", float(opt_lr)) logger.record_tabular("opt_ent_coeff", float(opt_ent_coeff)) logger.record_tabular("approx_kl", float(opt_kl)) if lr_upper_bound is not None: logger.record_tabular("rejections", rejections) logger.record_tabular("lr_ub", lr_upper_bound) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
def ars_optimize(env, policy_func, perturb_mag, learning_rate, eval_epoch, params_per_thread, top_perturb, maxiter, policy_scope="pi", init_policy_params = None, eval_func = evaluate_performance, callback=None, skilldim=3, task_num=5, inner_iters=10 ): ob_space = env.observation_space ac_space = env.action_space from gym import spaces obs_dim_base = ob_space.low.shape[0] + skilldim high = np.inf * np.ones(obs_dim_base) low = -high skill_ob_space = spaces.Box(low, high) np.random.seed(MPI.COMM_WORLD.Get_rank() * 1023+1) pi = policy_func(policy_scope, skill_ob_space, ac_space) # Construct network for new policy pol_var_list = [v for v in pi.get_trainable_variables() if 'pol' in v.name and 'logstd' not in v.name] pol_var_size = np.sum([np.prod(v.shape) for v in pol_var_list]) print(pol_var_list) print('pol_var_size: ', pol_var_size) get_pol_flat = U.GetFlat(pol_var_list) set_pol_from_flat = U.SetFromFlat(pol_var_list) adam = MpiAdam(pol_var_list) U.initialize() adam.sync() if init_policy_params is not None: cur_scope = pi.get_variables()[0].name[0:pi.get_variables()[0].name.find('/')] orig_scope = list(init_policy_params.keys())[0][0:list(init_policy_params.keys())[0].find('/')] print(cur_scope, orig_scope) for i in range(len(pi.get_variables())): if pi.get_variables()[i].name.replace(cur_scope, orig_scope, 1) in init_policy_params: assign_op = pi.get_variables()[i].assign( init_policy_params[pi.get_variables()[i].name.replace(cur_scope, orig_scope, 1)]) tf.get_default_session().run(assign_op) # Used to construct the task-embedding mapping skill_optimizer = UPOptimizer(env, pi, skilldim, eval_num=1, verbose=False, bayesian_opt=True) param_dim = len(get_pol_flat()) episodes_so_far = 0 iters_so_far = 0 current_parameters = np.copy(get_pol_flat()) for it in range(maxiter): logger.log("********** Iteration %i ************" % it) if callback is not None: callback(locals(), globals()) current_parameters = np.copy(get_pol_flat()) perturbations = [] performances = [] all_obs = [] if it % inner_iters == 0: logger.log("==== Reconstruct task set at iteration %i ======" % it) # Construct the task_embedding first task_embeddings = [] for task_id in range(int(np.max([task_num / MPI.COMM_WORLD.Get_size(), 1]))): task_parameters = env.env.env.resample_task() optimized_embedding = None skill_optimizer.reset() if skilldim > 0: skill_optimizer.optimize(maxiter=20, max_steps=50000, custom_bound=[-1.0, 1.0]) optimized_embedding = skill_optimizer.best_x print(task_parameters, optimized_embedding, skill_optimizer.best_f) else: optimized_embedding = [] task_embeddings.append([task_parameters, optimized_embedding]) print(optimized_embedding) all_task_embeddings = MPI.COMM_WORLD.allgather(task_embeddings) task_embeddings = [] for te in all_task_embeddings: task_embeddings += te task_embeddings = task_embeddings[0:task_num] for sp in range(params_per_thread): sampled_perturbation = np.random.normal(0, 1, param_dim) positive_perf = [] negative_perf = [] for task_emb in task_embeddings: pert_param = current_parameters + sampled_perturbation * perturb_mag obs, positive_pert_performance, episodes = eval_func(env, pi, pert_param, set_pol_from_flat, eval_epoch, task_emb[0], task_emb[1]) all_obs += obs episodes_so_far += episodes positive_perf.append(positive_pert_performance) pert_param = current_parameters - sampled_perturbation * perturb_mag obs, negative_pert_performance, episodes = eval_func(env, pi, pert_param, set_pol_from_flat, eval_epoch, task_emb[0], task_emb[1]) all_obs += obs episodes_so_far += episodes negative_perf.append(negative_pert_performance) perturbations.append(sampled_perturbation) performances.append([np.mean(positive_perf), np.mean(negative_perf)]) all_performances = np.concatenate(MPI.COMM_WORLD.allgather(performances), axis=0) all_perturbations = np.concatenate(MPI.COMM_WORLD.allgather(perturbations), axis=0) max_perf_list = np.max(all_performances, axis=1) max_perf_list.sort() top_k_perf = max_perf_list[-top_perturb] parameter_update = current_parameters * 0 utilized_rewards = [] for sp in range(len(all_performances)): if np.max(all_performances[sp]) >= top_k_perf: parameter_update += all_perturbations[sp] * (all_performances[sp][0] - all_performances[sp][1]) utilized_rewards.append(all_performances[sp][0]) utilized_rewards.append(all_performances[sp][1]) parameter_update /= top_perturb lr_scaler = np.std(utilized_rewards) if hasattr(pi, "ob_rms"): pi.ob_rms.update(np.array(all_obs)) # update running mean/std for policy delta_parameters = parameter_update * learning_rate / lr_scaler current_parameters += delta_parameters set_pol_from_flat(current_parameters) total_episodes_sofar = np.sum(MPI.COMM_WORLD.allgather(episodes_so_far)) final_performance = [] for task_emb in task_embeddings: obs, perf, episodes = eval_func(env, pi, current_parameters, set_pol_from_flat, eval_epoch, task_emb[0], task_emb[1]) final_performance.append(perf) if MPI.COMM_WORLD.Get_rank()==0: logger.record_tabular('EpRewMean', np.mean(final_performance)) logger.record_tabular('EpisodesSoFar', total_episodes_sofar) logger.record_tabular('Parameter magnitude', np.linalg.norm(current_parameters)) logger.record_tabular('Delta magnitude', np.linalg.norm(delta_parameters)) logger.dump_tabular() iters_so_far += 1 print('Optimized parameter: ', current_parameters)
def learn( *, network, env, eval_env, make_eval_env, env_id, total_timesteps, timesteps_per_batch, sil_update, sil_loss, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, lr=3e-4, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=5, sil_value=0.01, sil_alpha=0.6, sil_beta=0.1, max_episodes=0, max_iters=0, # time constraint callback=None, save_interval=0, load_path=None, # MBL # For train mbl mbl_train_freq=5, # For eval num_eval_episodes=5, eval_freq=5, vis_eval=False, #eval_targs=('mbmf',), eval_targs=('mf', ), quant=2, # For mbl.step #num_samples=(1500,), num_samples=(1, ), horizon=(2, ), #horizon=(2,1), #num_elites=(10,), num_elites=(1, ), mbl_lamb=(1.0, ), mbl_gamma=0.99, #mbl_sh=1, # Number of step for stochastic sampling mbl_sh=10000, #vf_lookahead=-1, #use_max_vf=False, reset_per_step=(0, ), # For get_model num_fc=2, num_fwd_hidden=500, use_layer_norm=False, # For MBL num_warm_start=int(1e4), init_epochs=10, update_epochs=5, batch_size=512, update_with_validation=False, use_mean_elites=1, use_ent_adjust=0, adj_std_scale=0.5, # For data loading validation_set_path=None, # For data collect collect_val_data=False, # For traj collect traj_collect='mf', # For profile measure_time=True, eval_val_err=False, measure_rew=True, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, vf_coef=0.5, max_grad_norm=0.5, log_interval=1, nminibatches=4, noptepochs=4, cliprange=0.2, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if not isinstance(num_samples, tuple): num_samples = (num_samples, ) if not isinstance(horizon, tuple): horizon = (horizon, ) if not isinstance(num_elites, tuple): num_elites = (num_elites, ) if not isinstance(mbl_lamb, tuple): mbl_lamb = (mbl_lamb, ) if not isinstance(reset_per_step, tuple): reset_per_step = (reset_per_step, ) if validation_set_path is None: if collect_val_data: validation_set_path = os.path.join(logger.get_dir(), 'val.pkl') else: validation_set_path = os.path.join('dataset', '{}-val.pkl'.format(env_id)) if eval_val_err: eval_val_err_path = os.path.join('dataset', '{}-combine-val.pkl'.format(env_id)) logger.log(locals()) logger.log('MBL_SH', mbl_sh) logger.log('Traj_collect', traj_collect) if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) policy = build_policy(env, network, value_network='copy', **network_kwargs) nenvs = env.num_envs np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * timesteps_per_batch nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) make_model = lambda: Model( policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=timesteps_per_batch, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, sil_update=sil_update, sil_value=sil_value, sil_alpha=sil_alpha, sil_beta=sil_beta, sil_loss=sil_loss, # fn_reward=env.process_reward, fn_reward=None, # fn_obs=env.process_obs, fn_obs=None, ppo=False, prev_pi='pi', silm=pi) model = make_model() with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) make_old_model = lambda: Model( policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=timesteps_per_batch, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, sil_update=sil_update, sil_value=sil_value, sil_alpha=sil_alpha, sil_beta=sil_beta, sil_loss=sil_loss, # fn_reward=env.process_reward, fn_reward=None, # fn_obs=env.process_obs, fn_obs=None, ppo=False, prev_pi='oldpi', silm=oldpi) old_model = make_old_model() # MBL # --------------------------------------- #viz = Visdom(env=env_id) win = None eval_targs = list(eval_targs) logger.log(eval_targs) make_model_f = get_make_mlp_model(num_fc=num_fc, num_fwd_hidden=num_fwd_hidden, layer_norm=use_layer_norm) mbl = MBL(env=eval_env, env_id=env_id, make_model=make_model_f, num_warm_start=num_warm_start, init_epochs=init_epochs, update_epochs=update_epochs, batch_size=batch_size, **network_kwargs) val_dataset = {'ob': None, 'ac': None, 'ob_next': None} if update_with_validation: logger.log('Update with validation') val_dataset = load_val_data(validation_set_path) if eval_val_err: logger.log('Log val error') eval_val_dataset = load_val_data(eval_val_err_path) if collect_val_data: logger.log('Collect validation data') val_dataset_collect = [] def _mf_pi(ob, t=None): stochastic = True ac, vpred, _, _ = pi.step(ob, stochastic=stochastic) return ac, vpred def _mf_det_pi(ob, t=None): #ac, vpred, _, _ = pi.step(ob, stochastic=False) ac, vpred = pi._evaluate([pi.pd.mode(), pi.vf], ob) return ac, vpred def _mf_ent_pi(ob, t=None): mean, std, vpred = pi._evaluate([pi.pd.mode(), pi.pd.std, pi.vf], ob) ac = np.random.normal(mean, std * adj_std_scale, size=mean.shape) return ac, vpred ################### use_ent_adjust======> adj_std_scale????????pi action sample def _mbmf_inner_pi(ob, t=0): if use_ent_adjust: return _mf_ent_pi(ob) else: #return _mf_pi(ob) if t < mbl_sh: return _mf_pi(ob) else: return _mf_det_pi(ob) # --------------------------------------- # Run multiple configuration once all_eval_descs = [] def make_mbmf_pi(n, h, e, l): def _mbmf_pi(ob): ac, rew = mbl.step(ob=ob, pi=_mbmf_inner_pi, horizon=h, num_samples=n, num_elites=e, gamma=mbl_gamma, lamb=l, use_mean_elites=use_mean_elites) return ac[None], rew return Policy(step=_mbmf_pi, reset=None) for n in num_samples: for h in horizon: for l in mbl_lamb: for e in num_elites: if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew', 'MBL_TRPO_SIL', make_mbmf_pi(n, h, e, l))) #if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), 'MBL_TRPO-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), make_mbmf_pi(n, h, e, l))) if 'mf' in eval_targs: all_eval_descs.append( ('MeanRew', 'TRPO_SIL', Policy(step=_mf_pi, reset=None))) logger.log('List of evaluation targets') for it in all_eval_descs: logger.log(it[0]) pool = Pool(mp.cpu_count()) warm_start_done = False # ---------------------------------------- atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- if traj_collect == 'mf': seg_gen = traj_segment_generator(env, timesteps_per_batch, model, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() if traj_collect == 'mf-random' or traj_collect == 'mf-mb': seg_mbl = seg_gen_mbl.__next__() else: seg_mbl = seg add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] # Val data collection if collect_val_data: for ob_, ac_, ob_next_ in zip(ob[:-1, 0, ...], ac[:-1, ...], ob[1:, 0, ...]): val_dataset_collect.append( (copy.copy(ob_), copy.copy(ac_), copy.copy(ob_next_))) # ----------------------------- # MBL update else: ob_mbl, ac_mbl = seg_mbl["ob"], seg_mbl["ac"] mbl.add_data_batch(ob_mbl[:-1, 0, ...], ac_mbl[:-1, ...], ob_mbl[1:, 0, ...]) mbl.update_forward_dynamic(require_update=iters_so_far % mbl_train_freq == 0, ob_val=val_dataset['ob'], ac_val=val_dataset['ac'], ob_next_val=val_dataset['ob_next']) # ----------------------------- if traj_collect == 'mf': #if traj_collect == 'mf' or traj_collect == 'mf-random' or traj_collect == 'mf-mb': vpredbefore = seg[ "vpred"] # predicted value function before udpate model = seg["model"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "rms"): pi.rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) with timed("SIL"): lrnow = lr(1.0 - timesteps_so_far / total_timesteps) l_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train( lrnow) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if sil_update > 0: logger.record_tabular("SilSamples", sil_samples) if rank == 0: # MBL evaluation if not collect_val_data: #set_global_seeds(seed) default_sess = tf.get_default_session() def multithread_eval_policy(env_, pi_, num_episodes_, vis_eval_, seed): with default_sess.as_default(): if hasattr(env, 'ob_rms') and hasattr(env_, 'ob_rms'): env_.ob_rms = env.ob_rms res = eval_policy(env_, pi_, num_episodes_, vis_eval_, seed, measure_time, measure_rew) try: env_.close() except: pass return res if mbl.is_warm_start_done() and iters_so_far % eval_freq == 0: warm_start_done = mbl.is_warm_start_done() if num_eval_episodes > 0: targs_names = {} with timed('eval'): num_descs = len(all_eval_descs) list_field_names = [e[0] for e in all_eval_descs] list_legend_names = [e[1] for e in all_eval_descs] list_pis = [e[2] for e in all_eval_descs] list_eval_envs = [ make_eval_env() for _ in range(num_descs) ] list_seed = [seed for _ in range(num_descs)] list_num_eval_episodes = [ num_eval_episodes for _ in range(num_descs) ] print(list_field_names) print(list_legend_names) list_vis_eval = [ vis_eval for _ in range(num_descs) ] for i in range(num_descs): field_name, legend_name = list_field_names[ i], list_legend_names[i], res = multithread_eval_policy( list_eval_envs[i], list_pis[i], list_num_eval_episodes[i], list_vis_eval[i], seed) #eval_results = pool.starmap(multithread_eval_policy, zip(list_eval_envs, list_pis, list_num_eval_episodes, list_vis_eval,list_seed)) #for field_name, legend_name, res in zip(list_field_names, list_legend_names, eval_results): perf, elapsed_time, eval_rew = res logger.record_tabular(field_name, perf) if measure_time: logger.record_tabular( 'Time-%s' % (field_name), elapsed_time) if measure_rew: logger.record_tabular( 'SimRew-%s' % (field_name), eval_rew) targs_names[field_name] = legend_name if eval_val_err: fwd_dynamics_err = mbl.eval_forward_dynamic( obs=eval_val_dataset['ob'], acs=eval_val_dataset['ac'], obs_next=eval_val_dataset['ob_next']) logger.record_tabular('FwdValError', fwd_dynamics_err) logger.dump_tabular() #print(logger.get_dir()) #print(targs_names) #if num_eval_episodes > 0: # win = plot(viz, win, logger.get_dir(), targs_names=targs_names, quant=quant, opt='best') # ----------- #logger.dump_tabular() yield pi if collect_val_data: with open(validation_set_path, 'wb') as f: pickle.dump(val_dataset_collect, f) logger.log('Save {} validation data'.format(len(val_dataset_collect)))
def learn(env_name, make_env, seed, make_policy, *, max_iters, horizon, drho, delta, gamma, multiple_init=None, sampler=None, feature_fun=None, iw_norm='none', bound_type='max-ess', max_offline_iters=10, save_weights=False, render_after=None, grid_size_1d=None, mu_min=None, mu_max=None, truncated_mise=True, delta_t=None, k=2, filename=None, find_optimal_arm=False, plot_bound=False, plot_ess_profile=False, trainable_std=False, rescale_ep_return=False): """ Learns a policy from scratch make_env: environment maker make_policy: policy maker horizon: max episode length delta: probability of failure gamma: discount factor max_iters: total number of learning iteration """ # Print options np.set_printoptions(precision=3) losses_with_name = [] # Build the environment env = make_env() ob_space = env.observation_space ac_space = env.action_space env.seed(seed) # Build the higher level target and behavioral policies pi = make_policy('pi', ob_space, ac_space) oldpi = make_policy('oldpi', ob_space, ac_space) logger.record_tabular('NumTrainableParams', int(pi._n_higher_params)) # Get all pi's learnable parameters all_var_list = pi.get_trainable_variables() var_list = \ [v for v in all_var_list if v.name.split('/')[1].startswith('higher')] shapes = [U.intprod(var.get_shape().as_list()) for var in var_list] d = sum(shapes) # Get all oldpi's learnable parameters all_var_list_old = oldpi.get_trainable_variables() var_list_old = \ [v for v in all_var_list_old if v.name.split('/')[1].startswith('higher')] # Get hyperpolicy's logstd higher_logstd_list = [pi.get_higher_logstd()] # My Placeholders actor_params_ = tf.placeholder(shape=[max_iters, pi._n_actor_weights], name='actor_params', dtype=tf.float32) last_actor_param_ = tf.placeholder(shape=(pi._n_actor_weights), name='last_actor_params', dtype=tf.float32) den_mise_log_ = tf.placeholder(shape=[max_iters], dtype=tf.float32, name='den_mise') renyi_bound_ = tf.placeholder(dtype=tf.float32, name='renyi_bound') ret_ = tf.placeholder(dtype=tf.float32, shape=(max_iters), name='ret') disc_ret_ = tf.placeholder(dtype=tf.float32, shape=(max_iters), name='disc_ret') n_ = tf.placeholder(dtype=tf.float32, name='iter_number') n_int = tf.cast(n_, dtype=tf.int32) mask_iters_ = tf.placeholder(dtype=tf.float32, shape=(max_iters), name='mask_iters') # grad_ = tf.placeholder(dtype=tf,.float32, # shape=(d, 1), name='grad') # Multiple importance weights (with balance heuristic) target_log_pdf = tf.reduce_sum( pi.pd.independent_logps(actor_params_), axis=1) behavioral_log_pdf = tf.reduce_sum( oldpi.pd.independent_logps(actor_params_), axis=1) behavioral_log_pdf_last_sample = tf.reduce_sum( oldpi.pd.independent_logps(last_actor_param_)) log_ratio = target_log_pdf - den_mise_log_ miw = tf.exp(log_ratio) * mask_iters_ den_mise_log_mean = tf.reduce_sum(den_mise_log_) / n_ den_mise_log_last = den_mise_log_[n_int-1] losses_with_name.extend([(den_mise_log_mean, 'DenMISEMeanLog'), (den_mise_log_[0], 'DenMISELogFirst'), (den_mise_log_last, 'DenMISELogLast'), (miw[0], 'IWFirstEpisode'), (miw[n_int-1], 'IWLastEpisode'), (tf.reduce_sum(miw)/n_, 'IWMean'), (tf.reduce_max(miw), 'IWMax'), (tf.reduce_min(miw), 'IWMin')]) # Return ep_return = disc_ret_ return_mean = tf.reduce_sum(ep_return) / n_ return_last = ep_return[n_int - 1] return_max = tf.reduce_max(ep_return[:n_int]) return_min = tf.reduce_min(ep_return[:n_int]) return_abs_max = tf.reduce_max(tf.abs(ep_return[:n_int])) regret = n_ * 5 - tf.reduce_sum(ep_return) regret_over_t = 5 - return_mean losses_with_name.extend([(return_mean, 'ReturnMean'), (return_max, 'ReturnMax'), (return_min, 'ReturnMin'), (return_last, 'ReturnLastEpisode'), (return_abs_max, 'ReturnAbsMax'), (regret, 'Regret'), (regret_over_t, 'Regret/t')]) # Regret # Exponentiated Renyi divergence between the target and one behavioral renyi_component = pi.pd.renyi(oldpi.pd) renyi_component = tf.cond(tf.is_nan(renyi_component), lambda: tf.constant(np.inf), lambda: renyi_component) renyi_component = tf.cond(renyi_component < 0., lambda: tf.constant(np.inf), lambda: renyi_component) renyi_component = tf.exp(renyi_component) if truncated_mise: # Bound to d2(target || mixture of behaviorals)/n mn = tf.sqrt((n_**2 * renyi_bound_) / tf.log(1 / delta)) mn_broadcasted = \ tf.ones(shape=miw.get_shape().as_list(), dtype=np.float32) * mn min = tf.where(tf.less(miw, mn_broadcasted), miw, mn_broadcasted) mise = tf.reduce_sum(min * ep_return * mask_iters_)/n_ else: # MISE mise = tf.reduce_sum(miw * ep_return * mask_iters_)/n_ losses_with_name.append((mise, 'MISE')) # Bounds if delta_t == 'continuous': tau = tf.ceil(n_**(1 / k)) delta_cst = delta delta = 6 * delta / ((np.pi * n_)**2 * (1 + tau**d)) elif delta_t == 'discrete': delta_cst = delta delta = 3 * delta / ((np.pi * n_)**2 * grid_size_1d) elif delta_t is None: grid_size_1d = 100 # ToDo correggiiiiiiiiiiiiiiii delta_cst = delta delta = tf.constant(delta) else: raise NotImplementedError losses_with_name.append((delta, 'Delta')) if bound_type == 'J': bound = mise elif bound_type == 'max-renyi': if truncated_mise: const = return_abs_max * (np.sqrt(2) + 1 / 3) \ * tf.sqrt(tf.log(1 / delta)) exploration_bonus = const * tf.sqrt(renyi_bound_) bound = mise + exploration_bonus else: const = return_abs_max * tf.sqrt(1 / delta - 1) exploration_bonus = const * tf.sqrt(renyi_bound_) bound = mise + exploration_bonus else: raise NotImplementedError losses_with_name.append((mise, 'BoundMISE')) losses_with_name.append((exploration_bonus, 'BoundBonus')) losses_with_name.append((bound, 'Bound')) # ESS estimation by d2 ess_d2 = n_ / renyi_bound_ # ESS estimation by miw norms eps = 1e-18 # for eps<1e-18 miw_2=0 if weights are zero miw_ess = (tf.exp(log_ratio) + eps) * mask_iters_ miw_1 = tf.linalg.norm(miw_ess, ord=1) miw_2 = tf.linalg.norm(miw_ess, ord=2) ess_miw = miw_1**2 / miw_2**2 # Infos losses, loss_names = map(list, zip(*losses_with_name)) # TF functions set_parameters = U.SetFromFlat(var_list) get_parameters = U.GetFlat(var_list) set_parameters_old = U.SetFromFlat(var_list_old) # set_higher_logstd = U.SetFromFlat(higher_logstd_list) # set_higher_logstd(np.log([0.15, 0.2])) compute_behav = U.function( [actor_params_], behavioral_log_pdf) compute_behav_last_sample = U.function( [last_actor_param_], behavioral_log_pdf_last_sample) compute_renyi = U.function( [], renyi_component) compute_bound = U.function( [actor_params_, disc_ret_, ret_, n_, mask_iters_, den_mise_log_, renyi_bound_], bound) compute_grad = U.function( [actor_params_, disc_ret_, ret_, n_, mask_iters_, den_mise_log_, renyi_bound_], U.flatgrad(bound, var_list)) compute_return_mean = U.function( [actor_params_, disc_ret_, ret_, n_, mask_iters_], return_mean) compute_losses = U.function( [actor_params_, disc_ret_, ret_, n_, mask_iters_, den_mise_log_, renyi_bound_], losses) compute_roba = U.function( [actor_params_, disc_ret_, ret_, n_, mask_iters_, den_mise_log_, renyi_bound_], [mise, exploration_bonus, ess_d2, ess_miw]) # Tf initialization U.initialize() # Store behaviorals' params and their trajectories old_rhos_list = [] all_eps = {} all_eps['actor_params'] = np.zeros(shape=[max_iters, pi._n_actor_weights]) all_eps['disc_ret'] = np.zeros(max_iters) all_eps['ret'] = np.zeros(max_iters) mask_iters = np.zeros(max_iters) # Set learning loop variables den_mise = np.zeros(mask_iters.shape).astype(np.float32) if delta_t == 'continuous': renyi_components_sum = None else: renyi_components_sum = np.zeros(grid_size_1d**d) new_grid = True grid_size_1d_old = 0 iters_so_far = 0 lens = [] tstart = time.time() # Sample actor's params before entering the learning loop rho = get_parameters() theta = pi.resample() all_eps['actor_params'][iters_so_far, :] = theta # Establish grid dimension if needed grid_dimension = ob_space.shape[0] # Learning loop ########################################################### while True: iters_so_far += 1 mask_iters[:iters_so_far] = 1 # Render one episode if render_after is not None and iters_so_far % render_after == 0: if hasattr(env, 'render'): render(env, pi, horizon) # Exit loop in the end if iters_so_far - 1 >= max_iters: print('Finished...') break # Learning iteration logger.log('********** Iteration %i ************' % iters_so_far) # Generate one trajectory with timed('sampling'): # Sample a trajectory with the newly parametrized actor ret, disc_ret, ep_len = eval_trajectory( env, pi, gamma, horizon, feature_fun, rescale_ep_return) all_eps['ret'][iters_so_far-1] = ret all_eps['disc_ret'][iters_so_far-1] = disc_ret lens.append(ep_len) # Store the parameters of the behavioral hyperpolicy old_rhos_list.append(rho) with timed('summaries before'): logger.record_tabular("Iteration", iters_so_far) logger.record_tabular("NumTrajectories", iters_so_far) logger.record_tabular("TimestepsSoFar", np.sum(lens)) logger.record_tabular('AvgEpLen', np.mean(lens)) logger.record_tabular('MinEpLen', np.min(lens)) logger.record_tabular("TimeElapsed", time.time() - tstart) # Save policy parameters to disk if save_weights: logger.record_tabular('Weights', str(get_parameters())) import pickle file = open('checkpoint.pkl', 'wb') pickle.dump(rho, file) # Tensor evaluations def evaluate_behav(): return compute_behav(all_eps['actor_params']) def evaluate_behav_last_sample(): args_behav_last = [all_eps['actor_params'][iters_so_far - 1]] return compute_behav_last_sample(*args_behav_last) def evaluate_renyi_component(): return compute_renyi() args = all_eps['actor_params'], all_eps['disc_ret'], \ all_eps['ret'], iters_so_far, mask_iters def evaluate_bound(den_mise_log, renyi_bound): args_bound = args + (den_mise_log, renyi_bound, ) return compute_bound(*args_bound) def evaluate_grad(den_mise_log, renyi_bound): args_grad = args + (den_mise_log, renyi_bound, ) return compute_grad(*args_grad) def evaluate_roba(den_mise_log, renyi_bound): args_roba = args + (den_mise_log, renyi_bound, ) return compute_roba(*args_roba) if bound_type == 'J': evaluate_renyi = None elif bound_type == 'max-renyi': evaluate_renyi = evaluate_renyi_component else: raise NotImplementedError with timed("Optimization"): if find_optimal_arm: pass elif multiple_init: bound = 0 improvement = 0 check = False for i in range(multiple_init): rho_init = [np.arctanh(np.random.uniform( pi.min_mean, pi.max_mean))] rho_i, improvement_i, den_mise_log_i, bound_i = \ optimize_offline(evaluate_roba, pi, rho_init, drho, old_rhos_list, iters_so_far, mask_iters, set_parameters, set_parameters_old, evaluate_behav, evaluate_renyi, evaluate_bound, evaluate_grad, max_offline_ite=max_offline_iters) if bound_i > bound: check = True rho = rho_i improvement = improvement_i den_mise_log = den_mise_log_i if not check: den_mise_log = den_mise_log_i else: if delta_t == 'continuous': grid_size_1d = int(np.ceil(iters_so_far**(1 / k))) if grid_size_1d > grid_size_1d_old: new_grid = True renyi_components_sum = np.zeros(grid_size_1d**d) grid_size_1d_old = grid_size_1d rho, improvement, den_mise_log, den_mise, \ renyi_components_sum, renyi_bound = \ best_of_grid(pi, grid_size_1d, mu_min, mu_max, grid_dimension, trainable_std, rho, old_rhos_list, iters_so_far, mask_iters, set_parameters, set_parameters_old, delta_cst, renyi_components_sum, evaluate_behav, den_mise, evaluate_behav_last_sample, evaluate_bound, evaluate_renyi, evaluate_roba, filename, plot_bound, plot_ess_profile, delta_t, new_grid) new_grid = False set_parameters(rho) with timed('summaries after'): # Sample actor's parameters from hyperpolicy and assign to actor if iters_so_far < max_iters: theta = pi.resample() all_eps['actor_params'][iters_so_far, :] = theta if env.spec is not None: if env.spec.id == 'LQG1D-v0': mu1_actor = pi.eval_actor_mean([[1]])[0][0] mu1_higher = pi.eval_higher_mean()[0] sigma = pi.eval_higher_std()[0] logger.record_tabular("LQGmu1_actor", mu1_actor) logger.record_tabular("LQGmu1_higher", mu1_higher) logger.record_tabular("LQGsigma_higher", sigma) elif env.spec.id == 'MountainCarContinuous-v0': ac1 = pi.eval_actor_mean([[1, 1]])[0][0] mu1_higher = pi.eval_higher_mean() sigma = pi.eval_higher_std() logger.record_tabular("ActionIn1", ac1) logger.record_tabular("MountainCar_mu0_higher", mu1_higher[0]) logger.record_tabular("MountainCar_mu1_higher", mu1_higher[1]) logger.record_tabular("MountainCar_std0_higher", sigma[0]) logger.record_tabular("MountainCar_std1_higher", sigma[1]) elif env.id is not None: if env.id == 'inverted_pendulum': ac1 = pi.eval_actor_mean([[1, 1, 1, 1]])[0][0] mu1_higher = pi.eval_higher_mean() sigma = pi.eval_higher_std() logger.record_tabular("ActionIn1", ac1) logger.record_tabular("InvPendulum_mu0_higher", mu1_higher[0]) logger.record_tabular("InvPendulum_mu1_higher", mu1_higher[1]) logger.record_tabular("InvPendulum_mu2_higher", mu1_higher[2]) logger.record_tabular("InvPendulum_mu3_higher", mu1_higher[3]) logger.record_tabular("InvPendulum_std0_higher", sigma[0]) logger.record_tabular("InvPendulum_std1_higher", sigma[1]) logger.record_tabular("InvPendulum_std2_higher", sigma[2]) logger.record_tabular("InvPendulum_std3_higher", sigma[3]) if find_optimal_arm: ret_mean = compute_return_mean(*args) logger.record_tabular('ReturnMean', ret_mean) else: args_losses = args + (den_mise_log, renyi_bound, ) meanlosses = np.array(compute_losses(*args_losses)) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) # Print all info in a table logger.dump_tabular() # Close environment in the end env.close()
def learn(env, policy_fn, *, timesteps_per_batch, # what to train on epsilon, beta, cg_iters, gamma, lam, # advantage estimation trial, sess, method, entcoeff=0.0, cg_damping=1e-2, kl_target=0.01, crosskl_coeff=0.01, vf_stepsize=3e-4, vf_iters =3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, TRPO=False ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- total_space = env.total_space ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space, ob_name="ob") oldpi = policy_fn("oldpi", ob_space, ac_space, ob_name="ob") gpi = policy_fn("gpi", total_space, ac_space, ob_name="gob") goldpi = policy_fn("goldpi", total_space, ac_space, ob_name="gob") atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return gatarg = tf.placeholder(dtype=tf.float32, shape=[None]) gret = tf.placeholder(dtype=tf.float32, shape=[None]) ob = U.get_placeholder_cached(name="ob") gob = U.get_placeholder_cached(name='gob') ac = pi.pdtype.sample_placeholder([None]) crosskl_c = tf.placeholder(dtype=tf.float32, shape=[]) # crosskl_c = 0.01 kloldnew = oldpi.pd.kl(pi.pd) gkloldnew = goldpi.pd.kl(gpi.pd) #TODO: check if it can work in this way # crosskl_ob = pi.pd.kl(goldpi.pd) # crosskl_gob = gpi.pd.kl(oldpi.pd) crosskl_gob = pi.pd.kl(gpi.pd) crosskl_ob = gpi.pd.kl(pi.pd) # crosskl pdmean = pi.pd.mean pdstd = pi.pd.std gpdmean = gpi.pd.mean gpdstd = gpi.pd.std ent = pi.pd.entropy() gent = gpi.pd.entropy() old_entropy = oldpi.pd.entropy() gold_entropy = goldpi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) meancrosskl = tf.reduce_mean(crosskl_ob) # meancrosskl = tf.maximum(tf.reduce_mean(crosskl_ob - 100), 0) gmeankl = tf.reduce_mean(gkloldnew) gmeanent = tf.reduce_mean(gent) gmeancrosskl = tf.reduce_mean(crosskl_gob) vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) gvferr = tf.reduce_mean(tf.square(gpi.vpred - gret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold gratio = tf.exp(gpi.pd.logp(ac) - goldpi.pd.logp(ac)) # Ratio objective # surrgain = tf.reduce_mean(ratio * atarg) # gsurrgain = tf.reduce_mean(gratio * gatarg) # Log objective surrgain = tf.reduce_mean(pi.pd.logp(ac) * atarg) gsurrgain = tf.reduce_mean(gpi.pd.logp(ac) * gatarg) # optimgain = surrgain + crosskl_c * meancrosskl optimgain = surrgain losses = [optimgain, meankl, meancrosskl, surrgain, meanent, tf.reduce_mean(ratio)] loss_names = ["optimgain", "meankl", "meancrosskl", "surrgain", "entropy", "ratio"] # goptimgain = gsurrgain + crosskl_c * gmeancrosskl goptimgain = gsurrgain glosses = [goptimgain, gmeankl, gmeancrosskl, gsurrgain, gmeanent, tf.reduce_mean(gratio)] gloss_names = ["goptimgain", "gmeankl","gmeancrosskl", "gsurrgain", "gentropy", "gratio"] dist = meankl gdist = gmeankl all_pi_var_list = pi.get_trainable_variables() all_var_list = [v for v in all_pi_var_list if v.name.split("/")[0].startswith("pi")] var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] vfadam = MpiAdam(vf_var_list) poladam = MpiAdam(var_list) gall_gpi_var_list = gpi.get_trainable_variables() gall_var_list = [v for v in gall_gpi_var_list if v.name.split("/")[0].startswith("gpi")] gvar_list = [v for v in gall_var_list if v.name.split("/")[1].startswith("pol")] gvf_var_list = [v for v in gall_var_list if v.name.split("/")[1].startswith("vf")] gvfadam = MpiAdam(gvf_var_list) # gpoladpam = MpiAdam(gvar_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) # crossklgrads = tf.gradients(meancrosskl, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) gget_flat = U.GetFlat(gvar_list) gset_from_flat = U.SetFromFlat(gvar_list) gklgrads = tf.gradients(gdist, gvar_list) # gcrossklgrads = tf.gradients(gmeancrosskl, gvar_list) gflat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="gflat_tan") gshapes = [var.get_shape().as_list() for var in gvar_list] gstart = 0 gtangents = [] for shape in gshapes: sz = U.intprod(shape) gtangents.append(tf.reshape(gflat_tangent[gstart:gstart+sz], shape)) gstart += sz ggvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(gklgrads, gtangents)]) #pylint: disable=E1111 gfvp = U.flatgrad(ggvp, gvar_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) gassign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(goldpi.get_variables(), gpi.get_variables())]) compute_losses = U.function([crosskl_c, gob, ob, ac, atarg], losses) compute_lossandgrad = U.function([crosskl_c, gob, ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) compute_crossklandgrad = U.function([ob, gob],U.flatgrad(meancrosskl, var_list)) gcompute_losses = U.function([crosskl_c, ob, gob, ac, gatarg], glosses) gcompute_lossandgrad = U.function([crosskl_c, ob, gob, ac, gatarg], glosses + [U.flatgrad(goptimgain, gvar_list)]) gcompute_fvp = U.function([gflat_tangent, gob, ac, gatarg], gfvp) gcompute_vflossandgrad = U.function([gob, gret], U.flatgrad(gvferr, gvf_var_list)) # compute_gcrossklandgrad = U.function([gob, ob], U.flatgrad(gmeancrosskl, gvar_list)) saver = tf.train.Saver() @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() guided_initilizer(gpol=gvar_list, gvf=gvf_var_list, fpol=var_list, fvf=vf_var_list) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() poladam.sync() print("Init final policy param sum", th_init.sum(), flush=True) gth_init = gget_flat() MPI.COMM_WORLD.Bcast(gth_init, root=0) gset_from_flat(gth_init) gvfadam.sync() # gpoladpam.sync() print("Init guided policy param sum", gth_init.sum(), flush=True) # Initialize eta, omega optimizer init_eta = 0.5 init_omega = 2.0 eta_omega_optimizer = EtaOmegaOptimizer(beta, epsilon, init_eta, init_omega) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, gpi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() num_iters = max_timesteps // timesteps_per_batch lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] gob, gatarg, gtdlamret = seg["gob"], seg["gadv"], seg["gtdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate gvpredbefore = seg["gvpred"] atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate gatarg = (gatarg - gatarg.mean()) / gatarg.std() if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy if hasattr(gpi, "ret_rms"): gpi.ret_rms.update(gtdlamret) if hasattr(gpi, "ob_rms"): gpi.ob_rms.update(gob) args = crosskl_coeff, seg["gob"], seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args[2:]] gargs = crosskl_coeff, seg["ob"], seg["gob"], seg["ac"], gatarg gfvpargs = [arr[::5] for arr in gargs[2:]] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p def gfisher_vector_product(p): return allmean(gcompute_fvp(p, *gfvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values gassign_old_eq_new() with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) *glossbefore, gg = gcompute_lossandgrad(*gargs) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) glossbefore = allmean(np.array(glossbefore)) gg = allmean(gg) if np.allclose(g, 0) or np.allclose(gg, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) gstepdir = cg(gfisher_vector_product, gg, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(gstepdir).all() assert np.isfinite(stepdir).all() if TRPO: # # TRPO specific code. # Find correct step size using line search # #TODO: also enable guided learning for TRPO shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / epsilon) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > epsilon * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) else: # # COPOS specific implementation. # copos_update_dir = stepdir gcopos_update_dir = gstepdir # Split direction into log-linear 'w_theta' and non-linear 'w_beta' parts w_theta, w_beta = pi.split_w(copos_update_dir) gw_theta, gw_beta = gpi.split_w(gcopos_update_dir) # q_beta(s,a) = \grad_beta \log \pi(a|s) * w_beta # = features_beta(s) * K^T * Prec * a # q_beta = self.target.get_q_beta(features_beta, actions) Waa, Wsa = pi.w2W(w_theta) wa = pi.get_wa(ob, w_beta) gWaa, gWsa = gpi.w2W(gw_theta) gwa = gpi.get_wa(gob, gw_beta) varphis = pi.get_varphis(ob) gvarphis = gpi.get_varphis(gob) # Optimize eta and omega tmp_ob = np.zeros((1,) + ob_space.shape) # We assume that entropy does not depend on the NN old_ent = old_entropy.eval({oldpi.ob: tmp_ob})[0] eta, omega = eta_omega_optimizer.optimize(w_theta, Waa, Wsa, wa, varphis, pi.get_kt(), pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent) logger.log("Initial eta of final policy: " + str(eta) + " and omega: " + str(omega)) gtmp_ob = np.zeros((1,) + total_space.shape) gold_ent = gold_entropy.eval({goldpi.ob: gtmp_ob})[0] geta, gomega = eta_omega_optimizer.optimize(gw_theta, gWaa, gWsa, gwa, gvarphis, gpi.get_kt(), gpi.get_prec_matrix(), gpi.is_new_policy_valid, gold_ent) logger.log("Initial eta of guided policy: " + str(geta) + " and omega: " + str(gomega)) current_theta_beta = get_flat() prev_theta, prev_beta = pi.all_to_theta_beta(current_theta_beta) gcurrent_theta_beta = gget_flat() gprev_theta, gprev_beta = gpi.all_to_theta_beta(gcurrent_theta_beta) for i in range(2): # Do a line search for both theta and beta parameters by adjusting only eta eta = eta_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, epsilon, args) logger.log("Updated eta of final policy, eta: " + str(eta) + " and omega: " + str(omega)) # Find proper omega for new eta. Use old policy parameters first. set_from_flat(pi.theta_beta_to_all(prev_theta, prev_beta)) eta, omega = \ eta_omega_optimizer.optimize(w_theta, Waa, Wsa, wa, varphis, pi.get_kt(), pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent, eta) logger.log("Updated omega of final policy, eta: " + str(eta) + " and omega: " + str(omega)) geta = eta_search(gw_theta, gw_beta, geta, gomega, allmean, gcompute_losses, gget_flat, gset_from_flat, gpi, epsilon, gargs) logger.log("updated eta of guided policy, eta:" + str(geta) + "and omega:" + str(gomega)) gset_from_flat(gpi.theta_beta_to_all(gprev_theta, gprev_beta)) geta, gomega = eta_omega_optimizer.optimize(gw_theta, gWaa, gWsa, gwa, gvarphis, gpi.get_kt(), gpi.get_prec_matrix(), gpi.is_new_policy_valid, gold_ent, geta) logger.log("Updated omega of guided policy, eta:" + str(geta) + "and omega:" + str(gomega)) # Use final policy logger.log("Final eta of final policy: " + str(eta) + " and omega: " + str(omega)) logger.log("Final eta of guided policy: " + str(geta) + "and omega:" + str(gomega)) cur_theta = (eta * prev_theta + w_theta.reshape(-1, )) / (eta + omega) cur_beta = prev_beta + w_beta.reshape(-1, ) / eta set_from_flat(pi.theta_beta_to_all(cur_theta, cur_beta)) gcur_theta = (geta * gprev_theta + gw_theta.reshape(-1, )) / (geta + gomega) gcur_beta = gprev_beta + gw_beta.reshape(-1, ) / geta gset_from_flat(gpi.theta_beta_to_all(gcur_theta, gcur_beta)) meanlosses = surr, kl, crosskl, *_ = allmean(np.array(compute_losses(*args))) gmeanlosses = gsurr, gkl, gcrosskl, *_ = allmean(np.array(gcompute_losses(*gargs))) # poladam.update(allmean(compute_crossklandgrad(ob, gob)), vf_stepsize) # gpoladpam.update(allmean(compute_gcrossklandgrad(gob, ob)), vf_stepsize) for _ in range(vf_iters): for (mbob, mbgob) in dataset.iterbatches((seg["ob"], seg["gob"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_crossklandgrad(mbob, mbgob)) poladam.update(g, vf_stepsize) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) for (lossname, lossval) in zip(gloss_names, gmeanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) for (mbob, mbret) in dataset.iterbatches((seg["gob"], seg["gtdlamret"]), include_final_partial_batch=False, batch_size=64): gg = allmean(gcompute_vflossandgrad(mbob, mbret)) gvfadam.update(gg, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) logger.record_tabular("gev_tdlam_before", explained_variance(gvpredbefore, gtdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("CrossKLCoeff :", crosskl_coeff) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("Name", method) logger.record_tabular("Iteration", iters_so_far) logger.record_tabular("trial", trial) if rank==0: logger.dump_tabular() if iters_so_far % 100 == 0 or iters_so_far == 1 or iters_so_far == num_iters: # sess = tf.get_default_session() checkdir = get_dir(osp.join(logger.get_dir(), 'checkpoints')) savepath = osp.join(checkdir, '%.5i.ckpt'%iters_so_far) saver.save(sess, save_path=savepath) print("save model to path:", savepath)
def run_hoof_no_lamgam( network, env, total_timesteps, timesteps_per_batch, # what to train on kl_range, gamma_range, lam_range, # advantage estimation num_kl, num_gamma_lam, cg_iters=10, seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' MPI = None nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space # +2 for gamma, lambda ob = tf.placeholder(shape=(None, env.observation_space.shape[0] + 2), dtype=env.observation_space.dtype, name='Ob') with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_ratio = U.function( [ob, ac, atarg], ratio) # IS ratio - used for computing IS weights compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator_with_gl(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' kl_range = np.atleast_1d(kl_range) gamma_range = np.atleast_1d(gamma_range) lam_range = np.atleast_1d(lam_range) while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() thbefore = get_flat() rand_gamma = gamma_range[0] + ( gamma_range[-1] - gamma_range[0]) * np.random.rand(num_gamma_lam) rand_lam = lam_range[0] + ( lam_range[-1] - lam_range[0]) * np.random.rand(num_gamma_lam) rand_kl = kl_range[0] + (kl_range[-1] - kl_range[0]) * np.random.rand(num_kl) opt_polval = -10**8 est_polval = np.zeros((num_gamma_lam, num_kl)) ob_lam_gam = [] tdlamret = [] vpred = [] for gl in range(num_gamma_lam): oblg, vpredbefore, atarg, tdlr = add_vtarg_and_adv_without_gl( pi, seg, rand_gamma[gl], rand_lam[gl]) ob_lam_gam += [oblg] tdlamret += [tdlr] vpred += [vpredbefore] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate pol_ob = np.concatenate( (seg['ob'], np.zeros(seg['ob'].shape[:-1] + (2, ))), axis=-1) args = pol_ob, seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=False) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) surrbefore = lossbefore[0] for m, kl in enumerate(rand_kl): lm = np.sqrt(shs / kl) fullstep = stepdir / lm thnew = thbefore + fullstep set_from_flat(thnew) # compute the IS estimates lik_ratio = compute_ratio(*args) est_polval[gl, m] = wis_estimate(seg, lik_ratio) # update best policy found so far if est_polval[gl, m] > opt_polval: opt_polval = est_polval[gl, m] opt_th = thnew opt_kl = kl opt_gamma = rand_gamma[gl] opt_lam = rand_lam[gl] opt_vpredbefore = vpredbefore opt_tdlr = tdlr meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore expectedimprove = g.dot(fullstep) set_from_flat(thbefore) logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) set_from_flat(opt_th) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) ob_lam_gam = np.concatenate(ob_lam_gam, axis=0) tdlamret = np.concatenate(tdlamret, axis=0) vpred = np.concatenate(vpred, axis=0) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (ob_lam_gam, tdlamret), include_final_partial_batch=False, batch_size=num_gamma_lam * 64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpred, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("Opt_KL", opt_kl) logger.record_tabular("gamma", opt_gamma) logger.record_tabular("lam", opt_lam) if rank == 0: logger.dump_tabular() return pi
def learn( env, make_policy, *, n_episodes, horizon, delta, gamma, max_iters, sampler=None, use_natural_gradient=False, #can be 'exact', 'approximate' fisher_reg=1e-2, iw_method='is', iw_norm='none', bound='J', line_search_type='parabola', save_weights=0, improvement_tol=0., center_return=False, render_after=None, max_offline_iters=100, callback=None, clipping=False, entropy='none', positive_return=False, reward_clustering='none', capacity=10): np.set_printoptions(precision=3) max_samples = horizon * n_episodes if line_search_type == 'binary': line_search = line_search_binary elif line_search_type == 'parabola': line_search = line_search_parabola else: raise ValueError() # Building the environment ob_space = env.observation_space ac_space = env.action_space # Creating the memory buffer memory = Memory(capacity=capacity, batch_size=n_episodes, horizon=horizon, ob_space=ob_space, ac_space=ac_space) # Building the target policy and saving its parameters pi = make_policy('pi', ob_space, ac_space) all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split('/')[1].startswith('pol') ] shapes = [U.intprod(var.get_shape().as_list()) for var in var_list] n_parameters = sum(shapes) # Building a set of behavioral policies behavioral_policies = memory.build_policies(make_policy, pi) # Placeholders ob_ = ob = U.get_placeholder_cached(name='ob') ac_ = pi.pdtype.sample_placeholder([None], name='ac') mask_ = tf.placeholder(dtype=tf.float32, shape=(None), name='mask') rew_ = tf.placeholder(dtype=tf.float32, shape=(None), name='rew') disc_rew_ = tf.placeholder(dtype=tf.float32, shape=(None), name='disc_rew') clustered_rew_ = tf.placeholder(dtype=tf.float32, shape=(None)) gradient_ = tf.placeholder(dtype=tf.float32, shape=(n_parameters, 1), name='gradient') iter_number_ = tf.placeholder(dtype=tf.int32, name='iter_number') active_policies = tf.placeholder(dtype=tf.float32, shape=(capacity), name='active_policies') losses_with_name = [] # Total number of trajectories N_total = tf.reduce_sum(active_policies) * n_episodes # Split operations disc_rew_split = tf.reshape(disc_rew_ * mask_, [-1, horizon]) rew_split = tf.reshape(rew_ * mask_, [-1, horizon]) mask_split = tf.reshape(mask_, [-1, horizon]) # Policy densities target_log_pdf = pi.pd.logp(ac_) * mask_ target_log_pdf_split = tf.reshape(target_log_pdf, [-1, horizon]) behavioral_log_pdfs = tf.stack([ bpi.pd.logp(ac_) * mask_ for bpi in memory.policies ]) # Shape is (capacity, ntraj*horizon) behavioral_log_pdfs_split = tf.reshape(behavioral_log_pdfs, [memory.capacity, -1, horizon]) # Compute renyi divergencies and sum over time, then exponentiate emp_d2_split = tf.reshape( tf.stack([pi.pd.renyi(bpi.pd, 2) * mask_ for bpi in memory.policies]), [memory.capacity, -1, horizon]) emp_d2_split_cum = tf.exp(tf.reduce_sum(emp_d2_split, axis=2)) # Compute arithmetic and harmonic mean of emp_d2 emp_d2_mean = tf.reduce_mean(emp_d2_split_cum, axis=1) emp_d2_arithmetic = tf.reduce_sum( emp_d2_mean * active_policies) / tf.reduce_sum(active_policies) emp_d2_harmonic = tf.reduce_sum(active_policies) / tf.reduce_sum( 1 / emp_d2_mean) # Return processing: clipping, centering, discounting ep_return = clustered_rew_ #tf.reduce_sum(mask_split * disc_rew_split, axis=1) if clipping: rew_split = tf.clip_by_value(rew_split, -1, 1) if center_return: ep_return = ep_return - tf.reduce_mean(ep_return) rew_split = rew_split - (tf.reduce_sum(rew_split) / (tf.reduce_sum(mask_split) + 1e-24)) discounter = [pow(gamma, i) for i in range(0, horizon)] # Decreasing gamma discounter_tf = tf.constant(discounter) disc_rew_split = rew_split * discounter_tf # Reward statistics return_mean = tf.reduce_mean(ep_return) return_std = U.reduce_std(ep_return) return_max = tf.reduce_max(ep_return) return_min = tf.reduce_min(ep_return) return_abs_max = tf.reduce_max(tf.abs(ep_return)) return_step_max = tf.reduce_max(tf.abs(rew_split)) # Max step reward return_step_mean = tf.abs(tf.reduce_mean(rew_split)) positive_step_return_max = tf.maximum(0.0, tf.reduce_max(rew_split)) negative_step_return_max = tf.maximum(0.0, tf.reduce_max(-rew_split)) return_step_maxmin = tf.abs(positive_step_return_max - negative_step_return_max) losses_with_name.extend([(return_mean, 'InitialReturnMean'), (return_max, 'InitialReturnMax'), (return_min, 'InitialReturnMin'), (return_std, 'InitialReturnStd'), (emp_d2_arithmetic, 'EmpiricalD2Arithmetic'), (emp_d2_harmonic, 'EmpiricalD2Harmonic'), (return_step_max, 'ReturnStepMax'), (return_step_maxmin, 'ReturnStepMaxmin')]) if iw_method == 'is': # Sum the log prob over time. Shapes: target(Nep, H), behav (Cap, Nep, H) target_log_pdf_episode = tf.reduce_sum(target_log_pdf_split, axis=1) behavioral_log_pdf_episode = tf.reduce_sum(behavioral_log_pdfs_split, axis=2) # To avoid numerical instability, compute the inversed ratio log_inverse_ratio = behavioral_log_pdf_episode - target_log_pdf_episode abc = tf.exp(log_inverse_ratio) * tf.expand_dims(active_policies, -1) iw = 1 / tf.reduce_sum( tf.exp(log_inverse_ratio) * tf.expand_dims(active_policies, -1), axis=0) # Get the probability by exponentiation #target_pdf_episode = tf.exp(target_log_pdf_episode) #behavioral_pdf_episode = tf.exp(behavioral_log_pdf_episode) # Get the denominator by averaging over behavioral policies #behavioral_pdf_mixture = tf.reduce_mean(behavioral_pdf_episode, axis=0) + 1e-24 #iw = target_pdf_episode / behavioral_pdf_mixture iwn = iw / n_episodes # Compute the J w_return_mean = tf.reduce_sum(ep_return * iwn) # Empirical D2 of the mixture and relative ESS ess_renyi_arithmetic = N_total / emp_d2_arithmetic ess_renyi_harmonic = N_total / emp_d2_harmonic # Log quantities losses_with_name.extend([ (tf.reduce_max(iw), 'MaxIW'), (tf.reduce_min(iw), 'MinIW'), (tf.reduce_mean(iw), 'MeanIW'), (U.reduce_std(iw), 'StdIW'), (tf.reduce_min(target_log_pdf_episode), 'MinTargetPdf'), (tf.reduce_min(behavioral_log_pdf_episode), 'MinBehavPdf'), (ess_renyi_arithmetic, 'ESSRenyiArithmetic'), (ess_renyi_harmonic, 'ESSRenyiHarmonic') ]) else: raise NotImplementedError() if bound == 'J': bound_ = w_return_mean elif bound == 'max-d2-harmonic': bound_ = w_return_mean - tf.sqrt( (1 - delta) / (delta * ess_renyi_harmonic)) * return_abs_max elif bound == 'max-d2-arithmetic': bound_ = w_return_mean - tf.sqrt( (1 - delta) / (delta * ess_renyi_arithmetic)) * return_abs_max else: raise NotImplementedError() # Policy entropy for exploration ent = pi.pd.entropy() meanent = tf.reduce_mean(ent) losses_with_name.append((meanent, 'MeanEntropy')) # Add policy entropy bonus if entropy != 'none': scheme, v1, v2 = entropy.split(':') if scheme == 'step': entcoeff = tf.cond(iter_number_ < int(v2), lambda: float(v1), lambda: float(0.0)) losses_with_name.append((entcoeff, 'EntropyCoefficient')) entbonus = entcoeff * meanent bound_ = bound_ + entbonus elif scheme == 'lin': ip = tf.cast(iter_number_ / max_iters, tf.float32) entcoeff_decay = tf.maximum( 0.0, float(v2) + (float(v1) - float(v2)) * (1.0 - ip)) losses_with_name.append((entcoeff_decay, 'EntropyCoefficient')) entbonus = entcoeff_decay * meanent bound_ = bound_ + entbonus elif scheme == 'exp': ent_f = tf.exp( -tf.abs(tf.reduce_mean(iw) - 1) * float(v2)) * float(v1) losses_with_name.append((ent_f, 'EntropyCoefficient')) bound_ = bound_ + ent_f * meanent else: raise Exception('Unrecognized entropy scheme.') losses_with_name.append((w_return_mean, 'ReturnMeanIW')) losses_with_name.append((bound_, 'Bound')) losses, loss_names = map(list, zip(*losses_with_name)) ''' if use_natural_gradient: p = tf.placeholder(dtype=tf.float32, shape=[None]) target_logpdf_episode = tf.reduce_sum(target_log_pdf_split * mask_split, axis=1) grad_logprob = U.flatgrad(tf.stop_gradient(iwn) * target_logpdf_episode, var_list) dot_product = tf.reduce_sum(grad_logprob * p) hess_logprob = U.flatgrad(dot_product, var_list) compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_], [-hess_logprob]) ''' assert_ops = tf.group(*tf.get_collection('asserts')) print_ops = tf.group(*tf.get_collection('prints')) compute_lossandgrad = U.function([ ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies ], losses + [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_grad = U.function([ ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies ], [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_bound = U.function([ ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies ], [bound_, assert_ops, print_ops]) compute_losses = U.function([ ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies ], losses) #compute_temp = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies], [log_inverse_ratio, abc, iw]) set_parameter = U.SetFromFlat(var_list) get_parameter = U.GetFlat(var_list) if sampler is None: seg_gen = traj_segment_generator(pi, env, n_episodes, horizon, stochastic=True, gamma=gamma) sampler = type("SequentialSampler", (object, ), { "collect": lambda self, _: seg_gen.__next__() })() U.initialize() # Starting optimizing episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=n_episodes) rewbuffer = deque(maxlen=n_episodes) while True: iters_so_far += 1 if render_after is not None and iters_so_far % render_after == 0: if hasattr(env, 'render'): render(env, pi, horizon) if callback: callback(locals(), globals()) if iters_so_far >= max_iters: print('Finished...') break logger.log('********** Iteration %i ************' % iters_so_far) theta = get_parameter() with timed('sampling'): seg = sampler.collect(theta) lens, rets = seg['ep_lens'], seg['ep_rets'] lenbuffer.extend(lens) rewbuffer.extend(rets) episodes_so_far += len(lens) timesteps_so_far += sum(lens) # Adding batch of trajectories to memory memory.add_trajectory_batch(seg) # Get multiple batches from memory seg_with_memory = memory.get_trajectories() # Get clustered reward reward_matrix = np.reshape( seg_with_memory['disc_rew'] * seg_with_memory['mask'], (-1, horizon)) ep_reward = np.sum(reward_matrix, axis=1) ep_reward = cluster_rewards(ep_reward, reward_clustering) args = ob, ac, rew, disc_rew, clustered_rew, mask, iter_number, active_policies = ( seg_with_memory['ob'], seg_with_memory['ac'], seg_with_memory['rew'], seg_with_memory['disc_rew'], ep_reward, seg_with_memory['mask'], iters_so_far, memory.get_active_policies_mask()) def evaluate_loss(): loss = compute_bound(*args) return loss[0] def evaluate_gradient(): gradient = compute_grad(*args) return gradient[0] if use_natural_gradient: def evaluate_fisher_vector_prod(x): return compute_linear_operator(x, *args)[0] + fisher_reg * x def evaluate_natural_gradient(g): return cg(evaluate_fisher_vector_prod, g, cg_iters=10, verbose=0) else: evaluate_natural_gradient = None with timed('summaries before'): logger.record_tabular("Iteration", iters_so_far) logger.record_tabular("InitialBound", evaluate_loss()) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if save_weights > 0 and iters_so_far % save_weights == 0: logger.record_tabular('Weights', str(get_parameter())) import pickle file = open('checkpoint' + str(iters_so_far) + '.pkl', 'wb') pickle.dump(theta, file) with timed("offline optimization"): theta, improvement = optimize_offline( theta, set_parameter, line_search, evaluate_loss, evaluate_gradient, evaluate_natural_gradient, max_offline_ite=max_offline_iters) set_parameter(theta) with timed('summaries after'): meanlosses = np.array(compute_losses(*args)) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.dump_tabular() env.close()
def learn(*, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs ): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker )) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular() return pi
def learn(env, make_policy, *, n_episodes, horizon, delta, gamma, max_iters, use_natural_gradient=False, #can be 'exact', 'approximate' fisher_reg=1e-2, iw_method='is', iw_norm='none', bound='J', line_search_type='parabola', save_weights=False, improvement_tol=0., center_return=False, render_after=None, max_offline_iters=100, callback=None): np.set_printoptions(precision=3) max_samples = horizon * n_episodes if line_search_type == 'binary': line_search = line_search_binary elif line_search_type == 'parabola': line_search = line_search_parabola else: raise ValueError() # Building the environment ob_space = env.observation_space ac_space = env.action_space # Building the policy pi = make_policy('pi', ob_space, ac_space) oldpi = make_policy('oldpi', ob_space, ac_space) all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.split('/')[1].startswith('pol')] shapes = [U.intprod(var.get_shape().as_list()) for var in var_list] n_parameters = sum(shapes) # Placeholders ob_ = ob = U.get_placeholder_cached(name='ob') ac_ = pi.pdtype.sample_placeholder([max_samples], name='ac') mask_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='mask') disc_rew_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='disc_rew') gradient_ = tf.placeholder(dtype=tf.float32, shape=(n_parameters, 1), name='gradient') # Policy densities target_log_pdf = pi.pd.logp(ac_) behavioral_log_pdf = oldpi.pd.logp(ac_) log_ratio = target_log_pdf - behavioral_log_pdf # Split operations disc_rew_split = tf.stack(tf.split(disc_rew_ * mask_, n_episodes)) log_ratio_split = tf.stack(tf.split(log_ratio * mask_, n_episodes)) target_log_pdf_split = tf.stack(tf.split(target_log_pdf * mask_, n_episodes)) mask_split = tf.stack(tf.split(mask_, n_episodes)) # Renyi divergence emp_d2_split = tf.stack(tf.split(pi.pd.renyi(oldpi.pd, 2) * mask_, n_episodes)) emp_d2_cum_split = tf.reduce_sum(emp_d2_split, axis=1) empirical_d2 = tf.reduce_mean(tf.exp(emp_d2_cum_split)) # Return ep_return = tf.reduce_sum(mask_split * disc_rew_split, axis=1) if center_return: ep_return = ep_return - tf.reduce_mean(ep_return) return_mean = tf.reduce_mean(ep_return) return_std = U.reduce_std(ep_return) return_max = tf.reduce_max(ep_return) return_min = tf.reduce_min(ep_return) return_abs_max = tf.reduce_max(tf.abs(ep_return)) if iw_method == 'pdis': raise NotImplementedError() elif iw_method == 'is': iw = tf.exp(tf.reduce_sum(log_ratio_split, axis=1)) if iw_norm == 'none': iwn = iw / n_episodes w_return_mean = tf.reduce_sum(iwn * ep_return) elif iw_norm == 'sn': iwn = iw / tf.reduce_sum(iw) w_return_mean = tf.reduce_sum(iwn * ep_return) elif iw_norm == 'regression': iwn = iw / n_episodes mean_iw = tf.reduce_mean(iw) beta = tf.reduce_sum((iw - mean_iw) * ep_return * iw) / (tf.reduce_sum((iw - mean_iw) ** 2) + 1e-24) w_return_mean = tf.reduce_mean(iw * ep_return - beta * (iw - 1)) else: raise NotImplementedError() ess_classic = tf.linalg.norm(iw, 1) ** 2 / tf.linalg.norm(iw, 2) ** 2 sqrt_ess_classic = tf.linalg.norm(iw, 1) / tf.linalg.norm(iw, 2) ess_renyi = n_episodes / empirical_d2 else: raise NotImplementedError() if bound == 'J': bound_ = w_return_mean elif bound == 'std-d2': bound_ = w_return_mean - tf.sqrt((1 - delta) / (delta * ess_renyi)) * return_std elif bound == 'max-d2': bound_ = w_return_mean - tf.sqrt((1 - delta) / (delta * ess_renyi)) * return_abs_max elif bound == 'max-ess': bound_ = w_return_mean - tf.sqrt((1 - delta) / delta) / sqrt_ess_classic * return_abs_max elif bound == 'std-ess': bound_ = w_return_mean - tf.sqrt((1 - delta) / delta) / sqrt_ess_classic * return_std else: raise NotImplementedError() losses = [bound_, return_mean, return_max, return_min, return_std, empirical_d2, w_return_mean, tf.reduce_max(iwn), tf.reduce_min(iwn), tf.reduce_mean(iwn), U.reduce_std(iwn), tf.reduce_max(iw), tf.reduce_min(iw), tf.reduce_mean(iw), U.reduce_std(iw), ess_classic, ess_renyi] loss_names = ['Bound', 'InitialReturnMean', 'InitialReturnMax', 'InitialReturnMin', 'InitialReturnStd', 'EmpiricalD2', 'ReturnMeanIW', 'MaxIWNorm', 'MinIWNorm', 'MeanIWNorm', 'StdIWNorm', 'MaxIW', 'MinIW', 'MeanIW', 'StdIW', 'ESSClassic', 'ESSRenyi'] if use_natural_gradient: p = tf.placeholder(dtype=tf.float32, shape=[None]) target_logpdf_episode = tf.reduce_sum(target_log_pdf_split * mask_split, axis=1) grad_logprob = U.flatgrad(tf.stop_gradient(iwn) * target_logpdf_episode, var_list) dot_product = tf.reduce_sum(grad_logprob * p) hess_logprob = U.flatgrad(dot_product, var_list) compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_], [-hess_logprob]) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_lossandgrad = U.function([ob_, ac_, disc_rew_, mask_], losses + [U.flatgrad(bound_, var_list)]) compute_grad = U.function([ob_, ac_, disc_rew_, mask_], [U.flatgrad(bound_, var_list)]) compute_bound = U.function([ob_, ac_, disc_rew_, mask_], [bound_]) compute_losses = U.function([ob_, ac_, disc_rew_, mask_], losses) set_parameter = U.SetFromFlat(var_list) get_parameter = U.GetFlat(var_list) seg_gen = traj_segment_generator(pi, env, n_episodes, horizon, stochastic=True, gamma=gamma) sampler = type("SequentialSampler", (object,), {"collect": lambda self, _: seg_gen.__next__()})() U.initialize() # Starting optimizing episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=n_episodes) rewbuffer = deque(maxlen=n_episodes) while True: iters_so_far += 1 if render_after is not None and iters_so_far % render_after == 0: if hasattr(env, 'render'): render(env, pi, horizon) if callback: callback(locals(), globals()) if iters_so_far >= max_iters: print('Finised...') break logger.log('********** Iteration %i ************' % iters_so_far) theta = get_parameter() with timed('sampling'): seg = sampler.collect(theta) lens, rets = seg['ep_lens'], seg['ep_rets'] lenbuffer.extend(lens) rewbuffer.extend(rets) episodes_so_far += len(lens) timesteps_so_far += sum(lens) args = ob, ac, disc_rew, mask = seg['ob'], seg['ac'], seg['disc_rew'], seg['mask'] assign_old_eq_new() def evaluate_loss(): loss = compute_bound(*args) return loss[0] def evaluate_gradient(): gradient = compute_grad(*args) return gradient[0] if use_natural_gradient: def evaluate_fisher_vector_prod(x): return compute_linear_operator(x, *args)[0] + fisher_reg * x def evaluate_natural_gradient(g): return cg(evaluate_fisher_vector_prod, g, cg_iters=10, verbose=0) else: evaluate_natural_gradient = None with timed('summaries before'): logger.record_tabular("Itaration", iters_so_far) logger.record_tabular("InitialBound", evaluate_loss()) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if save_weights: logger.record_tabular('Weights', str(get_parameter())) with timed("offline optimization"): theta, improvement = optimize_offline(theta, set_parameter, line_search, evaluate_loss, evaluate_gradient, evaluate_natural_gradient, max_offline_ite=max_offline_iters) set_parameter(theta) with timed('summaries after'): meanlosses = np.array(compute_losses(*args)) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.dump_tabular() env.close()
def learn( base_env, policy_fn, *, max_fitness, # has to be negative, as cmaes consider minization popsize, gensize, bounds, sigma, eval_iters, timesteps_per_actorbatch, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, seed=0): # Setup losses and stuff # ---------------------------------------- ob_space = base_env.observation_space ac_space = base_env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy backup_pi = policy_fn( "backup_pi", ob_space, ac_space ) # Construct a network for every individual to adapt during the es evolution var_list = pi.get_trainable_variables() layer_var_list = [] for i in range(pi.num_hid_layers): layer_var_list.append([ v for v in var_list if v.name.split("/")[2].startswith('fc%i' % (i + 1)) ]) logstd_var_list = [ v for v in var_list if v.name.split("/")[2].startswith("logstd") ] if len(logstd_var_list) != 0: layer_var_list.append( [v for v in var_list if v.name.split("/")[2].startswith("final")] + logstd_var_list) U.initialize() layer_set_operate_list = [] layer_get_operate_list = [] for var in layer_var_list: layer_set_operate_list.append(U.SetFromFlat(var)) layer_get_operate_list.append(U.GetFlat(var)) global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, best_fitness episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assign_backup_eq_new = U.function( [], [], updates=[ tf.assign(backup_v, newv) for ( backup_v, newv) in zipsame(backup_pi.get_variables(), pi.get_variables()) ]) assign_new_eq_backup = U.function( [], [], updates=[ tf.assign(newv, backup_v) for (newv, backup_v ) in zipsame(pi.get_variables(), backup_pi.get_variables()) ]) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" # Build generator for all solutions seg_gen = traj_segment_generator_eval(backup_pi, base_env, timesteps_per_actorbatch, stochastic=True) actors = [] for i in range(popsize): newActor = traj_segment_generator(pi, base_env, timesteps_per_actorbatch, stochastic=True, eval_iters=eval_iters, seg_gen=seg_gen) actors.append(newActor) best_fitness = -np.inf opt = cma.CMAOptions() opt['tolfun'] = max_fitness opt['popsize'] = popsize opt['maxiter'] = gensize opt['verb_disp'] = 0 opt['verb_log'] = 0 # opt['seed'] = seed opt['AdaptSigma'] = True # opt['bounds'] = bounds while True: if max_timesteps and timesteps_so_far >= max_timesteps: logger.log("Max time steps") break elif max_episodes and episodes_so_far >= max_episodes: logger.log("Max episodes") break elif max_iters and iters_so_far >= max_iters: logger.log("Max iterations") break elif max_seconds and time.time() - tstart >= max_seconds: logger.log("Max time") break # Linearly decay the exploration sigma_adapted = max(sigma - float(timesteps_so_far) / max_timesteps, 0) logger.log("********** Iteration %i ************" % iters_so_far) eval_seg = seg_gen.__next__() rewbuffer.extend(eval_seg["ep_rets"]) lenbuffer.extend(eval_seg["ep_lens"]) if iters_so_far == 0: result_record() for i in range(len(layer_var_list)): assign_backup_eq_new() # backup current policy logger.log("Current Layer:" + str(layer_var_list[i])) flatten_weights = layer_get_operate_list[i]() es = cma.CMAEvolutionStrategy(flatten_weights, sigma, opt) costs = None best_solution = None die_out_count = 0 while True: if es.countiter >= gensize: logger.log("Max generations for current layer") break solutions = es.ask() ob_segs = None segs = [] costs = [] lens = [] for id, solution in enumerate(solutions): layer_set_operate_list[i](solution) seg = actors[id].__next__() costs.append(-np.mean(seg["ep_rets"])) lens.append(np.sum(seg["ep_lens"])) segs.append(seg) if ob_segs is None: ob_segs = {'ob': np.copy(seg['ob'])} else: ob_segs['ob'] = np.append(ob_segs['ob'], seg['ob'], axis=0) assign_new_eq_backup() # Weights decay l2_decay = compute_weight_decay(0.01, solutions) costs += l2_decay costs, real_costs = fitness_normalization(costs) es.tell_real_seg(solutions=solutions, function_values=costs, real_f=real_costs, segs=segs) best_solution = np.copy(es.result[0]) best_fitness = -es.result[1] rewbuffer.extend(es.result[3]["ep_rets"]) lenbuffer.extend(es.result[3]["ep_lens"]) layer_set_operate_list[i](best_solution) logger.log("Update the layer") logger.log("Generation:", es.countiter) logger.log("Best Solution Fitness:", best_fitness) ob = ob_segs["ob"] if hasattr(pi, "ob_rms"): pi.ob_rms.update( ob ) # update running mean/std for observation normalization episodes_so_far += sum(lens) es = None import gc gc.collect() iters_so_far += 1