def train(env_id, num_timesteps, seed, num_cpu, batch, lr): from rl.common import set_global_seeds from rl.common.vec_env.vec_normalize import MAVecNormalize from rl.common.ma_wrappers import MAWrapper from sandbox.mppo import ppo2 from sandbox.mppo.policies import MlpPolicy import gym import tensorflow as tf from rl.common.vec_env.subproc_vec_env import SubprocVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def _make_env(): env = gym.make(env_id) env = MAWrapper(env) env = bench.Monitor(env, logger.get_dir()) return env env = SubprocVecEnv([_make_env for _ in range(num_cpu)], is_multi_agent=True) env = MAVecNormalize(env) set_global_seeds(seed) policy = MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=batch // num_cpu, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=lr, cliprange=0.2, total_timesteps=num_timesteps)
def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu): def create_env(rank): def _thunk(): env = make_env.make_env(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) return env return _thunk logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) set_global_seeds(seed) env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True) policy_fn = CategoricalPolicy learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.00, identical=make_env.get_identical(env_id)) env.close()
def train(env_id, num_timesteps, seed, num_cpu, batch, lr): from rl.common import set_global_seeds from rl.common.vec_env.vec_normalize import MAVecNormalize from rl.common.ma_wrappers import MAWrapper from sandbox.mppo import ppo2 from sandbox.mppo.policies import MlpPolicy import gym import tensorflow as tf from rl.common.vec_env.subproc_vec_env import SubprocVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def _make_env(rank): env = gym.make('RoboSumo-Ant-vs-Ant-v0') env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return env env = SubprocVecEnv([lambda: _make_env(i) for i in range(num_cpu)], is_multi_agent=True) env = MAVecNormalize(env) set_global_seeds(seed) policy = MlpPolicy expert = MADataSet('/atlas/u/tsong/Projects/imitation/ant-vs-ant.pkl') ppo2.learn(policy=policy, env=env, nsteps=batch // num_cpu, nminibatches=160, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=lr, cliprange=0.2, total_timesteps=num_timesteps, expert=expert, clone_iters=1000)
def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, expert_path, traj_limitation, ret_threshold, dis_lr, disc_type='decentralized', bc_iters=500, l2=0.1, d_iters=1, rew_scale=0.1): def create_env(rank): def _thunk(): env = make_env.make_env(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) return env return _thunk logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) set_global_seeds(seed) env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True) print(num_cpu) policy_fn = CategoricalPolicy expert = MADataSet(expert_path, ret_threshold=ret_threshold, traj_limitation=traj_limitation, nobs_flag=True) learn(policy_fn, expert, env, env_id, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.0, dis_lr=dis_lr, disc_type=disc_type, bc_iters=bc_iters, identical=make_env.get_identical(env_id), l2=l2, d_iters=d_iters, rew_scale=rew_scale) env.close()
def make_gym_env(env_id, num_env=2, seed=123, wrapper_kwargs=None, start_index=0): """ Create a wrapped, SubprocVecEnv for Gym Environments. """ if wrapper_kwargs is None: wrapper_kwargs = {} def make_env(rank): # pylint: disable=C0111 def _thunk(): env = gym.make(env_id) env.seed(seed + rank) return env return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.95, lam=0.92, log_interval=1, nprocs=32, nsteps=20, nstack=1, ent_coef=0.00, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=100, lrschedule='linear', identical=None): tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps =nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef= vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, identical=identical) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma, lam=lam) nbatch = nenvs * nsteps tstart = time.time() coord = tf.train.Coordinator() # enqueue_threads = [q_runner.create_threads(model.sess, coord=coord, start=True) for q_runner in model.q_runner] for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values = runner.run() policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = [explained_variance(values[k], rewards[k]) for k in range(model.num_agents)] logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) for k in range(model.num_agents): # logger.record_tabular('reward %d' % k, np.mean(rewards[k])) logger.record_tabular("explained_variance %d" % k, float(ev[k])) logger.record_tabular("policy_entropy %d" % k, float(policy_entropy[k])) logger.record_tabular("policy_loss %d" % k, float(policy_loss[k])) logger.record_tabular("value_loss %d" % k, float(value_loss[k])) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) print('Saving to', savepath) model.save(savepath) coord.request_stop() # coord.join(enqueue_threads) env.close()
def train(logdir, env_id, lr, num_timesteps, seed, timesteps_per_batch, cont=False): from sandbox.ppo_sgd import mlp_policy from sandbox.ppo_sgd import pposgd_simple from rl import logger from rl.common import set_global_seeds, tf_util as U from rl import bench from gym.envs.registration import register import multiagent import make_env logger.configure(logdir, format_strs=['log', 'json', 'tensorboard']) U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = make_env.make_env(env_id) def policy_fn(name, ob_space, ac_space, id): pi = mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, id=id) return pi env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=timesteps_per_batch, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=lr, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', cont=cont) env.close() return None
def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, max_episode_len): def create_env(rank): def _thunk(): env = make_env.make_env(env_id, max_episode_len=max_episode_len) env.discrete_action_input = True env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) return env return _thunk logger.configure(logdir, format_strs=['json']) set_global_seeds(seed) env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True) policy_fn = CategoricalPolicy learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.00, identical=make_env.get_identical(env_id), log_interval=50, save_interval=int(num_timesteps / timesteps_per_batch), max_episode_len=max_episode_len) logger.Logger.CURRENT.close() env.close()
def learn( *, network, env, eval_policy, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, checkpoint_path_in=None, checkpoint_dir_out=None, checkpoint_freq=100, # In iterations!!, from_iter=0, eval_episodes=20, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- # ob_space = Box(low=-np.inf, high=np.inf, shape=(env.observation_space.n,)) ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) # Loading checkpoint if checkpoint_path_in is not None and os.path.isfile(checkpoint_path_in): pi.load(checkpoint_path_in) logger.log('Loaded policy weights from %s' % checkpoint_path_in) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() # s = env.reset() # start = time.time() # for i in range(10000): # pi.step(s, stochastic=True) # duration = time.time() - start # print(duration) # return if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, gamma=gamma) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 iters_eval = 0 all_logs = [] best_rew = -np.inf tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards online_scores = [] offline_scores = [] if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) if iters_so_far % checkpoint_freq == 0 and checkpoint_dir_out is not None: if not os.path.exists(checkpoint_dir_out): os.makedirs(checkpoint_dir_out) pi.save( os.path.join(checkpoint_dir_out, 'checkpoint_%d' % iters_so_far)) logger.log('Saved policy weights as %s' % os.path.join( checkpoint_dir_out, 'checkpoint_%d.npy' % iters_so_far)) def pi_wrapper(ob): ac, vpred, _, _ = pi.step(ob, stochastic=True) return ac rew, _, logs, disc_rets, num_stops, avg_damages = eval_policy( pi=pi_wrapper, n_episodes=eval_episodes, verbose=True) offline_scores.append( [np.mean(disc_rets), np.mean(num_stops), np.mean(avg_damages)]) np.save(os.path.join(checkpoint_dir_out, 'offline_scores.npy'), offline_scores) for log in logs: log['iter'] = iters_eval all_logs = all_logs + logs iters_eval += 1 with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) ep_rew_mean = np.mean(rewbuffer) online_scores.append(ep_rew_mean) np.save(os.path.join(checkpoint_dir_out, 'online_scores.npy'), online_scores) # Saving best if iters_so_far % checkpoint_freq == 0 and ep_rew_mean > best_rew and checkpoint_dir_out is not None: pi.save(os.path.join(checkpoint_dir_out, 'best')) best_rew = ep_rew_mean logger.log('Saved policy weights as %s' % os.path.join(checkpoint_dir_out, 'best.npy')) if rank == 0: logger.dump_tabular() return pi
def learn(policy, expert, env, env_id, seed, total_timesteps=int(40e6), gamma=0.99, lam=0.95, log_interval=1, nprocs=32, nsteps=20, nstack=1, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=100, lrschedule='linear', dis_lr=0.001, disc_type='decentralized', bc_iters=500, identical=None, l2=0.1, d_iters=1, rew_scale=0.1): tf.reset_default_graph() set_global_seeds(seed) buffer = None nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space num_agents = (len(ob_space)) make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, identical=identical) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if disc_type == 'decentralized' or disc_type == 'decentralized-all': discriminator = [ Discriminator( model.sess, ob_space, ac_space, state_only=True, discount=gamma, nstack=nstack, index=k, disc_type=disc_type, scope="Discriminator_%d" % k, # gp_coef=gp_coef, total_steps=total_timesteps // (nprocs * nsteps), lr_rate=dis_lr, l2_loss_ratio=l2) for k in range(num_agents) ] else: assert False # add reward regularization if env_id == 'simple_tag': reward_reg_loss = tf.reduce_mean( tf.square(discriminator[0].reward + discriminator[3].reward) + tf.square(discriminator[1].reward + discriminator[3].reward) + tf.square(discriminator[2].reward + discriminator[3].reward)) + rew_scale * tf.reduce_mean( tf.maximum(0.0, 1 - discriminator[0].reward) + tf.maximum(0.0, 1 - discriminator[1].reward) + tf.maximum(0.0, 1 - discriminator[2].reward) + tf.maximum(0.0, discriminator[3].reward + 1)) reward_reg_lr = tf.placeholder(tf.float32, ()) reward_reg_optim = tf.train.AdamOptimizer(learning_rate=reward_reg_lr) reward_reg_train_op = reward_reg_optim.minimize(reward_reg_loss) tf.global_variables_initializer().run(session=model.sess) runner = Runner(env, model, discriminator, nsteps=nsteps, nstack=nstack, gamma=gamma, lam=lam, disc_type=disc_type, nobs_flag=True) nbatch = nenvs * nsteps tstart = time.time() coord = tf.train.Coordinator() # enqueue_threads = [q_runner.create_threads(model.sess, coord=coord, start=True) for q_runner in model.q_runner] for _ in range(bc_iters): e_obs, e_actions, e_nobs, _, _ = expert.get_next_batch(nenvs * nsteps) e_a = [np.argmax(e_actions[k], axis=1) for k in range(len(e_actions))] lld_loss = model.clone(e_obs, e_a) # print(lld_loss) update_policy_until = 10 for update in range(1, total_timesteps // nbatch + 1): obs, obs_next, states, rewards, report_rewards, masks, actions, values, all_obs, all_nobs,\ mh_actions, mh_all_actions, mh_rewards, mh_true_rewards, mh_true_returns = runner.run() total_loss = np.zeros((num_agents, d_iters)) idx = 0 idxs = np.arange(len(all_obs)) random.shuffle(idxs) all_obs = all_obs[idxs] mh_actions = [mh_actions[k][idxs] for k in range(num_agents)] mh_obs = [obs[k][idxs] for k in range(num_agents)] mh_obs_next = [obs_next[k][idxs] for k in range(num_agents)] mh_values = [values[k][idxs] for k in range(num_agents)] if buffer: buffer.update(mh_obs, mh_actions, mh_obs_next, all_obs, mh_values) else: buffer = Dset(mh_obs, mh_actions, mh_obs_next, all_obs, mh_values, randomize=True, num_agents=num_agents, nobs_flag=True) d_minibatch = nenvs * nsteps for d_iter in range(d_iters): e_obs, e_actions, e_nobs, e_all_obs, _ = expert.get_next_batch( d_minibatch) g_obs, g_actions, g_nobs, g_all_obs, _ = buffer.get_next_batch( batch_size=d_minibatch) e_a = [ np.argmax(e_actions[k], axis=1) for k in range(len(e_actions)) ] g_a = [ np.argmax(g_actions[k], axis=1) for k in range(len(g_actions)) ] g_log_prob = model.get_log_action_prob(g_obs, g_a) e_log_prob = model.get_log_action_prob(e_obs, e_a) if disc_type == 'decentralized': for k in range(num_agents): total_loss[k, d_iter] = discriminator[k].train( g_obs[k], g_actions[k], g_nobs[k], g_log_prob[k].reshape([-1, 1]), e_obs[k], e_actions[k], e_nobs[k], e_log_prob[k].reshape([-1, 1])) elif disc_type == 'decentralized-all': g_obs_all = np.concatenate(g_obs, axis=1) g_actions_all = np.concatenate(g_actions, axis=1) g_nobs_all = np.concatenate(g_nobs, axis=1) e_obs_all = np.concatenate(e_obs, axis=1) e_actions_all = np.concatenate(e_actions, axis=1) e_nobs_all = np.concatenate(e_nobs, axis=1) for k in range(num_agents): total_loss[k, d_iter] = discriminator[k].train( g_obs_all, g_actions_all, g_nobs_all, g_log_prob[k].reshape([-1, 1]), e_obs_all, e_actions_all, e_nobs_all, e_log_prob[k].reshape([-1, 1])) else: assert False if env_id == 'simple_tag': if disc_type == 'decentralized': feed_dict = { discriminator[k].obs: np.concatenate([g_obs[k], e_obs[k]], axis=0) for k in range(num_agents) } elif disc_type == 'decentralized-all': feed_dict = { discriminator[k].obs: np.concatenate([g_obs_all, e_obs_all], axis=0) for k in range(num_agents) } else: assert False feed_dict[reward_reg_lr] = discriminator[0].lr.value() model.sess.run(reward_reg_train_op, feed_dict=feed_dict) idx += 1 if update > update_policy_until: # 10 policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = [ explained_variance(values[k], rewards[k]) for k in range(model.num_agents) ] logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) for k in range(model.num_agents): logger.record_tabular("explained_variance %d" % k, float(ev[k])) if update > update_policy_until: logger.record_tabular("policy_entropy %d" % k, float(policy_entropy[k])) logger.record_tabular("policy_loss %d" % k, float(policy_loss[k])) logger.record_tabular("value_loss %d" % k, float(value_loss[k])) try: logger.record_tabular( 'pearson %d' % k, float( pearsonr(report_rewards[k].flatten(), mh_true_returns[k].flatten())[0])) logger.record_tabular( 'spearman %d' % k, float( spearmanr(report_rewards[k].flatten(), mh_true_returns[k].flatten())[0])) logger.record_tabular('reward %d' % k, float(np.mean(rewards[k]))) except: pass total_loss_m = np.mean(total_loss, axis=1) for k in range(num_agents): logger.record_tabular("total_loss %d" % k, total_loss_m[k]) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'm_%.5i' % update) print('Saving to', savepath) model.save(savepath) if disc_type == 'decentralized' or disc_type == 'decentralized-all': for k in range(num_agents): savepath = osp.join(logger.get_dir(), 'd_%d_%.5i' % (k, update)) discriminator[k].save(savepath) else: assert False coord.request_stop() # coord.join(enqueue_threads) env.close()
env = gym.make('{}NoFrameskip-v4'.format(env_id)) env.seed(seed + rank) return env return wrap_deepmind(env) return env_fn if 1: from rl_algs.common.vec_env.mpi_vec_env1 import MpiVecEnv from mpi4py import MPI comm = MPI.COMM_WORLD nenvs = comm.Get_size() env = make_env(comm.Get_rank())() env = MpiVecEnv(env, comm) A = np.array([env.action_space.sample() for _ in range(env.num_envs)])*0 elif 1: set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(nenvs)]) A = np.array([env.action_space.sample() for _ in range(env.num_envs)])*0 else: env = make_env(0)() A = env.action_space.sample()*0 env.num_envs = 1 env.reset() nsteps = 1000 tstart = time.time() blah = 0 for _ in range(nsteps): ob,rew,done,_ = env.step(A) for q in (ob, rew, done):
import numpy as np from rl.a2c.a2c import Model from rl.a2c.runner import Runner from rl.common import set_global_seeds from baselines.stochastic import Model as Stochastic from baselines.rule import Model as Rule import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt if __name__ == '__main__': set_global_seeds(args.seed) logger = get_logger('trading') logger.info(str(args)) env = Env('train') # Instantiate the model objects (that creates defender_model and adversary_model) model = Model( ob_size=env.ob_size, act_size=env.act_size, learning_rate=args.lr, latents=args.latents, activation=args.activation, optimizer=args.optimizer, vf_coef=args.vf_coef,