def _thunk(): env = make_env.make_env(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) return env
def _thunk(): env = make_env.make_env(env_id, max_episode_len=max_episode_len) env.discrete_action_input = True env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) return env
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.95, lam=0.92, log_interval=1, nprocs=32, nsteps=20, nstack=1, ent_coef=0.00, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=100, lrschedule='linear', identical=None): tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps =nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef= vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, identical=identical) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma, lam=lam) nbatch = nenvs * nsteps tstart = time.time() coord = tf.train.Coordinator() # enqueue_threads = [q_runner.create_threads(model.sess, coord=coord, start=True) for q_runner in model.q_runner] for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values = runner.run() policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = [explained_variance(values[k], rewards[k]) for k in range(model.num_agents)] logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) for k in range(model.num_agents): # logger.record_tabular('reward %d' % k, np.mean(rewards[k])) logger.record_tabular("explained_variance %d" % k, float(ev[k])) logger.record_tabular("policy_entropy %d" % k, float(policy_entropy[k])) logger.record_tabular("policy_loss %d" % k, float(policy_loss[k])) logger.record_tabular("value_loss %d" % k, float(value_loss[k])) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) print('Saving to', savepath) model.save(savepath) coord.request_stop() # coord.join(enqueue_threads) env.close()
def train(logdir, env_id, lr, num_timesteps, seed, timesteps_per_batch, cont=False): from sandbox.ppo_sgd import mlp_policy from sandbox.ppo_sgd import pposgd_simple from rl import logger from rl.common import set_global_seeds, tf_util as U from rl import bench from gym.envs.registration import register import multiagent import make_env logger.configure(logdir, format_strs=['log', 'json', 'tensorboard']) U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = make_env.make_env(env_id) def policy_fn(name, ob_space, ac_space, id): pi = mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, id=id) return pi env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=timesteps_per_batch, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=lr, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', cont=cont) env.close() return None
def learn(policy, env, expert, seed, total_timesteps=int(40e6), gamma=0.99, lam=0.95, log_interval=1, nprocs=32, nsteps=20, nstack=1, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.05, max_grad_norm=0.5, kfac_clip=0.001, save_interval=100, lrschedule='linear'): tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=1024, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() for _ in range(10000): e_obs, e_actions, _, _ = expert.get_next_batch(1024) e_a = [np.argmax(e_actions[k], axis=1) for k in range(len(e_actions))] lld_loss = model.clone(e_obs, e_a) print(lld_loss)
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=20, expert=None, clone_iters=None): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs try: ob_space = env.observation_space ac_space = env.action_space num_agents = len(ob_space) except: ob_space = env.observation_space.spaces ac_space = env.action_space.spaces num_agents = len(ob_space) nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda: Model(policy=policy, ob_spaces=ob_space, ac_spaces=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() if expert: if clone_iters: for i in range(clone_iters): e_obs, e_actions, _, _ = expert.get_next_batch(nbatch // nminibatches) lld = model.clone(lr(1.0), e_obs, e_actions) if i % 100 == 0: print([np.mean(l) for l in lld]) nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 epinfobuf.extend(epinfos) mblossvals = [] # if states is None: # nonrecurrent version # advs = [returns[k] - values[k] for k in range(num_agents)] # advs = [(advs[k] - advs[k].mean()) / (advs[k].std() + 1e-8) for k in range(num_agents)] inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = ([a[mbinds] for a in arr] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) # else: # recurrent version # assert nenvs % nminibatches == 0 # envsperbatch = nenvs // nminibatches # envinds = np.arange(nenvs) # flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) # envsperbatch = nbatch_train // nsteps # for _ in range(noptepochs): # np.random.shuffle(envinds) # for start in range(0, nenvs, envsperbatch): # end = start + envsperbatch # mbenvinds = envinds[start:end] # mbflatinds = flatinds[mbenvinds].ravel() # slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) # mbstates = states[mbenvinds] # mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = [ explained_variance(values[k], returns[k]) for k in range(num_agents) ] logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) for k in range(num_agents): logger.logkv("explained_variance_{}".format(k), float(ev[k])) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): for k in range(num_agents): logger.logkv(lossname + '{}'.format(k), lossval[k]) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) env.save(savepath + '.ob_rms') env.close()
def _make_env(): env = gym.make(env_id) env = MAWrapper(env) env = bench.Monitor(env, logger.get_dir()) return env
def _make_env(): env = make_env(env_id) # gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) return env
def learn(policy, expert, env, env_id, seed, total_timesteps=int(40e6), gamma=0.99, lam=0.95, log_interval=1, nprocs=32, nsteps=20, nstack=1, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=100, lrschedule='linear', dis_lr=0.001, disc_type='decentralized', bc_iters=500, identical=None, l2=0.1, d_iters=1, rew_scale=0.1): tf.reset_default_graph() set_global_seeds(seed) buffer = None nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space num_agents = (len(ob_space)) make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, identical=identical) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if disc_type == 'decentralized' or disc_type == 'decentralized-all': discriminator = [ Discriminator( model.sess, ob_space, ac_space, state_only=True, discount=gamma, nstack=nstack, index=k, disc_type=disc_type, scope="Discriminator_%d" % k, # gp_coef=gp_coef, total_steps=total_timesteps // (nprocs * nsteps), lr_rate=dis_lr, l2_loss_ratio=l2) for k in range(num_agents) ] else: assert False # add reward regularization if env_id == 'simple_tag': reward_reg_loss = tf.reduce_mean( tf.square(discriminator[0].reward + discriminator[3].reward) + tf.square(discriminator[1].reward + discriminator[3].reward) + tf.square(discriminator[2].reward + discriminator[3].reward)) + rew_scale * tf.reduce_mean( tf.maximum(0.0, 1 - discriminator[0].reward) + tf.maximum(0.0, 1 - discriminator[1].reward) + tf.maximum(0.0, 1 - discriminator[2].reward) + tf.maximum(0.0, discriminator[3].reward + 1)) reward_reg_lr = tf.placeholder(tf.float32, ()) reward_reg_optim = tf.train.AdamOptimizer(learning_rate=reward_reg_lr) reward_reg_train_op = reward_reg_optim.minimize(reward_reg_loss) tf.global_variables_initializer().run(session=model.sess) runner = Runner(env, model, discriminator, nsteps=nsteps, nstack=nstack, gamma=gamma, lam=lam, disc_type=disc_type, nobs_flag=True) nbatch = nenvs * nsteps tstart = time.time() coord = tf.train.Coordinator() # enqueue_threads = [q_runner.create_threads(model.sess, coord=coord, start=True) for q_runner in model.q_runner] for _ in range(bc_iters): e_obs, e_actions, e_nobs, _, _ = expert.get_next_batch(nenvs * nsteps) e_a = [np.argmax(e_actions[k], axis=1) for k in range(len(e_actions))] lld_loss = model.clone(e_obs, e_a) # print(lld_loss) update_policy_until = 10 for update in range(1, total_timesteps // nbatch + 1): obs, obs_next, states, rewards, report_rewards, masks, actions, values, all_obs, all_nobs,\ mh_actions, mh_all_actions, mh_rewards, mh_true_rewards, mh_true_returns = runner.run() total_loss = np.zeros((num_agents, d_iters)) idx = 0 idxs = np.arange(len(all_obs)) random.shuffle(idxs) all_obs = all_obs[idxs] mh_actions = [mh_actions[k][idxs] for k in range(num_agents)] mh_obs = [obs[k][idxs] for k in range(num_agents)] mh_obs_next = [obs_next[k][idxs] for k in range(num_agents)] mh_values = [values[k][idxs] for k in range(num_agents)] if buffer: buffer.update(mh_obs, mh_actions, mh_obs_next, all_obs, mh_values) else: buffer = Dset(mh_obs, mh_actions, mh_obs_next, all_obs, mh_values, randomize=True, num_agents=num_agents, nobs_flag=True) d_minibatch = nenvs * nsteps for d_iter in range(d_iters): e_obs, e_actions, e_nobs, e_all_obs, _ = expert.get_next_batch( d_minibatch) g_obs, g_actions, g_nobs, g_all_obs, _ = buffer.get_next_batch( batch_size=d_minibatch) e_a = [ np.argmax(e_actions[k], axis=1) for k in range(len(e_actions)) ] g_a = [ np.argmax(g_actions[k], axis=1) for k in range(len(g_actions)) ] g_log_prob = model.get_log_action_prob(g_obs, g_a) e_log_prob = model.get_log_action_prob(e_obs, e_a) if disc_type == 'decentralized': for k in range(num_agents): total_loss[k, d_iter] = discriminator[k].train( g_obs[k], g_actions[k], g_nobs[k], g_log_prob[k].reshape([-1, 1]), e_obs[k], e_actions[k], e_nobs[k], e_log_prob[k].reshape([-1, 1])) elif disc_type == 'decentralized-all': g_obs_all = np.concatenate(g_obs, axis=1) g_actions_all = np.concatenate(g_actions, axis=1) g_nobs_all = np.concatenate(g_nobs, axis=1) e_obs_all = np.concatenate(e_obs, axis=1) e_actions_all = np.concatenate(e_actions, axis=1) e_nobs_all = np.concatenate(e_nobs, axis=1) for k in range(num_agents): total_loss[k, d_iter] = discriminator[k].train( g_obs_all, g_actions_all, g_nobs_all, g_log_prob[k].reshape([-1, 1]), e_obs_all, e_actions_all, e_nobs_all, e_log_prob[k].reshape([-1, 1])) else: assert False if env_id == 'simple_tag': if disc_type == 'decentralized': feed_dict = { discriminator[k].obs: np.concatenate([g_obs[k], e_obs[k]], axis=0) for k in range(num_agents) } elif disc_type == 'decentralized-all': feed_dict = { discriminator[k].obs: np.concatenate([g_obs_all, e_obs_all], axis=0) for k in range(num_agents) } else: assert False feed_dict[reward_reg_lr] = discriminator[0].lr.value() model.sess.run(reward_reg_train_op, feed_dict=feed_dict) idx += 1 if update > update_policy_until: # 10 policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = [ explained_variance(values[k], rewards[k]) for k in range(model.num_agents) ] logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) for k in range(model.num_agents): logger.record_tabular("explained_variance %d" % k, float(ev[k])) if update > update_policy_until: logger.record_tabular("policy_entropy %d" % k, float(policy_entropy[k])) logger.record_tabular("policy_loss %d" % k, float(policy_loss[k])) logger.record_tabular("value_loss %d" % k, float(value_loss[k])) try: logger.record_tabular( 'pearson %d' % k, float( pearsonr(report_rewards[k].flatten(), mh_true_returns[k].flatten())[0])) logger.record_tabular( 'spearman %d' % k, float( spearmanr(report_rewards[k].flatten(), mh_true_returns[k].flatten())[0])) logger.record_tabular('reward %d' % k, float(np.mean(rewards[k]))) except: pass total_loss_m = np.mean(total_loss, axis=1) for k in range(num_agents): logger.record_tabular("total_loss %d" % k, total_loss_m[k]) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'm_%.5i' % update) print('Saving to', savepath) model.save(savepath) if disc_type == 'decentralized' or disc_type == 'decentralized-all': for k in range(num_agents): savepath = osp.join(logger.get_dir(), 'd_%d_%.5i' % (k, update)) discriminator[k].save(savepath) else: assert False coord.request_stop() # coord.join(enqueue_threads) env.close()
def learn(policy, env, expert, seed, total_timesteps=int(40e6), gamma=0.99, lam=0.95, log_interval=1, nprocs=4, nsteps=20, nstack=1, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.05, max_grad_norm=0.5, kfac_clip=0.001, save_interval=1000, lrschedule='linear', batch_size=1024): tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=batch_size, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() tstart = time.time() coord = tf.train.Coordinator() # enqueue_threads = [q_runner.create_threads(model.sess, coord=coord, start=True) for q_runner in model.q_runner] print("-------------------------------") print(total_timesteps // batch_size + 1) print("-------------------------------") for update in range(total_timesteps // batch_size + 1): e_obs, e_actions, _, _ = expert.get_next_batch(batch_size) e_a = [np.argmax(e_actions[k], axis=1) for k in range(len(e_actions))] nseconds = time.time() - tstart fps = int((update * batch_size) / nseconds) lld_loss = model.clone(e_obs, e_a)[0] # print(lld_loss) if update % log_interval == 0 or update == 1: logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * batch_size) logger.record_tabular("fps", fps) for k in range(model.num_agents): logger.record_tabular("lld_loss %d" % k, float(lld_loss[k])) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) print('Saving to', savepath) model.save(savepath) coord.request_stop()
def _make_env(rank): env = gym.make('RoboSumo-Ant-vs-Ant-v0') env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return env