def get_arch( *, library="baselines", cnn="clear", use_lstm=0, stack_channels="16_32_32", emb_size=256, **kwargs, ): stack_channels = [int(x) for x in stack_channels.split("_")] if library == "baselines": if cnn == "impala": from baselines.common.models import build_impala_cnn conv_fn = lambda x: build_impala_cnn( x, depths=stack_channels, emb_size=emb_size ) elif cnn == "nature": from baselines.common.models import nature_cnn conv_fn = nature_cnn elif cnn == "clear": from lucid.scratch.rl_util.arch import clear_cnn conv_fn = clear_cnn else: raise ValueError(f"Unsupported cnn: {cnn}") if use_lstm: from baselines.common.models import cnn_lstm arch = cnn_lstm(nlstm=256, conv_fn=conv_fn) else: arch = conv_fn else: raise ValueError(f"Unsupported library: {library}") return arch
def test_fn(env_name, num_envs, config_path, load_path): test_config_path = os.path.join(os.getcwd(), "procgen-adr", config_path) test_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, domain_config_path=test_config_path, render_mode="rgb_array") test_env = VecExtractDictObs(test_env, "rgb") test_env = VecMonitor(venv=test_env, filename=None, keep_buf=100) test_env = VecNormalize(venv=test_env, ob=False) setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256) recur = True if recur: logger.info("Using CNN LSTM") conv_fn = cnn_lstm(nlstm=256, conv_fn=conv_fn) mean, std = test(conv_fn, test_env, load_path=load_path) sess.close() return mean, std
def train_fn(env_name, num_envs, distribution_mode, num_levels, start_level, timesteps_per_proc, level_sampler_strategy, score_transform, model_name, is_test_worker=False, save_dir='./', comm=None): learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 use_vf_clipping = True mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else num_levels log_dir = save_dir + 'logs/' + model_name if log_dir is not None: log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'tensorboard' ] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs) logger.info("creating environment") eval_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=500, start_level=0, distribution_mode=distribution_mode) eval_env = VecExtractDictObs(eval_env, "rgb") eval_env = VecMonitor( venv=eval_env, filename=None, keep_buf=100, ) eval_env = VecNormalize(venv=eval_env, ob=False, ret=True) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32]) logger.info("training") model = ppo2.learn(network=conv_fn, total_timesteps=timesteps_per_proc, num_levels=num_levels, eval_env=eval_env, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, level_sampler_strategy=level_sampler_strategy, score_transform=score_transform) model.save(save_dir + 'models/' + model_name)
def main(): # get model path parser = argparse.ArgumentParser(description="Parse testing arguments") parser.add_argument('--model_path', type=str, default=None, help='Path to model checkpoint.') parser.add_argument('--config', type=str, default='configurations/ppo_baseline_cuda.yaml', help='Path to configuration file.') args = parser.parse_args() if args.model_path is None or not os.path.exists(args.model_path): raise OSError("Invalid model file supplied") # create configuration cfg = get_cfg_defaults() cfg.merge_from_file(args.config) # create save directory model_file_path = args.model_path exp_creation_time = os.path.normpath(model_file_path).split(os.sep)[-3] print(exp_creation_time) exp_dir = f"runs/{cfg.EXPERIMENT_NAME}/{exp_creation_time}_test/" os.makedirs(exp_dir, exist_ok=True) # create logger format_strs = ['csv', 'stdout'] logger.configure(dir=exp_dir, format_strs=format_strs, log_suffix=datetime.now().strftime('%Y-%m-%d-%H-%M')) # create (vectorized) procgen environment logger.info("creating environment") venv = ProcgenEnv(num_envs=cfg.TEST.NUM_ENVS, env_name="fruitbot", num_levels=cfg.TEST.NUM_LEVELS, start_level=cfg.TEST.LEVEL_SEED, distribution_mode="easy") venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) # create tensorflow session logger.info("creating tf session") config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # create cnn todo: make this less ugly conv_fn = None logger.info("building cnn") if cfg.TRAIN.NETWORK == "NATURE_CNN": conv_fn = lambda x: nature_cnn(x) elif cfg.TRAIN.NETWORK == "IMPALA_CNN": conv_fn = lambda x: build_impala_cnn( x, depths=[16, 32, 32], emb_size=256) # training logger.info("testing") ppo2.learn(env=venv, network=conv_fn, total_timesteps=cfg.TEST.TIMESTEPS, save_interval=0, nsteps=cfg.TEST.BATCH_SIZE, nminibatches=cfg.TRAIN.MINIBATCHES, lam=cfg.TRAIN.LAM, gamma=cfg.TRAIN.GAMMA, noptepochs=cfg.TRAIN.NUM_EPOCHS, log_interval=1, clip_vf=cfg.TRAIN.USE_VF_CLIPPING, lr=cfg.TRAIN.LR, cliprange=cfg.TRAIN.CLIP_RANGE, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, test=True, load_path=model_file_path)
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 total_timesteps = 1_000_000 ## now this counts steps in testing runs use_vf_clipping = True ## From random_ppo.py max_grad_norm = 0.5 vf_coef = 0.5 L2_WEIGHT = 10e-4 FM_COEFF = 0.002 REAL_THRES = 0.1 parser = argparse.ArgumentParser( description='Process procgen testing arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=1000) ## default starting_level set to 50 to test on unseen levels! parser.add_argument('--start_level', type=int, default=1000) parser.add_argument('--run_id', '-id', type=int, default=0) parser.add_argument('--load_id', type=int, default=0) parser.add_argument('--nrollouts', '-nroll', type=int, default=0) args = parser.parse_args() args.total_timesteps = total_timesteps if args.nrollouts: total_timesteps = int(args.nrollouts * num_envs * nsteps) run_ID = 'run_' + str(args.run_id).zfill(2) run_ID += '_load{}'.format(args.load_id) comm = MPI.COMM_WORLD rank = comm.Get_rank() mpi_rank_weight = 0 num_levels = args.num_levels log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) fpath = join(logpath, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.configure(dir=logpath, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) sess.__enter__() logger.info("Testing") ## Modified based on random_ppo.learn env = venv nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches nrollouts = total_timesteps // nbatch network = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) policy = build_policy(env, network) model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) LOAD_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.load_id) model.load(LOAD_PATH) logger.info("Model pramas loaded from save") runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) # tfirststart = time.time() ## Not doing timing yet # active_ep_buf = epinfobuf100 mean_rewards = [] datapoints = [] for rollout in range(1, nrollouts + 1): logger.info('collecting rollouts {}...'.format(rollout)) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) ## differnent from random_ppo! epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100]) logger.info('\n----', rollout) mean_rewards.append(rew_mean_10) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) logger.logkv("misc/total_timesteps", rollout * nbatch) logger.info('----\n') logger.dumpkvs() env.close() print("Rewards history: ", mean_rewards) return mean_rewards
def main(): # Hyperparameters num_envs = 128 learning_rate = 5e-4 ent_coef = .01 vf_coef = 0.5 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 max_grad_norm = 0.5 timesteps_per_proc = 100_000_000 use_vf_clipping = True # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument( '--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') args = parser.parse_args() # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 # Setup env specs env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR + f'/{args.env_name}/run_{args.run_id}', format_strs=format_strs) # Create env logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup model conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32]) # Training logger.info("training") ppo2.Runner = NetRandRunner ppo2.build_policy = build_policy model = ppo2.learn( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=vf_coef, max_grad_norm=max_grad_norm, model_fn=NetRandModel, ) # Saving logger.info("saving final model") if rank == 0: checkdir = os.path.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) model.save(os.path.join(checkdir, 'final_model.ckpt'))
envs = [env_fn for x in range(64)] venv = DummyVecEnv(envs) venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) logger.info("creating tf session") config = tf.ConfigProto() sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") final_model = ppo2.learn( env=venv, network=conv_fn, total_timesteps=total_time, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=0,
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 last_step = 4587520 # where we have left off in training timesteps_per_proc = 25_000_000 - last_step use_vf_clipping = True model_path = '../train-procgen/saved_model/policy_bossfight_vae560' vae_path = '../train-procgen/saved_model/bossfight_vae560' parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument( '--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) args = parser.parse_args() test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") ppo2_cvae.learn(env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=10, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, load_path=model_path, vae_path=vae_path)
def main(): # Hyperparameters num_envs = 128 learning_rate = 5e-4 ent_coef = .01 vf_coef = 0.5 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 max_grad_norm = 0.5 use_vf_clipping = True # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=50) parser.add_argument('--start_level', type=int, default=500) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') parser.add_argument('--use_bn', action='store_true') parser.add_argument('--use_l2reg', action='store_true') parser.add_argument('--l2reg_coeff', type=float, default=1e-4) parser.add_argument('--data_aug', type=str, default='no_aug', choices=["no_aug", "cutout_color", "crop"]) parser.add_argument('--use_rand_conv', action='store_true') parser.add_argument('--model_width', type=str, default='1x', choices=["1x", "2x", "4x"]) parser.add_argument('--level_setup', type=str, default='procgen', choices=["procgen", "oracle"]) parser.add_argument('--mix_mode', type=str, default='nomix', choices=['nomix', 'mixreg', 'mixobs']) parser.add_argument('--mix_alpha', type=float, default=0.2) parser.add_argument('--timesteps_per_proc', type=int, default=1_000_000) parser.add_argument('--level_sampler_strategy', type=str, default='value_l1') parser.add_argument('--score_transform', type=str, default='rank') parser.add_argument('--save_dir', type=str, default='gdrive/MyDrive/182 Project/') args = parser.parse_args() timesteps_per_proc = args.timesteps_per_proc log_dir = args.save_dir # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 # Setup env specs if args.level_setup == "procgen": env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level elif args.level_setup == "oracle": env_name = args.env_name num_levels = 0 start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'tensorboard' ] if log_comm.Get_rank() == 0 else [] logger.configure( dir=log_dir + f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}', format_strs=format_strs) # Create env logger.info("creating environment") eval_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=500, start_level=0, distribution_mode=args.distribution_mode) eval_env = VecExtractDictObs(eval_env, "rgb") eval_env = VecMonitor( venv=eval_env, filename=None, keep_buf=100, ) eval_env = VecNormalize(venv=eval_env, ob=False, ret=True) # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup model if args.model_width == '1x': depths = [16, 32, 32] elif args.model_width == '2x': depths = [32, 64, 64] elif args.model_width == '4x': depths = [64, 128, 128] conv_fn = lambda x: build_impala_cnn(x, depths=depths, use_bn=args.use_bn, randcnn=args.use_rand_conv and not is_test_worker) # Training logger.info("training") ppo2.learn = learn # use customized "learn" function model = ppo2.learn( network=conv_fn, total_timesteps=timesteps_per_proc, num_levels=num_levels, start_level=start_level, eval_env=eval_env, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=vf_coef, max_grad_norm=max_grad_norm, data_aug=args.data_aug, level_sampler_strategy=args.level_sampler_strategy, score_transform=args.score_transform, model_fn=get_mixreg_model(mix_mode=args.mix_mode, mix_alpha=args.mix_alpha, use_l2reg=args.use_l2reg, l2reg_coeff=args.l2reg_coeff), ) # Saving logger.info("saving final model") if rank == 0: checkdir = os.path.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) model.save(os.path.join(checkdir, 'final_model.ckpt'))
def rollout_fn(num_steps, env_name, num_envs, distribution_mode, num_levels, start_level, timesteps_per_proc, is_test_worker=False, log_dir='/tmp/procgen', comm=None, load_path=None): learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 use_vf_clipping = True mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else num_levels if log_dir is not None: log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs, filename="rollout") logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256) logger.info("training") ppo2.rollout( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, load_path = load_path, num_steps=num_steps, num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode )
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 timesteps_per_proc = 20_000_000 # 200_000_000: hard 25_000_000: easy use_vf_clipping = True LOG_DIR = './log/' parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--data_aug', type=str, default='normal') parser.add_argument('--exp_name', type=str, default='try1') parser.add_argument('--test_start_level', type=int, default=200) # 500 for hard / 200 for easy args = parser.parse_args() test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False #if args.num_levels < 50: # timesteps_per_proc = 5_000_000 if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] LOG_DIR += args.env_name + '/nlev_' + str(args.num_levels) + '_mode_' LOG_DIR += args.distribution_mode + '/' + args.data_aug + '/' + args.exp_name logger.configure(dir=LOG_DIR, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) # eval env, unlimited levels eval_venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=0, start_level=args.test_start_level, distribution_mode=args.distribution_mode) eval_venv = VecExtractDictObs(eval_venv, "rgb") eval_venv = VecMonitor( venv=eval_venv, filename=None, keep_buf=100, ) eval_venv = VecNormalize(venv=eval_venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") ppo2.learn( env=venv, eval_env=eval_venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=62, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, data_aug=args.data_aug, )
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 timesteps_per_proc = 20_000_000 use_vf_clipping = True parser = argparse.ArgumentParser(description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument('--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=50) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', '-id', type=int, default=0) parser.add_argument('--nupdates', type=int, default=0) parser.add_argument('--log_interval', type=int, default=1) parser.add_argument('--total_tsteps', type=int, default=0) parser.add_argument('--load_id', type=int, default=int(-1)) args = parser.parse_args() if args.nupdates: timesteps_per_proc = int(args.nupdates * num_envs * nsteps) if not args.total_tsteps: args.total_tsteps = timesteps_per_proc ## use global 20_000_000 if not specified in args! run_ID = 'run_'+str(args.run_id).zfill(2) SAVE_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.run_id) load_path = None if args.load_id > -1: load_path = 'log/vanilla/saved_vanilla_v{}.tar'.format(args.load_id) test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == (test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) logger.configure(dir=logpath, format_strs=format_strs) fpath = join(LOG_DIR, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.info("saving to filename ", SAVE_PATH) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto(log_device_placement=True)#device_count={'GPU':0}) config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256) logger.info("training") model = ppo2.learn( env=venv, network=conv_fn, total_timesteps=args.total_tsteps, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=args.log_interval, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, load_path=load_path, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, save_interval=300 ) model.save(SAVE_PATH)
def main(): parser = argparse.ArgumentParser(description="Process training arguments.") parser.add_argument('--config', type=str, default="configurations/ppo_baseline_cuda.yaml", help="config file name (located in config dir)") args = parser.parse_args() # create configuration cfg = get_cfg_defaults() cfg.merge_from_file(args.config) print(cfg.TRAIN.TOTAL_TIMESTEPS) # create experiment directory exp_dir = f"runs/{cfg.EXPERIMENT_NAME}/{datetime.now().strftime('%Y-%m-%d-%H-%M')}" os.makedirs(exp_dir, exist_ok=True) # create logger format_strs = ['csv', 'stdout'] logger.configure(dir=exp_dir, format_strs=format_strs, log_suffix=datetime.now().strftime('%Y-%m-%d-%H-%M')) # create (vectorized) procgen environment logger.info("creating environment") venv = ProcgenEnv(num_envs=cfg.TRAIN.NUM_ENVS, env_name="fruitbot", num_levels=cfg.TRAIN.NUM_LEVELS, start_level=cfg.TRAIN.LEVEL_SEED, distribution_mode="easy") venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) test_venv = ProcgenEnv(num_envs=cfg.TEST.NUM_ENVS, env_name="fruitbot", num_levels=cfg.TEST.NUM_LEVELS, start_level=cfg.TEST.LEVEL_SEED, distribution_mode="easy") test_venv = VecExtractDictObs(test_venv, "rgb") test_venv = VecMonitor( venv=test_venv, filename=None, keep_buf=100, ) test_venv = VecNormalize(venv=test_venv, ob=False) # create tensorflow session logger.info("creating tf session") config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # create cnn todo: make this less ugly conv_fn = None logger.info("building cnn") if cfg.TRAIN.NETWORK == "NATURE_CNN": conv_fn = lambda x: nature_cnn(x) elif cfg.TRAIN.NETWORK == "IMPALA_CNN": conv_fn = lambda x: build_impala_cnn( x, depths=[16, 32, 32], emb_size=256) # training logger.info("training") if cfg.TRAIN.POLICY == "A2C": a2c.learn(env=venv, network=conv_fn, total_timesteps=cfg.TRAIN.TOTAL_TIMESTEPS, nsteps=cfg.TRAIN.BATCH_SIZE, log_interval=1, eval_env=test_venv, augment=cfg.TRAIN.AUGMENT) elif cfg.TRAIN.POLICY == "ACKTR": acktr.learn(env=venv, network=conv_fn, total_timesteps=cfg.TRAIN.TOTAL_TIMESTEPS, nsteps=cfg.TRAIN.BATCH_SIZE, log_interval=1, eval_env=test_venv, augment=cfg.TRAIN.AUGMENT, seed=None) elif cfg.TRAIN.POLICY == "PPO": ppo2.learn(env=venv, eval_env=test_venv, network=conv_fn, total_timesteps=cfg.TRAIN.TOTAL_TIMESTEPS, save_interval=5, nsteps=cfg.TRAIN.BATCH_SIZE, nminibatches=cfg.TRAIN.MINIBATCHES, lam=cfg.TRAIN.LAM, gamma=cfg.TRAIN.GAMMA, noptepochs=cfg.TRAIN.NUM_EPOCHS, log_interval=1, clip_vf=cfg.TRAIN.USE_VF_CLIPPING, lr=cfg.TRAIN.LR, cliprange=cfg.TRAIN.CLIP_RANGE, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, augment=cfg.TRAIN.AUGMENT, load_path=cfg.TRAIN.PRETRAINED)
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 timesteps_per_proc = 5_000_000 use_vf_clipping = True parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env-name', type=str, default='bigfish') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num-levels', type=int, default=200) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--obs', choices=['rgb', 'lbl', 'onehot_lbl'], default='rgb') args = parser.parse_args() LOG_DIR = f'/raid0/dian/procgen_baseline/{args.env_name}/ppo_{args.obs}_{args.num_levels}_{SEED}' test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = args.num_levels # log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] # if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv( num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode, rand_seed=SEED, ) test_venv = ProcgenEnv( num_envs=num_envs, env_name=args.env_name, num_levels=0, start_level=args.start_level, distribution_mode=args.distribution_mode, rand_seed=SEED, ) if args.obs == 'onehot_lbl': venv = VecExtractDictObsOnehot(venv, args.env_name) venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) test_venv = VecExtractDictObsOnehot(test_venv, args.env_name) test_venv = VecMonitor( venv=test_venv, filename=None, keep_buf=100, ) else: venv = VecExtractDictObs(venv, args.obs) venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) test_venv = VecExtractDictObs(test_venv, args.obs) test_venv = VecMonitor( venv=test_venv, filename=None, keep_buf=100, ) test_venv = VecNormalize(venv=test_venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") ppo2.learn( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, eval_env=test_venv, save_interval=100, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, )
def eval_fn(load_path, args, env_name='fruitbot', distribution_mode='easy', num_levels=500, start_level=500, log_dir='./tmp/procgen', comm=None, num_trials=3, gui=False): learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 use_vf_clipping = True vf_coef = 0.5 max_grad_norm = 0.5 mpi_rank_weight = 1 log_interval = 1 seed=None log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=1, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256) logger.info(f"evaluating") set_global_seeds(seed) policy = build_policy(venv, conv_fn) # Get the nb of env nenvs = venv.num_envs # Get state_space and action_space ob_space = venv.observation_space ac_space = venv.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) from .alternate_ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if os.path.isfile(load_path): alt_ppo2.eval( network=conv_fn, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, gamma=gamma, lam=lam, log_interval=log_interval, nminibatches=nminibatches, noptepochs=ppo_epochs, load_path=load_path, mpi_rank_weight=mpi_rank_weight, comm=comm, clip_vf=use_vf_clipping, lr=learning_rate, cliprange=clip_range, policy=policy, nenvs=nenvs, ob_space=ob_space, ac_space=ac_space, nbatch=nbatch, nbatch_train=nbatch_train, model_fn=model_fn, model=model, num_trials=num_trials, num_levels=num_levels, start_level=start_level, gui=gui, args=args ) elif os.path.isdir(load_path): for file in os.listdir(load_path): log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir+'/'+file, format_strs=format_strs) alt_ppo2.eval( network=conv_fn, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, gamma=gamma, lam=lam, log_interval=log_interval, nminibatches=nminibatches, noptepochs=ppo_epochs, load_path=load_path+'/'+file, mpi_rank_weight=mpi_rank_weight, comm=comm, clip_vf=use_vf_clipping, lr=learning_rate, cliprange=clip_range, policy=policy, nenvs=nenvs, ob_space=ob_space, ac_space=ac_space, nbatch=nbatch, nbatch_train=nbatch_train, model_fn=model_fn, model=model, num_trials=num_trials, num_levels=num_levels, start_level=start_level, gui=gui, args=args ) else: print('Model path does not exist.') return
def conv_fn(x): return build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)
def main(): args = parse_config() run_dir = log_this(args, args.log_dir, args.log_name + '_' + args.env_name + '_' + args.rm_id) test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=run_dir, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=args.num_envs, env_name=args.env_name, num_levels=args.num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode, use_sequential_levels=args.use_sequential_levels) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) if args.rm_id: # load pretrained network device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") net = RewardNet().to(device) rm_path = glob.glob('./**/' + args.rm_id + '.rm', recursive=True)[0] net.load_state_dict( torch.load(rm_path, map_location=torch.device(device))) # use batch reward prediction function instead of the ground truth reward function # pass though sigmoid if needed if args.use_sigmoid: rew_func = lambda x: 1 / (1 + np.exp(-net.predict_batch_rewards(x)) ) else: rew_func = lambda x: net.predict_batch_rewards(x) ## Uncomment the line below to train a live-long agent # rew_func = lambda x: x.shape[0] * [1] venv = ProxyRewardWrapper(venv, rew_func) else: # true environment rewards will be use pass venv = VecNormalize(venv=venv, ob=False, use_tf=False) # do the rest of the training as normal logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") model = ppo2.learn( env=venv, network=conv_fn, total_timesteps=args.timesteps_per_proc, save_interval=args.save_interval, nsteps=args.nsteps, nminibatches=args.nminibatches, lam=args.lam, gamma=args.gamma, noptepochs=args.ppo_epochs, log_interval=args.log_interval, ent_coef=args.ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=args.use_vf_clipping, comm=comm, lr=args.learning_rate, cliprange=args.clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, load_path=args.load_path, ) model.save(os.path.join(run_dir, 'final_model.parameters'))
def main(): parser = argparse.ArgumentParser(description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot', help='env to run on from procgen') parser.add_argument('--num_envs', type=int, default=64, help='number of environments run simultaneously') parser.add_argument('--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"], help='level difficulty') parser.add_argument('--num_levels', type=int, default=0, help='number of levels to train/test on') parser.add_argument('--start_level', type=int, default=0, help='start level (used to avoid testing on seen levels)') parser.add_argument('--num_timesteps', type=int, default=0, help='number of timesteps total to train/test on') parser.add_argument('--save_frequency', type=int, default=0, help='checkpoint frequency') parser.add_argument('--model_loc', type=str, default=None, help='location of pretrained model') parser.add_argument('--results_loc', type=str, default=None, help='location of where to save current model/logs') parser.add_argument('--eval', type=bool, default=False, help='if true, does not update model') parser.add_argument('--data_aug', type=str, default='normal', help='whether to apply data augmentation') parser.add_argument('--gray_p', type=float, default=0.8, help='p value for grayscale data augmentation') parser.add_argument('--value_fn', type=str, default='fc', choices=['fc', 'gmm', 'lbmdp'], help='value function for ppo2 critic') parser.add_argument('--cnn_fn', type=str, default='impala_cnn', choices=['impala_cnn', 'nature_cnn', 'impala_cnn_lstm', 'lstm'], help='cnn for featurization') parser.add_argument('--entropy_fn', type=str, default='constant', choices=['constant', 'scaled'], help='function for entropy loss coefficient') parser.add_argument('--ent_coef', type=float, default=0.01, help='coefficient applied to entropy loss') parser.add_argument('--ent_scalar', type=float, default=1, help='coefficient applied within sigmoid to scaled entropy coefficient') parser.add_argument('--seed', type=int, default=None, help='seed for tensorflow') parser.add_argument('--gamma', type=float, default=0.999, help='discount factor') parser.add_argument('--lam', type=float, default=0.95, help='advantage discount factor') parser.add_argument('--lr', type=float, default=5e-4, help='learning rate for Adam') parser.add_argument('--imp_h1', type=float, default=16, help='impala cnn first hidden state') parser.add_argument('--imp_h2', type=float, default=64, help='impala cnn second hidden state') parser.add_argument('--imp_h3', type=float, default=64, help='impala cnn third hidden state') args = parser.parse_args() logger.configure(dir=args.results_loc, format_strs=['csv', 'stdout']) logger.info("Creating Environment") venv = ProcgenEnv(num_envs=args.num_envs, env_name=args.env_name, num_levels=args.num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, 'rgb') venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("Creating Tensorflow Session") config = tf.ConfigProto() sess = tf.Session(config=config) sess.__enter__() if args.cnn_fn == 'impala_cnn': conv_fn = lambda x: build_impala_cnn(x, depths=[args.imp_h1,args.imp_h2,args.imp_h3], emb_size=256) elif args.cnn_fn == 'nature_cnn': conv_fn = lambda x: nature_cnn(x) elif args.cnn_fn == 'impala_cnn_lstm': conv_fn = impala_cnn_lstm() elif args.cnn_fn == 'lstm': conv_fn = lstm() else: conv_fn = mlp() logger.info("Training") learn( network=conv_fn, env=venv, total_timesteps=args.num_timesteps, eval_env = None, seed=args.seed, nsteps=256, ent_coef=args.ent_coef, lr=args.lr, vf_coef=0.5, max_grad_norm=0.5, gamma=args.gamma, lam=args.lam, log_interval=args.save_frequency, nminibatches=4, noptepochs=3, cliprange=0.2, save_interval=0, load_path=args.model_loc, data_aug=args.data_aug, args=args, )