def test_fn(env_name, num_envs, config_path, load_path): test_config_path = os.path.join(os.getcwd(), "procgen-adr", config_path) test_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, domain_config_path=test_config_path, render_mode="rgb_array") test_env = VecExtractDictObs(test_env, "rgb") test_env = VecMonitor(venv=test_env, filename=None, keep_buf=100) test_env = VecNormalize(venv=test_env, ob=False) setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256) recur = True if recur: logger.info("Using CNN LSTM") conv_fn = cnn_lstm(nlstm=256, conv_fn=conv_fn) mean, std = test(conv_fn, test_env, load_path=load_path) sess.close() return mean, std
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 ##new defined vf_coef = 0.5 max_grad_norm = 0.5 ########### gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 # timesteps_per_proc = 50_000_000 use_vf_clipping = True parser = argparse.ArgumentParser(description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument('--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--total_timesteps', type=int, default=0) args = parser.parse_args() test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == (test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR, format_strs=format_strs, log_suffix="_total_timesteps_{}_num_levels_{}".format(args.total_timesteps, num_levels)) '''logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False)''' logger.info("Creating dropout evaluation environment") eval_venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=100, start_level=2000, distribution_mode=args.distribution_mode) eval_venv = VecExtractDictObs(eval_venv, "rgb") eval_venv = VecMonitor( venv=eval_venv, filename=None, keep_buf=100, ) eval_venv = VecNormalize(venv=eval_venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, is_train=False, depths=[16,32,32], emb_size=256) logger.info("testing dropout") policy = build_policy(eval_venv,conv_fn) nenvs = eval_venv.num_envs ob_space = eval_venv.observation_space ac_space = eval_venv.action_space nbatch = nenvs * nsteps nbatch_train = nbatch//nminibatches # Instantiate the model object (that creates act_model and train_model) from baselines.ppo2.model import Model model_fn = Model #modified from baseline ppo2 learn model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) model.load(MODEL_PATH) eval_runner = Runner(env=eval_venv, model=model, nsteps=nsteps, gamma=.999, lam=.95) eval_epinfobuf = deque(maxlen=100) nupdates = args.total_timesteps//nbatch log_interval = 1 for update in range(1, nupdates+1): #single upate to test eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() eval_epinfobuf.extend(eval_epinfos) if update % log_interval == 0 or update == 1: logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) logger.logkv('misc/total_timesteps',update*nbatch) logger.dumpkvs() eval_venv.close()
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 last_step = 4587520 # where we have left off in training timesteps_per_proc = 25_000_000 - last_step use_vf_clipping = True model_path = '../train-procgen/saved_model/policy_bossfight_vae560' vae_path = '../train-procgen/saved_model/bossfight_vae560' parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument( '--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) args = parser.parse_args() test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") ppo2_cvae.learn(env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=10, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, load_path=model_path, vae_path=vae_path)
def main(): """Run DQN until the environment throws an exception.""" # Hyperparameters learning_rate = 2.5e-4 gamma = 0.99 nstep_return = 3 timesteps_per_proc = 50_000_000 train_interval = 4 target_interval = 8192 batch_size = 512 min_buffer_size = 20000 # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='starpilot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=1) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') parser.add_argument('--level_setup', type=str, default='procgen', choices=["procgen", "oracle"]) parser.add_argument('--mix_mode', type=str, default='nomix', choices=['nomix', 'mixreg']) parser.add_argument('--mix_alpha', type=float, default=0.2) parser.add_argument('--use_l2reg', action='store_true') parser.add_argument('--data_aug', type=str, default='no_aug', choices=['no_aug', 'cutout_color', 'crop']) parser.add_argument('--PER', type=lambda x: bool(strtobool(x)), default=True, help='Whether to use PER') parser.add_argument('--num_envs', type=int, default=64) args = parser.parse_args() # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_envs = args.num_envs # Setup env specs if args.level_setup == "procgen": env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level elif args.level_setup == "oracle": env_name = args.env_name num_levels = 0 start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure( dir=LOG_DIR + f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}', format_strs=format_strs) # Create env logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank % len(gpus_id)] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup Rainbow models logger.info("building models") online_net, target_net = rainbow_models( sess, venv.action_space.n, gym_space_vectorizer(venv.observation_space), min_val=REWARD_RANGE_FOR_C51[env_name][0], max_val=REWARD_RANGE_FOR_C51[env_name][1]) dqn = MpiDQN(online_net, target_net, discount=gamma, comm=comm, mpi_rank_weight=mpi_rank_weight, mix_mode=args.mix_mode, mix_alpha=args.mix_alpha, use_l2reg=args.use_l2reg, data_aug=args.data_aug) player = NStepPlayer(VecPlayer(venv, dqn.online_net), nstep_return) optimize = dqn.optimize(learning_rate=learning_rate) # Initialize and sync variables sess.run(tf.global_variables_initializer()) global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if comm.Get_size() > 1: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E110 # Training logger.info("training") if args.PER: dqn.train(num_steps=timesteps_per_proc, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=train_interval, target_interval=target_interval, batch_size=batch_size, min_buffer_size=min_buffer_size) else: #set alpha and beta equal to 0 for uniform prioritization and no importance sampling dqn.train(num_steps=timesteps_per_proc, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0, 0, epsilon=0.1), optimize_op=optimize, train_interval=train_interval, target_interval=target_interval, batch_size=batch_size, min_buffer_size=min_buffer_size)
def main(): # Hyperparameters num_envs = 128 learning_rate = 5e-4 ent_coef = .01 vf_coef = 0.5 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 max_grad_norm = 0.5 use_vf_clipping = True # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=50) parser.add_argument('--start_level', type=int, default=500) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') parser.add_argument('--use_bn', action='store_true') parser.add_argument('--use_l2reg', action='store_true') parser.add_argument('--l2reg_coeff', type=float, default=1e-4) parser.add_argument('--data_aug', type=str, default='no_aug', choices=["no_aug", "cutout_color", "crop"]) parser.add_argument('--use_rand_conv', action='store_true') parser.add_argument('--model_width', type=str, default='1x', choices=["1x", "2x", "4x"]) parser.add_argument('--level_setup', type=str, default='procgen', choices=["procgen", "oracle"]) parser.add_argument('--mix_mode', type=str, default='nomix', choices=['nomix', 'mixreg', 'mixobs']) parser.add_argument('--mix_alpha', type=float, default=0.2) parser.add_argument('--timesteps_per_proc', type=int, default=1_000_000) parser.add_argument('--level_sampler_strategy', type=str, default='value_l1') parser.add_argument('--score_transform', type=str, default='rank') parser.add_argument('--save_dir', type=str, default='gdrive/MyDrive/182 Project/') args = parser.parse_args() timesteps_per_proc = args.timesteps_per_proc log_dir = args.save_dir # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 # Setup env specs if args.level_setup == "procgen": env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level elif args.level_setup == "oracle": env_name = args.env_name num_levels = 0 start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'tensorboard' ] if log_comm.Get_rank() == 0 else [] logger.configure( dir=log_dir + f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}', format_strs=format_strs) # Create env logger.info("creating environment") eval_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=500, start_level=0, distribution_mode=args.distribution_mode) eval_env = VecExtractDictObs(eval_env, "rgb") eval_env = VecMonitor( venv=eval_env, filename=None, keep_buf=100, ) eval_env = VecNormalize(venv=eval_env, ob=False, ret=True) # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup model if args.model_width == '1x': depths = [16, 32, 32] elif args.model_width == '2x': depths = [32, 64, 64] elif args.model_width == '4x': depths = [64, 128, 128] conv_fn = lambda x: build_impala_cnn(x, depths=depths, use_bn=args.use_bn, randcnn=args.use_rand_conv and not is_test_worker) # Training logger.info("training") ppo2.learn = learn # use customized "learn" function model = ppo2.learn( network=conv_fn, total_timesteps=timesteps_per_proc, num_levels=num_levels, start_level=start_level, eval_env=eval_env, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=vf_coef, max_grad_norm=max_grad_norm, data_aug=args.data_aug, level_sampler_strategy=args.level_sampler_strategy, score_transform=args.score_transform, model_fn=get_mixreg_model(mix_mode=args.mix_mode, mix_alpha=args.mix_alpha, use_l2reg=args.use_l2reg, l2reg_coeff=args.l2reg_coeff), ) # Saving logger.info("saving final model") if rank == 0: checkdir = os.path.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) model.save(os.path.join(checkdir, 'final_model.ckpt'))
def main(): parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=50) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', '-id', type=int, default=99) parser.add_argument('--nupdates', type=int, default=0) parser.add_argument('--total_tsteps', type=int, default=0) parser.add_argument('--log_interval', type=int, default=5) parser.add_argument('--load_id', type=int, default=int(-1)) parser.add_argument('--nrollouts', '-nroll', type=int, default=0) parser.add_argument('--test', default=False, action="store_true") parser.add_argument('--use_model', type=int, default=1, help="either model #1 or #2") parser.add_argument('--train_level', type=int, default=50) args = parser.parse_args() #timesteps_per_proc if args.nupdates: timesteps_per_proc = int(args.nupdates * num_envs * nsteps) if not args.total_tsteps: args.total_tsteps = TIMESTEPS_PER_PROC ## use global 20_000_000 if not specified in args! if args.nrollouts: total_timesteps = int(args.nrollouts * num_envs * nsteps) run_ID = 'run_' + str(args.run_id).zfill(2) if args.test: args.log_interval = 1 args.total_tsteps = 1_000_000 run_ID += '_test{}_model{}'.format(args.load_id, args.use_model) load_path = None if args.load_id > -1: load_path = join(SAVE_PATH, args.env_name, 'saved_ensemble2_v{}.tar'.format(args.load_id)) test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] if args.test: logpath = join('log2/ensemble2', args.env_name, 'test', run_ID) else: logpath = join('log2/ensemble2', args.env_name, 'train', run_ID) save_path = join(SAVE_PATH, args.env_name, 'saved_ensemble2_v{}.tar'.format(args.run_id)) logger.info("\n Model will be saved to file {}".format(save_path)) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) logger.configure(dir=logpath, format_strs=format_strs) fpath = join(logpath, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.info("creating tf session") setup_mpi_gpus() if not args.test: config = tf.compat.v1.ConfigProto(\ allow_soft_placement=True, log_device_placement=True)# device_count={'GPU':0}) config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) logger.info("creating 2 environments") n_levels = int(args.num_levels / 2) env1 = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=n_levels, start_level=0, distribution_mode=args.distribution_mode) env1 = VecExtractDictObs(env1, "rgb") env1 = VecMonitor( venv=env1, filename=None, keep_buf=100, ) env1 = VecNormalize(venv=env1, ob=False) env2 = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=n_levels, start_level=n_levels, distribution_mode=args.distribution_mode) env2 = VecExtractDictObs(env2, "rgb") env2 = VecMonitor( venv=env2, filename=None, keep_buf=100, ) env2 = VecNormalize(venv=env2, ob=False) train(run_ID, save_path, load_path, env1, env2, sess, logger, args) else: use_model = args.use_model ## 1 or 2 alt_flag = use_model - 1 test_all(alt_flag, load_path, logger, args)
def train_fn(env_name, num_envs, distribution_mode, num_levels, start_level, timesteps_per_proc, level_sampler_strategy, score_transform, model_name, is_test_worker=False, save_dir='./', comm=None): learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 use_vf_clipping = True mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else num_levels log_dir = save_dir + 'logs/' + model_name if log_dir is not None: log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'tensorboard' ] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs) logger.info("creating environment") eval_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=500, start_level=0, distribution_mode=distribution_mode) eval_env = VecExtractDictObs(eval_env, "rgb") eval_env = VecMonitor( venv=eval_env, filename=None, keep_buf=100, ) eval_env = VecNormalize(venv=eval_env, ob=False, ret=True) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32]) logger.info("training") model = ppo2.learn(network=conv_fn, total_timesteps=timesteps_per_proc, num_levels=num_levels, eval_env=eval_env, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, level_sampler_strategy=level_sampler_strategy, score_transform=score_transform) model.save(save_dir + 'models/' + model_name)
def run(checkpoint_path, *, output_dir, load_kwargs={}, trajectories_kwargs={}, observations_kwargs={}, **generate_kwargs): """Generate an interface from a checkpoint file. Arguments: checkpoint_path: path to checkpoint file, a joblib file containing a dictionary with these keys - params: saved model parameters as a dictionary mapping tensor names to numpy arrays - args: dictionary of metadata with these keys - env_name: name of the Procgen environment required if env_kind is 'procgen' - env_id: lowercase id of the Atari environment required if env_kind is 'atari' - env_kind: either 'procgen' or 'atari' defaults to 'procgen' - gamma: GAE hyperparameter gamma used to train the model defaults to None - lambda: GAE hyperparameter lambda used to train the model defaults to None - cnn: model architecture, one of 'clear', 'impala' or 'nature' defaults to 'clear' - any other optional arguments used to create the environment or get the architecture output_dir: path to directory where interface is to be saved required load_kwargs: dictionary with keys for any of the following - resample: whether to process the checkpoint file from scratch, rather than reusing samples previously saved to a non-temporary location defaults to True - model_path: lucid model save location - metadata_path: metadata dictionary save location - trajectories_path: trajectories save location - observations_path: additional observations save location - full_resolution: whether to also save observations in human-scale resolution (significant performance cost) defaults to False - temp_files: if any of the above paths is not specified, whether to default to a temporary location rather than a sudirectory of the checkpoint file's directory defaults to False trajectories_kwargs: dictionary with keys for any of the following only used if resampling - num_envs: number of trajectories to collect defaults to 8 - num_steps: length of each trajectory defaults to 512 observations_kwargs: dictionary with keys for any of the following only used if resampling - num_envs: number of environments to collect additional observations from in parallel defaults to 32 - num_obs: number of additional observations to collect from each parallel environment defaults to 128 - obs_every: number of steps to wait between each observation defaults to 128 model_bytes: lucid model, represented as a save file's bytes defaults to being extracted automatically observations: numpy array of additional observations used for feature visualization defaults to being extracted automatically observations_full: numpy array of the additional observations in human-scale resolution, or None to only use observations at the resolution seen by the model defaults to being extracted automatically, or None if human-scale resolution observations were not saved trajectories: dictionary of trajectories with keys 'observations', 'actions', 'rewards', either 'firsts' or 'dones', and optionally 'observations_full', each value being a numpy array with first two dimensions batch and timestep defaults to being extracted automatically policy_logits_name: name of tensor of policy logits defaults to being extracted automatically value_function_name: name of tensor of value function defaults to being extracted automatically env_name: Procgen environment name, used to help infer action_combos if that is not provided defaults to being extracted automatically, or 'unknown' if that fails numpy_precision: number of significant figures to round numpy arrays in the HTML file to defaults to 6 inline_js: whether to include the JavaScript in the HTML file inline, rather than referencing a separate file defaults to True (to avoid ad-blocker issues) inline_large_json: whether to include large amounts of JSON data in the HTML file inline, rather than referencing separate files defaults to whether output_dir does not contain '://' batch_size: size of minibatch of observations to pass through model defaults to 512 action_combos: list of tuples of strings describing the combinations of buttons triggered by each action defaults to being extracted automatically, or [('0',), ..., ('<num_actions - 1>',)] if that fails action_group_fns: list of function filters for grouping the action combos in different ways defaults to [ lambda combo: 'RIGHT' in combo, lambda combo: 'LEFT' in combo, lambda combo: 'UP' in combo, lambda combo: 'DOWN' in combo, lambda combo: 'RIGHT' not in combo and 'LEFT' not in combo and 'UP' not in combo and 'DOWN' not in combo ] layer_kwargs: dictionary of options for choosing layers, with keys for any of the following - name_contains_one_of: list of strings each layer name must contain one of, or None to not filter by name defaults to None - op_is_one_of: list of strings each layer op must be one of defaults to ['relu'] - bottleneck_only: whether to only include layers such that every path to an earlier convolutional layer passes through a bottleneck of the network defaults to True - discard_first_n: number of first layers to discard defaults to 0 input_layer_include: whether to additionally calcuate gradients with respect to the input layer defaults to False input_layer_name: display name of the input layer defaults to 'input' gae_gamma: gamma for computing advantages using GAE defaults to being extracted automatically, or 0.999 if that fails gae_lambda: lambda for computing advantages using GAE defaults to being extracted automatically, or 0.95 if that fails trajectory_bookmarks: number of links to display to highest advantage episodes and to lowest advantage episodes defaults to 16 nmf_features: number of dimensions for NMF dimensionality reduction defaults to 8 nmf_attr_opts: dictionary of options for computing attribution for NMF dimensionality reduction, the main one being integrate_steps (explained below, see attr_integrate_steps) defaults to {'integrate_steps': 10}, though if a dictionary is provided without an 'integrate_steps' key, then integrate_steps defaults to 1 vis_subdiv_mults: list of values of subdiv_mult, the spatial resolution of the grid of dataset examples used for feature visualization, as a mulitple of the resolution of the layer's activations defaults to [0.25, 0.5, 1, 2] vis_subdiv_mult_default: default value of subdiv_mult (explained above) defaults to 1 vis_expand_mults: list of values of expand_mult, the height and width of each patch used for feature visualization, as a multiple of the number of pixels if the layer were overlaid on the observation defaults to [1, 2, 4, 8] vis_expand_mult_default: default value of expand_mult (explained above) defaults to 4 vis_thumbnail_num_mult: spatial resolution of the grid of dataset examples used for feature visualization thumbnails defaults to 4 vis_thumbnail_expand_mult: the height and width of each patch used for feature visualization thumbnails, as a multiple of the number of pixels if the layer were overlaid on the observation defaults to 4 scrub_range: horizonal interval of observations and attribution used to construct scrubs defaults to (42 / 64, 44 / 64) attr_integrate_steps: number of points on the path used for numerical integration for computing attribution defaults to 10 attr_max_paths: maximum number of paths for multi-path attribution, or None to use single-path attribution defaults to None attr_policy: whether to compute attribution for the policy defaults to False attr_single_channels: whether to allow attribution for single channels to be displayed defaults to True observations_subdir: name of subdirectory containing additional observations defaults to 'observations/' trajectories_subdir: name of subdirectory containing trajectory observations defaults to 'trajectories/' trajectories_scrub_subdir: name of subdirectory containing scrubs of trajectory observations defaults to 'trajectories_scrub/' features_subdir: name of subdirectory containing feature visualizations defaults to 'features/' thumbnails_subdir: name of subdirectory containing feature thumbnails defaults to 'thumbnails/' attribution_subdir: name of subdirectory containing attribution defaults to 'attribution/' attribution_scrub_subdir: name of subdirectory containing scrubs of attribution defaults to 'attribution_scrub/' video_height: css height of each video screen defaults to '16em' video_width: css width of each video screen defaults to '16em' video_speed: speed of vidoes in frames per second defaults to 12 policy_display_height: css height of bar displaying policy defaults to '2em' policy_display_width: css width of bar displaying policy defaults to '40em' navigator_width: css width of navigator bar defaults to '24em' scrubber_height: css height of each scrubber defaults to '4em' scrubber_width: css width of each scrubber defaults to '48em' scrubber_visible_duration: number of frames visible in each scrubber defaults to 256 legend_item_height: css height of each legend item defaults to '6em' legend_item_width: css width of each legend item defaults to '6em' feature_viewer_height: css height of feature visualizations in the popup defaults to '40em' feature_viewer_width: css width of feature visualizations in the popup defaults to '40em' attribution_weight: css opacity of attribution when overlaid on observations (taking into account the fact that attribution is mostly transparent) defaults to 0.9 graph_colors: dictionary specifying css colors of graphs of each type defaults to { 'v': 'green', 'action': 'red', 'action_group': 'orange', 'advantage': 'blue' } trajectory_color: css color of text displaying trajectory information such as actions and rewards defaults to 'blue' """ import tensorflow as tf from mpi4py import MPI from baselines.common.mpi_util import setup_mpi_gpus comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() setup_mpi_gpus() exn = None if rank == 0 and load_kwargs.get("resample", True): kwargs = load(checkpoint_path, trajectories_kwargs=trajectories_kwargs, observations_kwargs=observations_kwargs, **load_kwargs) comm.barrier() else: comm.barrier() load_kwargs["resample"] = False try: kwargs = load(checkpoint_path, trajectories_kwargs=trajectories_kwargs, observations_kwargs=observations_kwargs, **load_kwargs) except tf.errors.NotFoundError as e: exn = e kwargs = None errors = comm.allreduce(0 if exn is None else 1, op=MPI.SUM) if errors == size: raise FileNotFoundError from exn elif errors > 0: kwargs = comm.bcast(kwargs, root=0) kwargs["output_dir"] = output_dir kwargs.update(generate_kwargs) generate(**kwargs)
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 timesteps_per_proc = 30_000_000 use_vf_clipping = True parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=50) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', '-id', type=int, default=0) parser.add_argument('--use', type=str, default="randcrop") parser.add_argument('--log_interval', type=int, default=20) parser.add_argument('--nupdates', type=int, default=0) parser.add_argument('--total_tsteps', type=int, default=0) parser.add_argument('--load_id', type=int, default=int(-1)) args = parser.parse_args() if args.nupdates: timesteps_per_proc = int(args.nupdates * num_envs * nsteps) if not args.total_tsteps: args.total_tsteps = timesteps_per_proc ## use global 20_000_000 if not specified in args! run_ID = 'run_' + str(args.run_id).zfill(2) ## select which ppo to use: agent_str = args.use LOG_DIR = join("log", agent_str, "train") save_model = join("log", agent_str, "saved_{}_v{}.tar".format(agent_str, args.run_id)) ppo_func = PPO_FUNCs[agent_str] load_path = None if args.load_id > -1: load_path = join("log", agent_str, "saved_{}_v{}.tar".format(agent_str, args.load_id)) test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) logger.configure(dir=logpath, format_strs=format_strs) fpath = join(LOG_DIR, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.info("\n Saving model to file {}".format(save_model)) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto( log_device_placement=True) #device_count={'GPU':0, 'XLA_GPU':0}) config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) #sess.__enter__() logger.info(venv.observation_space) logger.info("training") with sess.as_default(): model = ppo_func.learn( sess=sess, env=venv, network=None, total_timesteps=args.total_tsteps, save_interval=1000, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=args.log_interval, ent_coef=ent_coef, # clip_vf=use_vf_clipping, lr=learning_rate, cliprange=clip_range, # update_fn=None, # init_fn=None, save_path=save_model, load_path=load_path, vf_coef=0.5, max_grad_norm=0.5, ) model.save(save_model)
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 # nsteps = (128 // 8) nsteps = (128 // 8) nminibatches = 8 ppo_epochs = 3 clip_range = .2 timesteps_per_proc = 1_000_000 use_vf_clipping = True dist_mode = "easy" env_name = "visual-cartpole" num_levels = 100 # disc_coeff = None disc_coeff = 0. if disc_coeff is None: LOG_DIR = "/home/josh/" + env_name + "/" + env_name + "_disc_coeff_ramping2_num_levels_" + str(num_levels) + "_nsteps_" + str(nsteps) else: LOG_DIR = "/home/josh/" + env_name + "_easy_vae/" + env_name + "_disc_coeff_" + str(disc_coeff) + "_num_levels_" + str(num_levels) + "_nsteps_" + str(nsteps) test_worker_interval = 0 comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == (test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'tensorboard'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR, format_strs=format_strs) logger.info("creating environment") if env_name == "visual-cartpole": venv = gym.vector.make('cartpole-visual-v1', num_envs=num_envs, num_levels=num_levels) venv.observation_space = gym.spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8) venv.action_space = gym.spaces.Discrete(2) else: venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=0, distribution_mode=dist_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) if env_name == "visual-cartpole": test_venv = gym.vector.make('cartpole-visual-v1', num_envs=num_envs, num_levels=0) test_venv.observation_space = gym.spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8) test_venv.action_space = gym.spaces.Discrete(2) else: test_venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=0, start_level=1000, distribution_mode=dist_mode) test_venv = VecExtractDictObs(test_venv, "rgb") test_venv = VecMonitor( venv=test_venv, filename=None, keep_buf=100, ) # test_venv = VecExtractDictObs(test_venv, "rgb") test_venv = VecNormalize(venv=test_venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 config.gpu_options.per_process_gpu_memory_fraction = 0.9 sess = tf.Session(config=config) sess.__enter__() # conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256) # conv_fn = lambda x: nature_cnn(x) conv_fn = lambda x: build_darla_vae(x, emb_size=256) logger.info("training") ppo2.learn( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, eval_env=test_venv, num_levels=num_levels, disc_coeff=disc_coeff, )
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 # timesteps_per_proc = 50_000_000 use_vf_clipping = True parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument( '--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--total_timesteps', type=int, default=0) args = parser.parse_args() test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR, format_strs=format_strs, log_suffix="_total_timesteps_{}_num_levels_{}".format( args.total_timesteps, num_levels)) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating evaluation environment") eval_venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=100, start_level=2000, distribution_mode=args.distribution_mode) eval_venv = VecExtractDictObs(eval_venv, "rgb") eval_venv = VecMonitor( venv=eval_venv, filename=None, keep_buf=100, ) eval_venv = VecNormalize(venv=eval_venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn( x, is_train=False, depths=[16, 32, 32], emb_size=256) #change conv_fn so that its set to testing logger.info("testing") model = ppo2.learn(env=venv, eval_env=eval_venv, network=conv_fn, total_timesteps=args.total_timesteps, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, load_path=MODEL_PATH) # Save the model model.save( "test_dropout_model/model_total_timesteps_{}_num_levels_{}".format( args.total_timesteps, num_levels))
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 timesteps_per_proc = 20_000_000 # 200_000_000: hard 25_000_000: easy use_vf_clipping = True LOG_DIR = './log/' parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--data_aug', type=str, default='normal') parser.add_argument('--exp_name', type=str, default='try1') parser.add_argument('--test_start_level', type=int, default=200) # 500 for hard / 200 for easy args = parser.parse_args() test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False #if args.num_levels < 50: # timesteps_per_proc = 5_000_000 if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] LOG_DIR += args.env_name + '/nlev_' + str(args.num_levels) + '_mode_' LOG_DIR += args.distribution_mode + '/' + args.data_aug + '/' + args.exp_name logger.configure(dir=LOG_DIR, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) # eval env, unlimited levels eval_venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=0, start_level=args.test_start_level, distribution_mode=args.distribution_mode) eval_venv = VecExtractDictObs(eval_venv, "rgb") eval_venv = VecMonitor( venv=eval_venv, filename=None, keep_buf=100, ) eval_venv = VecNormalize(venv=eval_venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") ppo2.learn( env=venv, eval_env=eval_venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=62, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, data_aug=args.data_aug, )
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 timesteps_per_proc = 5_000_000 use_vf_clipping = True parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env-name', type=str, default='bigfish') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num-levels', type=int, default=200) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--obs', choices=['rgb', 'lbl', 'onehot_lbl'], default='rgb') args = parser.parse_args() LOG_DIR = f'/raid0/dian/procgen_baseline/{args.env_name}/ppo_{args.obs}_{args.num_levels}_{SEED}' test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = args.num_levels # log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] # if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv( num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode, rand_seed=SEED, ) test_venv = ProcgenEnv( num_envs=num_envs, env_name=args.env_name, num_levels=0, start_level=args.start_level, distribution_mode=args.distribution_mode, rand_seed=SEED, ) if args.obs == 'onehot_lbl': venv = VecExtractDictObsOnehot(venv, args.env_name) venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) test_venv = VecExtractDictObsOnehot(test_venv, args.env_name) test_venv = VecMonitor( venv=test_venv, filename=None, keep_buf=100, ) else: venv = VecExtractDictObs(venv, args.obs) venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) test_venv = VecExtractDictObs(test_venv, args.obs) test_venv = VecMonitor( venv=test_venv, filename=None, keep_buf=100, ) test_venv = VecNormalize(venv=test_venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") ppo2.learn( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, eval_env=test_venv, save_interval=100, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, )
def eval_fn(load_path, args, env_name='fruitbot', distribution_mode='easy', num_levels=500, start_level=500, log_dir='./tmp/procgen', comm=None, num_trials=3, gui=False): learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 use_vf_clipping = True vf_coef = 0.5 max_grad_norm = 0.5 mpi_rank_weight = 1 log_interval = 1 seed=None log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=1, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256) logger.info(f"evaluating") set_global_seeds(seed) policy = build_policy(venv, conv_fn) # Get the nb of env nenvs = venv.num_envs # Get state_space and action_space ob_space = venv.observation_space ac_space = venv.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) from .alternate_ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if os.path.isfile(load_path): alt_ppo2.eval( network=conv_fn, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, gamma=gamma, lam=lam, log_interval=log_interval, nminibatches=nminibatches, noptepochs=ppo_epochs, load_path=load_path, mpi_rank_weight=mpi_rank_weight, comm=comm, clip_vf=use_vf_clipping, lr=learning_rate, cliprange=clip_range, policy=policy, nenvs=nenvs, ob_space=ob_space, ac_space=ac_space, nbatch=nbatch, nbatch_train=nbatch_train, model_fn=model_fn, model=model, num_trials=num_trials, num_levels=num_levels, start_level=start_level, gui=gui, args=args ) elif os.path.isdir(load_path): for file in os.listdir(load_path): log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir+'/'+file, format_strs=format_strs) alt_ppo2.eval( network=conv_fn, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, gamma=gamma, lam=lam, log_interval=log_interval, nminibatches=nminibatches, noptepochs=ppo_epochs, load_path=load_path+'/'+file, mpi_rank_weight=mpi_rank_weight, comm=comm, clip_vf=use_vf_clipping, lr=learning_rate, cliprange=clip_range, policy=policy, nenvs=nenvs, ob_space=ob_space, ac_space=ac_space, nbatch=nbatch, nbatch_train=nbatch_train, model_fn=model_fn, model=model, num_trials=num_trials, num_levels=num_levels, start_level=start_level, gui=gui, args=args ) else: print('Model path does not exist.') return
def train(comm=None, *, save_dir=None, **kwargs): """ Train a model using Baselines' PPO2, and to save a checkpoint file in the required format. There is one required kwarg: either env_name (for env_kind="procgen") or env_id (for env_kind="atari"). Models for the paper were trained with 16 parallel MPI workers. Note: this code has not been well-tested. """ kwargs.setdefault("env_kind", "procgen") kwargs.setdefault("num_envs", 64) kwargs.setdefault("learning_rate", 5e-4) kwargs.setdefault("entropy_coeff", 0.01) kwargs.setdefault("gamma", 0.999) kwargs.setdefault("lambda", 0.95) kwargs.setdefault("num_steps", 256) kwargs.setdefault("num_minibatches", 8) kwargs.setdefault("library", "baselines") kwargs.setdefault("save_all", False) kwargs.setdefault("ppo_epochs", 3) kwargs.setdefault("clip_range", 0.2) kwargs.setdefault("timesteps_per_proc", 1_000_000_000) kwargs.setdefault("cnn", "clear") kwargs.setdefault("use_lstm", 0) kwargs.setdefault("stack_channels", "16_32_32") kwargs.setdefault("emb_size", 256) kwargs.setdefault("epsilon_greedy", 0.0) kwargs.setdefault("reward_scale", 1.0) kwargs.setdefault("frame_stack", 1) kwargs.setdefault("use_sticky_actions", 0) kwargs.setdefault("clip_vf", 1) kwargs.setdefault("reward_processing", "none") kwargs.setdefault("save_interval", 10) if comm is None: comm = MPI.COMM_WORLD rank = comm.Get_rank() setup_mpi_gpus() if save_dir is None: save_dir = tempfile.mkdtemp(prefix="rl_clarity_train_") create_env_kwargs = kwargs.copy() num_envs = create_env_kwargs.pop("num_envs") venv = create_env(num_envs, **create_env_kwargs) library = kwargs["library"] if library == "baselines": reward_processing = kwargs["reward_processing"] if reward_processing == "none": pass elif reward_processing == "clip": venv = VecClipReward(venv=venv) elif reward_processing == "normalize": venv = VecNormalize(venv=venv, ob=False, per_env=False) else: raise ValueError(f"Unsupported reward processing: {reward_processing}") scope = "ppo2_model" def update_fn(update, params=None): if rank == 0: save_interval = kwargs["save_interval"] if save_interval > 0 and update % save_interval == 0: print("Saving...") params = get_tf_params(scope) save_path = save_data( save_dir=save_dir, args_dict=kwargs, params=params, step=(update if kwargs["save_all"] else None), ) print(f"Saved to: {save_path}") sess = create_tf_session() sess.__enter__() if kwargs["use_lstm"]: raise ValueError("Recurrent networks not yet supported.") arch = get_arch(**kwargs) from baselines.ppo2 import ppo2 ppo2.learn( env=venv, network=arch, total_timesteps=kwargs["timesteps_per_proc"], save_interval=0, nsteps=kwargs["num_steps"], nminibatches=kwargs["num_minibatches"], lam=kwargs["lambda"], gamma=kwargs["gamma"], noptepochs=kwargs["ppo_epochs"], log_interval=1, ent_coef=kwargs["entropy_coeff"], mpi_rank_weight=1.0, clip_vf=bool(kwargs["clip_vf"]), comm=comm, lr=kwargs["learning_rate"], cliprange=kwargs["clip_range"], update_fn=update_fn, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, ) else: raise ValueError(f"Unsupported library: {library}") return save_dir
def main(): # Hyperparameters num_envs = 128 learning_rate = 5e-4 ent_coef = .01 vf_coef = 0.5 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 max_grad_norm = 0.5 timesteps_per_proc = 100_000_000 use_vf_clipping = True # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument( '--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') args = parser.parse_args() # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 # Setup env specs env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR + f'/{args.env_name}/run_{args.run_id}', format_strs=format_strs) # Create env logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup model conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32]) # Training logger.info("training") ppo2.Runner = NetRandRunner ppo2.build_policy = build_policy model = ppo2.learn( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=vf_coef, max_grad_norm=max_grad_norm, model_fn=NetRandModel, ) # Saving logger.info("saving final model") if rank == 0: checkdir = os.path.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) model.save(os.path.join(checkdir, 'final_model.ckpt'))
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 total_timesteps = 1_000_000 ## now this counts steps in testing runs use_vf_clipping = True ## From random_ppo.py max_grad_norm = 0.5 vf_coef = 0.5 L2_WEIGHT = 10e-4 FM_COEFF = 0.002 REAL_THRES = 0.1 parser = argparse.ArgumentParser( description='Process procgen testing arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=1000) ## default starting_level set to 50 to test on unseen levels! parser.add_argument('--start_level', type=int, default=1000) parser.add_argument('--run_id', '-id', type=int, default=0) parser.add_argument('--load_id', type=int, default=0) parser.add_argument('--nrollouts', '-nroll', type=int, default=0) args = parser.parse_args() args.total_timesteps = total_timesteps if args.nrollouts: total_timesteps = int(args.nrollouts * num_envs * nsteps) run_ID = 'run_' + str(args.run_id).zfill(2) run_ID += '_load{}'.format(args.load_id) comm = MPI.COMM_WORLD rank = comm.Get_rank() mpi_rank_weight = 0 num_levels = args.num_levels log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) fpath = join(logpath, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.configure(dir=logpath, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) sess.__enter__() logger.info("Testing") ## Modified based on random_ppo.learn env = venv nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches nrollouts = total_timesteps // nbatch network = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) policy = build_policy(env, network) model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) LOAD_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.load_id) model.load(LOAD_PATH) logger.info("Model pramas loaded from save") runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) # tfirststart = time.time() ## Not doing timing yet # active_ep_buf = epinfobuf100 mean_rewards = [] datapoints = [] for rollout in range(1, nrollouts + 1): logger.info('collecting rollouts {}...'.format(rollout)) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) ## differnent from random_ppo! epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100]) logger.info('\n----', rollout) mean_rewards.append(rew_mean_10) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) logger.logkv("misc/total_timesteps", rollout * nbatch) logger.info('----\n') logger.dumpkvs() env.close() print("Rewards history: ", mean_rewards) return mean_rewards
def rollout_fn(num_steps, env_name, num_envs, distribution_mode, num_levels, start_level, timesteps_per_proc, is_test_worker=False, log_dir='/tmp/procgen', comm=None, load_path=None): learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 use_vf_clipping = True mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else num_levels if log_dir is not None: log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs, filename="rollout") logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256) logger.info("training") ppo2.rollout( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, load_path = load_path, num_steps=num_steps, num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode )
def main(env_name, paint_vel_info, distribution_mode, num_levels, start_level, log_interval, iter_loss, arch, eval, num_envs, learning_rate, lr_schedule, ent_coef, gamma, lam, nsteps, nminibatches, ppo_epochs, clip_range, timesteps_per_proc, use_vf_clipping, _run, is_test_worker, timestep_factor): comm = MPI.COMM_WORLD log_comm = comm.Split(1 if is_test_worker else 0, 0) logger._run = _run # Configure logger format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir="{}/id_{}".format(LOG_DIR, _run._id), format_strs=format_strs) # Add sacred logger: if log_comm.Get_rank() == 0: logger.get_current().output_formats.append( SacredOutputFormat(_run, timestep_factor)) num_levels = 0 if is_test_worker else num_levels mpi_rank_weight = 0 if is_test_worker else 1 logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, paint_vel_info=paint_vel_info, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn_with_ibac( x, iter_loss=iter_loss, arch=arch, depths=[16, 32, 32], emb_size=256) logger.info("training") ppo_iter.learn( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, ## Iter iter_loss=iter_loss, arch=arch, _run=_run, ## Rest nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=log_interval, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, learning_rate=learning_rate, lr_schedule=lr_schedule, cliprange=clip_range, vf_coef=0.5, max_grad_norm=0.5, eval=eval, )
def main(): # Hyperparameters num_envs = 128 learning_rate = 5e-4 ent_coef = .01 vf_coef = 0.5 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 max_grad_norm = 0.5 timesteps_per_proc = 100_000_000 use_vf_clipping = True # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument('--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') parser.add_argument('--use_bn', action='store_true') parser.add_argument('--use_l2reg', action='store_true') parser.add_argument('--l2reg_coeff', type=float, default=1e-4) parser.add_argument('--data_aug', type=str, default='no_aug', choices=["no_aug", "cutout_color", "crop"]) parser.add_argument('--use_rand_conv', action='store_true') parser.add_argument('--model_width', type=str, default='1x', choices=["1x", "2x", "4x"]) parser.add_argument('--level_setup', type=str, default='procgen', choices=["procgen", "oracle"]) parser.add_argument('--mix_mode', type=str, default='nomix', choices=['nomix', 'mixreg', 'mixobs']) parser.add_argument('--mix_alpha', type=float, default=0.2) # JAG: Add second parameter beta to the beta distribution parser.add_argument('--mix_beta', type=float, default=0.2) # JAG: Parameters for adversarial RL # 1. The ending condition for adversarial gradient descent parser.add_argument('--adv_epsilon', type=float, default=5e-6) # 2. Learning rate for adversarial gradient descent parser.add_argument('--adv_lr', type=float, default=10) # 3. Adversarial penalty for observation euclidean distance parser.add_argument('--adv_gamma', type=float, default=0.01) # 4. We use adversarial after #threshold epochs of PPO training parser.add_argument('--adv_thresh', type=int, default=50) # 5. If we use evaluation environment parser.add_argument('--eval_env', type=bool, default=True) parser.add_argument('--eval_levels', type=int, default=0) # 6. The ratio of adversarial augmented data # adv = 1 means we replace original data with adversarial data # adv = 0 means we do not use adversarial parser.add_argument('--adv_adv', type=float, default=0.5) # 7. The ratio of mixup original data with augmented data # adv = 1 means we use augmented obs and value # adv = 0 means we use original obs and value parser.add_argument('--adv_obs', type=float, default=1) parser.add_argument('--adv_value', type=float, default=1) # Determine what percentage of environments we use (For generalization) # nenv = 1 means we use all the environments parser.add_argument('--adv_nenv', type=float, default=1) # 9. We test the first 500 epochs parser.add_argument('--adv_epochs', type=int, default=500) args = parser.parse_args() # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 # Setup env specs if args.level_setup == "procgen": env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level elif args.level_setup == "oracle": env_name = args.env_name num_levels = 0 start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure( dir=LOG_DIR + f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}', format_strs=format_strs ) # Create env logger.info("creating environment") # JAG: Limit the maximum training levels train_levels = int(num_levels * args.adv_nenv) venv = ProcgenEnv( num_envs=num_envs, env_name=env_name, num_levels=train_levels, start_level=start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) # JAG: If we use eval_env if args.eval_env: eval_env = ProcgenEnv( num_envs=num_envs, env_name=env_name, num_levels=args.eval_levels, start_level=start_level, distribution_mode=args.distribution_mode) eval_env = VecExtractDictObs(eval_env, "rgb") eval_env = VecMonitor(venv=eval_env, filename=None, keep_buf=100) eval_env = VecNormalize(venv=eval_env, ob=False) else: eval_env = None # Feed parameters to a dictionary adv_ratio={ 'adv': args.adv_adv, 'obs': args.adv_obs, 'value': args.adv_value, #'nenv': args.adv_nenv, } # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup model if args.model_width == '1x': depths = [16, 32, 32] elif args.model_width == '2x': depths = [32, 64, 64] elif args.model_width == '4x': depths = [64, 128, 128] conv_fn = lambda x: build_impala_cnn( x, depths=depths, use_bn=args.use_bn, randcnn=args.use_rand_conv and not is_test_worker) # Training logger.info("training") ppo2.learn = learn # use customized "learn" function model = ppo2.learn( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=vf_coef, max_grad_norm=max_grad_norm, data_aug=args.data_aug, use_rand_conv=args.use_rand_conv, model_fn=get_mixreg_model( mix_mode=args.mix_mode, mix_alpha=args.mix_alpha, mix_beta=args.mix_beta, use_l2reg=args.use_l2reg, l2reg_coeff=args.l2reg_coeff), # JAG: Pass adversarial parameters adv_epsilon=args.adv_epsilon, adv_lr=args.adv_lr, adv_gamma=args.adv_gamma, adv_thresh=args.adv_thresh, adv_ratio=adv_ratio, eval_env=eval_env, adv_epochs=args.adv_epochs, ) # Saving logger.info("saving final model") if rank == 0: checkdir = os.path.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) model.save(os.path.join(checkdir, 'final_model.ckpt'))
def train_fn(env_name: str, num_train_envs: int, n_training_steps: int, adr_config: ADRConfig = None, experiment_dir: str = None, tunable_params_config_path: str = None, log_dir: str = None, is_test_worker: bool = False, comm=None, save_interval: int = 1000, log_interval: int = 20, recur: bool = True): # Get the default ADR config if none is provided adr_config = ADRConfig() if adr_config is None else adr_config # Set up the experiment directory for this run. This will contain everything, from the domain configs for the # training environment and ADR evaluation environments to the logs. If the directory path is not provided, then # we'll make one an use the date-time-name to make it unique if experiment_dir is None: experiment_dir = pathlib.Path().absolute() / 'adr_experiments' / ( 'experiment-' + datetime_name()) experiment_dir.mkdir(parents=True, exist_ok=False) else: experiment_dir = pathlib.Path(experiment_dir) # Make a config directory within the experiment directory to hold the domain configs config_dir = experiment_dir / 'domain_configs' config_dir.mkdir(parents=True, exist_ok=False) # Load the tunable parameters from a config file if it is provided, otherwise get the default for the given game. if tunable_params_config_path is None: try: tunable_params = DEFAULT_TUNABLE_PARAMS[env_name] except KeyError: raise KeyError( f'No default tunable parameters exist for {env_name}') else: raise NotImplemented( 'Currently no way to load tunable parameters from a configuration file' ) # Make a default config for the given game... train_domain_config_path = config_dir / 'train_config.json' try: train_domain_config = DEFAULT_DOMAIN_CONFIGS[env_name] train_domain_config.to_json(train_domain_config_path) except KeyError: raise KeyError(f'No default config exists for {env_name}') # ...then load the initial bounds for the tunable parameters into the config. params = {} for param in tunable_params: params['min_' + param.name] = param.lower_bound params['max_' + param.name] = param.upper_bound train_domain_config.update_parameters(params, cache=False) # Configure the logger if we are given a log directory if log_dir is not None: log_dir = experiment_dir / log_dir log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=str(log_dir), format_strs=format_strs) logger.info(f'env_name: {env_name}') logger.info(f'num_train_envs: {num_train_envs}') logger.info(f'n_training_steps: {n_training_steps}') logger.info(f'experiment_dir: {experiment_dir}') logger.info(f'tunable_params_config_path: {tunable_params_config_path}') logger.info(f'log_dir: {log_dir}') logger.info(f'save_interval: {save_interval}') n_steps = 256 ent_coef = .01 lr = 5e-4 vf_coef = .5 max_grad_norm = .5 gamma = .999 lmbda = .95 n_minibatches = 8 ppo_epochs = 3 clip_range = .2 use_vf_clipping = True mpi_rank_weight = 0 if is_test_worker else 1 logger.info('creating environment') training_env = ProcgenEnv(num_envs=num_train_envs, env_name=env_name, domain_config_path=str(train_domain_config_path)) training_env = VecExtractDictObs(training_env, "rgb") training_env = VecMonitor(venv=training_env, filename=None, keep_buf=100) training_env = VecNormalize(venv=training_env, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.__enter__() def conv_fn(x): return build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) if recur: logger.info("Using CNN LSTM") conv_fn = cnn_lstm(nlstm=256, conv_fn=conv_fn) logger.info('training') ppo2_adr.learn(conv_fn, training_env, n_training_steps, config_dir, adr_config, train_domain_config, tunable_params, n_steps=n_steps, ent_coef=ent_coef, lr=lr, vf_coef=vf_coef, max_grad_norm=max_grad_norm, gamma=gamma, lmbda=lmbda, log_interval=log_interval, save_interval=save_interval, n_minibatches=n_minibatches, n_optepochs=ppo_epochs, clip_range=clip_range, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping)
def main(): num_envs = 64 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 total_timesteps = 1_000_000 ## From random_ppo.py max_grad_norm = 0.5 vf_coef = 0.5 parser = argparse.ArgumentParser( description='Process procgen testing arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=1000) parser.add_argument('--start_level', type=int, default=50) parser.add_argument('--run_id', '-id', type=int, default=0) parser.add_argument('--load_id', type=int, default=0) parser.add_argument('--nrollouts', '-nroll', type=int, default=50) parser.add_argument('--use', type=str, default="randcrop") parser.add_argument('--arch', type=str, default="impala") parser.add_argument('--no_bn', dest='use_batch_norm', action='store_false') parser.add_argument('--netrand', dest='netrand', action='store_true') parser.set_defaults(use_batch_norm=True) args = parser.parse_args() args.total_timesteps = total_timesteps arch = args.arch use_batch_norm = args.use_batch_norm if args.nrollouts: total_timesteps = int(args.nrollouts * num_envs * nsteps) run_ID = 'run_' + str(args.run_id).zfill(2) run_ID += '_load{}'.format(args.load_id) print(args.use) LOG_DIR = 'log/{}/test'.format(args.use) if not args.netrand: policy = CnnPolicy else: policy = RandomCnnPolicy load_model = "log/{}/saved_{}_v{}.tar".format(args.use, args.use, args.load_id) comm = MPI.COMM_WORLD num_levels = args.num_levels log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) fpath = join(logpath, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.configure(dir=logpath, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True sess = tf.compat.v1.Session(config=config) sess.__enter__() logger.info("Testing") env = venv nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches nrollouts = total_timesteps // nbatch model = Model(sess=sess, policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, arch=arch, use_batch_norm=use_batch_norm, dropout=0) model.load(load_model) logger.info("Model pramas loaded from saved model: ", load_model) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, aug_func=None) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) mean_rewards = [] datapoints = [] for rollout in range(1, nrollouts + 1): logger.info('collecting rollouts {}...'.format(rollout)) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100]) logger.info('\n----', rollout) mean_rewards.append(rew_mean_10) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) logger.logkv("misc/total_timesteps", rollout * nbatch) logger.info('----\n') logger.dumpkvs() env.close() print("Rewards history: ", mean_rewards) return mean_rewards
def main(): args = parse_config() run_dir = log_this(args, args.log_dir, args.log_name + '_' + args.env_name + '_' + args.rm_id) test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=run_dir, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=args.num_envs, env_name=args.env_name, num_levels=args.num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode, use_sequential_levels=args.use_sequential_levels) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) if args.rm_id: # load pretrained network device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") net = RewardNet().to(device) rm_path = glob.glob('./**/' + args.rm_id + '.rm', recursive=True)[0] net.load_state_dict( torch.load(rm_path, map_location=torch.device(device))) # use batch reward prediction function instead of the ground truth reward function # pass though sigmoid if needed if args.use_sigmoid: rew_func = lambda x: 1 / (1 + np.exp(-net.predict_batch_rewards(x)) ) else: rew_func = lambda x: net.predict_batch_rewards(x) ## Uncomment the line below to train a live-long agent # rew_func = lambda x: x.shape[0] * [1] venv = ProxyRewardWrapper(venv, rew_func) else: # true environment rewards will be use pass venv = VecNormalize(venv=venv, ob=False, use_tf=False) # do the rest of the training as normal logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") model = ppo2.learn( env=venv, network=conv_fn, total_timesteps=args.timesteps_per_proc, save_interval=args.save_interval, nsteps=args.nsteps, nminibatches=args.nminibatches, lam=args.lam, gamma=args.gamma, noptepochs=args.ppo_epochs, log_interval=args.log_interval, ent_coef=args.ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=args.use_vf_clipping, comm=comm, lr=args.learning_rate, cliprange=args.clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, load_path=args.load_path, ) model.save(os.path.join(run_dir, 'final_model.parameters'))
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 timesteps_per_proc = 5_000_000 use_vf_clipping = True parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=50) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=0) parser.add_argument('--nupdates', type=int, default=0) parser.add_argument('--debug', default=False, action="store_true") parser.add_argument('--total_tsteps', type=int, default=0) parser.add_argument('--load_id', type=int, default=int(-1)) args = parser.parse_args() if not args.total_tsteps: args.total_tsteps = timesteps_per_proc ## use global 20_000_000 if not specified in args! if args.nupdates: timesteps_per_proc = int(args.nupdates * num_envs * nsteps) run_ID = 'run_' + str(args.run_id).zfill(2) if args.debug: LOG_DIR = 'log/random/debug' SAVE_PATH = 'log/random/debug_random_v{}.tar'.format(args.run_id) else: LOG_DIR = 'log/random/train' SAVE_PATH = 'log/random/random_v{}.tar'.format(args.run_id) load_path = None if args.load_id > -1: load_path = 'log/random/random_v{}.tar'.format(args.load_id) test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) logger.configure(dir=logpath, format_strs=format_strs) fpath = join(logpath, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) logger.info("\n Saving to file {}".format(SAVE_PATH)) logger.info("\nSaved args at:\n\t{}\n".format(fpath)) #logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) sess.__enter__() model = random_ppo.learn( env=venv, network=None, total_timesteps=args.total_tsteps, save_interval=2, ## doesn't matter, only saving at the end nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, # clip_vf=use_vf_clipping, lr=learning_rate, cliprange=clip_range, #cliprange=lambda f : f * 0.2, # update_fn=None, # init_fn=None, save_path=SAVE_PATH, load_path=load_path, vf_coef=0.5, max_grad_norm=0.5, ) model.save(SAVE_PATH)