def eval_fn(load_path, args, env_name='fruitbot', distribution_mode='easy', num_levels=500, start_level=500, log_dir='./tmp/procgen', comm=None, num_trials=3, gui=False): learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 use_vf_clipping = True vf_coef = 0.5 max_grad_norm = 0.5 mpi_rank_weight = 1 log_interval = 1 seed=None log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=1, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256) logger.info(f"evaluating") set_global_seeds(seed) policy = build_policy(venv, conv_fn) # Get the nb of env nenvs = venv.num_envs # Get state_space and action_space ob_space = venv.observation_space ac_space = venv.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) from .alternate_ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if os.path.isfile(load_path): alt_ppo2.eval( network=conv_fn, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, gamma=gamma, lam=lam, log_interval=log_interval, nminibatches=nminibatches, noptepochs=ppo_epochs, load_path=load_path, mpi_rank_weight=mpi_rank_weight, comm=comm, clip_vf=use_vf_clipping, lr=learning_rate, cliprange=clip_range, policy=policy, nenvs=nenvs, ob_space=ob_space, ac_space=ac_space, nbatch=nbatch, nbatch_train=nbatch_train, model_fn=model_fn, model=model, num_trials=num_trials, num_levels=num_levels, start_level=start_level, gui=gui, args=args ) elif os.path.isdir(load_path): for file in os.listdir(load_path): log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir+'/'+file, format_strs=format_strs) alt_ppo2.eval( network=conv_fn, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, gamma=gamma, lam=lam, log_interval=log_interval, nminibatches=nminibatches, noptepochs=ppo_epochs, load_path=load_path+'/'+file, mpi_rank_weight=mpi_rank_weight, comm=comm, clip_vf=use_vf_clipping, lr=learning_rate, cliprange=clip_range, policy=policy, nenvs=nenvs, ob_space=ob_space, ac_space=ac_space, nbatch=nbatch, nbatch_train=nbatch_train, model_fn=model_fn, model=model, num_trials=num_trials, num_levels=num_levels, start_level=start_level, gui=gui, args=args ) else: print('Model path does not exist.') return
def train(comm=None, *, save_dir=None, **kwargs): """ Train a model using Baselines' PPO2, and to save a checkpoint file in the required format. There is one required kwarg: either env_name (for env_kind="procgen") or env_id (for env_kind="atari"). Models for the paper were trained with 16 parallel MPI workers. Note: this code has not been well-tested. """ kwargs.setdefault("env_kind", "procgen") kwargs.setdefault("num_envs", 64) kwargs.setdefault("learning_rate", 5e-4) kwargs.setdefault("entropy_coeff", 0.01) kwargs.setdefault("gamma", 0.999) kwargs.setdefault("lambda", 0.95) kwargs.setdefault("num_steps", 256) kwargs.setdefault("num_minibatches", 8) kwargs.setdefault("library", "baselines") kwargs.setdefault("save_all", False) kwargs.setdefault("ppo_epochs", 3) kwargs.setdefault("clip_range", 0.2) kwargs.setdefault("timesteps_per_proc", 1_000_000_000) kwargs.setdefault("cnn", "clear") kwargs.setdefault("use_lstm", 0) kwargs.setdefault("stack_channels", "16_32_32") kwargs.setdefault("emb_size", 256) kwargs.setdefault("epsilon_greedy", 0.0) kwargs.setdefault("reward_scale", 1.0) kwargs.setdefault("frame_stack", 1) kwargs.setdefault("use_sticky_actions", 0) kwargs.setdefault("clip_vf", 1) kwargs.setdefault("reward_processing", "none") kwargs.setdefault("save_interval", 10) if comm is None: comm = MPI.COMM_WORLD rank = comm.Get_rank() setup_mpi_gpus() if save_dir is None: save_dir = tempfile.mkdtemp(prefix="rl_clarity_train_") create_env_kwargs = kwargs.copy() num_envs = create_env_kwargs.pop("num_envs") venv = create_env(num_envs, **create_env_kwargs) library = kwargs["library"] if library == "baselines": reward_processing = kwargs["reward_processing"] if reward_processing == "none": pass elif reward_processing == "clip": venv = VecClipReward(venv=venv) elif reward_processing == "normalize": venv = VecNormalize(venv=venv, ob=False, per_env=False) else: raise ValueError(f"Unsupported reward processing: {reward_processing}") scope = "ppo2_model" def update_fn(update, params=None): if rank == 0: save_interval = kwargs["save_interval"] if save_interval > 0 and update % save_interval == 0: print("Saving...") params = get_tf_params(scope) save_path = save_data( save_dir=save_dir, args_dict=kwargs, params=params, step=(update if kwargs["save_all"] else None), ) print(f"Saved to: {save_path}") sess = create_tf_session() sess.__enter__() if kwargs["use_lstm"]: raise ValueError("Recurrent networks not yet supported.") arch = get_arch(**kwargs) from baselines.ppo2 import ppo2 ppo2.learn( env=venv, network=arch, total_timesteps=kwargs["timesteps_per_proc"], save_interval=0, nsteps=kwargs["num_steps"], nminibatches=kwargs["num_minibatches"], lam=kwargs["lambda"], gamma=kwargs["gamma"], noptepochs=kwargs["ppo_epochs"], log_interval=1, ent_coef=kwargs["entropy_coeff"], mpi_rank_weight=1.0, clip_vf=bool(kwargs["clip_vf"]), comm=comm, lr=kwargs["learning_rate"], cliprange=kwargs["clip_range"], update_fn=update_fn, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, ) else: raise ValueError(f"Unsupported library: {library}") return save_dir
def main(): parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=50) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', '-id', type=int, default=99) parser.add_argument('--nupdates', type=int, default=0) parser.add_argument('--total_tsteps', type=int, default=0) parser.add_argument('--log_interval', type=int, default=5) parser.add_argument('--load_id', type=int, default=int(-1)) parser.add_argument('--nrollouts', '-nroll', type=int, default=0) parser.add_argument('--test', default=False, action="store_true") parser.add_argument('--use_model', type=int, default=1, help="either model #1 or #2") parser.add_argument('--train_level', type=int, default=50) args = parser.parse_args() #timesteps_per_proc if args.nupdates: timesteps_per_proc = int(args.nupdates * num_envs * nsteps) if not args.total_tsteps: args.total_tsteps = TIMESTEPS_PER_PROC ## use global 20_000_000 if not specified in args! if args.nrollouts: total_timesteps = int(args.nrollouts * num_envs * nsteps) run_ID = 'run_' + str(args.run_id).zfill(2) if args.test: args.log_interval = 1 args.total_tsteps = 1_000_000 run_ID += '_test{}_model{}'.format(args.load_id, args.use_model) load_path = None if args.load_id > -1: load_path = join(SAVE_PATH, args.env_name, 'saved_ensemble_v{}.tar'.format(args.load_id)) test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] if args.test: logpath = join('log2/ensemble', args.env_name, 'test', run_ID) else: logpath = join('log2/ensemble', args.env_name, 'train', run_ID) save_path = join(SAVE_PATH, args.env_name, 'saved_ensemble_v{}.tar'.format(args.run_id)) logger.info("\n Model will be saved to file {}".format(save_path)) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) logger.configure(dir=logpath, format_strs=format_strs) fpath = join(logpath, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.info("creating tf session") setup_mpi_gpus() if not args.test: config = tf.compat.v1.ConfigProto(\ allow_soft_placement=True, log_device_placement=True)# device_count={'GPU':0}) config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) train(run_ID, save_path, load_path, venv, sess, logger, args) else: use_model = args.use_model ## 1 or 2 alt_flag = use_model - 1 test_all(alt_flag, load_path, logger, args)
def main(): num_envs = 64 # 16? learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 timesteps_per_proc = 20_000_000 use_vf_clipping = True parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=50) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--log_interval', type=int, default=10) parser.add_argument('--nupdates', type=int, default=0) parser.add_argument('--total_tsteps', type=int, default=0) parser.add_argument('--load_id', type=int, default=int(-1)) parser.add_argument('--run_id', '-id', type=int, default=0) parser.add_argument('--use', type=str, default="randcrop") parser.add_argument('--arch', type=str, default="impala") parser.add_argument('--no_bn', dest='use_batch_norm', action='store_false') parser.add_argument('--dropout', type=float, default=0) parser.add_argument('--netrand', type=float, default=0) parser.set_defaults(use_batch_norm=True) args = parser.parse_args() arch = args.arch dropout = args.dropout use_batch_norm = args.use_batch_norm netrand = args.netrand if args.nupdates: timesteps_per_proc = int(args.nupdates * num_envs * nsteps) if not args.total_tsteps: args.total_tsteps = timesteps_per_proc ## use global 20_000_000 if not specified in args! run_ID = 'run_' + str(args.run_id).zfill(2) agent_str = args.use LOG_DIR = join("log", agent_str, "train") save_model = join("log", agent_str, "saved_{}_v{}.tar".format(agent_str, args.run_id)) load_path = None if args.load_id > -1: load_path = join("log", agent_str, "saved_{}_v{}.tar".format(agent_str, args.load_id)) test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) logger.configure(dir=logpath, format_strs=format_strs) fpath = join(LOG_DIR, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) logger.info("\nSaved args at:\n\t{}\n".format(fpath)) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True sess = tf.compat.v1.Session(config=config) logger.info(venv.observation_space) logger.info("training") with sess.as_default(): model = learn( agent_str=agent_str, use_netrand=netrand, sess=sess, env=venv, network=None, total_timesteps=args.total_tsteps, save_interval=1000, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=args.log_interval, ent_coef=ent_coef, lr=learning_rate, arch=arch, use_batch_norm=use_batch_norm, dropout=dropout, cliprange=clip_range, save_path=save_model, load_path=load_path, vf_coef=0.5, max_grad_norm=0.5, clip_vf=use_vf_clipping, update_fn=None, init_fn=None, comm=comm, ) model.save(save_model)
def train(args): args.cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) log_dir = os.path.expanduser(args.log_dir) utils.cleanup_log_dir(log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") log_file = '-{}-{}-reproduce-s{}'.format(args.run_name, args.env_name, args.seed) venv = ProcgenEnv(num_envs=args.num_processes, env_name=args.env_name, \ num_levels=args.num_levels, start_level=args.start_level, \ distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) envs = VecPyTorchProcgen(venv, device) obs_shape = envs.observation_space.shape actor_critic = Policy( obs_shape, envs.action_space.n, base_kwargs={'recurrent': False, 'hidden_size': args.hidden_size}) actor_critic.to(device) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, aug_type=args.aug_type, split_ratio=args.split_ratio) batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch) if args.use_ucb: aug_id = data_augs.Identity aug_list = [aug_to_func[t](batch_size=batch_size) for t in list(aug_to_func.keys())] agent = algo.UCBDrAC( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_list=aug_list, aug_id=aug_id, aug_coef=args.aug_coef, num_aug_types=len(list(aug_to_func.keys())), ucb_exploration_coef=args.ucb_exploration_coef, ucb_window_length=args.ucb_window_length) elif args.use_meta_learning: aug_id = data_augs.Identity aug_list = [aug_to_func[t](batch_size=batch_size) \ for t in list(aug_to_func.keys())] aug_model = AugCNN() aug_model.to(device) agent = algo.MetaDrAC( actor_critic, aug_model, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, meta_grad_clip=args.meta_grad_clip, meta_num_train_steps=args.meta_num_train_steps, meta_num_test_steps=args.meta_num_test_steps, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_coef=args.aug_coef) elif args.use_rl2: aug_id = data_augs.Identity aug_list = [aug_to_func[t](batch_size=batch_size) for t in list(aug_to_func.keys())] rl2_obs_shape = [envs.action_space.n + 1] rl2_learner = Policy( rl2_obs_shape, len(list(aug_to_func.keys())), base_kwargs={'recurrent': True, 'hidden_size': args.rl2_hidden_size}) rl2_learner.to(device) agent = algo.RL2DrAC( actor_critic, rl2_learner, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.rl2_entropy_coef, lr=args.lr, eps=args.eps, rl2_lr=args.rl2_lr, rl2_eps=args.rl2_eps, max_grad_norm=args.max_grad_norm, aug_list=aug_list, aug_id=aug_id, aug_coef=args.aug_coef, num_aug_types=len(list(aug_to_func.keys())), recurrent_hidden_size=args.rl2_hidden_size, num_actions=envs.action_space.n, device=device) else: aug_id = data_augs.Identity aug_func = aug_to_func[args.aug_type](batch_size=batch_size) agent = algo.DrAC( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_func=aug_func, aug_coef=args.aug_coef, env_name=args.env_name) checkpoint_path = os.path.join(args.save_dir, "agent" + log_file + ".pt") if os.path.exists(checkpoint_path) and args.preempt: checkpoint = torch.load(checkpoint_path) agent.actor_critic.load_state_dict(checkpoint['model_state_dict']) agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) init_epoch = checkpoint['epoch'] + 1 logger.configure(dir=args.log_dir, format_strs=['csv', 'stdout'], log_suffix=log_file + "-e%s" % init_epoch) else: init_epoch = 0 logger.configure(dir=args.log_dir, format_strs=['csv', 'stdout'], log_suffix=log_file) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(init_epoch, num_updates): actor_critic.train() for step in range(args.num_steps): # Sample actions with torch.no_grad(): obs_id = aug_id(rollouts.obs[step]) value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( obs_id, rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): obs_id = aug_id(rollouts.obs[-1]) next_value = actor_critic.get_value( obs_id, rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.gae_lambda) if args.use_ucb and j > 0: agent.update_ucb_values(rollouts) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps print("\nUpdate {}, step {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), dist_entropy, value_loss, action_loss)) logger.logkv("train/nupdates", j) logger.logkv("train/total_num_steps", total_num_steps) logger.logkv("losses/dist_entropy", dist_entropy) logger.logkv("losses/value_loss", value_loss) logger.logkv("losses/action_loss", action_loss) logger.logkv("train/mean_episode_reward", np.mean(episode_rewards)) logger.logkv("train/median_episode_reward", np.median(episode_rewards)) ### Eval on the Full Distribution of Levels ### eval_episode_rewards = evaluate(args, actor_critic, device, aug_id=aug_id) logger.logkv("test/mean_episode_reward", np.mean(eval_episode_rewards)) logger.logkv("test/median_episode_reward", np.median(eval_episode_rewards)) logger.dumpkvs() # Save Model if (j > 0 and j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": try: os.makedirs(args.save_dir) except OSError: pass torch.save({ 'epoch': j, 'model_state_dict': agent.actor_critic.state_dict(), 'optimizer_state_dict': agent.optimizer.state_dict(), }, os.path.join(args.save_dir, "agent" + log_file + ".pt"))
def main(): # Hyperparameters num_envs = 128 learning_rate = 5e-4 ent_coef = .01 vf_coef = 0.5 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 max_grad_norm = 0.5 use_vf_clipping = True # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=50) parser.add_argument('--start_level', type=int, default=500) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') parser.add_argument('--use_bn', action='store_true') parser.add_argument('--use_l2reg', action='store_true') parser.add_argument('--l2reg_coeff', type=float, default=1e-4) parser.add_argument('--data_aug', type=str, default='no_aug', choices=["no_aug", "cutout_color", "crop"]) parser.add_argument('--use_rand_conv', action='store_true') parser.add_argument('--model_width', type=str, default='1x', choices=["1x", "2x", "4x"]) parser.add_argument('--level_setup', type=str, default='procgen', choices=["procgen", "oracle"]) parser.add_argument('--mix_mode', type=str, default='nomix', choices=['nomix', 'mixreg', 'mixobs']) parser.add_argument('--mix_alpha', type=float, default=0.2) parser.add_argument('--timesteps_per_proc', type=int, default=1_000_000) parser.add_argument('--level_sampler_strategy', type=str, default='value_l1') parser.add_argument('--score_transform', type=str, default='rank') parser.add_argument('--save_dir', type=str, default='gdrive/MyDrive/182 Project/') args = parser.parse_args() timesteps_per_proc = args.timesteps_per_proc log_dir = args.save_dir # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 # Setup env specs if args.level_setup == "procgen": env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level elif args.level_setup == "oracle": env_name = args.env_name num_levels = 0 start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'tensorboard' ] if log_comm.Get_rank() == 0 else [] logger.configure( dir=log_dir + f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}', format_strs=format_strs) # Create env logger.info("creating environment") eval_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=500, start_level=0, distribution_mode=args.distribution_mode) eval_env = VecExtractDictObs(eval_env, "rgb") eval_env = VecMonitor( venv=eval_env, filename=None, keep_buf=100, ) eval_env = VecNormalize(venv=eval_env, ob=False, ret=True) # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup model if args.model_width == '1x': depths = [16, 32, 32] elif args.model_width == '2x': depths = [32, 64, 64] elif args.model_width == '4x': depths = [64, 128, 128] conv_fn = lambda x: build_impala_cnn(x, depths=depths, use_bn=args.use_bn, randcnn=args.use_rand_conv and not is_test_worker) # Training logger.info("training") ppo2.learn = learn # use customized "learn" function model = ppo2.learn( network=conv_fn, total_timesteps=timesteps_per_proc, num_levels=num_levels, start_level=start_level, eval_env=eval_env, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=vf_coef, max_grad_norm=max_grad_norm, data_aug=args.data_aug, level_sampler_strategy=args.level_sampler_strategy, score_transform=args.score_transform, model_fn=get_mixreg_model(mix_mode=args.mix_mode, mix_alpha=args.mix_alpha, use_l2reg=args.use_l2reg, l2reg_coeff=args.l2reg_coeff), ) # Saving logger.info("saving final model") if rank == 0: checkdir = os.path.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) model.save(os.path.join(checkdir, 'final_model.ckpt'))
def build_env(args, extra_args): if 'Lock-v0' in args.env: import Environments env = gym.make('Lock-v0') ep_dict = { 'horizon': args.horizon, 'dimension': 5, 'switch': 0.1, 'tabular': False } env.init(env_config=ep_dict) return env elif 'diabcombolock-v0' in args.env: env = build_env_homer(args, extra_args) return env ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg seed = args.seed env_type, env_id = get_env_type(args) if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) env = VecFrameStack(env, frame_stack_size) else: config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) flatten_dict_observations = alg not in {'her'} env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations) if env_type == 'mujoco': env = VecNormalize(env, use_tf=True) return env
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg seed = args.seed env_type, env_id = get_env_type(args) env_thunk = lambda x: x if args.constraints is not None: assert len(args.constraints) == len(args.rewards) constraints = [ constraint.CONSTRAINT_DICT[s](r) for (s, r) in zip(args.constraints, args.rewards) ] env_thunk = lambda env: constraint.StepMonitor( constraint.ConstraintEnv(env, constraints, augmentation_type=args.augmentation, log_dir=logger.get_dir()), logger.get_dir( )) if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) env = env_thunk(env) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) env = env_thunk(env) else: frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale, constraint_env_thunk=env_thunk) env = VecFrameStack(env, frame_stack_size) else: config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) flatten_dict_observations = alg not in {'her'} env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations, constraint_env_thunk=env_thunk) if env_type == 'mujoco': env = VecNormalize(env) return env
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 total_timesteps = 100_000 ## now this counts steps in testing runs use_vf_clipping = True ## From random_ppo.py max_grad_norm = 0.5 vf_coef = 0.5 L2_WEIGHT = 10e-4 FM_COEFF = 0.002 REAL_THRES = 0.1 parser = argparse.ArgumentParser( description='Process procgen testing arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=50) ## default starting_level set to 50 to test on unseen levels! parser.add_argument('--start_level', type=int, default=50) parser.add_argument('--run_id', '-id', type=int, default=0) parser.add_argument('--nrollouts', '-nroll', type=int, default=0) args = parser.parse_args() args.total_timesteps = total_timesteps if args.nrollouts: total_timesteps = int(args.nrollouts * num_envs * nsteps) run_ID = 'run_' + str(args.run_id).zfill(2) comm = MPI.COMM_WORLD rank = comm.Get_rank() mpi_rank_weight = 0 num_levels = args.num_levels log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) fpath = join(LOG_DIR, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.configure(dir=logpath, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) sess.__enter__() logger.info("Testing") ## Modified based on random_ppo.learn env = venv nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches nrollouts = total_timesteps // nbatch policy = RandomCnnPolicy model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) model.load(LOAD_PATH) logger.info("Model pramas loaded from save") runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) # tfirststart = time.time() ## Not doing timing yet # active_ep_buf = epinfobuf100 mean_rewards = [] datapoints = [] for rollout in range(1, nrollouts + 1): logger.info('collecting rollouts {}...'.format(rollout)) clean_flag = 0 ## since we are testiing, ENABLE randomization obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( clean_flag) epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100]) logger.info('\n----', rollout) mean_rewards.append(rew_mean_10) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) logger.logkv("misc/total_timesteps", rollout * nbatch) logger.info('----\n') logger.dumpkvs() env.close() print("Rewards history: ", mean_rewards) return mean_rewards
def train_fn(env_name, num_envs, distribution_mode, num_levels, start_level, timesteps_per_proc, args, is_test_worker=False, log_dir='./tmp/procgen', comm=None, alternate_ppo=False, do_eval=False, eval_num_envs=None, eval_env_name=None, eval_num_levels=None, eval_start_level=None, eval_distribution_mode=None, do_test=False, test_num_envs=None, test_env_name=None, test_num_levels=None, test_start_level=None, test_distribution_mode=None): learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 use_vf_clipping = True mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else num_levels if log_dir is not None: log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) eval_env = None if do_eval: eval_env = ProcgenEnv(num_envs=eval_num_envs, env_name=eval_env_name, num_levels=eval_num_levels, start_level=eval_start_level, distribution_mode=eval_distribution_mode) eval_env = VecExtractDictObs(eval_env, "rgb") eval_env = VecMonitor( venv=eval_env, filename=None, keep_buf=100, ) eval_env = VecNormalize(venv=eval_env, ob=False) test_env = None if do_test: test_env = ProcgenEnv(num_envs=test_num_envs, env_name=test_env_name, num_levels=test_num_levels, start_level=test_start_level, distribution_mode=test_distribution_mode) test_env = VecExtractDictObs(test_env, "rgb") test_env = VecMonitor( venv=test_env, filename=None, keep_buf=100, ) test_env = VecNormalize(venv=test_env, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") if alternate_ppo: alt_ppo2.learn(env=venv, eval_env=eval_env, test_env=test_env, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=1, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, args=args, load_path=args.resume_path) else: ppo2.learn(env=venv, eval_env=eval_env, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=1, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, args=args)
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 timesteps_per_proc = 20_000_000 use_vf_clipping = True parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=50) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', '-id', type=int, default=99) parser.add_argument('--nupdates', type=int, default=0) parser.add_argument('--total_tsteps', type=int, default=0) parser.add_argument('--log_interval', type=int, default=20) parser.add_argument('--load_id', type=int, default=int(-1)) args = parser.parse_args() args.total_tsteps = timesteps_per_proc if args.nupdates: timesteps_per_proc = int(args.nupdates * num_envs * nsteps) if not args.total_tsteps: args.total_tsteps = timesteps_per_proc ## use global 20_000_000 if not specified in args! run_ID = 'run_' + str(args.run_id).zfill(2) save_model = join(SAVE_PATH, "saved_recenter_v{}.tar".format(args.run_id)) load_path = None if args.load_id > -1: load_path = 'log/recenter/recenter_v{}.tar'.format(args.load_id) test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) logger.configure(dir=logpath, format_strs=format_strs) fpath = join(LOG_DIR, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.info("\n Saving model to file {}".format(save_model)) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto( log_device_placement=True) #device_count={'GPU':0}) config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) # sess.__enter__() logger.info(venv.observation_space) logger.info("training") with sess.as_default(): model = recenter_ppo.learn( sess=sess, env=venv, network=None, total_timesteps=args.total_tsteps, save_interval=2, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=args.log_interval, ent_coef=ent_coef, # clip_vf=use_vf_clipping, lr=learning_rate, cliprange=clip_range, # update_fn=None, # init_fn=None, save_path=save_model, load_path=load_path, vf_coef=0.5, max_grad_norm=0.5, ) model.save(save_model)
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 timesteps_per_proc = 50_000_000 use_vf_clipping = True parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=500) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) args = parser.parse_args() test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") ppo2.learn( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, )
def learn(*, network, total_timesteps, num_levels=50, start_level=500, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, num_processes=64, num_steps=256, level_replay_temperature=0.1, level_replay_rho=1.0, level_replay_nu=0.5, level_replay_alpha=1.0, staleness_coef=0.1, staleness_temperature=1.0, level_sampler_strategy='value_l1', score_transform='rank', **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) level_sampler_args = dict(num_actors=num_processes, strategy=level_sampler_strategy, replay_schedule='proportionate', score_transform=score_transform, temperature=level_replay_temperature, rho=level_replay_rho, nu=level_replay_nu, alpha=level_replay_alpha, staleness_coef=staleness_coef, staleness_transform='power', staleness_temperature=staleness_temperature) env = ProcgenEnv(num_envs=num_processes, env_name='fruitbot', \ num_levels=1, start_level=start_level, \ distribution_mode='easy', paint_vel_info=False) env = VecExtractDictObs(env, "rgb") env = VecMonitor(venv=env, filename=None, keep_buf=100) env = VecNormalize(venv=env, ob=False, ret=True) seeds = [start_level + i for i in range(num_levels)] level_sampler = LevelSampler(seeds, env.observation_space, env.action_space, **level_sampler_args) env = VecProcgen(env, level_sampler=level_sampler) rollouts = RolloutStorage(num_steps, num_processes, env.observation_space.shape, env.action_space) level_seeds = np.zeros(num_processes) obs, level_seeds = env.reset() level_seeds = level_seeds.reshape(-1, 1) rollouts.obs[0] = obs policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, rollouts=rollouts) if eval_env is not None: eval_runner = EvalRunner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos, = runner.run( level_seeds=level_seeds) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos, = eval_runner.run( ) #pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Update level sampler level_sampler.update_with_rollouts(rollouts) rollouts.after_update() level_sampler.after_update() # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) return model
def main(): parser = argparse.ArgumentParser(description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot', help='env to run on from procgen') parser.add_argument('--num_envs', type=int, default=64, help='number of environments run simultaneously') parser.add_argument('--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"], help='level difficulty') parser.add_argument('--num_levels', type=int, default=0, help='number of levels to train/test on') parser.add_argument('--start_level', type=int, default=0, help='start level (used to avoid testing on seen levels)') parser.add_argument('--num_timesteps', type=int, default=0, help='number of timesteps total to train/test on') parser.add_argument('--save_frequency', type=int, default=0, help='checkpoint frequency') parser.add_argument('--model_loc', type=str, default=None, help='location of pretrained model') parser.add_argument('--results_loc', type=str, default=None, help='location of where to save current model/logs') parser.add_argument('--eval', type=bool, default=False, help='if true, does not update model') parser.add_argument('--data_aug', type=str, default='normal', help='whether to apply data augmentation') parser.add_argument('--gray_p', type=float, default=0.8, help='p value for grayscale data augmentation') parser.add_argument('--value_fn', type=str, default='fc', choices=['fc', 'gmm', 'lbmdp'], help='value function for ppo2 critic') parser.add_argument('--cnn_fn', type=str, default='impala_cnn', choices=['impala_cnn', 'nature_cnn', 'impala_cnn_lstm', 'lstm'], help='cnn for featurization') parser.add_argument('--entropy_fn', type=str, default='constant', choices=['constant', 'scaled'], help='function for entropy loss coefficient') parser.add_argument('--ent_coef', type=float, default=0.01, help='coefficient applied to entropy loss') parser.add_argument('--ent_scalar', type=float, default=1, help='coefficient applied within sigmoid to scaled entropy coefficient') parser.add_argument('--seed', type=int, default=None, help='seed for tensorflow') parser.add_argument('--gamma', type=float, default=0.999, help='discount factor') parser.add_argument('--lam', type=float, default=0.95, help='advantage discount factor') parser.add_argument('--lr', type=float, default=5e-4, help='learning rate for Adam') parser.add_argument('--imp_h1', type=float, default=16, help='impala cnn first hidden state') parser.add_argument('--imp_h2', type=float, default=64, help='impala cnn second hidden state') parser.add_argument('--imp_h3', type=float, default=64, help='impala cnn third hidden state') args = parser.parse_args() logger.configure(dir=args.results_loc, format_strs=['csv', 'stdout']) logger.info("Creating Environment") venv = ProcgenEnv(num_envs=args.num_envs, env_name=args.env_name, num_levels=args.num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, 'rgb') venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("Creating Tensorflow Session") config = tf.ConfigProto() sess = tf.Session(config=config) sess.__enter__() if args.cnn_fn == 'impala_cnn': conv_fn = lambda x: build_impala_cnn(x, depths=[args.imp_h1,args.imp_h2,args.imp_h3], emb_size=256) elif args.cnn_fn == 'nature_cnn': conv_fn = lambda x: nature_cnn(x) elif args.cnn_fn == 'impala_cnn_lstm': conv_fn = impala_cnn_lstm() elif args.cnn_fn == 'lstm': conv_fn = lstm() else: conv_fn = mlp() logger.info("Training") learn( network=conv_fn, env=venv, total_timesteps=args.num_timesteps, eval_env = None, seed=args.seed, nsteps=256, ent_coef=args.ent_coef, lr=args.lr, vf_coef=0.5, max_grad_norm=0.5, gamma=args.gamma, lam=args.lam, log_interval=args.save_frequency, nminibatches=4, noptepochs=3, cliprange=0.2, save_interval=0, load_path=args.model_loc, data_aug=args.data_aug, args=args, )