def make_lr_venv(num_envs, env_name, seeds, device, **kwargs): level_sampler = kwargs.get('level_sampler') level_sampler_args = kwargs.get('level_sampler_args') ret_normalization = not kwargs.get('no_ret_normalization', False) if env_name in PROCGEN_ENVS: num_levels = kwargs.get('num_levels', 1) start_level = kwargs.get('start_level', 0) distribution_mode = kwargs.get('distribution_mode', 'easy') paint_vel_info = kwargs.get('paint_vel_info', False) venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, \ num_levels=num_levels, start_level=start_level, \ distribution_mode=distribution_mode, paint_vel_info=paint_vel_info) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False, ret=ret_normalization) if level_sampler_args: level_sampler = LevelSampler( seeds, venv.observation_space, venv.action_space, **level_sampler_args) envs = VecPyTorchProcgen(venv, device, level_sampler=level_sampler) elif env_name.startswith('MiniGrid'): venv = VecMinigrid(num_envs=num_envs, env_name=env_name, seeds=seeds) venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False, ret=ret_normalization) if level_sampler_args: level_sampler = LevelSampler( seeds, venv.observation_space, venv.action_space, **level_sampler_args) elif seeds: level_sampler = LevelSampler( seeds, venv.observation_space, venv.action_space, strategy='random', ) envs = VecPyTorchMinigrid(venv, device, level_sampler=level_sampler) else: raise ValueError(f'Unsupported env {env_name}') return envs, level_sampler
def test_one_env(alt_flag, model, start_level, num_levels, logger, args, env=None): ## Modified based on random_ppo.learn if not env: venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) env = venv runner = TestRunner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) mean_rewards = [] datapoints = [] for rollout in range(1, args.nrollouts + 1): logger.info('collecting rollouts {}...'.format(rollout)) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( alt_flag) epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100]) logger.info('\n----', rollout) mean_rewards.append(rew_mean_10) logger.logkv('start_level', start_level) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) logger.logkv("misc/total_timesteps", rollout * args.nbatch) logger.info('----\n') logger.dumpkvs() env.close() logger.info("Average reward on levels {} ~ {}: {} ".format( start_level, start_level + num_levels, mean_rewards)) return np.mean(mean_rewards)
def test_all(alt_flag, load_path, logger, args): train_end = int(args.train_level) config = tf.compat.v1.ConfigProto( log_device_placement=True) #device_count={'GPU':0}) config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=train_end, start_level=0, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) env = venv nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches nrollouts = args.total_tsteps // nbatch args.nrollouts = nrollouts args.nbatch = nbatch model = Model(sess=sess, policy=EnsembleCnnPolicy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=0.5, max_grad_norm=0.5) model.load(load_path) logger.info("Model pramas loaded from saved model: ", load_path) mean_rewards = [] ## first, test train performance mean_rewards.append( test_one_env(alt_flag, model, 0, train_end, logger, args, env=env)) ## then, test on sampled intervals for l in TEST_START_LEVELS: mean_rewards.append( test_one_env(alt_flag, model, l, 100, logger, args, env=None)) logger.info("All tests finished, mean reward history: ", mean_rewards) return
def __init__(self, model, config_dir: pathlib.Path, n_trajectories: int, tunable_params: List[EnvironmentParameter]): self._model = model self._n_trajectories = n_trajectories # Initialize the environment easy_config_path = config_dir / 'test_easy_config.json' easy_config = copy.copy(BossfightEasyConfig) easy_config.to_json(easy_config_path) easy_env = ProcgenEnv(num_envs=1, env_name=str(easy_config.game), domain_config_path=str(easy_config_path)) easy_env = VecExtractDictObs(easy_env, "rgb") easy_env = VecMonitor(venv=easy_env, filename=None, keep_buf=100) self.easy_env = VecNormalize(venv=easy_env, ob=False) hard_config_path = config_dir / 'test_hard_config.json' hard_config = copy.copy(BossfightHardConfig) hard_config.to_json(hard_config_path) hard_env = ProcgenEnv(num_envs=1, env_name=str(hard_config.game), domain_config_path=str(hard_config_path)) hard_env = VecExtractDictObs(hard_env, "rgb") hard_env = VecMonitor(venv=hard_env, filename=None, keep_buf=100) self.hard_env = VecNormalize(venv=hard_env, ob=False) # Make a default config for bossfight... test_domain_config_path = config_dir / 'test_full_config.json' test_domain_config = DEFAULT_DOMAIN_CONFIGS['dc_bossfight'] test_domain_config.to_json(test_domain_config_path) params = {} for param in tunable_params: params['min_' + param.name] = param.clip_lower_bound params['max_' + param.name] = param.clip_upper_bound test_domain_config.update_parameters(params, cache=False) full_env = ProcgenEnv(num_envs=1, env_name=str(test_domain_config.game), domain_config_path=str(test_domain_config_path)) full_env = VecExtractDictObs(full_env, "rgb") full_env = VecMonitor(venv=full_env, filename=None, keep_buf=100) self.full_env = VecNormalize(venv=full_env, ob=False)
def evaluate(args, actor_critic, device, num_processes=1, aug_id=None): actor_critic.eval() # Sample Levels From the Full Distribution venv = ProcgenEnv(num_envs=num_processes, env_name=args.env_name, \ num_levels=0, start_level=0, \ distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) eval_envs = VecPyTorchProcgen(venv, device) eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.ones(num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): if aug_id: obs = aug_id(obs) _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=False) obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print("Last {} test episodes: mean/median reward {:.1f}/{:.1f}\n"\ .format(len(eval_episode_rewards), \ np.mean(eval_episode_rewards), np.median(eval_episode_rewards))) return eval_episode_rewards
def __init__(self, model, train_config_path: Union[str, pathlib.Path], env_parameter: EnvironmentParameter, adr_config: ADRConfig): self._model = model # Model being evaluated self._gamma = adr_config.gamma # Discount rate self._lambda = adr_config.lmbda # Lambda used in GAE (General Advantage Estimation) self._env_parameter = env_parameter self._param_name = self._env_parameter.name self._max_buffer_size = adr_config.max_buffer_size self._n_trajectories = adr_config.n_eval_trajectories self._upper_sample_prob = adr_config.upper_sample_prob self._train_config_path = pathlib.Path(train_config_path) config_dir = self._train_config_path.parent config_name = self._param_name + '_adr_eval_config.json' # Initialize the config for the evaluation environment # This config will be updated regularly throughout training. When we boundary sample this environment's # parameter, the config will be modified to set the parameter to the selected boundary before running a number # of trajectories. self._boundary_config = DomainConfig.from_json(self._train_config_path) self._boundary_config_path = config_dir / config_name self._boundary_config.to_json(self._boundary_config_path) # Initialize the environment env = ProcgenEnv(num_envs=1, env_name=str(self._boundary_config.game), domain_config_path=str(self._boundary_config_path)) env = VecExtractDictObs(env, "rgb") env = VecMonitor(venv=env, filename=None, keep_buf=100) self._env = VecNormalize(venv=env, ob=False) # Initialize the performance buffers self._upper_performance_buffer, self._lower_performance_buffer = PerformanceBuffer( ), PerformanceBuffer() self._states = { 'lower': model.adr_initial_state, 'upper': model.adr_initial_state } self._obs = self._env.reset() self._dones = [False]
def make_vec_env(nenvs=4, recurrent=False, grayscale=True, frame_stack=4, num_agents=2): venv = SubprocVecEnv([ lambda: make_env(rank, grayscale=grayscale, num_agents=num_agents) for rank in range(nenvs) ]) # Uncomment this line in place of the one above for debugging. # venv = DummyVecEnv([lambda: make_env(0)]) if not recurrent: # Perform the frame stack at the vectorized environment level as opposed to at # the individual environment level. I think this allows you to communicate fewer # images across processes. venv = VecFrameStack(venv, frame_stack) venv = MultiAgentToSingleAgent(venv, num_agents=num_agents) venv = VecMonitor(venv, filename=monitor_filepath) return venv
def test_fn(env_name, num_envs, config_path, load_path): test_config_path = os.path.join(os.getcwd(), "procgen-adr", config_path) test_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, domain_config_path=test_config_path, render_mode="rgb_array") test_env = VecExtractDictObs(test_env, "rgb") test_env = VecMonitor(venv=test_env, filename=None, keep_buf=100) test_env = VecNormalize(venv=test_env, ob=False) setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256) recur = True if recur: logger.info("Using CNN LSTM") conv_fn = cnn_lstm(nlstm=256, conv_fn=conv_fn) mean, std = test(conv_fn, test_env, load_path=load_path) sess.close() return mean, std
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 total_timesteps = 1_000_000 ## now this counts steps in testing runs use_vf_clipping = True ## From random_ppo.py max_grad_norm = 0.5 vf_coef = 0.5 L2_WEIGHT = 10e-4 FM_COEFF = 0.002 REAL_THRES = 0.1 parser = argparse.ArgumentParser( description='Process procgen testing arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=1000) ## default starting_level set to 50 to test on unseen levels! parser.add_argument('--start_level', type=int, default=1000) parser.add_argument('--run_id', '-id', type=int, default=0) parser.add_argument('--load_id', type=int, default=0) parser.add_argument('--nrollouts', '-nroll', type=int, default=0) args = parser.parse_args() args.total_timesteps = total_timesteps if args.nrollouts: total_timesteps = int(args.nrollouts * num_envs * nsteps) run_ID = 'run_' + str(args.run_id).zfill(2) run_ID += '_load{}'.format(args.load_id) comm = MPI.COMM_WORLD rank = comm.Get_rank() mpi_rank_weight = 0 num_levels = args.num_levels log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) fpath = join(logpath, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.configure(dir=logpath, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) sess.__enter__() logger.info("Testing") ## Modified based on random_ppo.learn env = venv nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches nrollouts = total_timesteps // nbatch network = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) policy = build_policy(env, network) model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) LOAD_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.load_id) model.load(LOAD_PATH) logger.info("Model pramas loaded from save") runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) # tfirststart = time.time() ## Not doing timing yet # active_ep_buf = epinfobuf100 mean_rewards = [] datapoints = [] for rollout in range(1, nrollouts + 1): logger.info('collecting rollouts {}...'.format(rollout)) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) ## differnent from random_ppo! epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100]) logger.info('\n----', rollout) mean_rewards.append(rew_mean_10) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) logger.logkv("misc/total_timesteps", rollout * nbatch) logger.info('----\n') logger.dumpkvs() env.close() print("Rewards history: ", mean_rewards) return mean_rewards
def train(args): args.cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) log_dir = os.path.expanduser(args.log_dir) utils.cleanup_log_dir(log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") log_file = '-{}-{}-reproduce-s{}'.format(args.run_name, args.env_name, args.seed) logger.configure(dir=args.log_dir, format_strs=['csv', 'stdout'], log_suffix=log_file) venv = ProcgenEnv(num_envs=args.num_processes, env_name=args.env_name, \ num_levels=args.num_levels, start_level=args.start_level, \ distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) envs = VecPyTorchProcgen(venv, device) obs_shape = envs.observation_space.shape actor_critic = Policy(obs_shape, envs.action_space.n, base_kwargs={ 'recurrent': False, 'hidden_size': args.hidden_size }) actor_critic.to(device) if modelbased: rollouts = BiggerRolloutStorage( args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, aug_type=args.aug_type, split_ratio=args.split_ratio) else: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, aug_type=args.aug_type, split_ratio=args.split_ratio) batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch) if args.use_ucb: aug_id = data_augs.Identity aug_list = [ aug_to_func[t](batch_size=batch_size) for t in list(aug_to_func.keys()) ] agent = algo.UCBDrAC(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_list=aug_list, aug_id=aug_id, aug_coef=args.aug_coef, num_aug_types=len(list(aug_to_func.keys())), ucb_exploration_coef=args.ucb_exploration_coef, ucb_window_length=args.ucb_window_length) elif args.use_meta_learning: aug_id = data_augs.Identity aug_list = [aug_to_func[t](batch_size=batch_size) \ for t in list(aug_to_func.keys())] aug_model = AugCNN() aug_model.to(device) agent = algo.MetaDrAC(actor_critic, aug_model, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, meta_grad_clip=args.meta_grad_clip, meta_num_train_steps=args.meta_num_train_steps, meta_num_test_steps=args.meta_num_test_steps, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_coef=args.aug_coef) elif args.use_rl2: aug_id = data_augs.Identity aug_list = [ aug_to_func[t](batch_size=batch_size) for t in list(aug_to_func.keys()) ] rl2_obs_shape = [envs.action_space.n + 1] rl2_learner = Policy(rl2_obs_shape, len(list(aug_to_func.keys())), base_kwargs={ 'recurrent': True, 'hidden_size': args.rl2_hidden_size }) rl2_learner.to(device) agent = algo.RL2DrAC(actor_critic, rl2_learner, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.rl2_entropy_coef, lr=args.lr, eps=args.eps, rl2_lr=args.rl2_lr, rl2_eps=args.rl2_eps, max_grad_norm=args.max_grad_norm, aug_list=aug_list, aug_id=aug_id, aug_coef=args.aug_coef, num_aug_types=len(list(aug_to_func.keys())), recurrent_hidden_size=args.rl2_hidden_size, num_actions=envs.action_space.n, device=device) elif False: # Regular Drac aug_id = data_augs.Identity aug_func = aug_to_func[args.aug_type](batch_size=batch_size) agent = algo.DrAC(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_func=aug_func, aug_coef=args.aug_coef, env_name=args.env_name) elif False: # Model Free Planning Drac aug_id = data_augs.Identity aug_func = aug_to_func[args.aug_type](batch_size=batch_size) actor_critic = PlanningPolicy(obs_shape, envs.action_space.n, base_kwargs={ 'recurrent': False, 'hidden_size': args.hidden_size }) actor_critic.to(device) agent = algo.DrAC(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_func=aug_func, aug_coef=args.aug_coef, env_name=args.env_name) else: # Model based Drac aug_id = data_augs.Identity aug_func = aug_to_func[args.aug_type](batch_size=batch_size) actor_critic = ModelBasedPolicy(obs_shape, envs.action_space.n, base_kwargs={ 'recurrent': False, 'hidden_size': args.hidden_size }) actor_critic.to(device) agent = algo.ConvDrAC(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_func=aug_func, aug_coef=args.aug_coef, env_name=args.env_name) obs = envs.reset() rollouts.obs[0].copy_(obs) if modelbased: rollouts.next_obs[0].copy_(obs) # TODO: is this right? rollouts.to(device) episode_rewards = deque(maxlen=10) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in trange(num_updates): actor_critic.train() for step in range(args.num_steps): # Sample actions with torch.no_grad(): obs_id = aug_id(rollouts.obs[step]) value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( obs_id, rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): obs_id = aug_id(rollouts.obs[-1]) next_value = actor_critic.get_value( obs_id, rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.gae_lambda) if args.use_ucb and j > 0: agent.update_ucb_values(rollouts) if isinstance(agent, algo.ConvDrAC): value_loss, action_loss, dist_entropy, transition_model_loss, reward_model_loss = agent.update( rollouts) else: value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "\nUpdate {}, step {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), dist_entropy, value_loss, action_loss)) logger.logkv("train/nupdates", j) logger.logkv("train/total_num_steps", total_num_steps) logger.logkv("losses/dist_entropy", dist_entropy) logger.logkv("losses/value_loss", value_loss) logger.logkv("losses/action_loss", action_loss) if isinstance(agent, algo.ConvDrAC): logger.logkv("losses/transition_model_loss", transition_model_loss) logger.logkv("losses/reward_model_loss", reward_model_loss) logger.logkv("train/mean_episode_reward", np.mean(episode_rewards)) logger.logkv("train/median_episode_reward", np.median(episode_rewards)) ### Eval on the Full Distribution of Levels ### eval_episode_rewards = evaluate(args, actor_critic, device, aug_id=aug_id) logger.logkv("test/mean_episode_reward", np.mean(eval_episode_rewards)) logger.logkv("test/median_episode_reward", np.median(eval_episode_rewards)) logger.dumpkvs()
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 timesteps_per_proc = 30_000_000 use_vf_clipping = True parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=50) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', '-id', type=int, default=0) parser.add_argument('--use', type=str, default="randcrop") parser.add_argument('--log_interval', type=int, default=20) parser.add_argument('--nupdates', type=int, default=0) parser.add_argument('--total_tsteps', type=int, default=0) parser.add_argument('--load_id', type=int, default=int(-1)) args = parser.parse_args() if args.nupdates: timesteps_per_proc = int(args.nupdates * num_envs * nsteps) if not args.total_tsteps: args.total_tsteps = timesteps_per_proc ## use global 20_000_000 if not specified in args! run_ID = 'run_' + str(args.run_id).zfill(2) ## select which ppo to use: agent_str = args.use LOG_DIR = join("log", agent_str, "train") save_model = join("log", agent_str, "saved_{}_v{}.tar".format(agent_str, args.run_id)) ppo_func = PPO_FUNCs[agent_str] load_path = None if args.load_id > -1: load_path = join("log", agent_str, "saved_{}_v{}.tar".format(agent_str, args.load_id)) test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) logger.configure(dir=logpath, format_strs=format_strs) fpath = join(LOG_DIR, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.info("\n Saving model to file {}".format(save_model)) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto( log_device_placement=True) #device_count={'GPU':0, 'XLA_GPU':0}) config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) #sess.__enter__() logger.info(venv.observation_space) logger.info("training") with sess.as_default(): model = ppo_func.learn( sess=sess, env=venv, network=None, total_timesteps=args.total_tsteps, save_interval=1000, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=args.log_interval, ent_coef=ent_coef, # clip_vf=use_vf_clipping, lr=learning_rate, cliprange=clip_range, # update_fn=None, # init_fn=None, save_path=save_model, load_path=load_path, vf_coef=0.5, max_grad_norm=0.5, ) model.save(save_model)
def main(): args = parse_config() run_dir = log_this(args, args.log_dir, args.log_name + '_' + args.env_name + '_' + args.rm_id) test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=run_dir, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=args.num_envs, env_name=args.env_name, num_levels=args.num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode, use_sequential_levels=args.use_sequential_levels) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) if args.rm_id: # load pretrained network device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") net = RewardNet().to(device) rm_path = glob.glob('./**/' + args.rm_id + '.rm', recursive=True)[0] net.load_state_dict( torch.load(rm_path, map_location=torch.device(device))) # use batch reward prediction function instead of the ground truth reward function # pass though sigmoid if needed if args.use_sigmoid: rew_func = lambda x: 1 / (1 + np.exp(-net.predict_batch_rewards(x)) ) else: rew_func = lambda x: net.predict_batch_rewards(x) ## Uncomment the line below to train a live-long agent # rew_func = lambda x: x.shape[0] * [1] venv = ProxyRewardWrapper(venv, rew_func) else: # true environment rewards will be use pass venv = VecNormalize(venv=venv, ob=False, use_tf=False) # do the rest of the training as normal logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") model = ppo2.learn( env=venv, network=conv_fn, total_timesteps=args.timesteps_per_proc, save_interval=args.save_interval, nsteps=args.nsteps, nminibatches=args.nminibatches, lam=args.lam, gamma=args.gamma, noptepochs=args.ppo_epochs, log_interval=args.log_interval, ent_coef=args.ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=args.use_vf_clipping, comm=comm, lr=args.learning_rate, cliprange=args.clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, load_path=args.load_path, ) model.save(os.path.join(run_dir, 'final_model.parameters'))
def train_fn(env_name: str, num_train_envs: int, n_training_steps: int, adr_config: ADRConfig = None, experiment_dir: str = None, tunable_params_config_path: str = None, log_dir: str = None, is_test_worker: bool = False, comm=None, save_interval: int = 1000, log_interval: int = 20, recur: bool = True): # Get the default ADR config if none is provided adr_config = ADRConfig() if adr_config is None else adr_config # Set up the experiment directory for this run. This will contain everything, from the domain configs for the # training environment and ADR evaluation environments to the logs. If the directory path is not provided, then # we'll make one an use the date-time-name to make it unique if experiment_dir is None: experiment_dir = pathlib.Path().absolute() / 'adr_experiments' / ( 'experiment-' + datetime_name()) experiment_dir.mkdir(parents=True, exist_ok=False) else: experiment_dir = pathlib.Path(experiment_dir) # Make a config directory within the experiment directory to hold the domain configs config_dir = experiment_dir / 'domain_configs' config_dir.mkdir(parents=True, exist_ok=False) # Load the tunable parameters from a config file if it is provided, otherwise get the default for the given game. if tunable_params_config_path is None: try: tunable_params = DEFAULT_TUNABLE_PARAMS[env_name] except KeyError: raise KeyError( f'No default tunable parameters exist for {env_name}') else: raise NotImplemented( 'Currently no way to load tunable parameters from a configuration file' ) # Make a default config for the given game... train_domain_config_path = config_dir / 'train_config.json' try: train_domain_config = DEFAULT_DOMAIN_CONFIGS[env_name] train_domain_config.to_json(train_domain_config_path) except KeyError: raise KeyError(f'No default config exists for {env_name}') # ...then load the initial bounds for the tunable parameters into the config. params = {} for param in tunable_params: params['min_' + param.name] = param.lower_bound params['max_' + param.name] = param.upper_bound train_domain_config.update_parameters(params, cache=False) # Configure the logger if we are given a log directory if log_dir is not None: log_dir = experiment_dir / log_dir log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=str(log_dir), format_strs=format_strs) logger.info(f'env_name: {env_name}') logger.info(f'num_train_envs: {num_train_envs}') logger.info(f'n_training_steps: {n_training_steps}') logger.info(f'experiment_dir: {experiment_dir}') logger.info(f'tunable_params_config_path: {tunable_params_config_path}') logger.info(f'log_dir: {log_dir}') logger.info(f'save_interval: {save_interval}') n_steps = 256 ent_coef = .01 lr = 5e-4 vf_coef = .5 max_grad_norm = .5 gamma = .999 lmbda = .95 n_minibatches = 8 ppo_epochs = 3 clip_range = .2 use_vf_clipping = True mpi_rank_weight = 0 if is_test_worker else 1 logger.info('creating environment') training_env = ProcgenEnv(num_envs=num_train_envs, env_name=env_name, domain_config_path=str(train_domain_config_path)) training_env = VecExtractDictObs(training_env, "rgb") training_env = VecMonitor(venv=training_env, filename=None, keep_buf=100) training_env = VecNormalize(venv=training_env, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.__enter__() def conv_fn(x): return build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) if recur: logger.info("Using CNN LSTM") conv_fn = cnn_lstm(nlstm=256, conv_fn=conv_fn) logger.info('training') ppo2_adr.learn(conv_fn, training_env, n_training_steps, config_dir, adr_config, train_domain_config, tunable_params, n_steps=n_steps, ent_coef=ent_coef, lr=lr, vf_coef=vf_coef, max_grad_norm=max_grad_norm, gamma=gamma, lmbda=lmbda, log_interval=log_interval, save_interval=save_interval, n_minibatches=n_minibatches, n_optepochs=ppo_epochs, clip_range=clip_range, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping)
def main(): # get model path parser = argparse.ArgumentParser(description="Parse testing arguments") parser.add_argument('--model_path', type=str, default=None, help='Path to model checkpoint.') parser.add_argument('--config', type=str, default='configurations/ppo_baseline_cuda.yaml', help='Path to configuration file.') args = parser.parse_args() if args.model_path is None or not os.path.exists(args.model_path): raise OSError("Invalid model file supplied") # create configuration cfg = get_cfg_defaults() cfg.merge_from_file(args.config) # create save directory model_file_path = args.model_path exp_creation_time = os.path.normpath(model_file_path).split(os.sep)[-3] print(exp_creation_time) exp_dir = f"runs/{cfg.EXPERIMENT_NAME}/{exp_creation_time}_test/" os.makedirs(exp_dir, exist_ok=True) # create logger format_strs = ['csv', 'stdout'] logger.configure(dir=exp_dir, format_strs=format_strs, log_suffix=datetime.now().strftime('%Y-%m-%d-%H-%M')) # create (vectorized) procgen environment logger.info("creating environment") venv = ProcgenEnv(num_envs=cfg.TEST.NUM_ENVS, env_name="fruitbot", num_levels=cfg.TEST.NUM_LEVELS, start_level=cfg.TEST.LEVEL_SEED, distribution_mode="easy") venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) # create tensorflow session logger.info("creating tf session") config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # create cnn todo: make this less ugly conv_fn = None logger.info("building cnn") if cfg.TRAIN.NETWORK == "NATURE_CNN": conv_fn = lambda x: nature_cnn(x) elif cfg.TRAIN.NETWORK == "IMPALA_CNN": conv_fn = lambda x: build_impala_cnn( x, depths=[16, 32, 32], emb_size=256) # training logger.info("testing") ppo2.learn(env=venv, network=conv_fn, total_timesteps=cfg.TEST.TIMESTEPS, save_interval=0, nsteps=cfg.TEST.BATCH_SIZE, nminibatches=cfg.TRAIN.MINIBATCHES, lam=cfg.TRAIN.LAM, gamma=cfg.TRAIN.GAMMA, noptepochs=cfg.TRAIN.NUM_EPOCHS, log_interval=1, clip_vf=cfg.TRAIN.USE_VF_CLIPPING, lr=cfg.TRAIN.LR, cliprange=cfg.TRAIN.CLIP_RANGE, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, test=True, load_path=model_file_path)
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 last_step = 4587520 # where we have left off in training timesteps_per_proc = 25_000_000 - last_step use_vf_clipping = True model_path = '../train-procgen/saved_model/policy_bossfight_vae560' vae_path = '../train-procgen/saved_model/bossfight_vae560' parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument( '--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) args = parser.parse_args() test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") ppo2_cvae.learn(env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=10, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, load_path=model_path, vae_path=vae_path)
def main(): """Run DQN until the environment throws an exception.""" # Hyperparameters learning_rate = 2.5e-4 gamma = 0.99 nstep_return = 3 timesteps_per_proc = 50_000_000 train_interval = 4 target_interval = 8192 batch_size = 512 min_buffer_size = 20000 # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='starpilot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=1) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') parser.add_argument('--level_setup', type=str, default='procgen', choices=["procgen", "oracle"]) parser.add_argument('--mix_mode', type=str, default='nomix', choices=['nomix', 'mixreg']) parser.add_argument('--mix_alpha', type=float, default=0.2) parser.add_argument('--use_l2reg', action='store_true') parser.add_argument('--data_aug', type=str, default='no_aug', choices=['no_aug', 'cutout_color', 'crop']) parser.add_argument('--PER', type=lambda x: bool(strtobool(x)), default=True, help='Whether to use PER') parser.add_argument('--num_envs', type=int, default=64) args = parser.parse_args() # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_envs = args.num_envs # Setup env specs if args.level_setup == "procgen": env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level elif args.level_setup == "oracle": env_name = args.env_name num_levels = 0 start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure( dir=LOG_DIR + f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}', format_strs=format_strs) # Create env logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank % len(gpus_id)] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup Rainbow models logger.info("building models") online_net, target_net = rainbow_models( sess, venv.action_space.n, gym_space_vectorizer(venv.observation_space), min_val=REWARD_RANGE_FOR_C51[env_name][0], max_val=REWARD_RANGE_FOR_C51[env_name][1]) dqn = MpiDQN(online_net, target_net, discount=gamma, comm=comm, mpi_rank_weight=mpi_rank_weight, mix_mode=args.mix_mode, mix_alpha=args.mix_alpha, use_l2reg=args.use_l2reg, data_aug=args.data_aug) player = NStepPlayer(VecPlayer(venv, dqn.online_net), nstep_return) optimize = dqn.optimize(learning_rate=learning_rate) # Initialize and sync variables sess.run(tf.global_variables_initializer()) global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if comm.Get_size() > 1: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E110 # Training logger.info("training") if args.PER: dqn.train(num_steps=timesteps_per_proc, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=train_interval, target_interval=target_interval, batch_size=batch_size, min_buffer_size=min_buffer_size) else: #set alpha and beta equal to 0 for uniform prioritization and no importance sampling dqn.train(num_steps=timesteps_per_proc, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0, 0, epsilon=0.1), optimize_op=optimize, train_interval=train_interval, target_interval=target_interval, batch_size=batch_size, min_buffer_size=min_buffer_size)
def main(): num_envs = 64 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 total_timesteps = 1_000_000 ## From random_ppo.py max_grad_norm = 0.5 vf_coef = 0.5 parser = argparse.ArgumentParser( description='Process procgen testing arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=1000) parser.add_argument('--start_level', type=int, default=50) parser.add_argument('--run_id', '-id', type=int, default=0) parser.add_argument('--load_id', type=int, default=0) parser.add_argument('--nrollouts', '-nroll', type=int, default=50) parser.add_argument('--use', type=str, default="randcrop") parser.add_argument('--arch', type=str, default="impala") parser.add_argument('--no_bn', dest='use_batch_norm', action='store_false') parser.add_argument('--netrand', dest='netrand', action='store_true') parser.set_defaults(use_batch_norm=True) args = parser.parse_args() args.total_timesteps = total_timesteps arch = args.arch use_batch_norm = args.use_batch_norm if args.nrollouts: total_timesteps = int(args.nrollouts * num_envs * nsteps) run_ID = 'run_' + str(args.run_id).zfill(2) run_ID += '_load{}'.format(args.load_id) print(args.use) LOG_DIR = 'log/{}/test'.format(args.use) if not args.netrand: policy = CnnPolicy else: policy = RandomCnnPolicy load_model = "log/{}/saved_{}_v{}.tar".format(args.use, args.use, args.load_id) comm = MPI.COMM_WORLD num_levels = args.num_levels log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) fpath = join(logpath, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.configure(dir=logpath, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True sess = tf.compat.v1.Session(config=config) sess.__enter__() logger.info("Testing") env = venv nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches nrollouts = total_timesteps // nbatch model = Model(sess=sess, policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, arch=arch, use_batch_norm=use_batch_norm, dropout=0) model.load(load_model) logger.info("Model pramas loaded from saved model: ", load_model) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, aug_func=None) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) mean_rewards = [] datapoints = [] for rollout in range(1, nrollouts + 1): logger.info('collecting rollouts {}...'.format(rollout)) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100]) logger.info('\n----', rollout) mean_rewards.append(rew_mean_10) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) logger.logkv("misc/total_timesteps", rollout * nbatch) logger.info('----\n') logger.dumpkvs() env.close() print("Rewards history: ", mean_rewards) return mean_rewards
def main(): # Hyperparameters num_envs = 128 learning_rate = 5e-4 ent_coef = .01 vf_coef = 0.5 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 max_grad_norm = 0.5 timesteps_per_proc = 100_000_000 use_vf_clipping = True # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument('--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') parser.add_argument('--use_bn', action='store_true') parser.add_argument('--use_l2reg', action='store_true') parser.add_argument('--l2reg_coeff', type=float, default=1e-4) parser.add_argument('--data_aug', type=str, default='no_aug', choices=["no_aug", "cutout_color", "crop"]) parser.add_argument('--use_rand_conv', action='store_true') parser.add_argument('--model_width', type=str, default='1x', choices=["1x", "2x", "4x"]) parser.add_argument('--level_setup', type=str, default='procgen', choices=["procgen", "oracle"]) parser.add_argument('--mix_mode', type=str, default='nomix', choices=['nomix', 'mixreg', 'mixobs']) parser.add_argument('--mix_alpha', type=float, default=0.2) # JAG: Add second parameter beta to the beta distribution parser.add_argument('--mix_beta', type=float, default=0.2) # JAG: Parameters for adversarial RL # 1. The ending condition for adversarial gradient descent parser.add_argument('--adv_epsilon', type=float, default=5e-6) # 2. Learning rate for adversarial gradient descent parser.add_argument('--adv_lr', type=float, default=10) # 3. Adversarial penalty for observation euclidean distance parser.add_argument('--adv_gamma', type=float, default=0.01) # 4. We use adversarial after #threshold epochs of PPO training parser.add_argument('--adv_thresh', type=int, default=50) # 5. If we use evaluation environment parser.add_argument('--eval_env', type=bool, default=True) parser.add_argument('--eval_levels', type=int, default=0) # 6. The ratio of adversarial augmented data # adv = 1 means we replace original data with adversarial data # adv = 0 means we do not use adversarial parser.add_argument('--adv_adv', type=float, default=0.5) # 7. The ratio of mixup original data with augmented data # adv = 1 means we use augmented obs and value # adv = 0 means we use original obs and value parser.add_argument('--adv_obs', type=float, default=1) parser.add_argument('--adv_value', type=float, default=1) # Determine what percentage of environments we use (For generalization) # nenv = 1 means we use all the environments parser.add_argument('--adv_nenv', type=float, default=1) # 9. We test the first 500 epochs parser.add_argument('--adv_epochs', type=int, default=500) args = parser.parse_args() # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 # Setup env specs if args.level_setup == "procgen": env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level elif args.level_setup == "oracle": env_name = args.env_name num_levels = 0 start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure( dir=LOG_DIR + f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}', format_strs=format_strs ) # Create env logger.info("creating environment") # JAG: Limit the maximum training levels train_levels = int(num_levels * args.adv_nenv) venv = ProcgenEnv( num_envs=num_envs, env_name=env_name, num_levels=train_levels, start_level=start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) # JAG: If we use eval_env if args.eval_env: eval_env = ProcgenEnv( num_envs=num_envs, env_name=env_name, num_levels=args.eval_levels, start_level=start_level, distribution_mode=args.distribution_mode) eval_env = VecExtractDictObs(eval_env, "rgb") eval_env = VecMonitor(venv=eval_env, filename=None, keep_buf=100) eval_env = VecNormalize(venv=eval_env, ob=False) else: eval_env = None # Feed parameters to a dictionary adv_ratio={ 'adv': args.adv_adv, 'obs': args.adv_obs, 'value': args.adv_value, #'nenv': args.adv_nenv, } # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup model if args.model_width == '1x': depths = [16, 32, 32] elif args.model_width == '2x': depths = [32, 64, 64] elif args.model_width == '4x': depths = [64, 128, 128] conv_fn = lambda x: build_impala_cnn( x, depths=depths, use_bn=args.use_bn, randcnn=args.use_rand_conv and not is_test_worker) # Training logger.info("training") ppo2.learn = learn # use customized "learn" function model = ppo2.learn( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=vf_coef, max_grad_norm=max_grad_norm, data_aug=args.data_aug, use_rand_conv=args.use_rand_conv, model_fn=get_mixreg_model( mix_mode=args.mix_mode, mix_alpha=args.mix_alpha, mix_beta=args.mix_beta, use_l2reg=args.use_l2reg, l2reg_coeff=args.l2reg_coeff), # JAG: Pass adversarial parameters adv_epsilon=args.adv_epsilon, adv_lr=args.adv_lr, adv_gamma=args.adv_gamma, adv_thresh=args.adv_thresh, adv_ratio=adv_ratio, eval_env=eval_env, adv_epochs=args.adv_epochs, ) # Saving logger.info("saving final model") if rank == 0: checkdir = os.path.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) model.save(os.path.join(checkdir, 'final_model.ckpt'))
def rollout_fn(num_steps, env_name, num_envs, distribution_mode, num_levels, start_level, timesteps_per_proc, is_test_worker=False, log_dir='/tmp/procgen', comm=None, load_path=None): learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 use_vf_clipping = True mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else num_levels if log_dir is not None: log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs, filename="rollout") logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256) logger.info("training") ppo2.rollout( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, load_path = load_path, num_steps=num_steps, num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode )
def main(env_name, paint_vel_info, distribution_mode, num_levels, start_level, log_interval, iter_loss, arch, eval, num_envs, learning_rate, lr_schedule, ent_coef, gamma, lam, nsteps, nminibatches, ppo_epochs, clip_range, timesteps_per_proc, use_vf_clipping, _run, is_test_worker, timestep_factor): comm = MPI.COMM_WORLD log_comm = comm.Split(1 if is_test_worker else 0, 0) logger._run = _run # Configure logger format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir="{}/id_{}".format(LOG_DIR, _run._id), format_strs=format_strs) # Add sacred logger: if log_comm.Get_rank() == 0: logger.get_current().output_formats.append( SacredOutputFormat(_run, timestep_factor)) num_levels = 0 if is_test_worker else num_levels mpi_rank_weight = 0 if is_test_worker else 1 logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, paint_vel_info=paint_vel_info, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn_with_ibac( x, iter_loss=iter_loss, arch=arch, depths=[16, 32, 32], emb_size=256) logger.info("training") ppo_iter.learn( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, ## Iter iter_loss=iter_loss, arch=arch, _run=_run, ## Rest nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=log_interval, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, learning_rate=learning_rate, lr_schedule=lr_schedule, cliprange=clip_range, vf_coef=0.5, max_grad_norm=0.5, eval=eval, )
return eval_episode_rewards if __name__ == "__main__": args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda:0" if args.cuda else "cpu") args.num_processes = 1 venv = ProcgenEnv(num_envs=args.num_processes, env_name=args.env_name, \ num_levels=args.num_levels, start_level=args.start_level, \ distribution_mode=args.distribution_mode, render_mode="rgb_array") venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) envs = VecPyTorchProcgen(venv, device) obs_shape = envs.observation_space.shape actor_critic = Policy(obs_shape, envs.action_space.n, base_kwargs={ 'recurrent': False, 'hidden_size': args.hidden_size }) actor_critic.to(device) aug_id = data_augs.Identity
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 ##new defined vf_coef = 0.5 max_grad_norm = 0.5 ########### gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 # timesteps_per_proc = 50_000_000 use_vf_clipping = True parser = argparse.ArgumentParser(description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument('--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--total_timesteps', type=int, default=0) args = parser.parse_args() test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == (test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR, format_strs=format_strs, log_suffix="_total_timesteps_{}_num_levels_{}".format(args.total_timesteps, num_levels)) '''logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False)''' logger.info("Creating dropout evaluation environment") eval_venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=100, start_level=2000, distribution_mode=args.distribution_mode) eval_venv = VecExtractDictObs(eval_venv, "rgb") eval_venv = VecMonitor( venv=eval_venv, filename=None, keep_buf=100, ) eval_venv = VecNormalize(venv=eval_venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, is_train=False, depths=[16,32,32], emb_size=256) logger.info("testing dropout") policy = build_policy(eval_venv,conv_fn) nenvs = eval_venv.num_envs ob_space = eval_venv.observation_space ac_space = eval_venv.action_space nbatch = nenvs * nsteps nbatch_train = nbatch//nminibatches # Instantiate the model object (that creates act_model and train_model) from baselines.ppo2.model import Model model_fn = Model #modified from baseline ppo2 learn model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) model.load(MODEL_PATH) eval_runner = Runner(env=eval_venv, model=model, nsteps=nsteps, gamma=.999, lam=.95) eval_epinfobuf = deque(maxlen=100) nupdates = args.total_timesteps//nbatch log_interval = 1 for update in range(1, nupdates+1): #single upate to test eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() eval_epinfobuf.extend(eval_epinfos) if update % log_interval == 0 or update == 1: logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) logger.logkv('misc/total_timesteps',update*nbatch) logger.dumpkvs() eval_venv.close()
def train_fn(env_name, num_envs, distribution_mode, num_levels, start_level, timesteps_per_proc, level_sampler_strategy, score_transform, model_name, is_test_worker=False, save_dir='./', comm=None): learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 use_vf_clipping = True mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else num_levels log_dir = save_dir + 'logs/' + model_name if log_dir is not None: log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'tensorboard' ] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs) logger.info("creating environment") eval_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=500, start_level=0, distribution_mode=distribution_mode) eval_env = VecExtractDictObs(eval_env, "rgb") eval_env = VecMonitor( venv=eval_env, filename=None, keep_buf=100, ) eval_env = VecNormalize(venv=eval_env, ob=False, ret=True) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32]) logger.info("training") model = ppo2.learn(network=conv_fn, total_timesteps=timesteps_per_proc, num_levels=num_levels, eval_env=eval_env, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, level_sampler_strategy=level_sampler_strategy, score_transform=score_transform) model.save(save_dir + 'models/' + model_name)
def train(args): args.cuda = not args.no_cuda and torch.cuda.is_available() print('Using CUDA: {}'.format(args.cuda)) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) log_dir = args.log_dir if not log_dir.startswith('gs://'): log_dir = os.path.expanduser(args.log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if not args.preempt: utils.cleanup_log_dir(log_dir) try: gfile.makedirs(log_dir) except: pass log_file = '-{}-{}-reproduce-s{}'.format(args.run_name, args.env_name, args.seed) save_dir = os.path.join(log_dir, 'checkpoints', log_file) gfile.makedirs(save_dir) venv = ProcgenEnv(num_envs=args.num_processes, env_name=args.env_name, \ num_levels=args.num_levels, start_level=args.start_level, \ distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) envs = VecPyTorchProcgen(venv, device) obs_shape = envs.observation_space.shape actor_critic = Policy(obs_shape, envs.action_space.n, base_kwargs={ 'recurrent': False, 'hidden_size': args.hidden_size }) actor_critic.to(device) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, aug_type=args.aug_type, split_ratio=args.split_ratio, store_policy=args.use_pse) batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch) if args.use_ucb: aug_id = data_augs.Identity aug_list = [ aug_to_func[t](batch_size=batch_size) for t in list(aug_to_func.keys()) ] agent = algo.UCBDrAC(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_list=aug_list, aug_id=aug_id, aug_coef=args.aug_coef, num_aug_types=len(list(aug_to_func.keys())), ucb_exploration_coef=args.ucb_exploration_coef, ucb_window_length=args.ucb_window_length) elif args.use_meta_learning: aug_id = data_augs.Identity aug_list = [aug_to_func[t](batch_size=batch_size) \ for t in list(aug_to_func.keys())] aug_model = AugCNN() aug_model.to(device) agent = algo.MetaDrAC(actor_critic, aug_model, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, meta_grad_clip=args.meta_grad_clip, meta_num_train_steps=args.meta_num_train_steps, meta_num_test_steps=args.meta_num_test_steps, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_coef=args.aug_coef) elif args.use_rl2: aug_id = data_augs.Identity aug_list = [ aug_to_func[t](batch_size=batch_size) for t in list(aug_to_func.keys()) ] rl2_obs_shape = [envs.action_space.n + 1] rl2_learner = Policy(rl2_obs_shape, len(list(aug_to_func.keys())), base_kwargs={ 'recurrent': True, 'hidden_size': args.rl2_hidden_size }) rl2_learner.to(device) agent = algo.RL2DrAC(actor_critic, rl2_learner, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.rl2_entropy_coef, lr=args.lr, eps=args.eps, rl2_lr=args.rl2_lr, rl2_eps=args.rl2_eps, max_grad_norm=args.max_grad_norm, aug_list=aug_list, aug_id=aug_id, aug_coef=args.aug_coef, num_aug_types=len(list(aug_to_func.keys())), recurrent_hidden_size=args.rl2_hidden_size, num_actions=envs.action_space.n, device=device) elif args.use_rad: aug_id = data_augs.Identity aug_func = aug_to_func[args.aug_type](batch_size=batch_size) pse_coef = args.pse_coef if args.use_pse: assert args.pse_coef > 0, "Please pass a non-zero pse_coef" else: pse_coef = 0.0 print("Running RAD ..") print( "PSE: {}, Coef: {}, Gamma: {}, Temp: {}, Coupling Temp: {}".format( args.use_pse, pse_coef, args.pse_gamma, args.pse_temperature, args.pse_coupling_temperature)) print('use_augmentation: {}'.format(args.use_augmentation)) agent = algo.RAD( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_func=aug_func, env_name=args.env_name, use_augmentation=args.use_augmentation, pse_gamma=args.pse_gamma, pse_coef=pse_coef, pse_temperature=args.pse_temperature, pse_coupling_temperature=args.pse_coupling_temperature) else: aug_id = data_augs.Identity aug_func = aug_to_func[args.aug_type](batch_size=batch_size) pse_coef = args.pse_coef if args.use_pse: assert args.pse_coef > 0, "Please pass a non-zero pse_coef" else: pse_coef = 0.0 print("Running DraC ..") print("PSE: {}, Coef: {}, Gamma: {}, Temp: {}".format( args.use_pse, pse_coef, args.pse_gamma, args.pse_temperature)) agent = algo.DrAC(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_func=aug_func, aug_coef=args.aug_coef, env_name=args.env_name, pse_gamma=args.pse_gamma, pse_coef=pse_coef, pse_temperature=args.pse_temperature) checkpoint_path = os.path.join(save_dir, "agent" + log_file + ".pt") if gfile.exists(checkpoint_path) and args.preempt: with gfile.GFile(checkpoint_path, 'rb') as f: inbuffer = io.BytesIO(f.read()) checkpoint = torch.load(inbuffer) agent.actor_critic.load_state_dict(checkpoint['model_state_dict']) agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) init_epoch = checkpoint['epoch'] + 1 print('Loaded ckpt from epoch {}'.format(init_epoch - 1)) logger.configure(dir=args.log_dir, format_strs=['csv', 'stdout', 'tensorboard'], log_suffix=log_file, init_step=init_epoch) else: init_epoch = 0 logger.configure(dir=args.log_dir, format_strs=['csv', 'stdout', 'tensorboard'], log_suffix=log_file, init_step=init_epoch) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(init_epoch, num_updates): actor_critic.train() for step in range(args.num_steps): # Sample actions with torch.no_grad(): obs_id = aug_id(rollouts.obs[step]) value, action, action_log_prob, recurrent_hidden_states, pi = actor_critic.act( obs_id, rollouts.recurrent_hidden_states[step], rollouts.masks[step], policy=True) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks, pi=pi) with torch.no_grad(): obs_id = aug_id(rollouts.obs[-1]) next_value = actor_critic.get_value( obs_id, rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.gae_lambda) if args.use_ucb and j > 0: agent.update_ucb_values(rollouts) value_loss, action_loss, dist_entropy, pse_loss = agent.update( rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "\nUpdate {}, step {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), dist_entropy, value_loss, action_loss)) logger.logkv("train/nupdates", j) logger.logkv("train/total_num_steps", total_num_steps) logger.logkv("losses/dist_entropy", dist_entropy) logger.logkv("losses/value_loss", value_loss) logger.logkv("losses/action_loss", action_loss) if args.use_pse: logger.logkv("losses/pse_loss", pse_loss) logger.logkv("train/mean_episode_reward", np.mean(episode_rewards)) logger.logkv("train/median_episode_reward", np.median(episode_rewards)) ### Eval on the Full Distribution of Levels ### eval_episode_rewards = evaluate(args, actor_critic, device, aug_id=aug_id) logger.logkv("test/mean_episode_reward", np.mean(eval_episode_rewards)) logger.logkv("test/median_episode_reward", np.median(eval_episode_rewards)) logger.dumpkvs() # Save Model if (j > 0 and j % args.save_interval == 0 or j == num_updates - 1) and save_dir != "": try: gfile.makedirs(save_dir) except OSError: pass ckpt_file = os.path.join(save_dir, "agent" + log_file + ".pt") outbuffer = io.BytesIO() torch.save( { 'epoch': j, 'model_state_dict': agent.actor_critic.state_dict(), 'optimizer_state_dict': agent.optimizer.state_dict() }, outbuffer) with gfile.GFile(ckpt_file, 'wb') as fout: fout.write(outbuffer.getvalue()) save_num_steps = (j + 1) * args.num_processes * args.num_steps print("\nUpdate {}, step {}, Saved {}.".format( j, save_num_steps, ckpt_file))
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 # nsteps = (128 // 8) nsteps = (128 // 8) nminibatches = 8 ppo_epochs = 3 clip_range = .2 timesteps_per_proc = 1_000_000 use_vf_clipping = True dist_mode = "easy" env_name = "visual-cartpole" num_levels = 100 # disc_coeff = None disc_coeff = 0. if disc_coeff is None: LOG_DIR = "/home/josh/" + env_name + "/" + env_name + "_disc_coeff_ramping2_num_levels_" + str(num_levels) + "_nsteps_" + str(nsteps) else: LOG_DIR = "/home/josh/" + env_name + "_easy_vae/" + env_name + "_disc_coeff_" + str(disc_coeff) + "_num_levels_" + str(num_levels) + "_nsteps_" + str(nsteps) test_worker_interval = 0 comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == (test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'tensorboard'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR, format_strs=format_strs) logger.info("creating environment") if env_name == "visual-cartpole": venv = gym.vector.make('cartpole-visual-v1', num_envs=num_envs, num_levels=num_levels) venv.observation_space = gym.spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8) venv.action_space = gym.spaces.Discrete(2) else: venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=0, distribution_mode=dist_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) if env_name == "visual-cartpole": test_venv = gym.vector.make('cartpole-visual-v1', num_envs=num_envs, num_levels=0) test_venv.observation_space = gym.spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8) test_venv.action_space = gym.spaces.Discrete(2) else: test_venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=0, start_level=1000, distribution_mode=dist_mode) test_venv = VecExtractDictObs(test_venv, "rgb") test_venv = VecMonitor( venv=test_venv, filename=None, keep_buf=100, ) # test_venv = VecExtractDictObs(test_venv, "rgb") test_venv = VecNormalize(venv=test_venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 config.gpu_options.per_process_gpu_memory_fraction = 0.9 sess = tf.Session(config=config) sess.__enter__() # conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256) # conv_fn = lambda x: nature_cnn(x) conv_fn = lambda x: build_darla_vae(x, emb_size=256) logger.info("training") ppo2.learn( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, eval_env=test_venv, num_levels=num_levels, disc_coeff=disc_coeff, )
def main(): # Hyperparameters num_envs = 128 learning_rate = 5e-4 ent_coef = .01 vf_coef = 0.5 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 max_grad_norm = 0.5 timesteps_per_proc = 100_000_000 use_vf_clipping = True # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument( '--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') args = parser.parse_args() # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 # Setup env specs env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR + f'/{args.env_name}/run_{args.run_id}', format_strs=format_strs) # Create env logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup model conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32]) # Training logger.info("training") ppo2.Runner = NetRandRunner ppo2.build_policy = build_policy model = ppo2.learn( env=venv, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=vf_coef, max_grad_norm=max_grad_norm, model_fn=NetRandModel, ) # Saving logger.info("saving final model") if rank == 0: checkdir = os.path.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) model.save(os.path.join(checkdir, 'final_model.ckpt'))
def main(): parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=50) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', '-id', type=int, default=99) parser.add_argument('--nupdates', type=int, default=0) parser.add_argument('--total_tsteps', type=int, default=0) parser.add_argument('--log_interval', type=int, default=5) parser.add_argument('--load_id', type=int, default=int(-1)) parser.add_argument('--nrollouts', '-nroll', type=int, default=0) parser.add_argument('--test', default=False, action="store_true") parser.add_argument('--use_model', type=int, default=1, help="either model #1 or #2") parser.add_argument('--train_level', type=int, default=50) args = parser.parse_args() #timesteps_per_proc if args.nupdates: timesteps_per_proc = int(args.nupdates * num_envs * nsteps) if not args.total_tsteps: args.total_tsteps = TIMESTEPS_PER_PROC ## use global 20_000_000 if not specified in args! if args.nrollouts: total_timesteps = int(args.nrollouts * num_envs * nsteps) run_ID = 'run_' + str(args.run_id).zfill(2) if args.test: args.log_interval = 1 args.total_tsteps = 1_000_000 run_ID += '_test{}_model{}'.format(args.load_id, args.use_model) load_path = None if args.load_id > -1: load_path = join(SAVE_PATH, args.env_name, 'saved_ensemble2_v{}.tar'.format(args.load_id)) test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] if args.test: logpath = join('log2/ensemble2', args.env_name, 'test', run_ID) else: logpath = join('log2/ensemble2', args.env_name, 'train', run_ID) save_path = join(SAVE_PATH, args.env_name, 'saved_ensemble2_v{}.tar'.format(args.run_id)) logger.info("\n Model will be saved to file {}".format(save_path)) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) logger.configure(dir=logpath, format_strs=format_strs) fpath = join(logpath, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.info("creating tf session") setup_mpi_gpus() if not args.test: config = tf.compat.v1.ConfigProto(\ allow_soft_placement=True, log_device_placement=True)# device_count={'GPU':0}) config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) logger.info("creating 2 environments") n_levels = int(args.num_levels / 2) env1 = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=n_levels, start_level=0, distribution_mode=args.distribution_mode) env1 = VecExtractDictObs(env1, "rgb") env1 = VecMonitor( venv=env1, filename=None, keep_buf=100, ) env1 = VecNormalize(venv=env1, ob=False) env2 = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=n_levels, start_level=n_levels, distribution_mode=args.distribution_mode) env2 = VecExtractDictObs(env2, "rgb") env2 = VecMonitor( venv=env2, filename=None, keep_buf=100, ) env2 = VecNormalize(venv=env2, ob=False) train(run_ID, save_path, load_path, env1, env2, sess, logger, args) else: use_model = args.use_model ## 1 or 2 alt_flag = use_model - 1 test_all(alt_flag, load_path, logger, args)
def learn(*, network, total_timesteps, num_levels=50, start_level=500, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, num_processes=64, num_steps=256, level_replay_temperature=0.1, level_replay_rho=1.0, level_replay_nu=0.5, level_replay_alpha=1.0, staleness_coef=0.1, staleness_temperature=1.0, level_sampler_strategy='value_l1', score_transform='rank', **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) level_sampler_args = dict(num_actors=num_processes, strategy=level_sampler_strategy, replay_schedule='proportionate', score_transform=score_transform, temperature=level_replay_temperature, rho=level_replay_rho, nu=level_replay_nu, alpha=level_replay_alpha, staleness_coef=staleness_coef, staleness_transform='power', staleness_temperature=staleness_temperature) env = ProcgenEnv(num_envs=num_processes, env_name='fruitbot', \ num_levels=1, start_level=start_level, \ distribution_mode='easy', paint_vel_info=False) env = VecExtractDictObs(env, "rgb") env = VecMonitor(venv=env, filename=None, keep_buf=100) env = VecNormalize(venv=env, ob=False, ret=True) seeds = [start_level + i for i in range(num_levels)] level_sampler = LevelSampler(seeds, env.observation_space, env.action_space, **level_sampler_args) env = VecProcgen(env, level_sampler=level_sampler) rollouts = RolloutStorage(num_steps, num_processes, env.observation_space.shape, env.action_space) level_seeds = np.zeros(num_processes) obs, level_seeds = env.reset() level_seeds = level_seeds.reshape(-1, 1) rollouts.obs[0] = obs policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, rollouts=rollouts) if eval_env is not None: eval_runner = EvalRunner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos, = runner.run( level_seeds=level_seeds) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos, = eval_runner.run( ) #pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Update level sampler level_sampler.update_with_rollouts(rollouts) rollouts.after_update() level_sampler.after_update() # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) np.save('gdrive/MyDrive/182 Project/sampled_levels.npy', level_sampler.sampled_levels) return model
def train(args): args.cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) log_dir = os.path.expanduser(args.log_dir) utils.cleanup_log_dir(log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") log_file = '-{}-{}-reproduce-s{}'.format(args.run_name, args.env_name, args.seed) venv = ProcgenEnv(num_envs=args.num_processes, env_name=args.env_name, \ num_levels=args.num_levels, start_level=args.start_level, \ distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) envs = VecPyTorchProcgen(venv, device) obs_shape = envs.observation_space.shape ################################ actor_critic = Policy(obs_shape, envs.action_space.n, base_kwargs={ 'recurrent': False, 'hidden_size': args.hidden_size }) actor_critic.to(device) ################################ rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, aug_type=args.aug_type, split_ratio=args.split_ratio) batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch) ################################ if args.use_ucb: aug_id = data_augs.Identity aug_list = [ aug_to_func[t](batch_size=batch_size) for t in list(aug_to_func.keys()) ] agent = algo.UCBDrAC(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_list=aug_list, aug_id=aug_id, aug_coef=args.aug_coef, num_aug_types=len(list(aug_to_func.keys())), ucb_exploration_coef=args.ucb_exploration_coef, ucb_window_length=args.ucb_window_length) elif args.use_meta_learning: aug_id = data_augs.Identity aug_list = [aug_to_func[t](batch_size=batch_size) \ for t in list(aug_to_func.keys())] aug_model = AugCNN() aug_model.to(device) agent = algo.MetaDrAC(actor_critic, aug_model, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, meta_grad_clip=args.meta_grad_clip, meta_num_train_steps=args.meta_num_train_steps, meta_num_test_steps=args.meta_num_test_steps, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_coef=args.aug_coef) elif args.use_rl2: aug_id = data_augs.Identity aug_list = [ aug_to_func[t](batch_size=batch_size) for t in list(aug_to_func.keys()) ] rl2_obs_shape = [envs.action_space.n + 1] rl2_learner = Policy(rl2_obs_shape, len(list(aug_to_func.keys())), base_kwargs={ 'recurrent': True, 'hidden_size': args.rl2_hidden_size }) rl2_learner.to(device) agent = algo.RL2DrAC(actor_critic, rl2_learner, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.rl2_entropy_coef, lr=args.lr, eps=args.eps, rl2_lr=args.rl2_lr, rl2_eps=args.rl2_eps, max_grad_norm=args.max_grad_norm, aug_list=aug_list, aug_id=aug_id, aug_coef=args.aug_coef, num_aug_types=len(list(aug_to_func.keys())), recurrent_hidden_size=args.rl2_hidden_size, num_actions=envs.action_space.n, device=device) else: aug_id = data_augs.Identity aug_func = aug_to_func[args.aug_type](batch_size=batch_size) agent = algo.DrAC(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_func=aug_func, aug_coef=args.aug_coef, env_name=args.env_name) checkpoint_path = os.path.join(args.save_dir, "agent" + log_file + ".pt") if os.path.exists(checkpoint_path) and args.preempt: checkpoint = torch.load(checkpoint_path) agent.actor_critic.load_state_dict(checkpoint['model_state_dict']) agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) init_epoch = checkpoint['epoch'] + 1 logger.configure(dir=args.log_dir, format_strs=['csv', 'stdout'], log_suffix=log_file + "-e%s" % init_epoch) else: init_epoch = 0 logger.configure(dir=args.log_dir, format_strs=['csv', 'stdout'], log_suffix=log_file) obs = envs.reset() # envs!!!!!!!!!! rollouts.obs[0].copy_(obs) # 초기 obs 장착 rollouts.to(device) episode_rewards = deque(maxlen=10) # args.num_steps -> 256, 'number of forward steps in A2C') # args.num_env_steps -> 25e6, 'number of environment steps to train' num_updates = int( args.num_env_steps) // args.num_processes // args.num_steps # todo : 에폭이라... 그런데 이거 에피소드마다 종료되는 스탭이 다를텐데... for j in range(init_epoch, num_updates): actor_critic.train() for step in range(args.num_steps): # Sample actions with torch.no_grad(): obs_id = aug_id(rollouts.obs[step]) value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( obs_id, rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observe reward and next obs # todo : check the shapes of obs, reward, done, infos obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): # todo : difference between reward and info['episode']['r'] episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): obs_id = aug_id(rollouts.obs[-1]) # todo : what is next_value for? next_value = actor_critic.get_value( obs_id, rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.gae_lambda) if args.use_ucb and j > 0: # from second epoch agent.update_ucb_values(rollouts) # update ucb # todo : 와 여기가 장난아니네 ㅠㅠ value_loss, action_loss, dist_entropy = agent.update(rollouts) # 뭔가 클리어! rollouts.after_update() # save for every interval-th episode or for the last epoch total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "\nUpdate {}, step {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), dist_entropy, value_loss, action_loss)) logger.logkv("train/nupdates", j) logger.logkv("train/total_num_steps", total_num_steps) logger.logkv("losses/dist_entropy", dist_entropy) logger.logkv("losses/value_loss", value_loss) logger.logkv("losses/action_loss", action_loss) logger.logkv("train/mean_episode_reward", np.mean(episode_rewards)) logger.logkv("train/median_episode_reward", np.median(episode_rewards)) ### Eval on the Full Distribution of Levels ### eval_episode_rewards = evaluate(args, actor_critic, device, aug_id=aug_id) logger.logkv("test/mean_episode_reward", np.mean(eval_episode_rewards)) logger.logkv("test/median_episode_reward", np.median(eval_episode_rewards)) logger.dumpkvs() # Save Model if (j > 0 and j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": try: os.makedirs(args.save_dir) except OSError: pass torch.save( { 'epoch': j, 'model_state_dict': agent.actor_critic.state_dict(), 'optimizer_state_dict': agent.optimizer.state_dict(), }, os.path.join(args.save_dir, "agent" + log_file + ".pt"))
def make_vec_envs( env_name, start_level, num_levels, distribution_mode, paint_vel_info, num_processes, num_frame_stack, device, ): """ Make vector of environments. Parameters: ----------- env_name : `str` Name of environment to train on. start_level : `int` The point in the list of levels available to the environment at which to index into. num_levels : `int` The number of unique levels that can be generated. Set to 0 to use unlimited levels. distribution_mode : `str` What variant of the levels to use {easy, hard, extreme, memory, exploration}. paint_vel_info : `Boolean` Paint player velocity info in the top left corner. Only supported by certain games. num_processes : `int` How many training CPU processes to use (default: 64). This will give the number of environments to make. num_frame_stack : `int` Number of frames to stack for VecFrameStack wrapper (default: 0). device : `torch.device` CPU or GPU. Returns: -------- env : Vector of environments. """ envs = ProcgenEnv(num_envs=num_processes, env_name=env_name, start_level=start_level, num_levels=num_levels, distribution_mode=distribution_mode, paint_vel_info=paint_vel_info) # extract image from dict envs = VecExtractDictObs(envs, "rgb") # re-order channels, (H,W,C) => (C,H,W). # required for PyTorch convolution layers. envs = VecTransposeImage(envs) # records: # 1. episode reward, # 2. episode length # 3. episode time taken envs = VecMonitor(venv=envs, keep_buf=100) # normalise the rewards envs = VecNormalize(envs, ob=False) # wrapper to convert observation arrays to torch.Tensors # normalise observations / 255. envs = VecPyTorch(envs, device) # Frame stacking wrapper for vectorized environment if num_frame_stack != 0: envs = VecPyTorchFrameStack(envs, num_frame_stack, device) return envs