コード例 #1
0
def test_one_env(alt_flag,
                 model,
                 start_level,
                 num_levels,
                 logger,
                 args,
                 env=None):
    ## Modified based on random_ppo.learn
    if not env:
        venv = ProcgenEnv(num_envs=num_envs,
                          env_name=args.env_name,
                          num_levels=num_levels,
                          start_level=start_level,
                          distribution_mode=args.distribution_mode)
        venv = VecExtractDictObs(venv, "rgb")
        venv = VecMonitor(
            venv=venv,
            filename=None,
            keep_buf=100,
        )
        venv = VecNormalize(venv=venv, ob=False)
        env = venv

    runner = TestRunner(env=env,
                        model=model,
                        nsteps=nsteps,
                        gamma=gamma,
                        lam=lam)

    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)
    mean_rewards = []
    datapoints = []
    for rollout in range(1, args.nrollouts + 1):
        logger.info('collecting rollouts {}...'.format(rollout))
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
            alt_flag)
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
        rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
        ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10])
        ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100])

        logger.info('\n----', rollout)
        mean_rewards.append(rew_mean_10)
        logger.logkv('start_level', start_level)
        logger.logkv('eprew10', rew_mean_10)
        logger.logkv('eprew100', rew_mean_100)
        logger.logkv('eplenmean10', ep_len_mean_10)
        logger.logkv('eplenmean100', ep_len_mean_100)
        logger.logkv("misc/total_timesteps", rollout * args.nbatch)

        logger.info('----\n')
        logger.dumpkvs()
    env.close()
    logger.info("Average reward on levels {} ~ {}: {} ".format(
        start_level, start_level + num_levels, mean_rewards))
    return np.mean(mean_rewards)
コード例 #2
0
ファイル: ppo2_pse.py プロジェクト: ahmeda14960/IBAC-SNI
def generate_level_replay(ppo,mdp_id,wandb_save_dir,nbatch_train, nsteps, max_grad_norm, ob_space, ac_space, nsteps_rollout=782):
	ppo_graph = tf.Graph()
	print('Created graph')
	observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255))
	action_space = DiscreteG(15)

	gym3_env_eval = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=1, start_level=int(mdp_id), 
									 paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
	venv_eval = FakeEnv(gym3_env_eval, observation_space, action_space)
	venv_eval = VecExtractDictObs(venv_eval, "rgb")
	venv_eval = VecMonitor(
		venv=venv_eval, filename=None, keep_buf=100,
	)
	venv_eval = VecNormalize(venv=venv_eval, ob=False)
	venv_eval = wrappers.add_final_wrappers(venv_eval)
	print('Created env')
	graph_one_vars = ppo_graph.get_all_collection_keys()

	model_path = wandb_save_dir+'/%d/ppo-1'%mdp_id

	with tf.compat.v1.Session(graph=ppo_graph,config=tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)) as sess_1:
		with tf.compat.v1.variable_scope("model_%d"%np.random.randint(0,100000,1).item()):
			ppo_model_1 = ppo(sess_1, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo')
			initialize = tf.compat.v1.global_variables_initializer()
			sess_1.run(initialize)
			print('Inited session')
		model_saver = tf.train.import_meta_graph(model_path+'.meta')
		model_saver.restore(sess_1, save_path=model_path)
		print('Restored PPO')
		mb_obs_1, mb_actions_1, mb_rewards_1 = collect_data(ppo_model_1,venv_eval,nsteps=nsteps_rollout, param_vals='pretrained')
		print('Collected level data')

	venv_eval.close()

	return mb_obs_1, mb_actions_1, mb_rewards_1
コード例 #3
0
def make_env(steps_per_env):
	observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255))
	action_space = DiscreteG(15)
	if Config.FIRST_PHASE == 'exploration':
		# baseline_vec_train = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
		gym3_env_train = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE, start_level=Config.START_LEVEL)
	else:
		# baseline_vec_train = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
		gym3_env_train = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE, start_level=Config.START_LEVEL)
	if Config.SECOND_PHASE == 'exploration':
		# baseline_vec_adapt = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT,  paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE)
		gym3_env_adapt = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT,  paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE, start_level=Config.START_LEVEL)
	elif Config.SECOND_PHASE != "None":
		# baseline_vec_adapt = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE)
		gym3_env_adapt = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE, start_level=Config.START_LEVEL)
	else:
		baseline_vec_adapt = gym3_env_adapt = None
	
	venv_train = FakeEnv(gym3_env_train, observation_space, action_space)
	venv_train = VecExtractDictObs(venv_train, "rgb")
	if Config.SECOND_PHASE != "None":
		venv_adapt = FakeEnv(gym3_env_adapt, observation_space, action_space)   
		venv_adapt = VecExtractDictObs(venv_adapt, "rgb")
	venv_train = VecMonitor(
		venv=venv_train, filename=None, keep_buf=100,
	)
	if Config.SECOND_PHASE != "None":
		venv_adapt = VecMonitor(
			venv=venv_adapt, filename=None, keep_buf=100,
		)

	venv_train = VecNormalize(venv=venv_train, ob=False)
	venv_train = wrappers.add_final_wrappers(venv_train)
	if Config.SECOND_PHASE != "None":
		venv_adapt = VecNormalize(venv=venv_adapt, ob=False)
		venv_adapt = wrappers.add_final_wrappers(venv_adapt)

		venv = wrappers.DistributionShiftWrapperVec(env_list=[venv_train, venv_adapt], steps_per_env=steps_per_env) 
	else:
		venv = venv_train
		venv_adapt = venv_train = None
		venv.current_env_steps_left = steps_per_env

	return venv, venv_train, venv_adapt
コード例 #4
0
def test_all(alt_flag, load_path, logger, args):
    train_end = int(args.train_level)
    config = tf.compat.v1.ConfigProto(
        log_device_placement=True)  #device_count={'GPU':0})
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.compat.v1.Session(config=config)

    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=train_end,
                      start_level=0,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    env = venv
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    nrollouts = args.total_tsteps // nbatch
    args.nrollouts = nrollouts
    args.nbatch = nbatch

    model = Model(sess=sess,
                  policy=EnsembleCnnPolicy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=0.5,
                  max_grad_norm=0.5)
    model.load(load_path)
    logger.info("Model pramas loaded from saved model: ", load_path)

    mean_rewards = []
    ## first, test train performance

    mean_rewards.append(
        test_one_env(alt_flag, model, 0, train_end, logger, args, env=env))

    ## then, test on sampled intervals
    for l in TEST_START_LEVELS:
        mean_rewards.append(
            test_one_env(alt_flag, model, l, 100, logger, args, env=None))

    logger.info("All tests finished, mean reward history: ", mean_rewards)
    return
コード例 #5
0
ファイル: envs.py プロジェクト: dibyaghosh/level-replay
def make_lr_venv(num_envs, env_name, seeds, device, **kwargs):
    level_sampler = kwargs.get('level_sampler')
    level_sampler_args = kwargs.get('level_sampler_args')

    ret_normalization = not kwargs.get('no_ret_normalization', False)

    if env_name in PROCGEN_ENVS:
        num_levels = kwargs.get('num_levels', 1)
        start_level = kwargs.get('start_level', 0)
        distribution_mode = kwargs.get('distribution_mode', 'easy')
        paint_vel_info = kwargs.get('paint_vel_info', False)

        venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, \
            num_levels=num_levels, start_level=start_level, \
            distribution_mode=distribution_mode,
            paint_vel_info=paint_vel_info)
        venv = VecExtractDictObs(venv, "rgb")
        venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
        venv = VecNormalize(venv=venv, ob=False, ret=ret_normalization)

        if level_sampler_args:
            level_sampler = LevelSampler(
                seeds, 
                venv.observation_space, venv.action_space,
                **level_sampler_args)

        envs = VecPyTorchProcgen(venv, device, level_sampler=level_sampler)

    elif env_name.startswith('MiniGrid'):
        venv = VecMinigrid(num_envs=num_envs, env_name=env_name, seeds=seeds)
        venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
        venv = VecNormalize(venv=venv, ob=False, ret=ret_normalization)

        if level_sampler_args:
            level_sampler = LevelSampler(
                seeds, 
                venv.observation_space, venv.action_space,
                **level_sampler_args)

        elif seeds:
            level_sampler = LevelSampler(
                seeds,
                venv.observation_space, venv.action_space,
                strategy='random',
            )

        envs = VecPyTorchMinigrid(venv, device, level_sampler=level_sampler)

    else:
        raise ValueError(f'Unsupported env {env_name}')

    return envs, level_sampler
コード例 #6
0
    def __init__(self, model, config_dir: pathlib.Path, n_trajectories: int, tunable_params: List[EnvironmentParameter]):
        self._model = model
        self._n_trajectories = n_trajectories

        # Initialize the environment
        easy_config_path = config_dir / 'test_easy_config.json'
        easy_config = copy.copy(BossfightEasyConfig)
        easy_config.to_json(easy_config_path)
        easy_env = ProcgenEnv(num_envs=1, env_name=str(easy_config.game), domain_config_path=str(easy_config_path))
        easy_env = VecExtractDictObs(easy_env, "rgb")
        easy_env = VecMonitor(venv=easy_env, filename=None, keep_buf=100)
        self.easy_env = VecNormalize(venv=easy_env, ob=False)

        hard_config_path = config_dir / 'test_hard_config.json'
        hard_config = copy.copy(BossfightHardConfig)
        hard_config.to_json(hard_config_path)
        hard_env = ProcgenEnv(num_envs=1, env_name=str(hard_config.game), domain_config_path=str(hard_config_path))
        hard_env = VecExtractDictObs(hard_env, "rgb")
        hard_env = VecMonitor(venv=hard_env, filename=None, keep_buf=100)
        self.hard_env = VecNormalize(venv=hard_env, ob=False)

        # Make a default config for bossfight...
        test_domain_config_path = config_dir / 'test_full_config.json'
        test_domain_config = DEFAULT_DOMAIN_CONFIGS['dc_bossfight']
        test_domain_config.to_json(test_domain_config_path)

        params = {}
        for param in tunable_params:
            params['min_' + param.name] = param.clip_lower_bound
            params['max_' + param.name] = param.clip_upper_bound
        test_domain_config.update_parameters(params, cache=False)

        full_env = ProcgenEnv(num_envs=1, env_name=str(test_domain_config.game), domain_config_path=str(test_domain_config_path))
        full_env = VecExtractDictObs(full_env, "rgb")
        full_env = VecMonitor(venv=full_env, filename=None, keep_buf=100)
        self.full_env = VecNormalize(venv=full_env, ob=False)
コード例 #7
0
def evaluate(args, actor_critic, device, num_processes=1, aug_id=None):
    actor_critic.eval()

    # Sample Levels From the Full Distribution
    venv = ProcgenEnv(num_envs=num_processes, env_name=args.env_name, \
        num_levels=0, start_level=0, \
        distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)
    eval_envs = VecPyTorchProcgen(venv, device)

    eval_episode_rewards = []

    obs = eval_envs.reset()
    eval_recurrent_hidden_states = torch.zeros(
        num_processes, actor_critic.recurrent_hidden_state_size, device=device)
    eval_masks = torch.ones(num_processes, 1, device=device)

    while len(eval_episode_rewards) < 10:
        with torch.no_grad():
            if aug_id:
                obs = aug_id(obs)
            _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                obs,
                eval_recurrent_hidden_states,
                eval_masks,
                deterministic=False)

        obs, _, done, infos = eval_envs.step(action)

        eval_masks = torch.tensor([[0.0] if done_ else [1.0]
                                   for done_ in done],
                                  dtype=torch.float32,
                                  device=device)

        for info in infos:
            if 'episode' in info.keys():
                eval_episode_rewards.append(info['episode']['r'])

    eval_envs.close()

    print("Last {} test episodes: mean/median reward {:.1f}/{:.1f}\n"\
        .format(len(eval_episode_rewards), \
        np.mean(eval_episode_rewards), np.median(eval_episode_rewards)))

    return eval_episode_rewards
コード例 #8
0
    def __init__(self, model, train_config_path: Union[str, pathlib.Path],
                 env_parameter: EnvironmentParameter, adr_config: ADRConfig):

        self._model = model  # Model being evaluated
        self._gamma = adr_config.gamma  # Discount rate
        self._lambda = adr_config.lmbda  # Lambda used in GAE (General Advantage Estimation)

        self._env_parameter = env_parameter
        self._param_name = self._env_parameter.name

        self._max_buffer_size = adr_config.max_buffer_size
        self._n_trajectories = adr_config.n_eval_trajectories
        self._upper_sample_prob = adr_config.upper_sample_prob

        self._train_config_path = pathlib.Path(train_config_path)
        config_dir = self._train_config_path.parent
        config_name = self._param_name + '_adr_eval_config.json'

        # Initialize the config for the evaluation environment
        # This config will be updated regularly throughout training. When we boundary sample this environment's
        # parameter, the config will be modified to set the parameter to the selected boundary before running a number
        # of trajectories.
        self._boundary_config = DomainConfig.from_json(self._train_config_path)
        self._boundary_config_path = config_dir / config_name
        self._boundary_config.to_json(self._boundary_config_path)

        # Initialize the environment
        env = ProcgenEnv(num_envs=1,
                         env_name=str(self._boundary_config.game),
                         domain_config_path=str(self._boundary_config_path))
        env = VecExtractDictObs(env, "rgb")
        env = VecMonitor(venv=env, filename=None, keep_buf=100)
        self._env = VecNormalize(venv=env, ob=False)

        # Initialize the performance buffers
        self._upper_performance_buffer, self._lower_performance_buffer = PerformanceBuffer(
        ), PerformanceBuffer()

        self._states = {
            'lower': model.adr_initial_state,
            'upper': model.adr_initial_state
        }
        self._obs = self._env.reset()
        self._dones = [False]
コード例 #9
0
def test_fn(env_name, num_envs, config_path, load_path):
    test_config_path = os.path.join(os.getcwd(), "procgen-adr", config_path)
    test_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, domain_config_path=test_config_path, render_mode="rgb_array")
    test_env = VecExtractDictObs(test_env, "rgb")
    test_env = VecMonitor(venv=test_env, filename=None, keep_buf=100)
    test_env = VecNormalize(venv=test_env, ob=False)

    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256)

    recur = True
    if recur:
        logger.info("Using CNN LSTM")
        conv_fn = cnn_lstm(nlstm=256, conv_fn=conv_fn)

    mean, std = test(conv_fn, test_env, load_path=load_path)
    sess.close()
    return mean, std
コード例 #10
0
ファイル: test.py プロジェクト: verystrongjoe/auto-drac
    return eval_episode_rewards


if __name__ == "__main__":

    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda:0" if args.cuda else "cpu")

    args.num_processes = 1

    venv = ProcgenEnv(num_envs=args.num_processes, env_name=args.env_name, \
        num_levels=args.num_levels, start_level=args.start_level, \
        distribution_mode=args.distribution_mode, render_mode="rgb_array")
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)
    envs = VecPyTorchProcgen(venv, device)

    obs_shape = envs.observation_space.shape

    actor_critic = Policy(obs_shape,
                          envs.action_space.n,
                          base_kwargs={
                              'recurrent': False,
                              'hidden_size': args.hidden_size
                          })
    actor_critic.to(device)

    aug_id = data_augs.Identity
コード例 #11
0
def main(env_name, paint_vel_info, distribution_mode, num_levels, start_level,
         log_interval, iter_loss, arch, eval, num_envs, learning_rate,
         lr_schedule, ent_coef, gamma, lam, nsteps, nminibatches, ppo_epochs,
         clip_range, timesteps_per_proc, use_vf_clipping, _run, is_test_worker,
         timestep_factor):

    comm = MPI.COMM_WORLD
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    logger._run = _run

    # Configure logger
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir="{}/id_{}".format(LOG_DIR, _run._id),
                     format_strs=format_strs)

    # Add sacred logger:
    if log_comm.Get_rank() == 0:
        logger.get_current().output_formats.append(
            SacredOutputFormat(_run, timestep_factor))

    num_levels = 0 if is_test_worker else num_levels
    mpi_rank_weight = 0 if is_test_worker else 1
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=env_name,
                      paint_vel_info=paint_vel_info,
                      num_levels=num_levels,
                      start_level=start_level,
                      distribution_mode=distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn_with_ibac(
        x, iter_loss=iter_loss, arch=arch, depths=[16, 32, 32], emb_size=256)

    logger.info("training")
    ppo_iter.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        ## Iter
        iter_loss=iter_loss,
        arch=arch,
        _run=_run,
        ## Rest
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=log_interval,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        learning_rate=learning_rate,
        lr_schedule=lr_schedule,
        cliprange=clip_range,
        vf_coef=0.5,
        max_grad_norm=0.5,
        eval=eval,
    )
コード例 #12
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    total_timesteps = 1_000_000  ## now this counts steps in testing runs
    use_vf_clipping = True

    ## From random_ppo.py
    max_grad_norm = 0.5
    vf_coef = 0.5
    L2_WEIGHT = 10e-4
    FM_COEFF = 0.002
    REAL_THRES = 0.1

    parser = argparse.ArgumentParser(
        description='Process procgen testing arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1000)
    ## default starting_level set to 50 to test on unseen levels!
    parser.add_argument('--start_level', type=int, default=1000)
    parser.add_argument('--run_id', '-id', type=int, default=0)
    parser.add_argument('--load_id', type=int, default=0)
    parser.add_argument('--nrollouts', '-nroll', type=int, default=0)

    args = parser.parse_args()
    args.total_timesteps = total_timesteps
    if args.nrollouts:
        total_timesteps = int(args.nrollouts * num_envs * nsteps)
    run_ID = 'run_' + str(args.run_id).zfill(2)
    run_ID += '_load{}'.format(args.load_id)

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    mpi_rank_weight = 0
    num_levels = args.num_levels

    log_comm = comm.Split(0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)

    fpath = join(logpath, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.configure(dir=logpath, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.compat.v1.Session(config=config)
    sess.__enter__()

    logger.info("Testing")
    ## Modified based on random_ppo.learn
    env = venv
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    nrollouts = total_timesteps // nbatch

    network = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)
    policy = build_policy(env, network)
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm)

    LOAD_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.load_id)
    model.load(LOAD_PATH)
    logger.info("Model pramas loaded from save")
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)
    # tfirststart = time.time() ## Not doing timing yet
    # active_ep_buf = epinfobuf100

    mean_rewards = []
    datapoints = []
    for rollout in range(1, nrollouts + 1):
        logger.info('collecting rollouts {}...'.format(rollout))
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  ## differnent from random_ppo!
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
        rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
        ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10])
        ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100])

        logger.info('\n----', rollout)
        mean_rewards.append(rew_mean_10)
        logger.logkv('eprew10', rew_mean_10)
        logger.logkv('eprew100', rew_mean_100)
        logger.logkv('eplenmean10', ep_len_mean_10)
        logger.logkv('eplenmean100', ep_len_mean_100)
        logger.logkv("misc/total_timesteps", rollout * nbatch)

        logger.info('----\n')
        logger.dumpkvs()
    env.close()

    print("Rewards history: ", mean_rewards)
    return mean_rewards
コード例 #13
0
def rollout_fn(num_steps, env_name, num_envs, distribution_mode, num_levels, start_level, timesteps_per_proc, is_test_worker=False, log_dir='/tmp/procgen', comm=None, load_path=None):
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    use_vf_clipping = True

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else num_levels

    if log_dir is not None:
        log_comm = comm.Split(1 if is_test_worker else 0, 0)
        format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
        logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs, filename="rollout")

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv, filename=None, keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256)

    logger.info("training")
    ppo2.rollout(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        load_path = load_path,
        num_steps=num_steps,
        num_envs=num_envs, 
        env_name=env_name,
        num_levels=num_levels, 
        start_level=start_level, 
        distribution_mode=distribution_mode
    )
コード例 #14
0
def main():

    args = parse_config()
    run_dir = log_this(args, args.log_dir,
                       args.log_name + '_' + args.env_name + '_' + args.rm_id)

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=run_dir, format_strs=format_strs)

    logger.info("creating environment")

    venv = ProcgenEnv(num_envs=args.num_envs,
                      env_name=args.env_name,
                      num_levels=args.num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode,
                      use_sequential_levels=args.use_sequential_levels)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)

    if args.rm_id:
        # load pretrained network
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        net = RewardNet().to(device)
        rm_path = glob.glob('./**/' + args.rm_id + '.rm', recursive=True)[0]
        net.load_state_dict(
            torch.load(rm_path, map_location=torch.device(device)))

        # use batch reward prediction function instead of the ground truth reward function
        # pass though sigmoid if needed
        if args.use_sigmoid:
            rew_func = lambda x: 1 / (1 + np.exp(-net.predict_batch_rewards(x))
                                      )
        else:
            rew_func = lambda x: net.predict_batch_rewards(x)

        ## Uncomment the line below to train a live-long agent
        # rew_func = lambda x: x.shape[0] * [1]

        venv = ProxyRewardWrapper(venv, rew_func)
    else:
        # true environment rewards will be use
        pass

    venv = VecNormalize(venv=venv, ob=False, use_tf=False)

    # do the rest of the training as normal
    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()

    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)

    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    logger.info("training")

    model = ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=args.timesteps_per_proc,
        save_interval=args.save_interval,
        nsteps=args.nsteps,
        nminibatches=args.nminibatches,
        lam=args.lam,
        gamma=args.gamma,
        noptepochs=args.ppo_epochs,
        log_interval=args.log_interval,
        ent_coef=args.ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=args.use_vf_clipping,
        comm=comm,
        lr=args.learning_rate,
        cliprange=args.clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        load_path=args.load_path,
    )

    model.save(os.path.join(run_dir, 'final_model.parameters'))
コード例 #15
0
def train_fn(env_name: str,
             num_train_envs: int,
             n_training_steps: int,
             adr_config: ADRConfig = None,
             experiment_dir: str = None,
             tunable_params_config_path: str = None,
             log_dir: str = None,
             is_test_worker: bool = False,
             comm=None,
             save_interval: int = 1000,
             log_interval: int = 20,
             recur: bool = True):

    # Get the default ADR config if none is provided
    adr_config = ADRConfig() if adr_config is None else adr_config

    # Set up the experiment directory for this run. This will contain everything, from the domain configs for the
    # training environment and ADR evaluation environments to the logs. If the directory path is not provided, then
    # we'll make one an use the date-time-name to make it unique
    if experiment_dir is None:
        experiment_dir = pathlib.Path().absolute() / 'adr_experiments' / (
            'experiment-' + datetime_name())
        experiment_dir.mkdir(parents=True, exist_ok=False)
    else:
        experiment_dir = pathlib.Path(experiment_dir)

    # Make a config directory within the experiment directory to hold the domain configs
    config_dir = experiment_dir / 'domain_configs'
    config_dir.mkdir(parents=True, exist_ok=False)

    # Load the tunable parameters from a config file if it is provided, otherwise get the default for the given game.
    if tunable_params_config_path is None:
        try:
            tunable_params = DEFAULT_TUNABLE_PARAMS[env_name]
        except KeyError:
            raise KeyError(
                f'No default tunable parameters exist for {env_name}')
    else:
        raise NotImplemented(
            'Currently no way to load tunable parameters from a configuration file'
        )

    # Make a default config for the given game...
    train_domain_config_path = config_dir / 'train_config.json'
    try:
        train_domain_config = DEFAULT_DOMAIN_CONFIGS[env_name]
        train_domain_config.to_json(train_domain_config_path)
    except KeyError:
        raise KeyError(f'No default config exists for {env_name}')

    # ...then load the initial bounds for the tunable parameters into the config.
    params = {}
    for param in tunable_params:
        params['min_' + param.name] = param.lower_bound
        params['max_' + param.name] = param.upper_bound
    train_domain_config.update_parameters(params, cache=False)

    # Configure the logger if we are given a log directory
    if log_dir is not None:
        log_dir = experiment_dir / log_dir
        log_comm = comm.Split(1 if is_test_worker else 0, 0)
        format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
        logger.configure(comm=log_comm,
                         dir=str(log_dir),
                         format_strs=format_strs)

    logger.info(f'env_name: {env_name}')
    logger.info(f'num_train_envs: {num_train_envs}')
    logger.info(f'n_training_steps: {n_training_steps}')
    logger.info(f'experiment_dir: {experiment_dir}')
    logger.info(f'tunable_params_config_path: {tunable_params_config_path}')
    logger.info(f'log_dir: {log_dir}')
    logger.info(f'save_interval: {save_interval}')

    n_steps = 256
    ent_coef = .01
    lr = 5e-4
    vf_coef = .5
    max_grad_norm = .5
    gamma = .999
    lmbda = .95
    n_minibatches = 8
    ppo_epochs = 3
    clip_range = .2
    use_vf_clipping = True

    mpi_rank_weight = 0 if is_test_worker else 1

    logger.info('creating environment')
    training_env = ProcgenEnv(num_envs=num_train_envs,
                              env_name=env_name,
                              domain_config_path=str(train_domain_config_path))
    training_env = VecExtractDictObs(training_env, "rgb")
    training_env = VecMonitor(venv=training_env, filename=None, keep_buf=100)
    training_env = VecNormalize(venv=training_env, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.__enter__()

    def conv_fn(x):
        return build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    if recur:
        logger.info("Using CNN LSTM")
        conv_fn = cnn_lstm(nlstm=256, conv_fn=conv_fn)

    logger.info('training')
    ppo2_adr.learn(conv_fn,
                   training_env,
                   n_training_steps,
                   config_dir,
                   adr_config,
                   train_domain_config,
                   tunable_params,
                   n_steps=n_steps,
                   ent_coef=ent_coef,
                   lr=lr,
                   vf_coef=vf_coef,
                   max_grad_norm=max_grad_norm,
                   gamma=gamma,
                   lmbda=lmbda,
                   log_interval=log_interval,
                   save_interval=save_interval,
                   n_minibatches=n_minibatches,
                   n_optepochs=ppo_epochs,
                   clip_range=clip_range,
                   mpi_rank_weight=mpi_rank_weight,
                   clip_vf=use_vf_clipping)
コード例 #16
0
ファイル: dropout_test.py プロジェクト: Rmao99/train-procgen
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    ##new defined
    vf_coef = 0.5
    max_grad_norm = 0.5
    ###########
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    # timesteps_per_proc = 50_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument('--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--total_timesteps', type=int, default=0)

    args = parser.parse_args()

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR, 
                     format_strs=format_strs,
                     log_suffix="_total_timesteps_{}_num_levels_{}".format(args.total_timesteps,
                                                                           num_levels))

    '''logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv, filename=None, keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)'''

    logger.info("Creating dropout evaluation environment")
    eval_venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=100, start_level=2000, distribution_mode=args.distribution_mode)
    eval_venv = VecExtractDictObs(eval_venv, "rgb")

    eval_venv = VecMonitor(
        venv=eval_venv, filename=None, keep_buf=100,
    )

    eval_venv = VecNormalize(venv=eval_venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, is_train=False, depths=[16,32,32], emb_size=256)

    logger.info("testing dropout")
    

    
    policy = build_policy(eval_venv,conv_fn)

    nenvs = eval_venv.num_envs
    ob_space = eval_venv.observation_space
    ac_space = eval_venv.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch//nminibatches
    
    # Instantiate the model object (that creates act_model and train_model)
    
    from baselines.ppo2.model import Model
    model_fn = Model    #modified from baseline ppo2 learn

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight)
    model.load(MODEL_PATH)
    eval_runner = Runner(env=eval_venv, model=model, nsteps=nsteps, gamma=.999, lam=.95)

    eval_epinfobuf = deque(maxlen=100)
    nupdates = args.total_timesteps//nbatch

    log_interval = 1
    for update in range(1, nupdates+1):
    #single upate to test    
        eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run()
        eval_epinfobuf.extend(eval_epinfos)
        if update % log_interval == 0 or update == 1:
            logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
            logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
            logger.logkv('misc/total_timesteps',update*nbatch)
            logger.dumpkvs()
    eval_venv.close()
コード例 #17
0
ファイル: train_load.py プロジェクト: chenziku/train-procgen
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    last_step = 4587520  # where we have left off in training
    timesteps_per_proc = 25_000_000 - last_step
    use_vf_clipping = True
    model_path = '../train-procgen/saved_model/policy_bossfight_vae560'
    vae_path = '../train-procgen/saved_model/bossfight_vae560'

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='hard',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)

    args = parser.parse_args()

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    logger.info("training")
    ppo2_cvae.learn(env=venv,
                    network=conv_fn,
                    total_timesteps=timesteps_per_proc,
                    save_interval=10,
                    nsteps=nsteps,
                    nminibatches=nminibatches,
                    lam=lam,
                    gamma=gamma,
                    noptepochs=ppo_epochs,
                    log_interval=1,
                    ent_coef=ent_coef,
                    mpi_rank_weight=mpi_rank_weight,
                    clip_vf=use_vf_clipping,
                    comm=comm,
                    lr=learning_rate,
                    cliprange=clip_range,
                    update_fn=None,
                    init_fn=None,
                    vf_coef=0.5,
                    max_grad_norm=0.5,
                    load_path=model_path,
                    vae_path=vae_path)
コード例 #18
0
ファイル: train_dqn.py プロジェクト: andyehrenberg/mixreg
def main():
    """Run DQN until the environment throws an exception."""
    # Hyperparameters
    learning_rate = 2.5e-4
    gamma = 0.99
    nstep_return = 3
    timesteps_per_proc = 50_000_000
    train_interval = 4
    target_interval = 8192
    batch_size = 512
    min_buffer_size = 20000

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='starpilot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    parser.add_argument('--level_setup',
                        type=str,
                        default='procgen',
                        choices=["procgen", "oracle"])
    parser.add_argument('--mix_mode',
                        type=str,
                        default='nomix',
                        choices=['nomix', 'mixreg'])
    parser.add_argument('--mix_alpha', type=float, default=0.2)
    parser.add_argument('--use_l2reg', action='store_true')
    parser.add_argument('--data_aug',
                        type=str,
                        default='no_aug',
                        choices=['no_aug', 'cutout_color', 'crop'])
    parser.add_argument('--PER',
                        type=lambda x: bool(strtobool(x)),
                        default=True,
                        help='Whether to use PER')
    parser.add_argument('--num_envs', type=int, default=64)
    args = parser.parse_args()

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    num_envs = args.num_envs

    # Setup env specs
    if args.level_setup == "procgen":
        env_name = args.env_name
        num_levels = 0 if is_test_worker else args.num_levels
        start_level = args.start_level
    elif args.level_setup == "oracle":
        env_name = args.env_name
        num_levels = 0
        start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(
        dir=LOG_DIR +
        f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}',
        format_strs=format_strs)

    # Create env
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=env_name,
                      num_levels=num_levels,
                      start_level=start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank % len(gpus_id)]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup Rainbow models
    logger.info("building models")
    online_net, target_net = rainbow_models(
        sess,
        venv.action_space.n,
        gym_space_vectorizer(venv.observation_space),
        min_val=REWARD_RANGE_FOR_C51[env_name][0],
        max_val=REWARD_RANGE_FOR_C51[env_name][1])
    dqn = MpiDQN(online_net,
                 target_net,
                 discount=gamma,
                 comm=comm,
                 mpi_rank_weight=mpi_rank_weight,
                 mix_mode=args.mix_mode,
                 mix_alpha=args.mix_alpha,
                 use_l2reg=args.use_l2reg,
                 data_aug=args.data_aug)
    player = NStepPlayer(VecPlayer(venv, dqn.online_net), nstep_return)
    optimize = dqn.optimize(learning_rate=learning_rate)

    # Initialize and sync variables
    sess.run(tf.global_variables_initializer())
    global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope="")
    if comm.Get_size() > 1:
        sync_from_root(sess, global_variables, comm=comm)  #pylint: disable=E110

    # Training
    logger.info("training")
    if args.PER:
        dqn.train(num_steps=timesteps_per_proc,
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000,
                                                        0.5,
                                                        0.4,
                                                        epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=train_interval,
                  target_interval=target_interval,
                  batch_size=batch_size,
                  min_buffer_size=min_buffer_size)
    else:
        #set alpha and beta equal to 0 for uniform prioritization and no importance sampling
        dqn.train(num_steps=timesteps_per_proc,
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000,
                                                        0,
                                                        0,
                                                        epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=train_interval,
                  target_interval=target_interval,
                  batch_size=batch_size,
                  min_buffer_size=min_buffer_size)
コード例 #19
0
ファイル: test.py プロジェクト: yanlai00/train-procgen
def main():
    num_envs = 64
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    total_timesteps = 1_000_000

    ## From random_ppo.py
    max_grad_norm = 0.5
    vf_coef = 0.5

    parser = argparse.ArgumentParser(
        description='Process procgen testing arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1000)
    parser.add_argument('--start_level', type=int, default=50)
    parser.add_argument('--run_id', '-id', type=int, default=0)
    parser.add_argument('--load_id', type=int, default=0)
    parser.add_argument('--nrollouts', '-nroll', type=int, default=50)
    parser.add_argument('--use', type=str, default="randcrop")
    parser.add_argument('--arch', type=str, default="impala")
    parser.add_argument('--no_bn', dest='use_batch_norm', action='store_false')
    parser.add_argument('--netrand', dest='netrand', action='store_true')
    parser.set_defaults(use_batch_norm=True)

    args = parser.parse_args()
    args.total_timesteps = total_timesteps
    arch = args.arch
    use_batch_norm = args.use_batch_norm
    if args.nrollouts:
        total_timesteps = int(args.nrollouts * num_envs * nsteps)
    run_ID = 'run_' + str(args.run_id).zfill(2)
    run_ID += '_load{}'.format(args.load_id)
    print(args.use)
    LOG_DIR = 'log/{}/test'.format(args.use)
    if not args.netrand:
        policy = CnnPolicy
    else:
        policy = RandomCnnPolicy
    load_model = "log/{}/saved_{}_v{}.tar".format(args.use, args.use,
                                                  args.load_id)

    comm = MPI.COMM_WORLD
    num_levels = args.num_levels

    log_comm = comm.Split(0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)

    fpath = join(logpath, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.configure(dir=logpath, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.compat.v1.Session(config=config)
    sess.__enter__()

    logger.info("Testing")
    env = venv
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    nrollouts = total_timesteps // nbatch

    model = Model(sess=sess,
                  policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  arch=arch,
                  use_batch_norm=use_batch_norm,
                  dropout=0)

    model.load(load_model)
    logger.info("Model pramas loaded from saved model: ", load_model)
    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    aug_func=None)

    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)

    mean_rewards = []
    datapoints = []
    for rollout in range(1, nrollouts + 1):
        logger.info('collecting rollouts {}...'.format(rollout))
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
        rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
        ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10])
        ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100])

        logger.info('\n----', rollout)
        mean_rewards.append(rew_mean_10)
        logger.logkv('eprew10', rew_mean_10)
        logger.logkv('eprew100', rew_mean_100)
        logger.logkv('eplenmean10', ep_len_mean_10)
        logger.logkv('eplenmean100', ep_len_mean_100)
        logger.logkv("misc/total_timesteps", rollout * nbatch)

        logger.info('----\n')
        logger.dumpkvs()
    env.close()

    print("Rewards history: ", mean_rewards)
    return mean_rewards
コード例 #20
0
def main():
    # Hyperparameters
    num_envs = 128
    learning_rate = 5e-4
    ent_coef = .01
    vf_coef = 0.5
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    max_grad_norm = 0.5
    timesteps_per_proc = 100_000_000
    use_vf_clipping = True

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument('--distribution_mode', type=str, default='hard',
            choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    parser.add_argument('--use_bn', action='store_true')
    parser.add_argument('--use_l2reg', action='store_true')
    parser.add_argument('--l2reg_coeff', type=float, default=1e-4)
    parser.add_argument('--data_aug', type=str, default='no_aug', 
            choices=["no_aug", "cutout_color", "crop"])
    parser.add_argument('--use_rand_conv', action='store_true')
    parser.add_argument('--model_width', type=str, default='1x',
            choices=["1x", "2x", "4x"])
    parser.add_argument('--level_setup', type=str, default='procgen',
            choices=["procgen", "oracle"])
    parser.add_argument('--mix_mode', type=str, default='nomix',
            choices=['nomix', 'mixreg', 'mixobs'])
    parser.add_argument('--mix_alpha', type=float, default=0.2)
    # JAG: Add second parameter beta to the beta distribution
    parser.add_argument('--mix_beta', type=float, default=0.2)

    # JAG: Parameters for adversarial RL
    # 1. The ending condition for adversarial gradient descent
    parser.add_argument('--adv_epsilon', type=float, default=5e-6)
    # 2. Learning rate for adversarial gradient descent
    parser.add_argument('--adv_lr', type=float, default=10)
    # 3. Adversarial penalty for observation euclidean distance
    parser.add_argument('--adv_gamma', type=float, default=0.01)
    # 4. We use adversarial after #threshold epochs of PPO training 
    parser.add_argument('--adv_thresh', type=int, default=50)
    # 5. If we use evaluation environment
    parser.add_argument('--eval_env', type=bool, default=True)
    parser.add_argument('--eval_levels', type=int, default=0)
    # 6. The ratio of adversarial augmented data
    # adv = 1 means we replace original data with adversarial data
    # adv = 0 means we do not use adversarial
    parser.add_argument('--adv_adv', type=float, default=0.5)
    # 7. The ratio of mixup original data with augmented data
    # adv = 1 means we use augmented obs and value
    # adv = 0 means we use original obs and value
    parser.add_argument('--adv_obs', type=float, default=1)
    parser.add_argument('--adv_value', type=float, default=1)
    # Determine what percentage of environments we use (For generalization)
    # nenv = 1 means we use all the environments
    parser.add_argument('--adv_nenv', type=float, default=1)
    # 9. We test the first 500 epochs
    parser.add_argument('--adv_epochs', type=int, default=500)
    args = parser.parse_args()

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
                test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    # Setup env specs
    if args.level_setup == "procgen":
        env_name = args.env_name
        num_levels = 0 if is_test_worker else args.num_levels
        start_level = args.start_level
    elif args.level_setup == "oracle":
        env_name = args.env_name
        num_levels = 0
        start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(
        dir=LOG_DIR +
        f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}',
        format_strs=format_strs
    )

    # Create env
    logger.info("creating environment")

    # JAG: Limit the maximum training levels
    train_levels = int(num_levels * args.adv_nenv)
    venv = ProcgenEnv(
            num_envs=num_envs, env_name=env_name, num_levels=train_levels,
            start_level=start_level, distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)


    # JAG: If we use eval_env
    if args.eval_env:
        eval_env = ProcgenEnv(
                num_envs=num_envs, env_name=env_name,
                num_levels=args.eval_levels, start_level=start_level,
                distribution_mode=args.distribution_mode)
        eval_env = VecExtractDictObs(eval_env, "rgb")
        eval_env = VecMonitor(venv=eval_env, filename=None, keep_buf=100)
        eval_env = VecNormalize(venv=eval_env, ob=False)
    else:
        eval_env = None

    # Feed parameters to a dictionary
    adv_ratio={
            'adv': args.adv_adv,
            'obs': args.adv_obs,
            'value': args.adv_value,
            #'nenv': args.adv_nenv,
    }

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup model
    if args.model_width == '1x':
        depths = [16, 32, 32]
    elif args.model_width == '2x':
        depths = [32, 64, 64]
    elif args.model_width == '4x':
        depths = [64, 128, 128]
    conv_fn = lambda x: build_impala_cnn(
            x, depths=depths, use_bn=args.use_bn,
            randcnn=args.use_rand_conv and not is_test_worker)

    # Training
    logger.info("training")
    ppo2.learn = learn  # use customized "learn" function
    model = ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        data_aug=args.data_aug,
        use_rand_conv=args.use_rand_conv,
        model_fn=get_mixreg_model(
            mix_mode=args.mix_mode,
            mix_alpha=args.mix_alpha,
            mix_beta=args.mix_beta,
            use_l2reg=args.use_l2reg,
            l2reg_coeff=args.l2reg_coeff),
        # JAG: Pass adversarial parameters
        adv_epsilon=args.adv_epsilon,
        adv_lr=args.adv_lr,
        adv_gamma=args.adv_gamma,
        adv_thresh=args.adv_thresh,
        adv_ratio=adv_ratio,
        eval_env=eval_env,
        adv_epochs=args.adv_epochs,
    )

    # Saving
    logger.info("saving final model")
    if rank == 0:
        checkdir = os.path.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)
        model.save(os.path.join(checkdir, 'final_model.ckpt'))
コード例 #21
0
def main():
    # get model path
    parser = argparse.ArgumentParser(description="Parse testing arguments")
    parser.add_argument('--model_path',
                        type=str,
                        default=None,
                        help='Path to model checkpoint.')
    parser.add_argument('--config',
                        type=str,
                        default='configurations/ppo_baseline_cuda.yaml',
                        help='Path to configuration file.')
    args = parser.parse_args()
    if args.model_path is None or not os.path.exists(args.model_path):
        raise OSError("Invalid model file supplied")

    # create configuration
    cfg = get_cfg_defaults()
    cfg.merge_from_file(args.config)

    # create save directory
    model_file_path = args.model_path
    exp_creation_time = os.path.normpath(model_file_path).split(os.sep)[-3]
    print(exp_creation_time)
    exp_dir = f"runs/{cfg.EXPERIMENT_NAME}/{exp_creation_time}_test/"
    os.makedirs(exp_dir, exist_ok=True)

    # create logger
    format_strs = ['csv', 'stdout']
    logger.configure(dir=exp_dir,
                     format_strs=format_strs,
                     log_suffix=datetime.now().strftime('%Y-%m-%d-%H-%M'))

    # create (vectorized) procgen environment
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=cfg.TEST.NUM_ENVS,
                      env_name="fruitbot",
                      num_levels=cfg.TEST.NUM_LEVELS,
                      start_level=cfg.TEST.LEVEL_SEED,
                      distribution_mode="easy")
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    # create tensorflow session
    logger.info("creating tf session")
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # create cnn todo: make this less ugly
    conv_fn = None
    logger.info("building cnn")
    if cfg.TRAIN.NETWORK == "NATURE_CNN":
        conv_fn = lambda x: nature_cnn(x)
    elif cfg.TRAIN.NETWORK == "IMPALA_CNN":
        conv_fn = lambda x: build_impala_cnn(
            x, depths=[16, 32, 32], emb_size=256)

    # training
    logger.info("testing")
    ppo2.learn(env=venv,
               network=conv_fn,
               total_timesteps=cfg.TEST.TIMESTEPS,
               save_interval=0,
               nsteps=cfg.TEST.BATCH_SIZE,
               nminibatches=cfg.TRAIN.MINIBATCHES,
               lam=cfg.TRAIN.LAM,
               gamma=cfg.TRAIN.GAMMA,
               noptepochs=cfg.TRAIN.NUM_EPOCHS,
               log_interval=1,
               clip_vf=cfg.TRAIN.USE_VF_CLIPPING,
               lr=cfg.TRAIN.LR,
               cliprange=cfg.TRAIN.CLIP_RANGE,
               update_fn=None,
               init_fn=None,
               vf_coef=0.5,
               max_grad_norm=0.5,
               test=True,
               load_path=model_file_path)
コード例 #22
0
ファイル: make_envs.py プロジェクト: udeepam/aldm
def make_vec_envs(
    env_name,
    start_level,
    num_levels,
    distribution_mode,
    paint_vel_info,
    num_processes,
    num_frame_stack,
    device,
):
    """
    Make vector of environments.

    Parameters:
    -----------
    env_name : `str`
        Name of environment to train on.
    start_level : `int`
        The point in the list of levels available to the environment at which to index into.
    num_levels : `int`
        The number of unique levels that can be generated. Set to 0 to use unlimited levels.
    distribution_mode : `str`
        What variant of the levels to use {easy, hard, extreme, memory, exploration}.
    paint_vel_info : `Boolean`
        Paint player velocity info in the top left corner. Only supported by certain games.
    num_processes : `int`
        How many training CPU processes to use (default: 64).
        This will give the number of environments to make.
    num_frame_stack : `int`
        Number of frames to stack for VecFrameStack wrapper (default: 0).
    device : `torch.device`
        CPU or GPU.

    Returns:
    --------
    env :
        Vector of environments.
    """
    envs = ProcgenEnv(num_envs=num_processes,
                      env_name=env_name,
                      start_level=start_level,
                      num_levels=num_levels,
                      distribution_mode=distribution_mode,
                      paint_vel_info=paint_vel_info)

    # extract image from dict
    envs = VecExtractDictObs(envs, "rgb")

    # re-order channels, (H,W,C) => (C,H,W).
    # required for PyTorch convolution layers.
    envs = VecTransposeImage(envs)

    # records:
    #  1. episode reward,
    #  2. episode length
    #  3. episode time taken
    envs = VecMonitor(venv=envs, keep_buf=100)

    # normalise the rewards
    envs = VecNormalize(envs, ob=False)

    # wrapper to convert observation arrays to torch.Tensors
    # normalise observations / 255.
    envs = VecPyTorch(envs, device)

    # Frame stacking wrapper for vectorized environment
    if num_frame_stack != 0:
        envs = VecPyTorchFrameStack(envs, num_frame_stack, device)

    return envs
コード例 #23
0
def train_fn(env_name,
             num_envs,
             distribution_mode,
             num_levels,
             start_level,
             timesteps_per_proc,
             level_sampler_strategy,
             score_transform,
             model_name,
             is_test_worker=False,
             save_dir='./',
             comm=None):
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    use_vf_clipping = True

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else num_levels

    log_dir = save_dir + 'logs/' + model_name

    if log_dir is not None:
        log_comm = comm.Split(1 if is_test_worker else 0, 0)
        format_strs = ['csv', 'stdout', 'tensorboard'
                       ] if log_comm.Get_rank() == 0 else []
        logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs)

    logger.info("creating environment")
    eval_env = ProcgenEnv(num_envs=num_envs,
                          env_name=env_name,
                          num_levels=500,
                          start_level=0,
                          distribution_mode=distribution_mode)
    eval_env = VecExtractDictObs(eval_env, "rgb")
    eval_env = VecMonitor(
        venv=eval_env,
        filename=None,
        keep_buf=100,
    )
    eval_env = VecNormalize(venv=eval_env, ob=False, ret=True)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32])

    logger.info("training")
    model = ppo2.learn(network=conv_fn,
                       total_timesteps=timesteps_per_proc,
                       num_levels=num_levels,
                       eval_env=eval_env,
                       save_interval=0,
                       nsteps=nsteps,
                       nminibatches=nminibatches,
                       lam=lam,
                       gamma=gamma,
                       noptepochs=ppo_epochs,
                       log_interval=1,
                       ent_coef=ent_coef,
                       mpi_rank_weight=mpi_rank_weight,
                       clip_vf=use_vf_clipping,
                       comm=comm,
                       lr=learning_rate,
                       cliprange=clip_range,
                       update_fn=None,
                       init_fn=None,
                       vf_coef=0.5,
                       max_grad_norm=0.5,
                       level_sampler_strategy=level_sampler_strategy,
                       score_transform=score_transform)
    model.save(save_dir + 'models/' + model_name)
コード例 #24
0
def main():
    # Hyperparameters
    num_envs = 128
    learning_rate = 5e-4
    ent_coef = .01
    vf_coef = 0.5
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    max_grad_norm = 0.5
    timesteps_per_proc = 100_000_000
    use_vf_clipping = True

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='hard',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    args = parser.parse_args()

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    # Setup env specs
    env_name = args.env_name
    num_levels = 0 if is_test_worker else args.num_levels
    start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR + f'/{args.env_name}/run_{args.run_id}',
                     format_strs=format_strs)

    # Create env
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=env_name,
                      num_levels=num_levels,
                      start_level=start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup model
    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32])

    # Training
    logger.info("training")
    ppo2.Runner = NetRandRunner
    ppo2.build_policy = build_policy
    model = ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        model_fn=NetRandModel,
    )

    # Saving
    logger.info("saving final model")
    if rank == 0:
        checkdir = os.path.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)
        model.save(os.path.join(checkdir, 'final_model.ckpt'))
コード例 #25
0
def train(args):
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    print('Using CUDA: {}'.format(args.cuda))

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    log_dir = args.log_dir
    if not log_dir.startswith('gs://'):
        log_dir = os.path.expanduser(args.log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if not args.preempt:
        utils.cleanup_log_dir(log_dir)
    try:
        gfile.makedirs(log_dir)
    except:
        pass

    log_file = '-{}-{}-reproduce-s{}'.format(args.run_name, args.env_name,
                                             args.seed)
    save_dir = os.path.join(log_dir, 'checkpoints', log_file)
    gfile.makedirs(save_dir)

    venv = ProcgenEnv(num_envs=args.num_processes, env_name=args.env_name, \
        num_levels=args.num_levels, start_level=args.start_level, \
        distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)
    envs = VecPyTorchProcgen(venv, device)

    obs_shape = envs.observation_space.shape
    actor_critic = Policy(obs_shape,
                          envs.action_space.n,
                          base_kwargs={
                              'recurrent': False,
                              'hidden_size': args.hidden_size
                          })
    actor_critic.to(device)

    rollouts = RolloutStorage(args.num_steps,
                              args.num_processes,
                              envs.observation_space.shape,
                              envs.action_space,
                              actor_critic.recurrent_hidden_state_size,
                              aug_type=args.aug_type,
                              split_ratio=args.split_ratio,
                              store_policy=args.use_pse)

    batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch)

    if args.use_ucb:
        aug_id = data_augs.Identity
        aug_list = [
            aug_to_func[t](batch_size=batch_size)
            for t in list(aug_to_func.keys())
        ]

        agent = algo.UCBDrAC(actor_critic,
                             args.clip_param,
                             args.ppo_epoch,
                             args.num_mini_batch,
                             args.value_loss_coef,
                             args.entropy_coef,
                             lr=args.lr,
                             eps=args.eps,
                             max_grad_norm=args.max_grad_norm,
                             aug_list=aug_list,
                             aug_id=aug_id,
                             aug_coef=args.aug_coef,
                             num_aug_types=len(list(aug_to_func.keys())),
                             ucb_exploration_coef=args.ucb_exploration_coef,
                             ucb_window_length=args.ucb_window_length)

    elif args.use_meta_learning:
        aug_id = data_augs.Identity
        aug_list = [aug_to_func[t](batch_size=batch_size) \
            for t in list(aug_to_func.keys())]

        aug_model = AugCNN()
        aug_model.to(device)

        agent = algo.MetaDrAC(actor_critic,
                              aug_model,
                              args.clip_param,
                              args.ppo_epoch,
                              args.num_mini_batch,
                              args.value_loss_coef,
                              args.entropy_coef,
                              meta_grad_clip=args.meta_grad_clip,
                              meta_num_train_steps=args.meta_num_train_steps,
                              meta_num_test_steps=args.meta_num_test_steps,
                              lr=args.lr,
                              eps=args.eps,
                              max_grad_norm=args.max_grad_norm,
                              aug_id=aug_id,
                              aug_coef=args.aug_coef)

    elif args.use_rl2:
        aug_id = data_augs.Identity
        aug_list = [
            aug_to_func[t](batch_size=batch_size)
            for t in list(aug_to_func.keys())
        ]

        rl2_obs_shape = [envs.action_space.n + 1]
        rl2_learner = Policy(rl2_obs_shape,
                             len(list(aug_to_func.keys())),
                             base_kwargs={
                                 'recurrent': True,
                                 'hidden_size': args.rl2_hidden_size
                             })
        rl2_learner.to(device)

        agent = algo.RL2DrAC(actor_critic,
                             rl2_learner,
                             args.clip_param,
                             args.ppo_epoch,
                             args.num_mini_batch,
                             args.value_loss_coef,
                             args.entropy_coef,
                             args.rl2_entropy_coef,
                             lr=args.lr,
                             eps=args.eps,
                             rl2_lr=args.rl2_lr,
                             rl2_eps=args.rl2_eps,
                             max_grad_norm=args.max_grad_norm,
                             aug_list=aug_list,
                             aug_id=aug_id,
                             aug_coef=args.aug_coef,
                             num_aug_types=len(list(aug_to_func.keys())),
                             recurrent_hidden_size=args.rl2_hidden_size,
                             num_actions=envs.action_space.n,
                             device=device)

    elif args.use_rad:
        aug_id = data_augs.Identity
        aug_func = aug_to_func[args.aug_type](batch_size=batch_size)

        pse_coef = args.pse_coef
        if args.use_pse:
            assert args.pse_coef > 0, "Please pass a non-zero pse_coef"
        else:
            pse_coef = 0.0
        print("Running RAD ..")
        print(
            "PSE: {}, Coef: {}, Gamma: {}, Temp: {}, Coupling Temp: {}".format(
                args.use_pse, pse_coef, args.pse_gamma, args.pse_temperature,
                args.pse_coupling_temperature))
        print('use_augmentation: {}'.format(args.use_augmentation))

        agent = algo.RAD(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
            aug_id=aug_id,
            aug_func=aug_func,
            env_name=args.env_name,
            use_augmentation=args.use_augmentation,
            pse_gamma=args.pse_gamma,
            pse_coef=pse_coef,
            pse_temperature=args.pse_temperature,
            pse_coupling_temperature=args.pse_coupling_temperature)
    else:
        aug_id = data_augs.Identity
        aug_func = aug_to_func[args.aug_type](batch_size=batch_size)

        pse_coef = args.pse_coef
        if args.use_pse:
            assert args.pse_coef > 0, "Please pass a non-zero pse_coef"
        else:
            pse_coef = 0.0
        print("Running DraC ..")
        print("PSE: {}, Coef: {}, Gamma: {}, Temp: {}".format(
            args.use_pse, pse_coef, args.pse_gamma, args.pse_temperature))

        agent = algo.DrAC(actor_critic,
                          args.clip_param,
                          args.ppo_epoch,
                          args.num_mini_batch,
                          args.value_loss_coef,
                          args.entropy_coef,
                          lr=args.lr,
                          eps=args.eps,
                          max_grad_norm=args.max_grad_norm,
                          aug_id=aug_id,
                          aug_func=aug_func,
                          aug_coef=args.aug_coef,
                          env_name=args.env_name,
                          pse_gamma=args.pse_gamma,
                          pse_coef=pse_coef,
                          pse_temperature=args.pse_temperature)

    checkpoint_path = os.path.join(save_dir, "agent" + log_file + ".pt")
    if gfile.exists(checkpoint_path) and args.preempt:
        with gfile.GFile(checkpoint_path, 'rb') as f:
            inbuffer = io.BytesIO(f.read())
            checkpoint = torch.load(inbuffer)
        agent.actor_critic.load_state_dict(checkpoint['model_state_dict'])
        agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        init_epoch = checkpoint['epoch'] + 1
        print('Loaded ckpt from epoch {}'.format(init_epoch - 1))
        logger.configure(dir=args.log_dir,
                         format_strs=['csv', 'stdout', 'tensorboard'],
                         log_suffix=log_file,
                         init_step=init_epoch)
    else:
        init_epoch = 0
        logger.configure(dir=args.log_dir,
                         format_strs=['csv', 'stdout', 'tensorboard'],
                         log_suffix=log_file,
                         init_step=init_epoch)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(init_epoch, num_updates):
        actor_critic.train()
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                obs_id = aug_id(rollouts.obs[step])
                value, action, action_log_prob, recurrent_hidden_states, pi = actor_critic.act(
                    obs_id,
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step],
                    policy=True)

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])

            rollouts.insert(obs,
                            recurrent_hidden_states,
                            action,
                            action_log_prob,
                            value,
                            reward,
                            masks,
                            bad_masks,
                            pi=pi)

        with torch.no_grad():
            obs_id = aug_id(rollouts.obs[-1])
            next_value = actor_critic.get_value(
                obs_id, rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.gamma, args.gae_lambda)

        if args.use_ucb and j > 0:
            agent.update_ucb_values(rollouts)
        value_loss, action_loss, dist_entropy, pse_loss = agent.update(
            rollouts)
        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "\nUpdate {}, step {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}"
                .format(j, total_num_steps, len(episode_rewards),
                        np.mean(episode_rewards), np.median(episode_rewards),
                        dist_entropy, value_loss, action_loss))

            logger.logkv("train/nupdates", j)
            logger.logkv("train/total_num_steps", total_num_steps)

            logger.logkv("losses/dist_entropy", dist_entropy)
            logger.logkv("losses/value_loss", value_loss)
            logger.logkv("losses/action_loss", action_loss)
            if args.use_pse:
                logger.logkv("losses/pse_loss", pse_loss)

            logger.logkv("train/mean_episode_reward", np.mean(episode_rewards))
            logger.logkv("train/median_episode_reward",
                         np.median(episode_rewards))

            ### Eval on the Full Distribution of Levels ###
            eval_episode_rewards = evaluate(args,
                                            actor_critic,
                                            device,
                                            aug_id=aug_id)

            logger.logkv("test/mean_episode_reward",
                         np.mean(eval_episode_rewards))
            logger.logkv("test/median_episode_reward",
                         np.median(eval_episode_rewards))

            logger.dumpkvs()

        # Save Model
        if (j > 0 and j % args.save_interval == 0
                or j == num_updates - 1) and save_dir != "":
            try:
                gfile.makedirs(save_dir)
            except OSError:
                pass

            ckpt_file = os.path.join(save_dir, "agent" + log_file + ".pt")
            outbuffer = io.BytesIO()
            torch.save(
                {
                    'epoch': j,
                    'model_state_dict': agent.actor_critic.state_dict(),
                    'optimizer_state_dict': agent.optimizer.state_dict()
                }, outbuffer)
            with gfile.GFile(ckpt_file, 'wb') as fout:
                fout.write(outbuffer.getvalue())
            save_num_steps = (j + 1) * args.num_processes * args.num_steps

            print("\nUpdate {}, step {}, Saved {}.".format(
                j, save_num_steps, ckpt_file))
コード例 #26
0
def learn(*,
          network,
          total_timesteps,
          num_levels=50,
          start_level=500,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          update_fn=None,
          init_fn=None,
          mpi_rank_weight=1,
          comm=None,
          num_processes=64,
          num_steps=256,
          level_replay_temperature=0.1,
          level_replay_rho=1.0,
          level_replay_nu=0.5,
          level_replay_alpha=1.0,
          staleness_coef=0.1,
          staleness_temperature=1.0,
          level_sampler_strategy='value_l1',
          score_transform='rank',
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    level_sampler_args = dict(num_actors=num_processes,
                              strategy=level_sampler_strategy,
                              replay_schedule='proportionate',
                              score_transform=score_transform,
                              temperature=level_replay_temperature,
                              rho=level_replay_rho,
                              nu=level_replay_nu,
                              alpha=level_replay_alpha,
                              staleness_coef=staleness_coef,
                              staleness_transform='power',
                              staleness_temperature=staleness_temperature)

    env = ProcgenEnv(num_envs=num_processes, env_name='fruitbot', \
        num_levels=1, start_level=start_level, \
        distribution_mode='easy',
        paint_vel_info=False)
    env = VecExtractDictObs(env, "rgb")
    env = VecMonitor(venv=env, filename=None, keep_buf=100)
    env = VecNormalize(venv=env, ob=False, ret=True)

    seeds = [start_level + i for i in range(num_levels)]

    level_sampler = LevelSampler(seeds, env.observation_space,
                                 env.action_space, **level_sampler_args)

    env = VecProcgen(env, level_sampler=level_sampler)

    rollouts = RolloutStorage(num_steps, num_processes,
                              env.observation_space.shape, env.action_space)

    level_seeds = np.zeros(num_processes)
    obs, level_seeds = env.reset()
    level_seeds = level_seeds.reshape(-1, 1)
    rollouts.obs[0] = obs

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    rollouts=rollouts)
    if eval_env is not None:
        eval_runner = EvalRunner(env=eval_env,
                                 model=model,
                                 nsteps=nsteps,
                                 gamma=gamma,
                                 lam=lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    if init_fn is not None:
        init_fn()

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Stepping environment...')

        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos, = runner.run(
            level_seeds=level_seeds)  #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos, = eval_runner.run(
            )  #pylint: disable=E0632

        if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Update level sampler
        level_sampler.update_with_rollouts(rollouts)

        rollouts.after_update()
        level_sampler.after_update()

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update_fn is not None:
            update_fn(update)

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv('loss/' + lossname, lossval)

            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update
                              == 1) and logger.get_dir() and is_mpi_root:
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)

    np.save('gdrive/MyDrive/182 Project/sampled_levels.npy',
            level_sampler.sampled_levels)

    return model
コード例 #27
0
def main():

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=50)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', '-id', type=int, default=99)
    parser.add_argument('--nupdates', type=int, default=0)
    parser.add_argument('--total_tsteps', type=int, default=0)
    parser.add_argument('--log_interval', type=int, default=5)
    parser.add_argument('--load_id', type=int, default=int(-1))
    parser.add_argument('--nrollouts', '-nroll', type=int, default=0)
    parser.add_argument('--test', default=False, action="store_true")
    parser.add_argument('--use_model',
                        type=int,
                        default=1,
                        help="either model #1 or #2")
    parser.add_argument('--train_level', type=int, default=50)

    args = parser.parse_args()
    #timesteps_per_proc
    if args.nupdates:
        timesteps_per_proc = int(args.nupdates * num_envs * nsteps)
    if not args.total_tsteps:
        args.total_tsteps = TIMESTEPS_PER_PROC  ## use global 20_000_000 if not specified in args!
    if args.nrollouts:
        total_timesteps = int(args.nrollouts * num_envs * nsteps)

    run_ID = 'run_' + str(args.run_id).zfill(2)
    if args.test:
        args.log_interval = 1
        args.total_tsteps = 1_000_000
        run_ID += '_test{}_model{}'.format(args.load_id, args.use_model)

    load_path = None
    if args.load_id > -1:
        load_path = join(SAVE_PATH, args.env_name,
                         'saved_ensemble2_v{}.tar'.format(args.load_id))

    test_worker_interval = args.test_worker_interval
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    if args.test:
        logpath = join('log2/ensemble2', args.env_name, 'test', run_ID)
    else:
        logpath = join('log2/ensemble2', args.env_name, 'train', run_ID)
        save_path = join(SAVE_PATH, args.env_name,
                         'saved_ensemble2_v{}.tar'.format(args.run_id))
        logger.info("\n Model will be saved to file {}".format(save_path))

    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)
    logger.configure(dir=logpath, format_strs=format_strs)

    fpath = join(logpath, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.info("creating tf session")
    setup_mpi_gpus()

    if not args.test:
        config = tf.compat.v1.ConfigProto(\
            allow_soft_placement=True,
            log_device_placement=True)# device_count={'GPU':0})
        config.gpu_options.allow_growth = True  #pylint: disable=E1101
        sess = tf.compat.v1.Session(config=config)
        logger.info("creating 2 environments")
        n_levels = int(args.num_levels / 2)
        env1 = ProcgenEnv(num_envs=num_envs,
                          env_name=args.env_name,
                          num_levels=n_levels,
                          start_level=0,
                          distribution_mode=args.distribution_mode)
        env1 = VecExtractDictObs(env1, "rgb")
        env1 = VecMonitor(
            venv=env1,
            filename=None,
            keep_buf=100,
        )
        env1 = VecNormalize(venv=env1, ob=False)

        env2 = ProcgenEnv(num_envs=num_envs,
                          env_name=args.env_name,
                          num_levels=n_levels,
                          start_level=n_levels,
                          distribution_mode=args.distribution_mode)
        env2 = VecExtractDictObs(env2, "rgb")
        env2 = VecMonitor(
            venv=env2,
            filename=None,
            keep_buf=100,
        )
        env2 = VecNormalize(venv=env2, ob=False)

        train(run_ID, save_path, load_path, env1, env2, sess, logger, args)
    else:
        use_model = args.use_model  ## 1 or 2
        alt_flag = use_model - 1
        test_all(alt_flag, load_path, logger, args)
コード例 #28
0
ファイル: train.py プロジェクト: joshnroy/auto-drac
def train(args):
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    log_dir = os.path.expanduser(args.log_dir)
    utils.cleanup_log_dir(log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    log_file = '-{}-{}-reproduce-s{}'.format(args.run_name, args.env_name,
                                             args.seed)
    logger.configure(dir=args.log_dir,
                     format_strs=['csv', 'stdout'],
                     log_suffix=log_file)

    venv = ProcgenEnv(num_envs=args.num_processes, env_name=args.env_name, \
        num_levels=args.num_levels, start_level=args.start_level, \
        distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)
    envs = VecPyTorchProcgen(venv, device)

    obs_shape = envs.observation_space.shape
    actor_critic = Policy(obs_shape,
                          envs.action_space.n,
                          base_kwargs={
                              'recurrent': False,
                              'hidden_size': args.hidden_size
                          })
    actor_critic.to(device)

    if modelbased:
        rollouts = BiggerRolloutStorage(
            args.num_steps,
            args.num_processes,
            envs.observation_space.shape,
            envs.action_space,
            actor_critic.recurrent_hidden_state_size,
            aug_type=args.aug_type,
            split_ratio=args.split_ratio)
    else:
        rollouts = RolloutStorage(args.num_steps,
                                  args.num_processes,
                                  envs.observation_space.shape,
                                  envs.action_space,
                                  actor_critic.recurrent_hidden_state_size,
                                  aug_type=args.aug_type,
                                  split_ratio=args.split_ratio)

    batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch)

    if args.use_ucb:
        aug_id = data_augs.Identity
        aug_list = [
            aug_to_func[t](batch_size=batch_size)
            for t in list(aug_to_func.keys())
        ]

        agent = algo.UCBDrAC(actor_critic,
                             args.clip_param,
                             args.ppo_epoch,
                             args.num_mini_batch,
                             args.value_loss_coef,
                             args.entropy_coef,
                             lr=args.lr,
                             eps=args.eps,
                             max_grad_norm=args.max_grad_norm,
                             aug_list=aug_list,
                             aug_id=aug_id,
                             aug_coef=args.aug_coef,
                             num_aug_types=len(list(aug_to_func.keys())),
                             ucb_exploration_coef=args.ucb_exploration_coef,
                             ucb_window_length=args.ucb_window_length)

    elif args.use_meta_learning:
        aug_id = data_augs.Identity
        aug_list = [aug_to_func[t](batch_size=batch_size) \
            for t in list(aug_to_func.keys())]

        aug_model = AugCNN()
        aug_model.to(device)

        agent = algo.MetaDrAC(actor_critic,
                              aug_model,
                              args.clip_param,
                              args.ppo_epoch,
                              args.num_mini_batch,
                              args.value_loss_coef,
                              args.entropy_coef,
                              meta_grad_clip=args.meta_grad_clip,
                              meta_num_train_steps=args.meta_num_train_steps,
                              meta_num_test_steps=args.meta_num_test_steps,
                              lr=args.lr,
                              eps=args.eps,
                              max_grad_norm=args.max_grad_norm,
                              aug_id=aug_id,
                              aug_coef=args.aug_coef)

    elif args.use_rl2:
        aug_id = data_augs.Identity
        aug_list = [
            aug_to_func[t](batch_size=batch_size)
            for t in list(aug_to_func.keys())
        ]

        rl2_obs_shape = [envs.action_space.n + 1]
        rl2_learner = Policy(rl2_obs_shape,
                             len(list(aug_to_func.keys())),
                             base_kwargs={
                                 'recurrent': True,
                                 'hidden_size': args.rl2_hidden_size
                             })
        rl2_learner.to(device)

        agent = algo.RL2DrAC(actor_critic,
                             rl2_learner,
                             args.clip_param,
                             args.ppo_epoch,
                             args.num_mini_batch,
                             args.value_loss_coef,
                             args.entropy_coef,
                             args.rl2_entropy_coef,
                             lr=args.lr,
                             eps=args.eps,
                             rl2_lr=args.rl2_lr,
                             rl2_eps=args.rl2_eps,
                             max_grad_norm=args.max_grad_norm,
                             aug_list=aug_list,
                             aug_id=aug_id,
                             aug_coef=args.aug_coef,
                             num_aug_types=len(list(aug_to_func.keys())),
                             recurrent_hidden_size=args.rl2_hidden_size,
                             num_actions=envs.action_space.n,
                             device=device)

    elif False:  # Regular Drac
        aug_id = data_augs.Identity
        aug_func = aug_to_func[args.aug_type](batch_size=batch_size)

        agent = algo.DrAC(actor_critic,
                          args.clip_param,
                          args.ppo_epoch,
                          args.num_mini_batch,
                          args.value_loss_coef,
                          args.entropy_coef,
                          lr=args.lr,
                          eps=args.eps,
                          max_grad_norm=args.max_grad_norm,
                          aug_id=aug_id,
                          aug_func=aug_func,
                          aug_coef=args.aug_coef,
                          env_name=args.env_name)
    elif False:  # Model Free Planning Drac
        aug_id = data_augs.Identity
        aug_func = aug_to_func[args.aug_type](batch_size=batch_size)

        actor_critic = PlanningPolicy(obs_shape,
                                      envs.action_space.n,
                                      base_kwargs={
                                          'recurrent': False,
                                          'hidden_size': args.hidden_size
                                      })
        actor_critic.to(device)

        agent = algo.DrAC(actor_critic,
                          args.clip_param,
                          args.ppo_epoch,
                          args.num_mini_batch,
                          args.value_loss_coef,
                          args.entropy_coef,
                          lr=args.lr,
                          eps=args.eps,
                          max_grad_norm=args.max_grad_norm,
                          aug_id=aug_id,
                          aug_func=aug_func,
                          aug_coef=args.aug_coef,
                          env_name=args.env_name)
    else:  # Model based Drac
        aug_id = data_augs.Identity
        aug_func = aug_to_func[args.aug_type](batch_size=batch_size)

        actor_critic = ModelBasedPolicy(obs_shape,
                                        envs.action_space.n,
                                        base_kwargs={
                                            'recurrent': False,
                                            'hidden_size': args.hidden_size
                                        })
        actor_critic.to(device)

        agent = algo.ConvDrAC(actor_critic,
                              args.clip_param,
                              args.ppo_epoch,
                              args.num_mini_batch,
                              args.value_loss_coef,
                              args.entropy_coef,
                              lr=args.lr,
                              eps=args.eps,
                              max_grad_norm=args.max_grad_norm,
                              aug_id=aug_id,
                              aug_func=aug_func,
                              aug_coef=args.aug_coef,
                              env_name=args.env_name)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    if modelbased:
        rollouts.next_obs[0].copy_(obs)  # TODO: is this right?
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in trange(num_updates):
        actor_critic.train()
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                obs_id = aug_id(rollouts.obs[step])
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    obs_id, rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])

            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            obs_id = aug_id(rollouts.obs[-1])
            next_value = actor_critic.get_value(
                obs_id, rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.gamma, args.gae_lambda)

        if args.use_ucb and j > 0:
            agent.update_ucb_values(rollouts)
        if isinstance(agent, algo.ConvDrAC):
            value_loss, action_loss, dist_entropy, transition_model_loss, reward_model_loss = agent.update(
                rollouts)
        else:
            value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "\nUpdate {}, step {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}"
                .format(j, total_num_steps, len(episode_rewards),
                        np.mean(episode_rewards), np.median(episode_rewards),
                        dist_entropy, value_loss, action_loss))

            logger.logkv("train/nupdates", j)
            logger.logkv("train/total_num_steps", total_num_steps)

            logger.logkv("losses/dist_entropy", dist_entropy)
            logger.logkv("losses/value_loss", value_loss)
            logger.logkv("losses/action_loss", action_loss)
            if isinstance(agent, algo.ConvDrAC):
                logger.logkv("losses/transition_model_loss",
                             transition_model_loss)
                logger.logkv("losses/reward_model_loss", reward_model_loss)

            logger.logkv("train/mean_episode_reward", np.mean(episode_rewards))
            logger.logkv("train/median_episode_reward",
                         np.median(episode_rewards))

            ### Eval on the Full Distribution of Levels ###
            eval_episode_rewards = evaluate(args,
                                            actor_critic,
                                            device,
                                            aug_id=aug_id)

            logger.logkv("test/mean_episode_reward",
                         np.mean(eval_episode_rewards))
            logger.logkv("test/median_episode_reward",
                         np.median(eval_episode_rewards))

            logger.dumpkvs()
コード例 #29
0
def train(args):
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    log_dir = os.path.expanduser(args.log_dir)
    utils.cleanup_log_dir(log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    log_file = '-{}-{}-reproduce-s{}'.format(args.run_name, args.env_name,
                                             args.seed)

    venv = ProcgenEnv(num_envs=args.num_processes, env_name=args.env_name, \
        num_levels=args.num_levels, start_level=args.start_level, \
        distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)
    envs = VecPyTorchProcgen(venv, device)

    obs_shape = envs.observation_space.shape

    ################################
    actor_critic = Policy(obs_shape,
                          envs.action_space.n,
                          base_kwargs={
                              'recurrent': False,
                              'hidden_size': args.hidden_size
                          })
    actor_critic.to(device)

    ################################
    rollouts = RolloutStorage(args.num_steps,
                              args.num_processes,
                              envs.observation_space.shape,
                              envs.action_space,
                              actor_critic.recurrent_hidden_state_size,
                              aug_type=args.aug_type,
                              split_ratio=args.split_ratio)

    batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch)

    ################################
    if args.use_ucb:
        aug_id = data_augs.Identity
        aug_list = [
            aug_to_func[t](batch_size=batch_size)
            for t in list(aug_to_func.keys())
        ]

        agent = algo.UCBDrAC(actor_critic,
                             args.clip_param,
                             args.ppo_epoch,
                             args.num_mini_batch,
                             args.value_loss_coef,
                             args.entropy_coef,
                             lr=args.lr,
                             eps=args.eps,
                             max_grad_norm=args.max_grad_norm,
                             aug_list=aug_list,
                             aug_id=aug_id,
                             aug_coef=args.aug_coef,
                             num_aug_types=len(list(aug_to_func.keys())),
                             ucb_exploration_coef=args.ucb_exploration_coef,
                             ucb_window_length=args.ucb_window_length)

    elif args.use_meta_learning:
        aug_id = data_augs.Identity
        aug_list = [aug_to_func[t](batch_size=batch_size) \
            for t in list(aug_to_func.keys())]

        aug_model = AugCNN()
        aug_model.to(device)

        agent = algo.MetaDrAC(actor_critic,
                              aug_model,
                              args.clip_param,
                              args.ppo_epoch,
                              args.num_mini_batch,
                              args.value_loss_coef,
                              args.entropy_coef,
                              meta_grad_clip=args.meta_grad_clip,
                              meta_num_train_steps=args.meta_num_train_steps,
                              meta_num_test_steps=args.meta_num_test_steps,
                              lr=args.lr,
                              eps=args.eps,
                              max_grad_norm=args.max_grad_norm,
                              aug_id=aug_id,
                              aug_coef=args.aug_coef)

    elif args.use_rl2:
        aug_id = data_augs.Identity
        aug_list = [
            aug_to_func[t](batch_size=batch_size)
            for t in list(aug_to_func.keys())
        ]

        rl2_obs_shape = [envs.action_space.n + 1]
        rl2_learner = Policy(rl2_obs_shape,
                             len(list(aug_to_func.keys())),
                             base_kwargs={
                                 'recurrent': True,
                                 'hidden_size': args.rl2_hidden_size
                             })
        rl2_learner.to(device)

        agent = algo.RL2DrAC(actor_critic,
                             rl2_learner,
                             args.clip_param,
                             args.ppo_epoch,
                             args.num_mini_batch,
                             args.value_loss_coef,
                             args.entropy_coef,
                             args.rl2_entropy_coef,
                             lr=args.lr,
                             eps=args.eps,
                             rl2_lr=args.rl2_lr,
                             rl2_eps=args.rl2_eps,
                             max_grad_norm=args.max_grad_norm,
                             aug_list=aug_list,
                             aug_id=aug_id,
                             aug_coef=args.aug_coef,
                             num_aug_types=len(list(aug_to_func.keys())),
                             recurrent_hidden_size=args.rl2_hidden_size,
                             num_actions=envs.action_space.n,
                             device=device)

    else:
        aug_id = data_augs.Identity
        aug_func = aug_to_func[args.aug_type](batch_size=batch_size)

        agent = algo.DrAC(actor_critic,
                          args.clip_param,
                          args.ppo_epoch,
                          args.num_mini_batch,
                          args.value_loss_coef,
                          args.entropy_coef,
                          lr=args.lr,
                          eps=args.eps,
                          max_grad_norm=args.max_grad_norm,
                          aug_id=aug_id,
                          aug_func=aug_func,
                          aug_coef=args.aug_coef,
                          env_name=args.env_name)

    checkpoint_path = os.path.join(args.save_dir, "agent" + log_file + ".pt")
    if os.path.exists(checkpoint_path) and args.preempt:
        checkpoint = torch.load(checkpoint_path)
        agent.actor_critic.load_state_dict(checkpoint['model_state_dict'])
        agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        init_epoch = checkpoint['epoch'] + 1
        logger.configure(dir=args.log_dir,
                         format_strs=['csv', 'stdout'],
                         log_suffix=log_file + "-e%s" % init_epoch)
    else:
        init_epoch = 0
        logger.configure(dir=args.log_dir,
                         format_strs=['csv', 'stdout'],
                         log_suffix=log_file)

    obs = envs.reset()  # envs!!!!!!!!!!
    rollouts.obs[0].copy_(obs)  # 초기 obs 장착
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    # args.num_steps -> 256, 'number of forward steps in A2C')
    # args.num_env_steps -> 25e6, 'number of environment steps to train'
    num_updates = int(
        args.num_env_steps) // args.num_processes // args.num_steps

    # todo : 에폭이라... 그런데 이거 에피소드마다 종료되는 스탭이 다를텐데...
    for j in range(init_epoch, num_updates):
        actor_critic.train()
        for step in range(args.num_steps):

            # Sample actions
            with torch.no_grad():
                obs_id = aug_id(rollouts.obs[step])
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    obs_id, rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Observe reward and next obs
            # todo : check the shapes of obs, reward, done, infos
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    # todo : difference between reward and info['episode']['r']
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])

            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            obs_id = aug_id(rollouts.obs[-1])
            # todo : what is next_value for?
            next_value = actor_critic.get_value(
                obs_id, rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.gamma, args.gae_lambda)

        if args.use_ucb and j > 0:  # from second epoch
            agent.update_ucb_values(rollouts)  # update ucb

        # todo : 와 여기가 장난아니네 ㅠㅠ
        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        # 뭔가 클리어!
        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "\nUpdate {}, step {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}"
                .format(j, total_num_steps, len(episode_rewards),
                        np.mean(episode_rewards), np.median(episode_rewards),
                        dist_entropy, value_loss, action_loss))

            logger.logkv("train/nupdates", j)
            logger.logkv("train/total_num_steps", total_num_steps)

            logger.logkv("losses/dist_entropy", dist_entropy)
            logger.logkv("losses/value_loss", value_loss)
            logger.logkv("losses/action_loss", action_loss)

            logger.logkv("train/mean_episode_reward", np.mean(episode_rewards))
            logger.logkv("train/median_episode_reward",
                         np.median(episode_rewards))

            ### Eval on the Full Distribution of Levels ###
            eval_episode_rewards = evaluate(args,
                                            actor_critic,
                                            device,
                                            aug_id=aug_id)

            logger.logkv("test/mean_episode_reward",
                         np.mean(eval_episode_rewards))
            logger.logkv("test/median_episode_reward",
                         np.median(eval_episode_rewards))

            logger.dumpkvs()

        # Save Model
        if (j > 0 and j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            try:
                os.makedirs(args.save_dir)
            except OSError:
                pass

            torch.save(
                {
                    'epoch': j,
                    'model_state_dict': agent.actor_critic.state_dict(),
                    'optimizer_state_dict': agent.optimizer.state_dict(),
                }, os.path.join(args.save_dir, "agent" + log_file + ".pt"))
コード例 #30
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    timesteps_per_proc = 30_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=50)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', '-id', type=int, default=0)
    parser.add_argument('--use', type=str, default="randcrop")
    parser.add_argument('--log_interval', type=int, default=20)
    parser.add_argument('--nupdates', type=int, default=0)
    parser.add_argument('--total_tsteps', type=int, default=0)
    parser.add_argument('--load_id', type=int, default=int(-1))

    args = parser.parse_args()

    if args.nupdates:
        timesteps_per_proc = int(args.nupdates * num_envs * nsteps)
    if not args.total_tsteps:
        args.total_tsteps = timesteps_per_proc  ## use global 20_000_000 if not specified in args!

    run_ID = 'run_' + str(args.run_id).zfill(2)
    ## select which ppo to use:
    agent_str = args.use
    LOG_DIR = join("log", agent_str, "train")
    save_model = join("log", agent_str,
                      "saved_{}_v{}.tar".format(agent_str, args.run_id))
    ppo_func = PPO_FUNCs[agent_str]
    load_path = None
    if args.load_id > -1:
        load_path = join("log", agent_str,
                         "saved_{}_v{}.tar".format(agent_str, args.load_id))

    test_worker_interval = args.test_worker_interval
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)
    logger.configure(dir=logpath, format_strs=format_strs)

    fpath = join(LOG_DIR, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.info("\n Saving model to file {}".format(save_model))

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto(
        log_device_placement=True)  #device_count={'GPU':0, 'XLA_GPU':0})
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.compat.v1.Session(config=config)
    #sess.__enter__()

    logger.info(venv.observation_space)
    logger.info("training")
    with sess.as_default():
        model = ppo_func.learn(
            sess=sess,
            env=venv,
            network=None,
            total_timesteps=args.total_tsteps,
            save_interval=1000,
            nsteps=nsteps,
            nminibatches=nminibatches,
            lam=lam,
            gamma=gamma,
            noptepochs=ppo_epochs,
            log_interval=args.log_interval,
            ent_coef=ent_coef,
            # clip_vf=use_vf_clipping,
            lr=learning_rate,
            cliprange=clip_range,
            # update_fn=None,
            # init_fn=None,
            save_path=save_model,
            load_path=load_path,
            vf_coef=0.5,
            max_grad_norm=0.5,
        )
        model.save(save_model)