Ejemplo n.º 1
0
def get_arch(
    *,
    library="baselines",
    cnn="clear",
    use_lstm=0,
    stack_channels="16_32_32",
    emb_size=256,
    **kwargs,
):
    stack_channels = [int(x) for x in stack_channels.split("_")]

    if library == "baselines":
        if cnn == "impala":
            from baselines.common.models import build_impala_cnn

            conv_fn = lambda x: build_impala_cnn(
                x, depths=stack_channels, emb_size=emb_size
            )
        elif cnn == "nature":
            from baselines.common.models import nature_cnn

            conv_fn = nature_cnn
        elif cnn == "clear":
            from lucid.scratch.rl_util.arch import clear_cnn

            conv_fn = clear_cnn
        else:
            raise ValueError(f"Unsupported cnn: {cnn}")

        if use_lstm:
            from baselines.common.models import cnn_lstm

            arch = cnn_lstm(nlstm=256, conv_fn=conv_fn)
        else:
            arch = conv_fn

    else:
        raise ValueError(f"Unsupported library: {library}")

    return arch
Ejemplo n.º 2
0
def test_fn(env_name, num_envs, config_path, load_path):
    test_config_path = os.path.join(os.getcwd(), "procgen-adr", config_path)
    test_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, domain_config_path=test_config_path, render_mode="rgb_array")
    test_env = VecExtractDictObs(test_env, "rgb")
    test_env = VecMonitor(venv=test_env, filename=None, keep_buf=100)
    test_env = VecNormalize(venv=test_env, ob=False)

    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256)

    recur = True
    if recur:
        logger.info("Using CNN LSTM")
        conv_fn = cnn_lstm(nlstm=256, conv_fn=conv_fn)

    mean, std = test(conv_fn, test_env, load_path=load_path)
    sess.close()
    return mean, std
Ejemplo n.º 3
0
def train_fn(env_name,
             num_envs,
             distribution_mode,
             num_levels,
             start_level,
             timesteps_per_proc,
             level_sampler_strategy,
             score_transform,
             model_name,
             is_test_worker=False,
             save_dir='./',
             comm=None):
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    use_vf_clipping = True

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else num_levels

    log_dir = save_dir + 'logs/' + model_name

    if log_dir is not None:
        log_comm = comm.Split(1 if is_test_worker else 0, 0)
        format_strs = ['csv', 'stdout', 'tensorboard'
                       ] if log_comm.Get_rank() == 0 else []
        logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs)

    logger.info("creating environment")
    eval_env = ProcgenEnv(num_envs=num_envs,
                          env_name=env_name,
                          num_levels=500,
                          start_level=0,
                          distribution_mode=distribution_mode)
    eval_env = VecExtractDictObs(eval_env, "rgb")
    eval_env = VecMonitor(
        venv=eval_env,
        filename=None,
        keep_buf=100,
    )
    eval_env = VecNormalize(venv=eval_env, ob=False, ret=True)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32])

    logger.info("training")
    model = ppo2.learn(network=conv_fn,
                       total_timesteps=timesteps_per_proc,
                       num_levels=num_levels,
                       eval_env=eval_env,
                       save_interval=0,
                       nsteps=nsteps,
                       nminibatches=nminibatches,
                       lam=lam,
                       gamma=gamma,
                       noptepochs=ppo_epochs,
                       log_interval=1,
                       ent_coef=ent_coef,
                       mpi_rank_weight=mpi_rank_weight,
                       clip_vf=use_vf_clipping,
                       comm=comm,
                       lr=learning_rate,
                       cliprange=clip_range,
                       update_fn=None,
                       init_fn=None,
                       vf_coef=0.5,
                       max_grad_norm=0.5,
                       level_sampler_strategy=level_sampler_strategy,
                       score_transform=score_transform)
    model.save(save_dir + 'models/' + model_name)
Ejemplo n.º 4
0
def main():
    # get model path
    parser = argparse.ArgumentParser(description="Parse testing arguments")
    parser.add_argument('--model_path',
                        type=str,
                        default=None,
                        help='Path to model checkpoint.')
    parser.add_argument('--config',
                        type=str,
                        default='configurations/ppo_baseline_cuda.yaml',
                        help='Path to configuration file.')
    args = parser.parse_args()
    if args.model_path is None or not os.path.exists(args.model_path):
        raise OSError("Invalid model file supplied")

    # create configuration
    cfg = get_cfg_defaults()
    cfg.merge_from_file(args.config)

    # create save directory
    model_file_path = args.model_path
    exp_creation_time = os.path.normpath(model_file_path).split(os.sep)[-3]
    print(exp_creation_time)
    exp_dir = f"runs/{cfg.EXPERIMENT_NAME}/{exp_creation_time}_test/"
    os.makedirs(exp_dir, exist_ok=True)

    # create logger
    format_strs = ['csv', 'stdout']
    logger.configure(dir=exp_dir,
                     format_strs=format_strs,
                     log_suffix=datetime.now().strftime('%Y-%m-%d-%H-%M'))

    # create (vectorized) procgen environment
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=cfg.TEST.NUM_ENVS,
                      env_name="fruitbot",
                      num_levels=cfg.TEST.NUM_LEVELS,
                      start_level=cfg.TEST.LEVEL_SEED,
                      distribution_mode="easy")
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    # create tensorflow session
    logger.info("creating tf session")
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # create cnn todo: make this less ugly
    conv_fn = None
    logger.info("building cnn")
    if cfg.TRAIN.NETWORK == "NATURE_CNN":
        conv_fn = lambda x: nature_cnn(x)
    elif cfg.TRAIN.NETWORK == "IMPALA_CNN":
        conv_fn = lambda x: build_impala_cnn(
            x, depths=[16, 32, 32], emb_size=256)

    # training
    logger.info("testing")
    ppo2.learn(env=venv,
               network=conv_fn,
               total_timesteps=cfg.TEST.TIMESTEPS,
               save_interval=0,
               nsteps=cfg.TEST.BATCH_SIZE,
               nminibatches=cfg.TRAIN.MINIBATCHES,
               lam=cfg.TRAIN.LAM,
               gamma=cfg.TRAIN.GAMMA,
               noptepochs=cfg.TRAIN.NUM_EPOCHS,
               log_interval=1,
               clip_vf=cfg.TRAIN.USE_VF_CLIPPING,
               lr=cfg.TRAIN.LR,
               cliprange=cfg.TRAIN.CLIP_RANGE,
               update_fn=None,
               init_fn=None,
               vf_coef=0.5,
               max_grad_norm=0.5,
               test=True,
               load_path=model_file_path)
Ejemplo n.º 5
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    total_timesteps = 1_000_000  ## now this counts steps in testing runs
    use_vf_clipping = True

    ## From random_ppo.py
    max_grad_norm = 0.5
    vf_coef = 0.5
    L2_WEIGHT = 10e-4
    FM_COEFF = 0.002
    REAL_THRES = 0.1

    parser = argparse.ArgumentParser(
        description='Process procgen testing arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1000)
    ## default starting_level set to 50 to test on unseen levels!
    parser.add_argument('--start_level', type=int, default=1000)
    parser.add_argument('--run_id', '-id', type=int, default=0)
    parser.add_argument('--load_id', type=int, default=0)
    parser.add_argument('--nrollouts', '-nroll', type=int, default=0)

    args = parser.parse_args()
    args.total_timesteps = total_timesteps
    if args.nrollouts:
        total_timesteps = int(args.nrollouts * num_envs * nsteps)
    run_ID = 'run_' + str(args.run_id).zfill(2)
    run_ID += '_load{}'.format(args.load_id)

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    mpi_rank_weight = 0
    num_levels = args.num_levels

    log_comm = comm.Split(0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)

    fpath = join(logpath, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.configure(dir=logpath, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.compat.v1.Session(config=config)
    sess.__enter__()

    logger.info("Testing")
    ## Modified based on random_ppo.learn
    env = venv
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    nrollouts = total_timesteps // nbatch

    network = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)
    policy = build_policy(env, network)
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm)

    LOAD_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.load_id)
    model.load(LOAD_PATH)
    logger.info("Model pramas loaded from save")
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)
    # tfirststart = time.time() ## Not doing timing yet
    # active_ep_buf = epinfobuf100

    mean_rewards = []
    datapoints = []
    for rollout in range(1, nrollouts + 1):
        logger.info('collecting rollouts {}...'.format(rollout))
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  ## differnent from random_ppo!
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
        rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
        ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10])
        ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100])

        logger.info('\n----', rollout)
        mean_rewards.append(rew_mean_10)
        logger.logkv('eprew10', rew_mean_10)
        logger.logkv('eprew100', rew_mean_100)
        logger.logkv('eplenmean10', ep_len_mean_10)
        logger.logkv('eplenmean100', ep_len_mean_100)
        logger.logkv("misc/total_timesteps", rollout * nbatch)

        logger.info('----\n')
        logger.dumpkvs()
    env.close()

    print("Rewards history: ", mean_rewards)
    return mean_rewards
Ejemplo n.º 6
0
def main():
    # Hyperparameters
    num_envs = 128
    learning_rate = 5e-4
    ent_coef = .01
    vf_coef = 0.5
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    max_grad_norm = 0.5
    timesteps_per_proc = 100_000_000
    use_vf_clipping = True

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='hard',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    args = parser.parse_args()

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    # Setup env specs
    env_name = args.env_name
    num_levels = 0 if is_test_worker else args.num_levels
    start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR + f'/{args.env_name}/run_{args.run_id}',
                     format_strs=format_strs)

    # Create env
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=env_name,
                      num_levels=num_levels,
                      start_level=start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup model
    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32])

    # Training
    logger.info("training")
    ppo2.Runner = NetRandRunner
    ppo2.build_policy = build_policy
    model = ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        model_fn=NetRandModel,
    )

    # Saving
    logger.info("saving final model")
    if rank == 0:
        checkdir = os.path.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)
        model.save(os.path.join(checkdir, 'final_model.ckpt'))
Ejemplo n.º 7
0
envs = [env_fn for x in range(64)]
venv = DummyVecEnv(envs)

venv = VecMonitor(
    venv=venv,
    filename=None,
    keep_buf=100,
)

logger.info("creating tf session")
config = tf.ConfigProto()
sess = tf.Session(config=config)
sess.__enter__()

conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

logger.info("training")
final_model = ppo2.learn(
    env=venv,
    network=conv_fn,
    total_timesteps=total_time,
    save_interval=0,
    nsteps=nsteps,
    nminibatches=nminibatches,
    lam=lam,
    gamma=gamma,
    noptepochs=ppo_epochs,
    log_interval=1,
    ent_coef=ent_coef,
    mpi_rank_weight=0,
Ejemplo n.º 8
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    last_step = 4587520  # where we have left off in training
    timesteps_per_proc = 25_000_000 - last_step
    use_vf_clipping = True
    model_path = '../train-procgen/saved_model/policy_bossfight_vae560'
    vae_path = '../train-procgen/saved_model/bossfight_vae560'

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='hard',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)

    args = parser.parse_args()

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    logger.info("training")
    ppo2_cvae.learn(env=venv,
                    network=conv_fn,
                    total_timesteps=timesteps_per_proc,
                    save_interval=10,
                    nsteps=nsteps,
                    nminibatches=nminibatches,
                    lam=lam,
                    gamma=gamma,
                    noptepochs=ppo_epochs,
                    log_interval=1,
                    ent_coef=ent_coef,
                    mpi_rank_weight=mpi_rank_weight,
                    clip_vf=use_vf_clipping,
                    comm=comm,
                    lr=learning_rate,
                    cliprange=clip_range,
                    update_fn=None,
                    init_fn=None,
                    vf_coef=0.5,
                    max_grad_norm=0.5,
                    load_path=model_path,
                    vae_path=vae_path)
Ejemplo n.º 9
0
def main():
    # Hyperparameters
    num_envs = 128
    learning_rate = 5e-4
    ent_coef = .01
    vf_coef = 0.5
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    max_grad_norm = 0.5
    use_vf_clipping = True

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=50)
    parser.add_argument('--start_level', type=int, default=500)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    parser.add_argument('--use_bn', action='store_true')
    parser.add_argument('--use_l2reg', action='store_true')
    parser.add_argument('--l2reg_coeff', type=float, default=1e-4)
    parser.add_argument('--data_aug',
                        type=str,
                        default='no_aug',
                        choices=["no_aug", "cutout_color", "crop"])
    parser.add_argument('--use_rand_conv', action='store_true')
    parser.add_argument('--model_width',
                        type=str,
                        default='1x',
                        choices=["1x", "2x", "4x"])
    parser.add_argument('--level_setup',
                        type=str,
                        default='procgen',
                        choices=["procgen", "oracle"])
    parser.add_argument('--mix_mode',
                        type=str,
                        default='nomix',
                        choices=['nomix', 'mixreg', 'mixobs'])
    parser.add_argument('--mix_alpha', type=float, default=0.2)
    parser.add_argument('--timesteps_per_proc', type=int, default=1_000_000)
    parser.add_argument('--level_sampler_strategy',
                        type=str,
                        default='value_l1')
    parser.add_argument('--score_transform', type=str, default='rank')
    parser.add_argument('--save_dir',
                        type=str,
                        default='gdrive/MyDrive/182 Project/')
    args = parser.parse_args()

    timesteps_per_proc = args.timesteps_per_proc
    log_dir = args.save_dir

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    # Setup env specs
    if args.level_setup == "procgen":
        env_name = args.env_name
        num_levels = 0 if is_test_worker else args.num_levels
        start_level = args.start_level
    elif args.level_setup == "oracle":
        env_name = args.env_name
        num_levels = 0
        start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout', 'tensorboard'
                   ] if log_comm.Get_rank() == 0 else []
    logger.configure(
        dir=log_dir +
        f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}',
        format_strs=format_strs)

    # Create env
    logger.info("creating environment")
    eval_env = ProcgenEnv(num_envs=num_envs,
                          env_name=env_name,
                          num_levels=500,
                          start_level=0,
                          distribution_mode=args.distribution_mode)
    eval_env = VecExtractDictObs(eval_env, "rgb")
    eval_env = VecMonitor(
        venv=eval_env,
        filename=None,
        keep_buf=100,
    )
    eval_env = VecNormalize(venv=eval_env, ob=False, ret=True)

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup model
    if args.model_width == '1x':
        depths = [16, 32, 32]
    elif args.model_width == '2x':
        depths = [32, 64, 64]
    elif args.model_width == '4x':
        depths = [64, 128, 128]
    conv_fn = lambda x: build_impala_cnn(x,
                                         depths=depths,
                                         use_bn=args.use_bn,
                                         randcnn=args.use_rand_conv and
                                         not is_test_worker)

    # Training
    logger.info("training")
    ppo2.learn = learn  # use customized "learn" function
    model = ppo2.learn(
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        num_levels=num_levels,
        start_level=start_level,
        eval_env=eval_env,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        data_aug=args.data_aug,
        level_sampler_strategy=args.level_sampler_strategy,
        score_transform=args.score_transform,
        model_fn=get_mixreg_model(mix_mode=args.mix_mode,
                                  mix_alpha=args.mix_alpha,
                                  use_l2reg=args.use_l2reg,
                                  l2reg_coeff=args.l2reg_coeff),
    )

    # Saving
    logger.info("saving final model")
    if rank == 0:
        checkdir = os.path.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)
        model.save(os.path.join(checkdir, 'final_model.ckpt'))
Ejemplo n.º 10
0
def rollout_fn(num_steps, env_name, num_envs, distribution_mode, num_levels, start_level, timesteps_per_proc, is_test_worker=False, log_dir='/tmp/procgen', comm=None, load_path=None):
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    use_vf_clipping = True

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else num_levels

    if log_dir is not None:
        log_comm = comm.Split(1 if is_test_worker else 0, 0)
        format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
        logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs, filename="rollout")

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv, filename=None, keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256)

    logger.info("training")
    ppo2.rollout(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        load_path = load_path,
        num_steps=num_steps,
        num_envs=num_envs, 
        env_name=env_name,
        num_levels=num_levels, 
        start_level=start_level, 
        distribution_mode=distribution_mode
    )
Ejemplo n.º 11
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    timesteps_per_proc = 20_000_000  # 200_000_000: hard 25_000_000: easy
    use_vf_clipping = True
    LOG_DIR = './log/'

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--data_aug', type=str, default='normal')
    parser.add_argument('--exp_name', type=str, default='try1')
    parser.add_argument('--test_start_level', type=int,
                        default=200)  # 500 for hard / 200 for easy

    args = parser.parse_args()

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    #if args.num_levels < 50:
    #    timesteps_per_proc = 5_000_000

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    LOG_DIR += args.env_name + '/nlev_' + str(args.num_levels) + '_mode_'
    LOG_DIR += args.distribution_mode + '/' + args.data_aug + '/' + args.exp_name

    logger.configure(dir=LOG_DIR, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    # eval env, unlimited levels
    eval_venv = ProcgenEnv(num_envs=num_envs,
                           env_name=args.env_name,
                           num_levels=0,
                           start_level=args.test_start_level,
                           distribution_mode=args.distribution_mode)
    eval_venv = VecExtractDictObs(eval_venv, "rgb")

    eval_venv = VecMonitor(
        venv=eval_venv,
        filename=None,
        keep_buf=100,
    )

    eval_venv = VecNormalize(venv=eval_venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    logger.info("training")
    ppo2.learn(
        env=venv,
        eval_env=eval_venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        save_interval=62,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        data_aug=args.data_aug,
    )
Ejemplo n.º 12
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    timesteps_per_proc = 20_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument('--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=50)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', '-id', type=int, default=0)
    parser.add_argument('--nupdates', type=int, default=0)
    parser.add_argument('--log_interval', type=int, default=1)
    parser.add_argument('--total_tsteps', type=int, default=0)
    parser.add_argument('--load_id', type=int, default=int(-1))

    args = parser.parse_args()
    if args.nupdates:
        timesteps_per_proc = int(args.nupdates * num_envs * nsteps)
    if not args.total_tsteps:
        args.total_tsteps = timesteps_per_proc ## use global 20_000_000 if not specified in args!


    run_ID = 'run_'+str(args.run_id).zfill(2)
    SAVE_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.run_id)
    load_path = None
    if args.load_id > -1:
        load_path = 'log/vanilla/saved_vanilla_v{}.tar'.format(args.load_id)
    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)
    logger.configure(dir=logpath, format_strs=format_strs)   

    fpath = join(LOG_DIR, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.info("saving to filename ", SAVE_PATH)
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv, filename=None, keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto(log_device_placement=True)#device_count={'GPU':0})
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256)

    logger.info("training")
    model = ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=args.total_tsteps,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=args.log_interval,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        load_path=load_path,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        save_interval=300
    )
    model.save(SAVE_PATH)
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(description="Process training arguments.")
    parser.add_argument('--config',
                        type=str,
                        default="configurations/ppo_baseline_cuda.yaml",
                        help="config file name (located in config dir)")
    args = parser.parse_args()

    # create configuration
    cfg = get_cfg_defaults()
    cfg.merge_from_file(args.config)

    print(cfg.TRAIN.TOTAL_TIMESTEPS)

    # create experiment directory
    exp_dir = f"runs/{cfg.EXPERIMENT_NAME}/{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    os.makedirs(exp_dir, exist_ok=True)

    # create logger
    format_strs = ['csv', 'stdout']
    logger.configure(dir=exp_dir,
                     format_strs=format_strs,
                     log_suffix=datetime.now().strftime('%Y-%m-%d-%H-%M'))

    # create (vectorized) procgen environment
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=cfg.TRAIN.NUM_ENVS,
                      env_name="fruitbot",
                      num_levels=cfg.TRAIN.NUM_LEVELS,
                      start_level=cfg.TRAIN.LEVEL_SEED,
                      distribution_mode="easy")
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    test_venv = ProcgenEnv(num_envs=cfg.TEST.NUM_ENVS,
                           env_name="fruitbot",
                           num_levels=cfg.TEST.NUM_LEVELS,
                           start_level=cfg.TEST.LEVEL_SEED,
                           distribution_mode="easy")
    test_venv = VecExtractDictObs(test_venv, "rgb")
    test_venv = VecMonitor(
        venv=test_venv,
        filename=None,
        keep_buf=100,
    )
    test_venv = VecNormalize(venv=test_venv, ob=False)

    # create tensorflow session
    logger.info("creating tf session")
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # create cnn todo: make this less ugly
    conv_fn = None
    logger.info("building cnn")
    if cfg.TRAIN.NETWORK == "NATURE_CNN":
        conv_fn = lambda x: nature_cnn(x)
    elif cfg.TRAIN.NETWORK == "IMPALA_CNN":
        conv_fn = lambda x: build_impala_cnn(
            x, depths=[16, 32, 32], emb_size=256)

    # training
    logger.info("training")
    if cfg.TRAIN.POLICY == "A2C":
        a2c.learn(env=venv,
                  network=conv_fn,
                  total_timesteps=cfg.TRAIN.TOTAL_TIMESTEPS,
                  nsteps=cfg.TRAIN.BATCH_SIZE,
                  log_interval=1,
                  eval_env=test_venv,
                  augment=cfg.TRAIN.AUGMENT)
    elif cfg.TRAIN.POLICY == "ACKTR":
        acktr.learn(env=venv,
                    network=conv_fn,
                    total_timesteps=cfg.TRAIN.TOTAL_TIMESTEPS,
                    nsteps=cfg.TRAIN.BATCH_SIZE,
                    log_interval=1,
                    eval_env=test_venv,
                    augment=cfg.TRAIN.AUGMENT,
                    seed=None)
    elif cfg.TRAIN.POLICY == "PPO":
        ppo2.learn(env=venv,
                   eval_env=test_venv,
                   network=conv_fn,
                   total_timesteps=cfg.TRAIN.TOTAL_TIMESTEPS,
                   save_interval=5,
                   nsteps=cfg.TRAIN.BATCH_SIZE,
                   nminibatches=cfg.TRAIN.MINIBATCHES,
                   lam=cfg.TRAIN.LAM,
                   gamma=cfg.TRAIN.GAMMA,
                   noptepochs=cfg.TRAIN.NUM_EPOCHS,
                   log_interval=1,
                   clip_vf=cfg.TRAIN.USE_VF_CLIPPING,
                   lr=cfg.TRAIN.LR,
                   cliprange=cfg.TRAIN.CLIP_RANGE,
                   update_fn=None,
                   init_fn=None,
                   vf_coef=0.5,
                   max_grad_norm=0.5,
                   augment=cfg.TRAIN.AUGMENT,
                   load_path=cfg.TRAIN.PRETRAINED)
Ejemplo n.º 14
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    timesteps_per_proc = 5_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env-name', type=str, default='bigfish')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num-levels', type=int, default=200)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--obs',
                        choices=['rgb', 'lbl', 'onehot_lbl'],
                        default='rgb')

    args = parser.parse_args()

    LOG_DIR = f'/raid0/dian/procgen_baseline/{args.env_name}/ppo_{args.obs}_{args.num_levels}_{SEED}'

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = args.num_levels

    # log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout']  # if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(
        num_envs=num_envs,
        env_name=args.env_name,
        num_levels=num_levels,
        start_level=args.start_level,
        distribution_mode=args.distribution_mode,
        rand_seed=SEED,
    )
    test_venv = ProcgenEnv(
        num_envs=num_envs,
        env_name=args.env_name,
        num_levels=0,
        start_level=args.start_level,
        distribution_mode=args.distribution_mode,
        rand_seed=SEED,
    )
    if args.obs == 'onehot_lbl':
        venv = VecExtractDictObsOnehot(venv, args.env_name)
        venv = VecMonitor(
            venv=venv,
            filename=None,
            keep_buf=100,
        )
        test_venv = VecExtractDictObsOnehot(test_venv, args.env_name)
        test_venv = VecMonitor(
            venv=test_venv,
            filename=None,
            keep_buf=100,
        )
    else:
        venv = VecExtractDictObs(venv, args.obs)
        venv = VecMonitor(
            venv=venv,
            filename=None,
            keep_buf=100,
        )
        venv = VecNormalize(venv=venv, ob=False)

        test_venv = VecExtractDictObs(test_venv, args.obs)
        test_venv = VecMonitor(
            venv=test_venv,
            filename=None,
            keep_buf=100,
        )
        test_venv = VecNormalize(venv=test_venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    logger.info("training")
    ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        eval_env=test_venv,
        save_interval=100,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
    )
Ejemplo n.º 15
0
def eval_fn(load_path, args, env_name='fruitbot', distribution_mode='easy', num_levels=500, start_level=500, log_dir='./tmp/procgen', comm=None, num_trials=3, gui=False):

    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    use_vf_clipping = True
    vf_coef = 0.5
    max_grad_norm = 0.5

    mpi_rank_weight = 1
    log_interval = 1
    seed=None

    log_comm = comm.Split(0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=1, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv, filename=None, keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256)

    logger.info(f"evaluating")

    set_global_seeds(seed)

    policy = build_policy(venv, conv_fn)

    # Get the nb of env
    nenvs = venv.num_envs
    # Get state_space and action_space
    ob_space = venv.observation_space
    ac_space = venv.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    from .alternate_ppo2.model import Model
    model_fn = Model

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight)

    if os.path.isfile(load_path):
        alt_ppo2.eval(
            network=conv_fn,
            nsteps=nsteps,
            ent_coef=ent_coef,
            vf_coef=vf_coef,
            max_grad_norm=max_grad_norm,
            gamma=gamma,
            lam=lam,
            log_interval=log_interval,
            nminibatches=nminibatches,
            noptepochs=ppo_epochs,
            load_path=load_path,
            mpi_rank_weight=mpi_rank_weight,
            comm=comm,
            clip_vf=use_vf_clipping,
            lr=learning_rate,
            cliprange=clip_range,
            policy=policy,
            nenvs=nenvs,
            ob_space=ob_space,
            ac_space=ac_space,
            nbatch=nbatch,
            nbatch_train=nbatch_train,
            model_fn=model_fn,
            model=model,
            num_trials=num_trials,
            num_levels=num_levels,
            start_level=start_level,
            gui=gui,
            args=args
        )
    elif os.path.isdir(load_path):
        for file in os.listdir(load_path):
            log_comm = comm.Split(0, 0)
            format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
            logger.configure(comm=log_comm, dir=log_dir+'/'+file, format_strs=format_strs)
            alt_ppo2.eval(
                network=conv_fn,
                nsteps=nsteps,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                max_grad_norm=max_grad_norm,
                gamma=gamma,
                lam=lam,
                log_interval=log_interval,
                nminibatches=nminibatches,
                noptepochs=ppo_epochs,
                load_path=load_path+'/'+file,
                mpi_rank_weight=mpi_rank_weight,
                comm=comm,
                clip_vf=use_vf_clipping,
                lr=learning_rate,
                cliprange=clip_range,
                policy=policy,
                nenvs=nenvs,
                ob_space=ob_space,
                ac_space=ac_space,
                nbatch=nbatch,
                nbatch_train=nbatch_train,
                model_fn=model_fn,
                model=model,
                num_trials=num_trials,
                num_levels=num_levels,
                start_level=start_level,
                gui=gui,
                args=args
            )
    else:
        print('Model path does not exist.')
    return
Ejemplo n.º 16
0
 def conv_fn(x):
     return build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)
def main():

    args = parse_config()
    run_dir = log_this(args, args.log_dir,
                       args.log_name + '_' + args.env_name + '_' + args.rm_id)

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=run_dir, format_strs=format_strs)

    logger.info("creating environment")

    venv = ProcgenEnv(num_envs=args.num_envs,
                      env_name=args.env_name,
                      num_levels=args.num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode,
                      use_sequential_levels=args.use_sequential_levels)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)

    if args.rm_id:
        # load pretrained network
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        net = RewardNet().to(device)
        rm_path = glob.glob('./**/' + args.rm_id + '.rm', recursive=True)[0]
        net.load_state_dict(
            torch.load(rm_path, map_location=torch.device(device)))

        # use batch reward prediction function instead of the ground truth reward function
        # pass though sigmoid if needed
        if args.use_sigmoid:
            rew_func = lambda x: 1 / (1 + np.exp(-net.predict_batch_rewards(x))
                                      )
        else:
            rew_func = lambda x: net.predict_batch_rewards(x)

        ## Uncomment the line below to train a live-long agent
        # rew_func = lambda x: x.shape[0] * [1]

        venv = ProxyRewardWrapper(venv, rew_func)
    else:
        # true environment rewards will be use
        pass

    venv = VecNormalize(venv=venv, ob=False, use_tf=False)

    # do the rest of the training as normal
    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()

    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)

    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    logger.info("training")

    model = ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=args.timesteps_per_proc,
        save_interval=args.save_interval,
        nsteps=args.nsteps,
        nminibatches=args.nminibatches,
        lam=args.lam,
        gamma=args.gamma,
        noptepochs=args.ppo_epochs,
        log_interval=args.log_interval,
        ent_coef=args.ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=args.use_vf_clipping,
        comm=comm,
        lr=args.learning_rate,
        cliprange=args.clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        load_path=args.load_path,
    )

    model.save(os.path.join(run_dir, 'final_model.parameters'))
Ejemplo n.º 18
0
def main():

    parser = argparse.ArgumentParser(description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot', help='env to run on from procgen')
    parser.add_argument('--num_envs', type=int, default=64, help='number of environments run simultaneously')
    parser.add_argument('--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"], help='level difficulty')
    parser.add_argument('--num_levels', type=int, default=0, help='number of levels to train/test on')
    parser.add_argument('--start_level', type=int, default=0, help='start level (used to avoid testing on seen levels)')
    parser.add_argument('--num_timesteps', type=int, default=0, help='number of timesteps total to train/test on')
    parser.add_argument('--save_frequency', type=int, default=0, help='checkpoint frequency')
    parser.add_argument('--model_loc', type=str, default=None, help='location of pretrained model')
    parser.add_argument('--results_loc', type=str, default=None, help='location of where to save current model/logs')

    parser.add_argument('--eval', type=bool, default=False, help='if true, does not update model')
    parser.add_argument('--data_aug', type=str, default='normal', help='whether to apply data augmentation')
    parser.add_argument('--gray_p', type=float, default=0.8, help='p value for grayscale data augmentation')

    parser.add_argument('--value_fn', type=str, default='fc', choices=['fc', 'gmm', 'lbmdp'], help='value function for ppo2 critic')
    parser.add_argument('--cnn_fn', type=str, default='impala_cnn', choices=['impala_cnn', 'nature_cnn', 'impala_cnn_lstm', 'lstm'], help='cnn for featurization')
    parser.add_argument('--entropy_fn', type=str, default='constant', choices=['constant', 'scaled'], help='function for entropy loss coefficient')


    parser.add_argument('--ent_coef', type=float, default=0.01, help='coefficient applied to entropy loss')
    parser.add_argument('--ent_scalar', type=float, default=1, help='coefficient applied within sigmoid to scaled entropy coefficient')
    parser.add_argument('--seed', type=int, default=None, help='seed for tensorflow')
    parser.add_argument('--gamma', type=float, default=0.999, help='discount factor')
    parser.add_argument('--lam', type=float, default=0.95, help='advantage discount factor')
    parser.add_argument('--lr',  type=float, default=5e-4, help='learning rate for Adam')
    parser.add_argument('--imp_h1', type=float, default=16, help='impala cnn first hidden state')
    parser.add_argument('--imp_h2', type=float, default=64, help='impala cnn second hidden state')
    parser.add_argument('--imp_h3', type=float, default=64, help='impala cnn third hidden state')


    args = parser.parse_args()

    logger.configure(dir=args.results_loc, format_strs=['csv', 'stdout'])
    logger.info("Creating Environment")
    venv = ProcgenEnv(num_envs=args.num_envs, env_name=args.env_name, num_levels=args.num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, 'rgb')
    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("Creating Tensorflow Session")
    config = tf.ConfigProto()
    sess = tf.Session(config=config)
    sess.__enter__()

    if args.cnn_fn == 'impala_cnn':
        conv_fn = lambda x: build_impala_cnn(x, depths=[args.imp_h1,args.imp_h2,args.imp_h3], emb_size=256)
    elif args.cnn_fn == 'nature_cnn':
        conv_fn = lambda x: nature_cnn(x)
    elif args.cnn_fn == 'impala_cnn_lstm':
        conv_fn = impala_cnn_lstm()
    elif args.cnn_fn == 'lstm':
        conv_fn = lstm()
    else:
        conv_fn = mlp()

    logger.info("Training")
    learn(
        network=conv_fn,
        env=venv,
        total_timesteps=args.num_timesteps,
        eval_env = None,
        seed=args.seed,
        nsteps=256,
        ent_coef=args.ent_coef,
        lr=args.lr,
        vf_coef=0.5,
        max_grad_norm=0.5,
        gamma=args.gamma,
        lam=args.lam,
        log_interval=args.save_frequency,
        nminibatches=4,
        noptepochs=3,
        cliprange=0.2,
        save_interval=0,
        load_path=args.model_loc,
        data_aug=args.data_aug,
        args=args,
    )