예제 #1
0
def test_fn(env_name, num_envs, config_path, load_path):
    test_config_path = os.path.join(os.getcwd(), "procgen-adr", config_path)
    test_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, domain_config_path=test_config_path, render_mode="rgb_array")
    test_env = VecExtractDictObs(test_env, "rgb")
    test_env = VecMonitor(venv=test_env, filename=None, keep_buf=100)
    test_env = VecNormalize(venv=test_env, ob=False)

    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256)

    recur = True
    if recur:
        logger.info("Using CNN LSTM")
        conv_fn = cnn_lstm(nlstm=256, conv_fn=conv_fn)

    mean, std = test(conv_fn, test_env, load_path=load_path)
    sess.close()
    return mean, std
예제 #2
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    ##new defined
    vf_coef = 0.5
    max_grad_norm = 0.5
    ###########
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    # timesteps_per_proc = 50_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument('--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--total_timesteps', type=int, default=0)

    args = parser.parse_args()

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR, 
                     format_strs=format_strs,
                     log_suffix="_total_timesteps_{}_num_levels_{}".format(args.total_timesteps,
                                                                           num_levels))

    '''logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv, filename=None, keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)'''

    logger.info("Creating dropout evaluation environment")
    eval_venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=100, start_level=2000, distribution_mode=args.distribution_mode)
    eval_venv = VecExtractDictObs(eval_venv, "rgb")

    eval_venv = VecMonitor(
        venv=eval_venv, filename=None, keep_buf=100,
    )

    eval_venv = VecNormalize(venv=eval_venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, is_train=False, depths=[16,32,32], emb_size=256)

    logger.info("testing dropout")
    

    
    policy = build_policy(eval_venv,conv_fn)

    nenvs = eval_venv.num_envs
    ob_space = eval_venv.observation_space
    ac_space = eval_venv.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch//nminibatches
    
    # Instantiate the model object (that creates act_model and train_model)
    
    from baselines.ppo2.model import Model
    model_fn = Model    #modified from baseline ppo2 learn

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight)
    model.load(MODEL_PATH)
    eval_runner = Runner(env=eval_venv, model=model, nsteps=nsteps, gamma=.999, lam=.95)

    eval_epinfobuf = deque(maxlen=100)
    nupdates = args.total_timesteps//nbatch

    log_interval = 1
    for update in range(1, nupdates+1):
    #single upate to test    
        eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run()
        eval_epinfobuf.extend(eval_epinfos)
        if update % log_interval == 0 or update == 1:
            logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
            logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
            logger.logkv('misc/total_timesteps',update*nbatch)
            logger.dumpkvs()
    eval_venv.close()
예제 #3
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    last_step = 4587520  # where we have left off in training
    timesteps_per_proc = 25_000_000 - last_step
    use_vf_clipping = True
    model_path = '../train-procgen/saved_model/policy_bossfight_vae560'
    vae_path = '../train-procgen/saved_model/bossfight_vae560'

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='hard',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)

    args = parser.parse_args()

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    logger.info("training")
    ppo2_cvae.learn(env=venv,
                    network=conv_fn,
                    total_timesteps=timesteps_per_proc,
                    save_interval=10,
                    nsteps=nsteps,
                    nminibatches=nminibatches,
                    lam=lam,
                    gamma=gamma,
                    noptepochs=ppo_epochs,
                    log_interval=1,
                    ent_coef=ent_coef,
                    mpi_rank_weight=mpi_rank_weight,
                    clip_vf=use_vf_clipping,
                    comm=comm,
                    lr=learning_rate,
                    cliprange=clip_range,
                    update_fn=None,
                    init_fn=None,
                    vf_coef=0.5,
                    max_grad_norm=0.5,
                    load_path=model_path,
                    vae_path=vae_path)
예제 #4
0
def main():
    """Run DQN until the environment throws an exception."""
    # Hyperparameters
    learning_rate = 2.5e-4
    gamma = 0.99
    nstep_return = 3
    timesteps_per_proc = 50_000_000
    train_interval = 4
    target_interval = 8192
    batch_size = 512
    min_buffer_size = 20000

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='starpilot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    parser.add_argument('--level_setup',
                        type=str,
                        default='procgen',
                        choices=["procgen", "oracle"])
    parser.add_argument('--mix_mode',
                        type=str,
                        default='nomix',
                        choices=['nomix', 'mixreg'])
    parser.add_argument('--mix_alpha', type=float, default=0.2)
    parser.add_argument('--use_l2reg', action='store_true')
    parser.add_argument('--data_aug',
                        type=str,
                        default='no_aug',
                        choices=['no_aug', 'cutout_color', 'crop'])
    parser.add_argument('--PER',
                        type=lambda x: bool(strtobool(x)),
                        default=True,
                        help='Whether to use PER')
    parser.add_argument('--num_envs', type=int, default=64)
    args = parser.parse_args()

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    num_envs = args.num_envs

    # Setup env specs
    if args.level_setup == "procgen":
        env_name = args.env_name
        num_levels = 0 if is_test_worker else args.num_levels
        start_level = args.start_level
    elif args.level_setup == "oracle":
        env_name = args.env_name
        num_levels = 0
        start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(
        dir=LOG_DIR +
        f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}',
        format_strs=format_strs)

    # Create env
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=env_name,
                      num_levels=num_levels,
                      start_level=start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank % len(gpus_id)]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup Rainbow models
    logger.info("building models")
    online_net, target_net = rainbow_models(
        sess,
        venv.action_space.n,
        gym_space_vectorizer(venv.observation_space),
        min_val=REWARD_RANGE_FOR_C51[env_name][0],
        max_val=REWARD_RANGE_FOR_C51[env_name][1])
    dqn = MpiDQN(online_net,
                 target_net,
                 discount=gamma,
                 comm=comm,
                 mpi_rank_weight=mpi_rank_weight,
                 mix_mode=args.mix_mode,
                 mix_alpha=args.mix_alpha,
                 use_l2reg=args.use_l2reg,
                 data_aug=args.data_aug)
    player = NStepPlayer(VecPlayer(venv, dqn.online_net), nstep_return)
    optimize = dqn.optimize(learning_rate=learning_rate)

    # Initialize and sync variables
    sess.run(tf.global_variables_initializer())
    global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope="")
    if comm.Get_size() > 1:
        sync_from_root(sess, global_variables, comm=comm)  #pylint: disable=E110

    # Training
    logger.info("training")
    if args.PER:
        dqn.train(num_steps=timesteps_per_proc,
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000,
                                                        0.5,
                                                        0.4,
                                                        epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=train_interval,
                  target_interval=target_interval,
                  batch_size=batch_size,
                  min_buffer_size=min_buffer_size)
    else:
        #set alpha and beta equal to 0 for uniform prioritization and no importance sampling
        dqn.train(num_steps=timesteps_per_proc,
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000,
                                                        0,
                                                        0,
                                                        epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=train_interval,
                  target_interval=target_interval,
                  batch_size=batch_size,
                  min_buffer_size=min_buffer_size)
예제 #5
0
def main():
    # Hyperparameters
    num_envs = 128
    learning_rate = 5e-4
    ent_coef = .01
    vf_coef = 0.5
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    max_grad_norm = 0.5
    use_vf_clipping = True

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=50)
    parser.add_argument('--start_level', type=int, default=500)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    parser.add_argument('--use_bn', action='store_true')
    parser.add_argument('--use_l2reg', action='store_true')
    parser.add_argument('--l2reg_coeff', type=float, default=1e-4)
    parser.add_argument('--data_aug',
                        type=str,
                        default='no_aug',
                        choices=["no_aug", "cutout_color", "crop"])
    parser.add_argument('--use_rand_conv', action='store_true')
    parser.add_argument('--model_width',
                        type=str,
                        default='1x',
                        choices=["1x", "2x", "4x"])
    parser.add_argument('--level_setup',
                        type=str,
                        default='procgen',
                        choices=["procgen", "oracle"])
    parser.add_argument('--mix_mode',
                        type=str,
                        default='nomix',
                        choices=['nomix', 'mixreg', 'mixobs'])
    parser.add_argument('--mix_alpha', type=float, default=0.2)
    parser.add_argument('--timesteps_per_proc', type=int, default=1_000_000)
    parser.add_argument('--level_sampler_strategy',
                        type=str,
                        default='value_l1')
    parser.add_argument('--score_transform', type=str, default='rank')
    parser.add_argument('--save_dir',
                        type=str,
                        default='gdrive/MyDrive/182 Project/')
    args = parser.parse_args()

    timesteps_per_proc = args.timesteps_per_proc
    log_dir = args.save_dir

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    # Setup env specs
    if args.level_setup == "procgen":
        env_name = args.env_name
        num_levels = 0 if is_test_worker else args.num_levels
        start_level = args.start_level
    elif args.level_setup == "oracle":
        env_name = args.env_name
        num_levels = 0
        start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout', 'tensorboard'
                   ] if log_comm.Get_rank() == 0 else []
    logger.configure(
        dir=log_dir +
        f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}',
        format_strs=format_strs)

    # Create env
    logger.info("creating environment")
    eval_env = ProcgenEnv(num_envs=num_envs,
                          env_name=env_name,
                          num_levels=500,
                          start_level=0,
                          distribution_mode=args.distribution_mode)
    eval_env = VecExtractDictObs(eval_env, "rgb")
    eval_env = VecMonitor(
        venv=eval_env,
        filename=None,
        keep_buf=100,
    )
    eval_env = VecNormalize(venv=eval_env, ob=False, ret=True)

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup model
    if args.model_width == '1x':
        depths = [16, 32, 32]
    elif args.model_width == '2x':
        depths = [32, 64, 64]
    elif args.model_width == '4x':
        depths = [64, 128, 128]
    conv_fn = lambda x: build_impala_cnn(x,
                                         depths=depths,
                                         use_bn=args.use_bn,
                                         randcnn=args.use_rand_conv and
                                         not is_test_worker)

    # Training
    logger.info("training")
    ppo2.learn = learn  # use customized "learn" function
    model = ppo2.learn(
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        num_levels=num_levels,
        start_level=start_level,
        eval_env=eval_env,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        data_aug=args.data_aug,
        level_sampler_strategy=args.level_sampler_strategy,
        score_transform=args.score_transform,
        model_fn=get_mixreg_model(mix_mode=args.mix_mode,
                                  mix_alpha=args.mix_alpha,
                                  use_l2reg=args.use_l2reg,
                                  l2reg_coeff=args.l2reg_coeff),
    )

    # Saving
    logger.info("saving final model")
    if rank == 0:
        checkdir = os.path.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)
        model.save(os.path.join(checkdir, 'final_model.ckpt'))
예제 #6
0
def main():

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=50)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', '-id', type=int, default=99)
    parser.add_argument('--nupdates', type=int, default=0)
    parser.add_argument('--total_tsteps', type=int, default=0)
    parser.add_argument('--log_interval', type=int, default=5)
    parser.add_argument('--load_id', type=int, default=int(-1))
    parser.add_argument('--nrollouts', '-nroll', type=int, default=0)
    parser.add_argument('--test', default=False, action="store_true")
    parser.add_argument('--use_model',
                        type=int,
                        default=1,
                        help="either model #1 or #2")
    parser.add_argument('--train_level', type=int, default=50)

    args = parser.parse_args()
    #timesteps_per_proc
    if args.nupdates:
        timesteps_per_proc = int(args.nupdates * num_envs * nsteps)
    if not args.total_tsteps:
        args.total_tsteps = TIMESTEPS_PER_PROC  ## use global 20_000_000 if not specified in args!
    if args.nrollouts:
        total_timesteps = int(args.nrollouts * num_envs * nsteps)

    run_ID = 'run_' + str(args.run_id).zfill(2)
    if args.test:
        args.log_interval = 1
        args.total_tsteps = 1_000_000
        run_ID += '_test{}_model{}'.format(args.load_id, args.use_model)

    load_path = None
    if args.load_id > -1:
        load_path = join(SAVE_PATH, args.env_name,
                         'saved_ensemble2_v{}.tar'.format(args.load_id))

    test_worker_interval = args.test_worker_interval
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    if args.test:
        logpath = join('log2/ensemble2', args.env_name, 'test', run_ID)
    else:
        logpath = join('log2/ensemble2', args.env_name, 'train', run_ID)
        save_path = join(SAVE_PATH, args.env_name,
                         'saved_ensemble2_v{}.tar'.format(args.run_id))
        logger.info("\n Model will be saved to file {}".format(save_path))

    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)
    logger.configure(dir=logpath, format_strs=format_strs)

    fpath = join(logpath, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.info("creating tf session")
    setup_mpi_gpus()

    if not args.test:
        config = tf.compat.v1.ConfigProto(\
            allow_soft_placement=True,
            log_device_placement=True)# device_count={'GPU':0})
        config.gpu_options.allow_growth = True  #pylint: disable=E1101
        sess = tf.compat.v1.Session(config=config)
        logger.info("creating 2 environments")
        n_levels = int(args.num_levels / 2)
        env1 = ProcgenEnv(num_envs=num_envs,
                          env_name=args.env_name,
                          num_levels=n_levels,
                          start_level=0,
                          distribution_mode=args.distribution_mode)
        env1 = VecExtractDictObs(env1, "rgb")
        env1 = VecMonitor(
            venv=env1,
            filename=None,
            keep_buf=100,
        )
        env1 = VecNormalize(venv=env1, ob=False)

        env2 = ProcgenEnv(num_envs=num_envs,
                          env_name=args.env_name,
                          num_levels=n_levels,
                          start_level=n_levels,
                          distribution_mode=args.distribution_mode)
        env2 = VecExtractDictObs(env2, "rgb")
        env2 = VecMonitor(
            venv=env2,
            filename=None,
            keep_buf=100,
        )
        env2 = VecNormalize(venv=env2, ob=False)

        train(run_ID, save_path, load_path, env1, env2, sess, logger, args)
    else:
        use_model = args.use_model  ## 1 or 2
        alt_flag = use_model - 1
        test_all(alt_flag, load_path, logger, args)
예제 #7
0
def train_fn(env_name,
             num_envs,
             distribution_mode,
             num_levels,
             start_level,
             timesteps_per_proc,
             level_sampler_strategy,
             score_transform,
             model_name,
             is_test_worker=False,
             save_dir='./',
             comm=None):
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    use_vf_clipping = True

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else num_levels

    log_dir = save_dir + 'logs/' + model_name

    if log_dir is not None:
        log_comm = comm.Split(1 if is_test_worker else 0, 0)
        format_strs = ['csv', 'stdout', 'tensorboard'
                       ] if log_comm.Get_rank() == 0 else []
        logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs)

    logger.info("creating environment")
    eval_env = ProcgenEnv(num_envs=num_envs,
                          env_name=env_name,
                          num_levels=500,
                          start_level=0,
                          distribution_mode=distribution_mode)
    eval_env = VecExtractDictObs(eval_env, "rgb")
    eval_env = VecMonitor(
        venv=eval_env,
        filename=None,
        keep_buf=100,
    )
    eval_env = VecNormalize(venv=eval_env, ob=False, ret=True)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32])

    logger.info("training")
    model = ppo2.learn(network=conv_fn,
                       total_timesteps=timesteps_per_proc,
                       num_levels=num_levels,
                       eval_env=eval_env,
                       save_interval=0,
                       nsteps=nsteps,
                       nminibatches=nminibatches,
                       lam=lam,
                       gamma=gamma,
                       noptepochs=ppo_epochs,
                       log_interval=1,
                       ent_coef=ent_coef,
                       mpi_rank_weight=mpi_rank_weight,
                       clip_vf=use_vf_clipping,
                       comm=comm,
                       lr=learning_rate,
                       cliprange=clip_range,
                       update_fn=None,
                       init_fn=None,
                       vf_coef=0.5,
                       max_grad_norm=0.5,
                       level_sampler_strategy=level_sampler_strategy,
                       score_transform=score_transform)
    model.save(save_dir + 'models/' + model_name)
예제 #8
0
def run(checkpoint_path,
        *,
        output_dir,
        load_kwargs={},
        trajectories_kwargs={},
        observations_kwargs={},
        **generate_kwargs):
    """Generate an interface from a checkpoint file.

    Arguments:
        checkpoint_path: path to checkpoint file, a joblib file containing a
                         dictionary with these keys
          - params: saved model parameters as a dictionary mapping tensor names
                    to numpy arrays
          - args: dictionary of metadata with these keys
              - env_name:     name of the Procgen environment
                              required if env_kind is 'procgen'
              - env_id:       lowercase id of the Atari environment
                              required if env_kind is 'atari'
              - env_kind:     either 'procgen' or 'atari'
                              defaults to 'procgen'
              - gamma:        GAE hyperparameter gamma used to train the model
                              defaults to None
              - lambda:       GAE hyperparameter lambda used to train the model
                              defaults to None
              - cnn:          model architecture, one of 'clear', 'impala' or
                              'nature'
                              defaults to 'clear'
              - any other optional arguments used to create the environment or
                get the architecture
        output_dir:            path to directory where interface is to be saved
                               required
        load_kwargs: dictionary with keys for any of the following
          - resample:          whether to process the checkpoint file from
                               scratch, rather than reusing samples previously
                               saved to a non-temporary location
                               defaults to True
          - model_path:        lucid model save location
          - metadata_path:     metadata dictionary save location
          - trajectories_path: trajectories save location
          - observations_path: additional observations save location
          - full_resolution:   whether to also save observations in human-scale
                               resolution (significant performance cost)
                               defaults to False
          - temp_files:        if any of the above paths is not specified,
                               whether to default to a temporary location
                               rather than a sudirectory of the checkpoint
                               file's directory
                               defaults to False
        trajectories_kwargs: dictionary with keys for any of the following
                             only used if resampling
          - num_envs:  number of trajectories to collect
                       defaults to 8
          - num_steps: length of each trajectory
                       defaults to 512
        observations_kwargs: dictionary with keys for any of the following
                             only used if resampling
          - num_envs:  number of environments to collect additional
                       observations from in parallel
                       defaults to 32
          - num_obs:   number of additional observations to collect from
                       each parallel environment
                       defaults to 128
          - obs_every: number of steps to wait between each observation
                       defaults to 128
        model_bytes:          lucid model, represented as a save file's bytes
                              defaults to being extracted automatically
        observations:         numpy array of additional observations used for
                              feature visualization
                              defaults to being extracted automatically
        observations_full:    numpy array of the additional observations in
                              human-scale resolution, or None to only use
                              observations at the resolution seen by the model
                              defaults to being extracted automatically, or None
                              if human-scale resolution observations were not
                              saved
        trajectories:         dictionary of trajectories with keys
                              'observations', 'actions', 'rewards', either
                              'firsts' or 'dones', and optionally
                              'observations_full', each value being a numpy
                              array with first two dimensions batch and timestep
                              defaults to being extracted automatically
        policy_logits_name:   name of tensor of policy logits
                              defaults to being extracted automatically
        value_function_name:  name of tensor of value function
                              defaults to being extracted automatically
        env_name:             Procgen environment name, used to help infer
                              action_combos if that is not provided
                              defaults to being extracted automatically, or
                              'unknown' if that fails
        numpy_precision:      number of significant figures to round numpy
                              arrays in the HTML file to
                              defaults to 6
        inline_js:            whether to include the JavaScript in the HTML file
                              inline, rather than referencing a separate file
                              defaults to True (to avoid ad-blocker issues)
        inline_large_json:    whether to include large amounts of JSON data in
                              the HTML file inline, rather than referencing
                              separate files
                              defaults to whether output_dir does not contain
                              '://'
        batch_size:           size of minibatch of observations to pass through
                              model
                              defaults to 512
        action_combos:        list of tuples of strings describing the
                              combinations of buttons triggered by each action
                              defaults to being extracted automatically, or
                              [('0',), ..., ('<num_actions - 1>',)] if that fails
        action_group_fns:     list of function filters for grouping the action
                              combos in different ways
                              defaults to [
                                  lambda combo: 'RIGHT' in combo,
                                  lambda combo: 'LEFT' in combo,
                                  lambda combo: 'UP' in combo,
                                  lambda combo: 'DOWN' in combo,
                                  lambda combo: 'RIGHT' not in combo
                                                 and 'LEFT' not in combo
                                                 and 'UP' not in combo
                                                 and 'DOWN' not in combo
                              ]
        layer_kwargs: dictionary of options for choosing layers, with keys for
                      any of the following
          - name_contains_one_of: list of strings each layer name must contain
                                  one of, or None to not filter by name
                                  defaults to None
          - op_is_one_of:         list of strings each layer op must be one of
                                  defaults to ['relu']
          - bottleneck_only:      whether to only include layers such that every
                                  path to an earlier convolutional layer passes
                                  through a bottleneck of the network
                                  defaults to True
          - discard_first_n:      number of first layers to discard
                                  defaults to 0
        input_layer_include:  whether to additionally calcuate gradients with
                              respect to the input layer
                              defaults to False
        input_layer_name:     display name of the input layer
                              defaults to 'input'
        gae_gamma:            gamma for computing advantages using GAE
                              defaults to being extracted automatically, or
                              0.999 if that fails
        gae_lambda:           lambda for computing advantages using GAE
                              defaults to being extracted automatically, or
                              0.95 if that fails
        trajectory_bookmarks: number of links to display to highest advantage
                              episodes and to lowest advantage episodes
                              defaults to 16
        nmf_features:         number of dimensions for NMF dimensionality
                              reduction
                              defaults to 8
        nmf_attr_opts:        dictionary of options for computing attribution
                              for NMF dimensionality reduction, the main one
                              being integrate_steps (explained below, see
                              attr_integrate_steps)
                              defaults to {'integrate_steps': 10}, though if a
                              dictionary is provided without an
                              'integrate_steps' key, then integrate_steps
                              defaults to 1
        vis_subdiv_mults:     list of values of subdiv_mult, the spatial
                              resolution of the grid of dataset examples used
                              for feature visualization, as a mulitple of the
                              resolution of the layer's activations
                              defaults to [0.25, 0.5, 1, 2]
        vis_subdiv_mult_default: default value of subdiv_mult (explained above)
                              defaults to 1
        vis_expand_mults:     list of values of expand_mult, the height and
                              width of each patch used for feature
                              visualization, as a multiple of the number of
                              pixels if the layer were overlaid on the
                              observation
                              defaults to [1, 2, 4, 8]
        vis_expand_mult_default: default value of expand_mult (explained above)
                              defaults to 4
        vis_thumbnail_num_mult: spatial resolution of the grid of dataset
                              examples used for feature visualization thumbnails
                              defaults to 4
        vis_thumbnail_expand_mult: the height and width of each patch used for
                              feature visualization thumbnails, as a multiple of
                              the number of pixels if the layer were overlaid on
                              the observation
                              defaults to 4
        scrub_range:          horizonal interval of observations and attribution
                              used to construct scrubs
                              defaults to (42 / 64, 44 / 64)
        attr_integrate_steps: number of points on the path used for numerical
                              integration for computing attribution
                              defaults to 10
        attr_max_paths:       maximum number of paths for multi-path
                              attribution, or None to use single-path
                              attribution
                              defaults to None
        attr_policy:          whether to compute attribution for the policy
                              defaults to False
        attr_single_channels: whether to allow attribution for single channels
                              to be displayed
                              defaults to True
        observations_subdir:  name of subdirectory containing additional
                              observations
                              defaults to 'observations/'
        trajectories_subdir:  name of subdirectory containing trajectory
                              observations
                              defaults to 'trajectories/'
        trajectories_scrub_subdir: name of subdirectory containing scrubs of
                              trajectory observations
                              defaults to 'trajectories_scrub/'
        features_subdir:      name of subdirectory containing feature
                              visualizations
                              defaults to 'features/'
        thumbnails_subdir:    name of subdirectory containing feature thumbnails
                              defaults to 'thumbnails/'
        attribution_subdir:   name of subdirectory containing attribution
                              defaults to 'attribution/'
        attribution_scrub_subdir: name of subdirectory containing scrubs of
                              attribution
                              defaults to 'attribution_scrub/'
        video_height:         css height of each video screen
                              defaults to '16em'
        video_width:          css width of each video screen
                              defaults to '16em'
        video_speed:          speed of vidoes in frames per second
                              defaults to 12
        policy_display_height: css height of bar displaying policy
                              defaults to '2em'
        policy_display_width: css width of bar displaying policy
                              defaults to '40em'
        navigator_width:      css width of navigator bar
                              defaults to '24em'
        scrubber_height:      css height of each scrubber
                              defaults to '4em'
        scrubber_width:       css width of each scrubber
                              defaults to '48em'
        scrubber_visible_duration: number of frames visible in each scrubber
                              defaults to 256
        legend_item_height:   css height of each legend item
                              defaults to '6em'
        legend_item_width:    css width of each legend item
                              defaults to '6em'
        feature_viewer_height: css height of feature visualizations in the popup
                              defaults to '40em'
        feature_viewer_width: css width of feature visualizations in the popup
                              defaults to '40em'
        attribution_weight:   css opacity of attribution when overlaid on
                              observations (taking into account the fact that
                              attribution is mostly transparent)
                              defaults to 0.9
        graph_colors:         dictionary specifying css colors of graphs of each
                              type
                              defaults to {
                                  'v': 'green',
                                  'action': 'red',
                                  'action_group': 'orange',
                                  'advantage': 'blue'
                              }
        trajectory_color:     css color of text displaying trajectory
                              information such as actions and rewards
                              defaults to 'blue'
    """
    import tensorflow as tf
    from mpi4py import MPI
    from baselines.common.mpi_util import setup_mpi_gpus

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    setup_mpi_gpus()

    exn = None
    if rank == 0 and load_kwargs.get("resample", True):
        kwargs = load(checkpoint_path,
                      trajectories_kwargs=trajectories_kwargs,
                      observations_kwargs=observations_kwargs,
                      **load_kwargs)
        comm.barrier()
    else:
        comm.barrier()
        load_kwargs["resample"] = False
        try:
            kwargs = load(checkpoint_path,
                          trajectories_kwargs=trajectories_kwargs,
                          observations_kwargs=observations_kwargs,
                          **load_kwargs)
        except tf.errors.NotFoundError as e:
            exn = e
            kwargs = None
    errors = comm.allreduce(0 if exn is None else 1, op=MPI.SUM)
    if errors == size:
        raise FileNotFoundError from exn
    elif errors > 0:
        kwargs = comm.bcast(kwargs, root=0)
    kwargs["output_dir"] = output_dir
    kwargs.update(generate_kwargs)

    generate(**kwargs)
예제 #9
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    timesteps_per_proc = 30_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=50)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', '-id', type=int, default=0)
    parser.add_argument('--use', type=str, default="randcrop")
    parser.add_argument('--log_interval', type=int, default=20)
    parser.add_argument('--nupdates', type=int, default=0)
    parser.add_argument('--total_tsteps', type=int, default=0)
    parser.add_argument('--load_id', type=int, default=int(-1))

    args = parser.parse_args()

    if args.nupdates:
        timesteps_per_proc = int(args.nupdates * num_envs * nsteps)
    if not args.total_tsteps:
        args.total_tsteps = timesteps_per_proc  ## use global 20_000_000 if not specified in args!

    run_ID = 'run_' + str(args.run_id).zfill(2)
    ## select which ppo to use:
    agent_str = args.use
    LOG_DIR = join("log", agent_str, "train")
    save_model = join("log", agent_str,
                      "saved_{}_v{}.tar".format(agent_str, args.run_id))
    ppo_func = PPO_FUNCs[agent_str]
    load_path = None
    if args.load_id > -1:
        load_path = join("log", agent_str,
                         "saved_{}_v{}.tar".format(agent_str, args.load_id))

    test_worker_interval = args.test_worker_interval
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)
    logger.configure(dir=logpath, format_strs=format_strs)

    fpath = join(LOG_DIR, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.info("\n Saving model to file {}".format(save_model))

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto(
        log_device_placement=True)  #device_count={'GPU':0, 'XLA_GPU':0})
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.compat.v1.Session(config=config)
    #sess.__enter__()

    logger.info(venv.observation_space)
    logger.info("training")
    with sess.as_default():
        model = ppo_func.learn(
            sess=sess,
            env=venv,
            network=None,
            total_timesteps=args.total_tsteps,
            save_interval=1000,
            nsteps=nsteps,
            nminibatches=nminibatches,
            lam=lam,
            gamma=gamma,
            noptepochs=ppo_epochs,
            log_interval=args.log_interval,
            ent_coef=ent_coef,
            # clip_vf=use_vf_clipping,
            lr=learning_rate,
            cliprange=clip_range,
            # update_fn=None,
            # init_fn=None,
            save_path=save_model,
            load_path=load_path,
            vf_coef=0.5,
            max_grad_norm=0.5,
        )
        model.save(save_model)
예제 #10
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    # nsteps = (128 // 8)
    nsteps = (128 // 8)
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    timesteps_per_proc = 1_000_000
    use_vf_clipping = True
    dist_mode = "easy"
    env_name = "visual-cartpole"


    num_levels = 100
    # disc_coeff = None
    disc_coeff = 0.
    if disc_coeff is None:
        LOG_DIR = "/home/josh/" + env_name + "/" + env_name + "_disc_coeff_ramping2_num_levels_" + str(num_levels) + "_nsteps_" + str(nsteps)
    else:
        LOG_DIR = "/home/josh/" + env_name + "_easy_vae/" + env_name + "_disc_coeff_" + str(disc_coeff) + "_num_levels_" + str(num_levels) + "_nsteps_" + str(nsteps)

    test_worker_interval = 0

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout', 'tensorboard'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR, format_strs=format_strs)

    logger.info("creating environment")

    if env_name == "visual-cartpole":
        venv = gym.vector.make('cartpole-visual-v1', num_envs=num_envs, num_levels=num_levels)
        venv.observation_space = gym.spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8)
        venv.action_space = gym.spaces.Discrete(2)
    else:
        venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=0, distribution_mode=dist_mode)
        venv = VecExtractDictObs(venv, "rgb")


    venv = VecMonitor(
        venv=venv, filename=None, keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    if env_name == "visual-cartpole":
        test_venv = gym.vector.make('cartpole-visual-v1', num_envs=num_envs, num_levels=0)
        test_venv.observation_space = gym.spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8)
        test_venv.action_space = gym.spaces.Discrete(2)
    else:
        test_venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=0, start_level=1000, distribution_mode=dist_mode)
        test_venv = VecExtractDictObs(test_venv, "rgb")

    test_venv = VecMonitor(
        venv=test_venv, filename=None, keep_buf=100,
    )
    # test_venv = VecExtractDictObs(test_venv, "rgb")

    test_venv = VecNormalize(venv=test_venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    config.gpu_options.per_process_gpu_memory_fraction = 0.9
    sess = tf.Session(config=config)
    sess.__enter__()

    # conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256)
    # conv_fn = lambda x: nature_cnn(x)

    conv_fn = lambda x: build_darla_vae(x, emb_size=256)

    logger.info("training")
    ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        eval_env=test_venv,
        num_levels=num_levels,
        disc_coeff=disc_coeff,
    )
예제 #11
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    # timesteps_per_proc = 50_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='hard',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--total_timesteps', type=int, default=0)

    args = parser.parse_args()

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR,
                     format_strs=format_strs,
                     log_suffix="_total_timesteps_{}_num_levels_{}".format(
                         args.total_timesteps, num_levels))

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating evaluation environment")
    eval_venv = ProcgenEnv(num_envs=num_envs,
                           env_name=args.env_name,
                           num_levels=100,
                           start_level=2000,
                           distribution_mode=args.distribution_mode)
    eval_venv = VecExtractDictObs(eval_venv, "rgb")

    eval_venv = VecMonitor(
        venv=eval_venv,
        filename=None,
        keep_buf=100,
    )

    eval_venv = VecNormalize(venv=eval_venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(
        x, is_train=False, depths=[16, 32, 32], emb_size=256)
    #change conv_fn so that its set to testing
    logger.info("testing")
    model = ppo2.learn(env=venv,
                       eval_env=eval_venv,
                       network=conv_fn,
                       total_timesteps=args.total_timesteps,
                       save_interval=0,
                       nsteps=nsteps,
                       nminibatches=nminibatches,
                       lam=lam,
                       gamma=gamma,
                       noptepochs=ppo_epochs,
                       log_interval=1,
                       ent_coef=ent_coef,
                       mpi_rank_weight=mpi_rank_weight,
                       clip_vf=use_vf_clipping,
                       comm=comm,
                       lr=learning_rate,
                       cliprange=clip_range,
                       update_fn=None,
                       init_fn=None,
                       vf_coef=0.5,
                       max_grad_norm=0.5,
                       load_path=MODEL_PATH)

    # Save the model
    model.save(
        "test_dropout_model/model_total_timesteps_{}_num_levels_{}".format(
            args.total_timesteps, num_levels))
예제 #12
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    timesteps_per_proc = 20_000_000  # 200_000_000: hard 25_000_000: easy
    use_vf_clipping = True
    LOG_DIR = './log/'

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--data_aug', type=str, default='normal')
    parser.add_argument('--exp_name', type=str, default='try1')
    parser.add_argument('--test_start_level', type=int,
                        default=200)  # 500 for hard / 200 for easy

    args = parser.parse_args()

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    #if args.num_levels < 50:
    #    timesteps_per_proc = 5_000_000

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    LOG_DIR += args.env_name + '/nlev_' + str(args.num_levels) + '_mode_'
    LOG_DIR += args.distribution_mode + '/' + args.data_aug + '/' + args.exp_name

    logger.configure(dir=LOG_DIR, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    # eval env, unlimited levels
    eval_venv = ProcgenEnv(num_envs=num_envs,
                           env_name=args.env_name,
                           num_levels=0,
                           start_level=args.test_start_level,
                           distribution_mode=args.distribution_mode)
    eval_venv = VecExtractDictObs(eval_venv, "rgb")

    eval_venv = VecMonitor(
        venv=eval_venv,
        filename=None,
        keep_buf=100,
    )

    eval_venv = VecNormalize(venv=eval_venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    logger.info("training")
    ppo2.learn(
        env=venv,
        eval_env=eval_venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        save_interval=62,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        data_aug=args.data_aug,
    )
예제 #13
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    timesteps_per_proc = 5_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env-name', type=str, default='bigfish')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num-levels', type=int, default=200)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--obs',
                        choices=['rgb', 'lbl', 'onehot_lbl'],
                        default='rgb')

    args = parser.parse_args()

    LOG_DIR = f'/raid0/dian/procgen_baseline/{args.env_name}/ppo_{args.obs}_{args.num_levels}_{SEED}'

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = args.num_levels

    # log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout']  # if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(
        num_envs=num_envs,
        env_name=args.env_name,
        num_levels=num_levels,
        start_level=args.start_level,
        distribution_mode=args.distribution_mode,
        rand_seed=SEED,
    )
    test_venv = ProcgenEnv(
        num_envs=num_envs,
        env_name=args.env_name,
        num_levels=0,
        start_level=args.start_level,
        distribution_mode=args.distribution_mode,
        rand_seed=SEED,
    )
    if args.obs == 'onehot_lbl':
        venv = VecExtractDictObsOnehot(venv, args.env_name)
        venv = VecMonitor(
            venv=venv,
            filename=None,
            keep_buf=100,
        )
        test_venv = VecExtractDictObsOnehot(test_venv, args.env_name)
        test_venv = VecMonitor(
            venv=test_venv,
            filename=None,
            keep_buf=100,
        )
    else:
        venv = VecExtractDictObs(venv, args.obs)
        venv = VecMonitor(
            venv=venv,
            filename=None,
            keep_buf=100,
        )
        venv = VecNormalize(venv=venv, ob=False)

        test_venv = VecExtractDictObs(test_venv, args.obs)
        test_venv = VecMonitor(
            venv=test_venv,
            filename=None,
            keep_buf=100,
        )
        test_venv = VecNormalize(venv=test_venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    logger.info("training")
    ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        eval_env=test_venv,
        save_interval=100,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
    )
예제 #14
0
def eval_fn(load_path, args, env_name='fruitbot', distribution_mode='easy', num_levels=500, start_level=500, log_dir='./tmp/procgen', comm=None, num_trials=3, gui=False):

    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    use_vf_clipping = True
    vf_coef = 0.5
    max_grad_norm = 0.5

    mpi_rank_weight = 1
    log_interval = 1
    seed=None

    log_comm = comm.Split(0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=1, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv, filename=None, keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256)

    logger.info(f"evaluating")

    set_global_seeds(seed)

    policy = build_policy(venv, conv_fn)

    # Get the nb of env
    nenvs = venv.num_envs
    # Get state_space and action_space
    ob_space = venv.observation_space
    ac_space = venv.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    from .alternate_ppo2.model import Model
    model_fn = Model

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight)

    if os.path.isfile(load_path):
        alt_ppo2.eval(
            network=conv_fn,
            nsteps=nsteps,
            ent_coef=ent_coef,
            vf_coef=vf_coef,
            max_grad_norm=max_grad_norm,
            gamma=gamma,
            lam=lam,
            log_interval=log_interval,
            nminibatches=nminibatches,
            noptepochs=ppo_epochs,
            load_path=load_path,
            mpi_rank_weight=mpi_rank_weight,
            comm=comm,
            clip_vf=use_vf_clipping,
            lr=learning_rate,
            cliprange=clip_range,
            policy=policy,
            nenvs=nenvs,
            ob_space=ob_space,
            ac_space=ac_space,
            nbatch=nbatch,
            nbatch_train=nbatch_train,
            model_fn=model_fn,
            model=model,
            num_trials=num_trials,
            num_levels=num_levels,
            start_level=start_level,
            gui=gui,
            args=args
        )
    elif os.path.isdir(load_path):
        for file in os.listdir(load_path):
            log_comm = comm.Split(0, 0)
            format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
            logger.configure(comm=log_comm, dir=log_dir+'/'+file, format_strs=format_strs)
            alt_ppo2.eval(
                network=conv_fn,
                nsteps=nsteps,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                max_grad_norm=max_grad_norm,
                gamma=gamma,
                lam=lam,
                log_interval=log_interval,
                nminibatches=nminibatches,
                noptepochs=ppo_epochs,
                load_path=load_path+'/'+file,
                mpi_rank_weight=mpi_rank_weight,
                comm=comm,
                clip_vf=use_vf_clipping,
                lr=learning_rate,
                cliprange=clip_range,
                policy=policy,
                nenvs=nenvs,
                ob_space=ob_space,
                ac_space=ac_space,
                nbatch=nbatch,
                nbatch_train=nbatch_train,
                model_fn=model_fn,
                model=model,
                num_trials=num_trials,
                num_levels=num_levels,
                start_level=start_level,
                gui=gui,
                args=args
            )
    else:
        print('Model path does not exist.')
    return
예제 #15
0
def train(comm=None, *, save_dir=None, **kwargs):
    """
    Train a model using Baselines' PPO2, and to save a checkpoint file in the
    required format.

    There is one required kwarg: either env_name (for env_kind="procgen") or
    env_id (for env_kind="atari").

    Models for the paper were trained with 16 parallel MPI workers.

    Note: this code has not been well-tested.
    """
    kwargs.setdefault("env_kind", "procgen")
    kwargs.setdefault("num_envs", 64)
    kwargs.setdefault("learning_rate", 5e-4)
    kwargs.setdefault("entropy_coeff", 0.01)
    kwargs.setdefault("gamma", 0.999)
    kwargs.setdefault("lambda", 0.95)
    kwargs.setdefault("num_steps", 256)
    kwargs.setdefault("num_minibatches", 8)
    kwargs.setdefault("library", "baselines")
    kwargs.setdefault("save_all", False)
    kwargs.setdefault("ppo_epochs", 3)
    kwargs.setdefault("clip_range", 0.2)
    kwargs.setdefault("timesteps_per_proc", 1_000_000_000)
    kwargs.setdefault("cnn", "clear")
    kwargs.setdefault("use_lstm", 0)
    kwargs.setdefault("stack_channels", "16_32_32")
    kwargs.setdefault("emb_size", 256)
    kwargs.setdefault("epsilon_greedy", 0.0)
    kwargs.setdefault("reward_scale", 1.0)
    kwargs.setdefault("frame_stack", 1)
    kwargs.setdefault("use_sticky_actions", 0)
    kwargs.setdefault("clip_vf", 1)
    kwargs.setdefault("reward_processing", "none")
    kwargs.setdefault("save_interval", 10)

    if comm is None:
        comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    setup_mpi_gpus()

    if save_dir is None:
        save_dir = tempfile.mkdtemp(prefix="rl_clarity_train_")

    create_env_kwargs = kwargs.copy()
    num_envs = create_env_kwargs.pop("num_envs")
    venv = create_env(num_envs, **create_env_kwargs)

    library = kwargs["library"]
    if library == "baselines":
        reward_processing = kwargs["reward_processing"]
        if reward_processing == "none":
            pass
        elif reward_processing == "clip":
            venv = VecClipReward(venv=venv)
        elif reward_processing == "normalize":
            venv = VecNormalize(venv=venv, ob=False, per_env=False)
        else:
            raise ValueError(f"Unsupported reward processing: {reward_processing}")

        scope = "ppo2_model"

        def update_fn(update, params=None):
            if rank == 0:
                save_interval = kwargs["save_interval"]
                if save_interval > 0 and update % save_interval == 0:
                    print("Saving...")
                    params = get_tf_params(scope)
                    save_path = save_data(
                        save_dir=save_dir,
                        args_dict=kwargs,
                        params=params,
                        step=(update if kwargs["save_all"] else None),
                    )
                    print(f"Saved to: {save_path}")

        sess = create_tf_session()
        sess.__enter__()

        if kwargs["use_lstm"]:
            raise ValueError("Recurrent networks not yet supported.")
        arch = get_arch(**kwargs)

        from baselines.ppo2 import ppo2

        ppo2.learn(
            env=venv,
            network=arch,
            total_timesteps=kwargs["timesteps_per_proc"],
            save_interval=0,
            nsteps=kwargs["num_steps"],
            nminibatches=kwargs["num_minibatches"],
            lam=kwargs["lambda"],
            gamma=kwargs["gamma"],
            noptepochs=kwargs["ppo_epochs"],
            log_interval=1,
            ent_coef=kwargs["entropy_coeff"],
            mpi_rank_weight=1.0,
            clip_vf=bool(kwargs["clip_vf"]),
            comm=comm,
            lr=kwargs["learning_rate"],
            cliprange=kwargs["clip_range"],
            update_fn=update_fn,
            init_fn=None,
            vf_coef=0.5,
            max_grad_norm=0.5,
        )
    else:
        raise ValueError(f"Unsupported library: {library}")

    return save_dir
예제 #16
0
def main():
    # Hyperparameters
    num_envs = 128
    learning_rate = 5e-4
    ent_coef = .01
    vf_coef = 0.5
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    max_grad_norm = 0.5
    timesteps_per_proc = 100_000_000
    use_vf_clipping = True

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='hard',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    args = parser.parse_args()

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    # Setup env specs
    env_name = args.env_name
    num_levels = 0 if is_test_worker else args.num_levels
    start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR + f'/{args.env_name}/run_{args.run_id}',
                     format_strs=format_strs)

    # Create env
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=env_name,
                      num_levels=num_levels,
                      start_level=start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup model
    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32])

    # Training
    logger.info("training")
    ppo2.Runner = NetRandRunner
    ppo2.build_policy = build_policy
    model = ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        model_fn=NetRandModel,
    )

    # Saving
    logger.info("saving final model")
    if rank == 0:
        checkdir = os.path.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)
        model.save(os.path.join(checkdir, 'final_model.ckpt'))
예제 #17
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    total_timesteps = 1_000_000  ## now this counts steps in testing runs
    use_vf_clipping = True

    ## From random_ppo.py
    max_grad_norm = 0.5
    vf_coef = 0.5
    L2_WEIGHT = 10e-4
    FM_COEFF = 0.002
    REAL_THRES = 0.1

    parser = argparse.ArgumentParser(
        description='Process procgen testing arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1000)
    ## default starting_level set to 50 to test on unseen levels!
    parser.add_argument('--start_level', type=int, default=1000)
    parser.add_argument('--run_id', '-id', type=int, default=0)
    parser.add_argument('--load_id', type=int, default=0)
    parser.add_argument('--nrollouts', '-nroll', type=int, default=0)

    args = parser.parse_args()
    args.total_timesteps = total_timesteps
    if args.nrollouts:
        total_timesteps = int(args.nrollouts * num_envs * nsteps)
    run_ID = 'run_' + str(args.run_id).zfill(2)
    run_ID += '_load{}'.format(args.load_id)

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    mpi_rank_weight = 0
    num_levels = args.num_levels

    log_comm = comm.Split(0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)

    fpath = join(logpath, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.configure(dir=logpath, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.compat.v1.Session(config=config)
    sess.__enter__()

    logger.info("Testing")
    ## Modified based on random_ppo.learn
    env = venv
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    nrollouts = total_timesteps // nbatch

    network = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)
    policy = build_policy(env, network)
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm)

    LOAD_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.load_id)
    model.load(LOAD_PATH)
    logger.info("Model pramas loaded from save")
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)
    # tfirststart = time.time() ## Not doing timing yet
    # active_ep_buf = epinfobuf100

    mean_rewards = []
    datapoints = []
    for rollout in range(1, nrollouts + 1):
        logger.info('collecting rollouts {}...'.format(rollout))
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  ## differnent from random_ppo!
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
        rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
        ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10])
        ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100])

        logger.info('\n----', rollout)
        mean_rewards.append(rew_mean_10)
        logger.logkv('eprew10', rew_mean_10)
        logger.logkv('eprew100', rew_mean_100)
        logger.logkv('eplenmean10', ep_len_mean_10)
        logger.logkv('eplenmean100', ep_len_mean_100)
        logger.logkv("misc/total_timesteps", rollout * nbatch)

        logger.info('----\n')
        logger.dumpkvs()
    env.close()

    print("Rewards history: ", mean_rewards)
    return mean_rewards
예제 #18
0
def rollout_fn(num_steps, env_name, num_envs, distribution_mode, num_levels, start_level, timesteps_per_proc, is_test_worker=False, log_dir='/tmp/procgen', comm=None, load_path=None):
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    use_vf_clipping = True

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else num_levels

    if log_dir is not None:
        log_comm = comm.Split(1 if is_test_worker else 0, 0)
        format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
        logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs, filename="rollout")

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv, filename=None, keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256)

    logger.info("training")
    ppo2.rollout(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        load_path = load_path,
        num_steps=num_steps,
        num_envs=num_envs, 
        env_name=env_name,
        num_levels=num_levels, 
        start_level=start_level, 
        distribution_mode=distribution_mode
    )
예제 #19
0
def main(env_name, paint_vel_info, distribution_mode, num_levels, start_level,
         log_interval, iter_loss, arch, eval, num_envs, learning_rate,
         lr_schedule, ent_coef, gamma, lam, nsteps, nminibatches, ppo_epochs,
         clip_range, timesteps_per_proc, use_vf_clipping, _run, is_test_worker,
         timestep_factor):

    comm = MPI.COMM_WORLD
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    logger._run = _run

    # Configure logger
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir="{}/id_{}".format(LOG_DIR, _run._id),
                     format_strs=format_strs)

    # Add sacred logger:
    if log_comm.Get_rank() == 0:
        logger.get_current().output_formats.append(
            SacredOutputFormat(_run, timestep_factor))

    num_levels = 0 if is_test_worker else num_levels
    mpi_rank_weight = 0 if is_test_worker else 1
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=env_name,
                      paint_vel_info=paint_vel_info,
                      num_levels=num_levels,
                      start_level=start_level,
                      distribution_mode=distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn_with_ibac(
        x, iter_loss=iter_loss, arch=arch, depths=[16, 32, 32], emb_size=256)

    logger.info("training")
    ppo_iter.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        ## Iter
        iter_loss=iter_loss,
        arch=arch,
        _run=_run,
        ## Rest
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=log_interval,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        learning_rate=learning_rate,
        lr_schedule=lr_schedule,
        cliprange=clip_range,
        vf_coef=0.5,
        max_grad_norm=0.5,
        eval=eval,
    )
예제 #20
0
def main():
    # Hyperparameters
    num_envs = 128
    learning_rate = 5e-4
    ent_coef = .01
    vf_coef = 0.5
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    max_grad_norm = 0.5
    timesteps_per_proc = 100_000_000
    use_vf_clipping = True

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument('--distribution_mode', type=str, default='hard',
            choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    parser.add_argument('--use_bn', action='store_true')
    parser.add_argument('--use_l2reg', action='store_true')
    parser.add_argument('--l2reg_coeff', type=float, default=1e-4)
    parser.add_argument('--data_aug', type=str, default='no_aug', 
            choices=["no_aug", "cutout_color", "crop"])
    parser.add_argument('--use_rand_conv', action='store_true')
    parser.add_argument('--model_width', type=str, default='1x',
            choices=["1x", "2x", "4x"])
    parser.add_argument('--level_setup', type=str, default='procgen',
            choices=["procgen", "oracle"])
    parser.add_argument('--mix_mode', type=str, default='nomix',
            choices=['nomix', 'mixreg', 'mixobs'])
    parser.add_argument('--mix_alpha', type=float, default=0.2)
    # JAG: Add second parameter beta to the beta distribution
    parser.add_argument('--mix_beta', type=float, default=0.2)

    # JAG: Parameters for adversarial RL
    # 1. The ending condition for adversarial gradient descent
    parser.add_argument('--adv_epsilon', type=float, default=5e-6)
    # 2. Learning rate for adversarial gradient descent
    parser.add_argument('--adv_lr', type=float, default=10)
    # 3. Adversarial penalty for observation euclidean distance
    parser.add_argument('--adv_gamma', type=float, default=0.01)
    # 4. We use adversarial after #threshold epochs of PPO training 
    parser.add_argument('--adv_thresh', type=int, default=50)
    # 5. If we use evaluation environment
    parser.add_argument('--eval_env', type=bool, default=True)
    parser.add_argument('--eval_levels', type=int, default=0)
    # 6. The ratio of adversarial augmented data
    # adv = 1 means we replace original data with adversarial data
    # adv = 0 means we do not use adversarial
    parser.add_argument('--adv_adv', type=float, default=0.5)
    # 7. The ratio of mixup original data with augmented data
    # adv = 1 means we use augmented obs and value
    # adv = 0 means we use original obs and value
    parser.add_argument('--adv_obs', type=float, default=1)
    parser.add_argument('--adv_value', type=float, default=1)
    # Determine what percentage of environments we use (For generalization)
    # nenv = 1 means we use all the environments
    parser.add_argument('--adv_nenv', type=float, default=1)
    # 9. We test the first 500 epochs
    parser.add_argument('--adv_epochs', type=int, default=500)
    args = parser.parse_args()

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
                test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    # Setup env specs
    if args.level_setup == "procgen":
        env_name = args.env_name
        num_levels = 0 if is_test_worker else args.num_levels
        start_level = args.start_level
    elif args.level_setup == "oracle":
        env_name = args.env_name
        num_levels = 0
        start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(
        dir=LOG_DIR +
        f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}',
        format_strs=format_strs
    )

    # Create env
    logger.info("creating environment")

    # JAG: Limit the maximum training levels
    train_levels = int(num_levels * args.adv_nenv)
    venv = ProcgenEnv(
            num_envs=num_envs, env_name=env_name, num_levels=train_levels,
            start_level=start_level, distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)


    # JAG: If we use eval_env
    if args.eval_env:
        eval_env = ProcgenEnv(
                num_envs=num_envs, env_name=env_name,
                num_levels=args.eval_levels, start_level=start_level,
                distribution_mode=args.distribution_mode)
        eval_env = VecExtractDictObs(eval_env, "rgb")
        eval_env = VecMonitor(venv=eval_env, filename=None, keep_buf=100)
        eval_env = VecNormalize(venv=eval_env, ob=False)
    else:
        eval_env = None

    # Feed parameters to a dictionary
    adv_ratio={
            'adv': args.adv_adv,
            'obs': args.adv_obs,
            'value': args.adv_value,
            #'nenv': args.adv_nenv,
    }

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup model
    if args.model_width == '1x':
        depths = [16, 32, 32]
    elif args.model_width == '2x':
        depths = [32, 64, 64]
    elif args.model_width == '4x':
        depths = [64, 128, 128]
    conv_fn = lambda x: build_impala_cnn(
            x, depths=depths, use_bn=args.use_bn,
            randcnn=args.use_rand_conv and not is_test_worker)

    # Training
    logger.info("training")
    ppo2.learn = learn  # use customized "learn" function
    model = ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        data_aug=args.data_aug,
        use_rand_conv=args.use_rand_conv,
        model_fn=get_mixreg_model(
            mix_mode=args.mix_mode,
            mix_alpha=args.mix_alpha,
            mix_beta=args.mix_beta,
            use_l2reg=args.use_l2reg,
            l2reg_coeff=args.l2reg_coeff),
        # JAG: Pass adversarial parameters
        adv_epsilon=args.adv_epsilon,
        adv_lr=args.adv_lr,
        adv_gamma=args.adv_gamma,
        adv_thresh=args.adv_thresh,
        adv_ratio=adv_ratio,
        eval_env=eval_env,
        adv_epochs=args.adv_epochs,
    )

    # Saving
    logger.info("saving final model")
    if rank == 0:
        checkdir = os.path.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)
        model.save(os.path.join(checkdir, 'final_model.ckpt'))
예제 #21
0
def train_fn(env_name: str,
             num_train_envs: int,
             n_training_steps: int,
             adr_config: ADRConfig = None,
             experiment_dir: str = None,
             tunable_params_config_path: str = None,
             log_dir: str = None,
             is_test_worker: bool = False,
             comm=None,
             save_interval: int = 1000,
             log_interval: int = 20,
             recur: bool = True):

    # Get the default ADR config if none is provided
    adr_config = ADRConfig() if adr_config is None else adr_config

    # Set up the experiment directory for this run. This will contain everything, from the domain configs for the
    # training environment and ADR evaluation environments to the logs. If the directory path is not provided, then
    # we'll make one an use the date-time-name to make it unique
    if experiment_dir is None:
        experiment_dir = pathlib.Path().absolute() / 'adr_experiments' / (
            'experiment-' + datetime_name())
        experiment_dir.mkdir(parents=True, exist_ok=False)
    else:
        experiment_dir = pathlib.Path(experiment_dir)

    # Make a config directory within the experiment directory to hold the domain configs
    config_dir = experiment_dir / 'domain_configs'
    config_dir.mkdir(parents=True, exist_ok=False)

    # Load the tunable parameters from a config file if it is provided, otherwise get the default for the given game.
    if tunable_params_config_path is None:
        try:
            tunable_params = DEFAULT_TUNABLE_PARAMS[env_name]
        except KeyError:
            raise KeyError(
                f'No default tunable parameters exist for {env_name}')
    else:
        raise NotImplemented(
            'Currently no way to load tunable parameters from a configuration file'
        )

    # Make a default config for the given game...
    train_domain_config_path = config_dir / 'train_config.json'
    try:
        train_domain_config = DEFAULT_DOMAIN_CONFIGS[env_name]
        train_domain_config.to_json(train_domain_config_path)
    except KeyError:
        raise KeyError(f'No default config exists for {env_name}')

    # ...then load the initial bounds for the tunable parameters into the config.
    params = {}
    for param in tunable_params:
        params['min_' + param.name] = param.lower_bound
        params['max_' + param.name] = param.upper_bound
    train_domain_config.update_parameters(params, cache=False)

    # Configure the logger if we are given a log directory
    if log_dir is not None:
        log_dir = experiment_dir / log_dir
        log_comm = comm.Split(1 if is_test_worker else 0, 0)
        format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
        logger.configure(comm=log_comm,
                         dir=str(log_dir),
                         format_strs=format_strs)

    logger.info(f'env_name: {env_name}')
    logger.info(f'num_train_envs: {num_train_envs}')
    logger.info(f'n_training_steps: {n_training_steps}')
    logger.info(f'experiment_dir: {experiment_dir}')
    logger.info(f'tunable_params_config_path: {tunable_params_config_path}')
    logger.info(f'log_dir: {log_dir}')
    logger.info(f'save_interval: {save_interval}')

    n_steps = 256
    ent_coef = .01
    lr = 5e-4
    vf_coef = .5
    max_grad_norm = .5
    gamma = .999
    lmbda = .95
    n_minibatches = 8
    ppo_epochs = 3
    clip_range = .2
    use_vf_clipping = True

    mpi_rank_weight = 0 if is_test_worker else 1

    logger.info('creating environment')
    training_env = ProcgenEnv(num_envs=num_train_envs,
                              env_name=env_name,
                              domain_config_path=str(train_domain_config_path))
    training_env = VecExtractDictObs(training_env, "rgb")
    training_env = VecMonitor(venv=training_env, filename=None, keep_buf=100)
    training_env = VecNormalize(venv=training_env, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.__enter__()

    def conv_fn(x):
        return build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    if recur:
        logger.info("Using CNN LSTM")
        conv_fn = cnn_lstm(nlstm=256, conv_fn=conv_fn)

    logger.info('training')
    ppo2_adr.learn(conv_fn,
                   training_env,
                   n_training_steps,
                   config_dir,
                   adr_config,
                   train_domain_config,
                   tunable_params,
                   n_steps=n_steps,
                   ent_coef=ent_coef,
                   lr=lr,
                   vf_coef=vf_coef,
                   max_grad_norm=max_grad_norm,
                   gamma=gamma,
                   lmbda=lmbda,
                   log_interval=log_interval,
                   save_interval=save_interval,
                   n_minibatches=n_minibatches,
                   n_optepochs=ppo_epochs,
                   clip_range=clip_range,
                   mpi_rank_weight=mpi_rank_weight,
                   clip_vf=use_vf_clipping)
예제 #22
0
def main():
    num_envs = 64
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    total_timesteps = 1_000_000

    ## From random_ppo.py
    max_grad_norm = 0.5
    vf_coef = 0.5

    parser = argparse.ArgumentParser(
        description='Process procgen testing arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1000)
    parser.add_argument('--start_level', type=int, default=50)
    parser.add_argument('--run_id', '-id', type=int, default=0)
    parser.add_argument('--load_id', type=int, default=0)
    parser.add_argument('--nrollouts', '-nroll', type=int, default=50)
    parser.add_argument('--use', type=str, default="randcrop")
    parser.add_argument('--arch', type=str, default="impala")
    parser.add_argument('--no_bn', dest='use_batch_norm', action='store_false')
    parser.add_argument('--netrand', dest='netrand', action='store_true')
    parser.set_defaults(use_batch_norm=True)

    args = parser.parse_args()
    args.total_timesteps = total_timesteps
    arch = args.arch
    use_batch_norm = args.use_batch_norm
    if args.nrollouts:
        total_timesteps = int(args.nrollouts * num_envs * nsteps)
    run_ID = 'run_' + str(args.run_id).zfill(2)
    run_ID += '_load{}'.format(args.load_id)
    print(args.use)
    LOG_DIR = 'log/{}/test'.format(args.use)
    if not args.netrand:
        policy = CnnPolicy
    else:
        policy = RandomCnnPolicy
    load_model = "log/{}/saved_{}_v{}.tar".format(args.use, args.use,
                                                  args.load_id)

    comm = MPI.COMM_WORLD
    num_levels = args.num_levels

    log_comm = comm.Split(0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)

    fpath = join(logpath, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.configure(dir=logpath, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.compat.v1.Session(config=config)
    sess.__enter__()

    logger.info("Testing")
    env = venv
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    nrollouts = total_timesteps // nbatch

    model = Model(sess=sess,
                  policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  arch=arch,
                  use_batch_norm=use_batch_norm,
                  dropout=0)

    model.load(load_model)
    logger.info("Model pramas loaded from saved model: ", load_model)
    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    aug_func=None)

    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)

    mean_rewards = []
    datapoints = []
    for rollout in range(1, nrollouts + 1):
        logger.info('collecting rollouts {}...'.format(rollout))
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
        rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
        ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10])
        ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100])

        logger.info('\n----', rollout)
        mean_rewards.append(rew_mean_10)
        logger.logkv('eprew10', rew_mean_10)
        logger.logkv('eprew100', rew_mean_100)
        logger.logkv('eplenmean10', ep_len_mean_10)
        logger.logkv('eplenmean100', ep_len_mean_100)
        logger.logkv("misc/total_timesteps", rollout * nbatch)

        logger.info('----\n')
        logger.dumpkvs()
    env.close()

    print("Rewards history: ", mean_rewards)
    return mean_rewards
def main():

    args = parse_config()
    run_dir = log_this(args, args.log_dir,
                       args.log_name + '_' + args.env_name + '_' + args.rm_id)

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=run_dir, format_strs=format_strs)

    logger.info("creating environment")

    venv = ProcgenEnv(num_envs=args.num_envs,
                      env_name=args.env_name,
                      num_levels=args.num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode,
                      use_sequential_levels=args.use_sequential_levels)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)

    if args.rm_id:
        # load pretrained network
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        net = RewardNet().to(device)
        rm_path = glob.glob('./**/' + args.rm_id + '.rm', recursive=True)[0]
        net.load_state_dict(
            torch.load(rm_path, map_location=torch.device(device)))

        # use batch reward prediction function instead of the ground truth reward function
        # pass though sigmoid if needed
        if args.use_sigmoid:
            rew_func = lambda x: 1 / (1 + np.exp(-net.predict_batch_rewards(x))
                                      )
        else:
            rew_func = lambda x: net.predict_batch_rewards(x)

        ## Uncomment the line below to train a live-long agent
        # rew_func = lambda x: x.shape[0] * [1]

        venv = ProxyRewardWrapper(venv, rew_func)
    else:
        # true environment rewards will be use
        pass

    venv = VecNormalize(venv=venv, ob=False, use_tf=False)

    # do the rest of the training as normal
    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()

    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)

    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    logger.info("training")

    model = ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=args.timesteps_per_proc,
        save_interval=args.save_interval,
        nsteps=args.nsteps,
        nminibatches=args.nminibatches,
        lam=args.lam,
        gamma=args.gamma,
        noptepochs=args.ppo_epochs,
        log_interval=args.log_interval,
        ent_coef=args.ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=args.use_vf_clipping,
        comm=comm,
        lr=args.learning_rate,
        cliprange=args.clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        load_path=args.load_path,
    )

    model.save(os.path.join(run_dir, 'final_model.parameters'))
예제 #24
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    timesteps_per_proc = 5_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=50)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=0)
    parser.add_argument('--nupdates', type=int, default=0)
    parser.add_argument('--debug', default=False, action="store_true")
    parser.add_argument('--total_tsteps', type=int, default=0)
    parser.add_argument('--load_id', type=int, default=int(-1))

    args = parser.parse_args()
    if not args.total_tsteps:
        args.total_tsteps = timesteps_per_proc  ## use global 20_000_000 if not specified in args!

    if args.nupdates:
        timesteps_per_proc = int(args.nupdates * num_envs * nsteps)
    run_ID = 'run_' + str(args.run_id).zfill(2)
    if args.debug:
        LOG_DIR = 'log/random/debug'
        SAVE_PATH = 'log/random/debug_random_v{}.tar'.format(args.run_id)
    else:
        LOG_DIR = 'log/random/train'
        SAVE_PATH = 'log/random/random_v{}.tar'.format(args.run_id)

    load_path = None
    if args.load_id > -1:
        load_path = 'log/random/random_v{}.tar'.format(args.load_id)
    test_worker_interval = args.test_worker_interval
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []
    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)
    logger.configure(dir=logpath, format_strs=format_strs)

    fpath = join(logpath, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    logger.info("\n Saving to file {}".format(SAVE_PATH))
    logger.info("\nSaved args at:\n\t{}\n".format(fpath))

    #logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.compat.v1.Session(config=config)
    sess.__enter__()

    model = random_ppo.learn(
        env=venv,
        network=None,
        total_timesteps=args.total_tsteps,
        save_interval=2,  ## doesn't matter, only saving at the end
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        # clip_vf=use_vf_clipping,
        lr=learning_rate,
        cliprange=clip_range,
        #cliprange=lambda f : f * 0.2,
        # update_fn=None,
        # init_fn=None,
        save_path=SAVE_PATH,
        load_path=load_path,
        vf_coef=0.5,
        max_grad_norm=0.5,
    )
    model.save(SAVE_PATH)