Beispiel #1
0
def scoped_init_procgen():
    from procgen import ProcgenEnv
    env = ProcgenEnv(num_envs=2,
                     env_name="coinrun",
                     num_levels=12,
                     start_level=34)
    after_init = file_descriptor_count()

    env.close()
    return after_init
Beispiel #2
0
def test_multi_speed(env_name, num_envs, benchmark):
    venv = ProcgenEnv(num_envs=num_envs, env_name=env_name)

    venv.reset()
    actions = np.zeros([venv.num_envs])

    def rollout(max_steps):
        step_count = 0
        while step_count < max_steps:
            _obs, _rews, _dones, _infos = venv.step(actions)
            step_count += 1

    benchmark(lambda: rollout(1000))

    venv.close()
Beispiel #3
0
                entropy_loss = entropy.mean()
                loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef

                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
                optimizer.step()

            if args.target_kl is not None:
                if approx_kl > args.target_kl:
                    break

        y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
        var_y = np.var(y_true)
        explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

        # TRY NOT TO MODIFY: record rewards for plotting purposes
        writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
        writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
        writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
        writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
        writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
        writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
        writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
        writer.add_scalar("losses/explained_variance", explained_var, global_step)
        print("SPS:", int(global_step / (time.time() - start_time)))
        writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)

    envs.close()
    writer.close()
Beispiel #4
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    ##new defined
    vf_coef = 0.5
    max_grad_norm = 0.5
    ###########
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    # timesteps_per_proc = 50_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument('--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--total_timesteps', type=int, default=0)

    args = parser.parse_args()

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR, 
                     format_strs=format_strs,
                     log_suffix="_total_timesteps_{}_num_levels_{}".format(args.total_timesteps,
                                                                           num_levels))

    '''logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv, filename=None, keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)'''

    logger.info("Creating dropout evaluation environment")
    eval_venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=100, start_level=2000, distribution_mode=args.distribution_mode)
    eval_venv = VecExtractDictObs(eval_venv, "rgb")

    eval_venv = VecMonitor(
        venv=eval_venv, filename=None, keep_buf=100,
    )

    eval_venv = VecNormalize(venv=eval_venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, is_train=False, depths=[16,32,32], emb_size=256)

    logger.info("testing dropout")
    

    
    policy = build_policy(eval_venv,conv_fn)

    nenvs = eval_venv.num_envs
    ob_space = eval_venv.observation_space
    ac_space = eval_venv.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch//nminibatches
    
    # Instantiate the model object (that creates act_model and train_model)
    
    from baselines.ppo2.model import Model
    model_fn = Model    #modified from baseline ppo2 learn

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight)
    model.load(MODEL_PATH)
    eval_runner = Runner(env=eval_venv, model=model, nsteps=nsteps, gamma=.999, lam=.95)

    eval_epinfobuf = deque(maxlen=100)
    nupdates = args.total_timesteps//nbatch

    log_interval = 1
    for update in range(1, nupdates+1):
    #single upate to test    
        eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run()
        eval_epinfobuf.extend(eval_epinfos)
        if update % log_interval == 0 or update == 1:
            logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
            logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
            logger.logkv('misc/total_timesteps',update*nbatch)
            logger.dumpkvs()
    eval_venv.close()
Beispiel #5
0
                 vf_coef=vf_coef,
                 max_grad_norm=max_grad_norm,
                 comm=comm,
                 mpi_rank_weight=mpi_rank_weight)

load_path = '000002400.ckpt'
if load_path is not None:
    model.load(load_path)

obs = training_env.reset()
dones = [False]
states = model.initial_state
import numpy as np
step = 0

rew = []
for _ in range(video_interval + video_length + 1):
    actions, values, states, _ = model.step(obs, S=states, M=dones)
    obs[:], rewards, dones, infos = training_env.step(actions)
    rew.append(rewards)
    step += 1
    print(f"Steps: {step}")
    training_env.render()
    if dones[0]:
        break

print(np.mean(rew))
training_env.close()

# recorded_video = glob.glob(os.path.join('./recordings', "*.mp4"))