Ejemplo n.º 1
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None
    set_global_seeds(workerseed)
    env = make_atari(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=int(num_timesteps * 1.1),
        timesteps_per_actorbatch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    env.close()
Ejemplo n.º 2
0
def main():
    logger.configure()
    env = make_atari('PongNoFrameskip-v4')
    env = bench.Monitor(env, logger.get_dir())
    env = deepq.wrap_atari_dqn(env)

    model = deepq.learn(
        env,
        "conv_only",
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=True,
        lr=1e-4,
        total_timesteps=int(1e7),
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
    )

    model.save('pong_model.pkl')
    env.close()
Ejemplo n.º 3
0
def main():
    parser = atari_arg_parser()
    parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn')
    args = parser.parse_args()
    logger.configure()
    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
        policy=args.policy)
Ejemplo n.º 4
0
def main():
    parser = atari_arg_parser()
    parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
    parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
    args = parser.parse_args()
    logger.configure()
    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
        policy=args.policy, lrschedule=args.lrschedule, num_env=16)
Ejemplo n.º 5
0
def main():
    args = mujoco_arg_parser().parse_args()
    logger.configure()
    model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)

    if args.play:
        logger.log("Running trained model")
        obs = np.zeros((env.num_envs,) + env.observation_space.shape)
        obs[:] = env.reset()
        while True:
            actions = model.step(obs)[0]
            obs[:]  = env.step(actions)[0]
            env.render()
Ejemplo n.º 6
0
def main(args):
    # configure logger, disable logging in child MPI processes (with rank > 0)

    arg_parser = common_arg_parser()
    args, unknown_args = arg_parser.parse_known_args(args)
    extra_args = parse_cmdline_kwargs(unknown_args)

    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        logger.configure()
    else:
        logger.configure(format_strs=[])
        rank = MPI.COMM_WORLD.Get_rank()

    model, env = train(args, extra_args)

    if args.save_path is not None and rank == 0:
        save_path = osp.expanduser(args.save_path)
        model.save(save_path)

    if args.play:
        logger.log("Running trained model")
        obs = env.reset()

        state = model.initial_state if hasattr(model, 'initial_state') else None
        dones = np.zeros((1,))

        episode_rew = 0
        while True:
            if state is not None:
                actions, _, state, _ = model.step(obs,S=state, M=dones)
            else:
                actions, _, _, _ = model.step(obs)

            obs, rew, done, _ = env.step(actions)
            episode_rew += rew[0] if isinstance(env, VecEnv) else rew
            env.render()
            done = done.any() if isinstance(done, np.ndarray) else done
            if done:
                print('episode_rew={}'.format(episode_rew))
                episode_rew = 0
                obs = env.reset()

    env.close()

    return model
Ejemplo n.º 7
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=32, num_hid_layers=2)
    env = make_mujoco_env(env_id, workerseed)
    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
    env.close()
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--checkpoint-freq', type=int, default=10000)
    parser.add_argument('--checkpoint-path', type=str, default=None)

    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
    env = make_atari(args.env)
    env = bench.Monitor(env, logger.get_dir())
    env = deepq.wrap_atari_dqn(env)
    model = deepq.models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=bool(args.dueling),
    )

    deepq.learn(
        env,
        q_func=model,
        lr=1e-4,
        max_timesteps=args.num_timesteps,
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=bool(args.prioritized),
        prioritized_replay_alpha=args.prioritized_replay_alpha,
        checkpoint_freq=args.checkpoint_freq,
        checkpoint_path=args.checkpoint_path,
    )

    env.close()
Ejemplo n.º 9
0
def main():
    logger.configure()
    parser = mujoco_arg_parser()
    parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
    parser.set_defaults(num_timesteps=int(2e7))
   
    args = parser.parse_args()
    
    if not args.play:
        # train the model
        train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
    else:       
        # construct the model object, load pre-trained model and render
        pi = train(num_timesteps=1, seed=args.seed)
        U.load_state(args.model_path)
        env = make_mujoco_env('Humanoid-v2', seed=0)

        ob = env.reset()        
        while True:
            action = pi.act(stochastic=False, ob=ob)[0]
            ob, _, done, _ =  env.step(action)
            env.render()
            if done:
                ob = env.reset()
Ejemplo n.º 10
0
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, load_path=None, save_path=None):

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)

    logger.configure(dir=save_path)     # logger save dir config.
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    if load_path is not None:
        print('load model from ' + load_path)
        model.load(load_path)
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    nupdates = total_timesteps//nbatch
    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        obs, returns, masks, actions, values, neglogpacs, states, epinfos, max_rewards, mean_rewards, median_rewards = runner.run() #pylint: disable=E0632
        '''
        # array to img
        from PIL import Image
        i = 0
        for ob in obs:
            for j in range(4):
                o = ob[:, :, j*3:j*3+3]
                img = Image.fromarray(o)
                img.save('input_img_' + str(i) + '.png')
                i += 1
        '''
        epinfobuf.extend(epinfos)
        mblossvals = []
        if states is None: # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
        else: # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            logger.logkv('max_rewards', max_rewards)
            logger.logkv('mean_rewards', mean_rewards)
            logger.logkv('median_rewards', median_rewards)
            # logger.logkv('env_stage', env.envs[0].env.env.env.env.env.env.statename)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    env.close()
    return model
Ejemplo n.º 11
0
    env = gym.make(alg_kwargs['env_name'])
    env.set_episode_size(alg_kwargs['nsteps'])
    env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir()), allow_early_resets=True)

    return env

# Get dictionary from baselines/acktr/defaults
alg_kwargs = defaults.mara_mlp()

# Create needed folders
timedate = datetime.now().strftime('%Y-%m-%d_%Hh%Mmin')
logdir = '/tmp/ros2learn/' + alg_kwargs['env_name'] + '/acktr/' + timedate

# Generate tensorboard file
format_strs = os.getenv('MARA_LOG_FORMAT', 'stdout,log,csv,tensorboard').split(',')
logger.configure(os.path.abspath(logdir), format_strs)

with open(logger.get_dir() + "/parameters.txt", 'w') as out:
    out.write(
        'num_layers = ' + str(alg_kwargs['num_layers']) + '\n'
        + 'num_hidden = ' + str(alg_kwargs['num_hidden']) + '\n'
        + 'layer_norm = ' + str(alg_kwargs['layer_norm']) + '\n'
        + 'nsteps = ' + str(alg_kwargs['nsteps']) + '\n'
        + 'nprocs = ' + str(alg_kwargs['nprocs']) + '\n'
        + 'gamma = ' + str(alg_kwargs['gamma']) + '\n'
        + 'lam = ' + str(alg_kwargs['lam']) + '\n'
        + 'ent_coef = ' + str(alg_kwargs['ent_coef']) + '\n'
        + 'vf_coef = ' + str(alg_kwargs['vf_coef']) + '\n'
        + 'vf_fisher_coef = ' + str(alg_kwargs['vf_fisher_coef']) + '\n'
        + 'lr = ' + str(alg_kwargs['lr']) + '\n'
        + 'max_grad_norm = ' + str(alg_kwargs['max_grad_norm']) + '\n'
    gamma = 0.995

    env = RemoteVecEnv([make_env] * num_cpus)
    env = VecNormalize(env, ret=True, gamma=gamma)

    set_global_seeds(seed)
    policy = policies.MlpPolicy
    ppo2.learn(policy=policy,
               env=env,
               nsteps=128,
               nminibatches=num_cpus-num_casks,
               lam=0.95,
               gamma=gamma,
               noptepochs=4,
               log_interval=1,
               vf_coef=0.5,
               ent_coef=0.0,
               lr=3e-4,
               cliprange=0.2,
               save_interval=2,
               load_path="./logs/course_6/00244",
               total_timesteps=num_timesteps,
               num_casks=num_casks)


if __name__ == "__main__":
    ray.init()
    configure(dir="./logs")
    train(int(1e6), 60730)
Ejemplo n.º 13
0
def launch(
    env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
    override_params={}, save_policies=True
):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the ' +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
            'are looking to reproduce those results, be aware of this. Please also refer to ' + 
            'https://github.com/openai/baselines/issues/314 for further details.')
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed)

    train(
        logdir=logdir, policy=policy, rollout_worker=rollout_worker,
        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval, save_policies=save_policies)
Ejemplo n.º 14
0
def train(env_id, num_timesteps, seed, render):
    env = LearningEnvironment(num_particles=PARTICLES, disable_render=not render)
    env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json"))
    set_global_seeds(seed)
    gym.logger.setLevel(logging.WARN)

    with tf.Session(config=tf.ConfigProto()) as session:
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        with tf.variable_scope("vf"):
            vf = NeuralNetValueFunction(ob_dim, ac_dim)
        with tf.variable_scope("pi"):
            policy = GaussianMlpPolicy(ob_dim, ac_dim)

        learn(env, policy=policy, vf=vf,
            gamma=0.99, lam=0.97, timesteps_per_batch=8000,
            desired_kl=0.0002,
            num_timesteps=num_timesteps,
            animate=False)

        env.close()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Run Mujoco benchmark.')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
    parser.add_argument('--render', help='Choose whether to render', type=bool, default=False)
    args = parser.parse_args()
    logger.configure(dir=DIRECTORY)
    train(args.env, num_timesteps=5e7, seed=args.seed, render=args.render)
Ejemplo n.º 15
0
def main():
    expdir = os.path.join("/home/wulfebw/experiments", "ssb64_005", "run_003")
    os.makedirs(expdir, exist_ok=True)
    monitor_filepath = os.path.join(expdir, "monitor.csv")
    movie_dir = os.path.join(expdir, "movies")
    os.makedirs(movie_dir, exist_ok=True)
    load_filepath = None
    # load_filepath = "/home/wulfebw/experiments/ssb64_004/run_006/checkpoints/00100"

    # This configures baselines logging.
    configure(dir=expdir)
    # Creating the session here prevents tf from using all the gpu memory, which
    # causes a failure in the emulator. I'm not sure why because when the emulator
    # is running with angrylion I thought it wasn't using any gpu memory, but
    # there's a lot here I don't understand so oh well.
    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                            intra_op_parallelism_threads=1,
                                            inter_op_parallelism_threads=1,
                                            gpu_options=gpu_options))

    def make_env(rank, grayscale=True):
        retro.data.add_custom_integration("custom")
        env = retro.n64_env.N64Env(game="SuperSmashBros-N64",
                                   use_restricted_actions=retro.Actions.MULTI_DISCRETE,
                                   inttype=retro.data.Integrations.CUSTOM,
                                   obs_type=retro.Observations.IMAGE)
        env = wrap_n64(env, grayscale=grayscale)
        env = wrap_monitoring_n64(env, monitor_filepath=monitor_filepath, movie_dir=movie_dir)
        return env

    def make_vec_env(nenvs=4, recurrent=False, grayscale=True, frame_stack=4, frame_diff=False):
        venv = SubprocVecEnv([lambda: make_env(rank, grayscale=grayscale) for rank in range(nenvs)])
        # Uncomment this line in place of the one above for debugging.
        # venv = DummyVecEnv([lambda: make_env(0)])

        if not recurrent:
            if frame_diff:
                venv = VecFrameDiff(venv)
            else:
                # Perform the frame stack at the vectorized environment level as opposed to at
                # the individual environment level. I think this allows you to communicate fewer
                # images across processes.
                venv = VecFrameStack(venv, frame_stack)
        return venv

    network_name = "impala_cnn"
    recurrent = "lstm" in network_name
    grayscale = False
    frame_stack = 2
    frame_diff = False
    venv = make_vec_env(nenvs=16,
                        recurrent=recurrent,
                        grayscale=grayscale,
                        frame_stack=frame_stack,
                        frame_diff=frame_diff)
    ppo2.learn(network=network_name,
               env=venv,
               total_timesteps=int(10e6),
               nsteps=256,
               nminibatches=8,
               lam=0.95,
               gamma=0.999,
               noptepochs=3,
               log_interval=1,
               ent_coef=.01,
               lr=lambda f: f * 5e-4,
               cliprange=0.2,
               save_interval=10,
               load_path=load_filepath)
def launch(n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           override_params={},
           save_policies=True):
    timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    logdir = osp.join(osp.dirname(__file__),
                      'log/reach_her_%i_%s' % (seed, timestamp))
    print("Logging to %s." % logdir)

    env = "SawyerPickAndPlace-v1"

    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        # try:
        #     whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        # except CalledProcessError:
        # fancy version of mpi call failed, try simple version
        whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env
    params['replay_strategy'] = replay_strategy
    if env in config.DEFAULT_ENV_PARAMS:
        params.update(
            config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies)
Ejemplo n.º 17
0
    )  # choices are adaptive-param_xx, ou_xx, normal_xx, none
    parser.add_argument('--num-timesteps', type=int, default=None)
    boolean_flag(parser, 'evaluation', default=False)

    # add reward_param
    parser.add_argument('--reward_param_scaling', type=float, default=0.5)
    parser.add_argument('--reward_param_thr', type=float, default=50.)
    parser.add_argument('--reward_param_type', type=str, default='const')
    boolean_flag(parser, 'my_render', default=True)

    args = parser.parse_args()

    if args.num_timesteps is not None:
        assert (args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles *
                args.nb_rollout_steps)

    dict_args = vars(args)

    del dict_args['num_timesteps']

    return dict_args


if __name__ == '__main__':
    args = parse_args()

    if MPI.COMM_WORLD.Get_rank() == 0:
        logger.configure()

    run(**args)
Ejemplo n.º 18
0
def run_task(vv, log_dir=None, exp_name=None):
    override_params = {}
    # Fork for multi-CPU MPI implementation.
    if vv['num_cpu'] > 1:
        whoami = mpi_fork(vv['num_cpu'])
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    log_dir = '/media/part/cmu_ri/deep/deep_RL/data/local/square2d-debug/square2d_debug_2018_06_17/'  #hack for now, fix later

    # Configure logging
    if rank == 0:
        if log_dir or logger.get_dir() is None:
            from pathlib import Path
            logger.configure(dir=log_dir, exp_name=exp_name)
    else:
        if log_dir or logger.get_dir() is None:
            from pathlib import Path
            logger.configure(dir=log_dir, exp_name=exp_name)

    logdir = logger.get_dir()
    #logdir = ''# a quick hack, fix later
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = vv['seed'] + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = vv['env_name']
    params['replay_strategy'] = vv['replay_strategy']
    params['replay_sample_strategy'] = vv['replay_sample_strategy']
    params['reward_type'] = vv['reward_type']
    params['replay_k'] = vv['replay_k']
    if vv['network'] == 'fc':
        params['network_class'] = 'baselines.her.actor_critic:ActorCritic'
    elif vv['network'] == 'cnn_fc':
        params[
            'network_class'] = 'baselines.her.cnn_actor_critic:CNNActorCritic'

    if vv['env_name'] in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[
            vv['env_name']])  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'variant.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    shapes = config.configure_shapes(params)
    dims = shapes_to_dims(shapes)
    policy = config.configure_ddpg(dims=dims,
                                   shapes=shapes,
                                   params=params,
                                   clip_return=vv['clip_return'])

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=vv['n_epochs'],
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=vv['policy_save_interval'],
          save_policies=vv['save_policies'])
Ejemplo n.º 19
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='SunblazeCartPoleRandomNormal-v0')
    parser.add_argument('--seed',
                        type=int,
                        help='RNG seed, defaults to random')
    parser.add_argument('--output', type=str)
    parser.add_argument('--processes', default=1, help='int or "max" for all')

    # EPOpt specific
    parser.add_argument('--epsilon', type=float, default=1.0)
    # EPOpt paper keept epsilon=1 until iters>100 (max 200 iters)
    parser.add_argument('--activate',
                        type=int,
                        default=100,
                        help='How long to fix epsilon to 1.0 before e')
    parser.add_argument(
        '--paths',
        type=int,
        default=100,
        help='number of trajectories to sample from each iteration')
    parser.add_argument('--algorithm',
                        type=str,
                        choices=['ppo2', 'a2c'],
                        default='ppo2',
                        help='Inner batch policy optimization algorithm')
    parser.add_argument('--policy',
                        choices=['mlp', 'lstm'],
                        default='mlp',
                        help='Policy architecture')

    # Episode-modification specific:
    # parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--total-episodes', type=int, default=5e4)

    # RL algo. yyperparameters
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--nsteps', type=int, default=2048)
    parser.add_argument('--ent-coef',
                        type=float,
                        default=1e-2,
                        help='Only relevant for A2C')
    parser.add_argument('--nminibatches',
                        type=int,
                        default=32,
                        help='Only relevant for PPO2')

    args = parser.parse_args()

    # Configure logger
    if args.output:
        try:
            os.makedirs(args.output)
        except OSError:
            pass
        logger.reset()
        logger.configure(dir=args.output)

    # If seed is unspecified, generate a pseudorandom one
    if not args.seed:
        # "Seed must be between 0 and 2**32 - 1"
        seed = create_seed(args.seed, max_bytes=4)
    else:
        seed = args.seed

    # Log it for reference
    with open(os.path.join(args.output, 'seed.txt'), 'w') as fout:
        fout.write("%d\n" % seed)

    if args.processes == 'max':
        ncpu = multiprocessing.cpu_count()
        # from: https://github.com/openai/baselines/blob/1f8a03f3a62367526f20215188fb5ea4b9ec27e0/baselines/ppo2/run_atari.py#L15
        if sys.platform == 'darwin': ncpu //= 2
    else:
        try:
            ncpu = int(args.processes)
        except ValueError:
            raise argparse.ArgumentTypeError("Invalid number of processes")

    train_epopt(
        args.env,
        total_episodes=args.total_episodes,
        seed=seed,
        lr=args.lr,
        epsilon=args.epsilon,
        activate_at=args.activate,
        paths=args.paths,
        algorithm=args.algorithm,
        policy=args.policy,
        ncpu=ncpu,
        nsteps=args.nsteps,
        nminibatches=args.nminibatches,
        ent_coef=args.
        ent_coef,  # default 0.01 in baselines, 0.0001 in chainer A3C
    )
def main():
    import neptune

    parser = argparse.ArgumentParser(argument_default=None)
    parser.add_argument('--config', action='append', help='Gin config files.')
    parser.add_argument('--debug', action='store_true', default=False)
    cmd_args, unknown = parser.parse_known_args()
    debug = cmd_args.debug
    spec_path = cmd_args.config[0]

    if not debug:
        try:
            with open(spec_path, 'rb') as f:
                import cloudpickle
                specification = cloudpickle.load(f)
        except pickle.UnpicklingError:
            with open(spec_path) as f:
                vars_ = {'script': os.path.basename(spec_path)}
                exec(f.read(), vars_)  # pylint: disable=exec-used
                specification = vars_['experiments_list'][0].to_dict()
                print(
                    'NOTE: Only the first experiment from the list will be run!'
                )
        parameters = specification['parameters']
    else:
        print("debug run")
        parameters = dict(env_id="toy_mr", env_size=None)

    class MockArgs(object):
        def add(self, key, value):
            setattr(self, key, value)

    args = MockArgs()

    args.add('env', parameters["env_id"])  # 'chain_env' 'toy_mr'
    args.add('env_size', parameters["env_size"])
    args.add('seed', 0)
    args.add('max_episode_steps', 300)

    args.add('num_timesteps', int(1e12))
    args.add('num_env', 32)
    args.add('use_news', 0)
    args.add('gamma', 0.99)
    args.add('gamma_ext', 0.999)
    args.add('lam', 0.95)
    args.add('update_ob_stats_every_step', 0)
    args.add('update_ob_stats_independently_per_gpu', 0)
    args.add('update_ob_stats_from_random_agent', 1)
    args.add('proportion_of_exp_used_for_predictor_update', 1.)
    args.add('tag', '')
    args.add(
        'policy',
        'cnn',
    )
    args.add('int_coeff', 1.)
    args.add('ext_coeff', 2.)
    args.add('dynamics_bonus', 0)

    if not debug:
        # TODO read more from specification
        print("running with neptune")
        neptune.init(
            project_qualified_name="pmtest/planning-with-learned-models")
        neptune.create_experiment(
            name=specification['name'],
            tags=specification['tags'],
            params=specification['parameters'],
            upload_stdout=False,
            upload_stderr=False,
        )
        neptune.send_metric("test", 777)
        baselines_format_strs = ['log', 'csv']
    else:
        print("running without neptune")
        baselines_format_strs = ['stdout', 'log', 'csv']

    logger.configure(dir="out", format_strs=baselines_format_strs)

    seed = 10000 * args.seed  # + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(frame_stack=4,
               nminibatches=4,
               nepochs=4,
               lr=0.0001,
               max_grad_norm=0.0,
               env_size=args.env_size,
               use_news=args.use_news,
               gamma=args.gamma,
               gamma_ext=args.gamma_ext,
               max_episode_steps=args.max_episode_steps,
               lam=args.lam,
               update_ob_stats_every_step=args.update_ob_stats_every_step,
               update_ob_stats_independently_per_gpu=args.
               update_ob_stats_independently_per_gpu,
               update_ob_stats_from_random_agent=args.
               update_ob_stats_from_random_agent,
               proportion_of_exp_used_for_predictor_update=args.
               proportion_of_exp_used_for_predictor_update,
               policy=args.policy,
               int_coeff=args.int_coeff,
               ext_coeff=args.ext_coeff,
               dynamics_bonus=args.dynamics_bonus)

    tf_util.make_session(make_default=True)
    train(env_id=args.env,
          num_env=args.num_env,
          seed=seed,
          num_timesteps=args.num_timesteps,
          hps=hps,
          use_neptune=(not debug))
Ejemplo n.º 21
0
parser.add_argument('algo_type', type=str)
parser.add_argument('start', type=int)
args = parser.parse_args()

rs = 100 * (args.start + 1)
np.random.seed(rs)

with open('experiment_params.yaml', 'r') as stream:
    params = yaml.load(stream)
env_type = 'mujoco'
env_id = params[args.mjenv]
params['algo_type'] = args.algo_type
params['rseed'] = rs

log_path = os.path.join('results_NPG', env_id, args.algo_type, 'rs_' + str(rs))
logger.configure(dir=os.path.join(os.getcwd(), log_path))

# dump the params once in the folder
with open(os.path.join(log_path, 'params.yaml'), 'w') as outfile:
    yaml.dump(params, outfile, default_flow_style=False)

env = make_vec_env(env_id, env_type, 1, rs)

if args.algo_type == 'HOOF_All':
    learnt_model = learn_hoof_all(
        env,
        env_type,
        timesteps_per_batch=params['batch_size'],
        total_timesteps=params['total_ts'],
        kl_range=params['kl_bound'],
        gamma_range=params['discount_bound'],
Ejemplo n.º 22
0
                                    inter_op_parallelism_threads=1,
                                    intra_op_parallelism_threads=1))

U.initialize()

# Get dictionary from baselines/acktr/defaults
defaults = defaults.mara_mlp()

# Create needed folders
try:
    logdir = defaults['trained_path'].split('checkpoints')[
        0] + 'results' + defaults['trained_path'].split('checkpoints')[1]
except:
    logdir = '/tmp/ros2learn/' + defaults['env_name'] + '/acktr_results/'
finally:
    logger.configure(os.path.abspath(logdir))
    csvdir = logdir + "csv/"

csv_files = [
    csvdir + "det_obs.csv", csvdir + "det_acs.csv", csvdir + "det_rew.csv"
]
if not os.path.exists(csvdir):
    os.makedirs(csvdir)
else:
    for f in csv_files:
        if os.path.isfile(f):
            os.remove(f)


def make_env():
    env = gym.make(defaults['env_name'])
    parser.add_argument('--actor-lr', type=float, default=0.01)
    parser.add_argument('--critic-lr', type=float, default=0.005)
    boolean_flag(parser, 'popart', default=False)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--reward-scale', type=float, default=1.)
    parser.add_argument('--clip-norm', type=float, default=None)
    parser.add_argument('--nb-epochs', type=int, default=1)  # with default settings, perform 1M steps total
    parser.add_argument('--nb-epoch-cycles', type=int, default=1000)
    parser.add_argument('--nb-train-steps', type=int, default=1)  # per epoch cycle and MPI worker
    parser.add_argument('--nb-eval-steps', type=int, default=100)  # per epoch cycle and MPI worker
    parser.add_argument('--nb-rollout-steps', type=int, default=1)  # per epoch cycle and MPI worker
    parser.add_argument('--noise-type', type=str, default='ou_0.2')  # choices are adaptive-param_xx, ou_xx, normal_xx, none
    parser.add_argument('--num-timesteps', type=int, default=None)
    boolean_flag(parser, 'evaluation', default=False)
    args = parser.parse_args()
    # we don't directly specify timesteps for this script, so make sure that if we do specify them
    # they agree with the other parameters
    if args.num_timesteps is not None:
        assert(args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps)
    dict_args = vars(args)
    del dict_args['num_timesteps']
    return dict_args


if __name__ == '__main__':
    args = parse_args()
    if MPI.COMM_WORLD.Get_rank() == 0:
        logger.configure(dir=os.path.join(get_default_data_directory("dppg_baselines_main_editted")))
    # Run actual script.
    run(**args)
Ejemplo n.º 24
0
def main(args):
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)
    if args.env_id == "MsPacman-v0":
        from gym import wrappers
        env = wrappers.Monitor(
            env,
            "/tmp",
            force=True,
            video_callable=False,
        )
        from baselines.common.wrappers import wrap_deepmind
        env = wrap_deepmind(env)
    if args.encode_1d_obs:
        from vae.AE import Autoencoder
        from baselines.common.vae_encoding_wrapper import VAEEncodingWrapper
        ae = Autoencoder((84, 84, 4), [(64, 8, 2), (128, 6, 3), (128, 4, 2),
                                       (128, 3, 1)], [1000, 500], 100)
        ae.load_model("../../vae/runs/5e-4decay/model")
        U.update_initialized_parameters()
        env = VAEEncodingWrapper(env, ae)
    env.seed(args.seed)

    # env = bench.Monitor(env, logger.get_dir() and
    #                     osp.join(logger.get_dir(), "monitor.json"))

    gym.logger.setLevel(logging.WARN)

    if args.log_dir != Log_dir:
        log_dir = osp.join(Log_dir, args.log_dir)
        save_dir = osp.join(Checkpoint_dir, args.log_dir)
    else:
        log_dir = Log_dir
        save_dir = Checkpoint_dir

    args, rnd_iter, dyn_norm = modify_args(args)

    logger.log(f"rnd_cnn_type: {args.rnd_cnn_type}")
    logger.log(f"policy_cnn_type: {args.policy_cnn_type}")
    logger.log(f"rnd_critic_scale: {args.rnd_critic_scale}")
    logger.log(f"policy_hidden_size: {args.policy_hidden_size}")

    def policy_fn(name, ob_space, ac_space):
        # return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
        #                             hid_size=args.policy_hidden_size, num_hid_layers=2, popart=args.popart, gaussian_fixed_var=args.fixed_var)
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=[150, 50],
                                    num_hid_layers=2,
                                    popart=args.popart,
                                    gaussian_fixed_var=args.fixed_var,
                                    activation="relu")

    def policy_fn_cnn(name, ob_space, ac_space):
        return cnn_policy.CNNPolicy(name=name,
                                    policy_cnn_type=args.policy_cnn_type,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=args.policy_hidden_size,
                                    num_hid_layers=2,
                                    popart=args.popart,
                                    gaussian_fixed_var=args.fixed_var)

    if args.task == 'train':

        if args.env_id == "MsPacman-v0":
            if args.encode_1d_obs:
                exp_data = get_exp_data_atari(DATASET_PATH, ae)
            else:
                exp_data = get_exp_data_atari(DATASET_PATH)
        else:
            exp_data = get_exp_data(
                osp.join(osp.dirname(osp.realpath(__file__)),
                         "../../data/%s.pkl" % args.env_id))

        task_name = get_task_name(args)

        task_name += "_rndcnn" + str(args.rnd_cnn_type) + "_mlpcnn" + str(
            args.policy_cnn_type)

        logger.configure(dir=log_dir,
                         log_suffix=task_name,
                         format_strs=["log", "stdout"])
        if args.reward == 0:
            if args.env_id == "Humanoid-v2":
                critic = make_critic(env,
                                     exp_data,
                                     reward_type=args.reward,
                                     scale=2500)
            elif args.env_id == "Reacher-v2":
                critic = make_critic(env,
                                     exp_data,
                                     rnd_hid_size=20,
                                     hid_size=20,
                                     reward_type=args.reward,
                                     scale=2500)
            elif args.env_id == "HalfCheetah-v2":
                critic = make_critic(env,
                                     exp_data,
                                     rnd_hid_size=20,
                                     hid_size=20,
                                     reward_type=args.reward,
                                     scale=25000)
            elif args.env_id == "Ant-v2":
                critic = make_critic(env, exp_data, reward_type=args.reward)
            elif args.env_id == "MsPacman-v0":
                critic = make_critic(env,
                                     exp_data,
                                     hid_size=128,
                                     reward_type=args.reward,
                                     scale=args.rnd_critic_scale,
                                     CNN_critic=args.use_cnn,
                                     rnd_cnn_type=args.rnd_cnn_type)
            else:
                critic = make_critic(env, exp_data, reward_type=args.reward)
        else:
            if args.env_id == "Reacher-v2":
                critic = make_critic(env,
                                     exp_data,
                                     hid_size=100,
                                     reward_type=args.reward,
                                     scale=1000)
            if args.env_id == "Walker2d-v2":
                critic = make_critic(env,
                                     exp_data,
                                     hid_size=30,
                                     reward_type=args.reward,
                                     scale=100)
            if args.env_id == "HalfCheetah-v2":
                critic = make_critic(env,
                                     exp_data,
                                     hid_size=30,
                                     reward_type=args.reward,
                                     scale=1000)
            if args.env_id == "Hopper-v2":
                critic = make_critic(env,
                                     exp_data,
                                     hid_size=30,
                                     reward_type=args.reward,
                                     scale=1000)
            if args.env_id == "Ant-v2":
                critic = make_critic(env,
                                     exp_data,
                                     hid_size=128,
                                     reward_type=args.reward,
                                     scale=100)
            if args.env_id == "MsPacman-v0":
                critic = make_critic(env,
                                     exp_data,
                                     hid_size=128,
                                     reward_type=args.reward,
                                     scale=args.rnd_critic_scale,
                                     rnd_cnn_type=args.rnd_cnn_type)

        if args.use_cnn:
            policy = policy_fn_cnn
        else:
            policy = policy_fn
        train(env, args.seed, policy, critic, exp_data, args.g_step,
              args.d_step, args.policy_entcoeff, args.num_timesteps, save_dir,
              args.pretrained, args.BC_max_iter, args.gamma, rnd_iter,
              dyn_norm, task_name, args.use_cnn)
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample)
    else:
        raise NotImplementedError
    env.close()
Ejemplo n.º 25
0
def launch(
    env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
    override_params={}, save_policies=True
):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed)

    train(
        logdir=logdir, policy=policy, rollout_worker=rollout_worker,
        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval, save_policies=save_policies)
Ejemplo n.º 26
0
def main():
    args = pybullet_arg_parser().parse_args()
    logger.configure(format_strs=['stdout', 'log', 'csv'],
                     log_suffix="PPO_NAC_Advantage-" + args.env)
    logger.log("Algorithm: PPO_NAC_Advantage-" + args.env)
    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
Ejemplo n.º 27
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartHumanWalker-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument(
        '--init_policy',
        help='Initial Policy',
        default=
        'data/ppo_DartHumanWalker-v1241_energy15_vel5_5s_pdscale1_mirror4_up03fwd03ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_velrew3_2s_dcon1_asinput_damping2kneethigh_thigh160knee100_curriculum_1xjoint_shoulder100_dqpen0_2kassist/policy_params.pkl'
    )
    parser.add_argument('--init_curriculum',
                        help='Initial Curriculum',
                        nargs='+',
                        default=[2000.0, 2000])
    parser.add_argument(
        '--ref_policy',
        help='Reference Policy',
        default=
        'data/ppo_DartHumanWalker-v1241_energy15_vel5_5s_pdscale1_mirror4_up03fwd03ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_velrew3_2s_dcon1_asinput_damping2kneethigh_thigh160knee100_curriculum_1xjoint_shoulder100_dqpen0_2kassist/policy_params.pkl'
    )
    parser.add_argument('--ref_curriculum',
                        help='Reference Curriculum',
                        nargs='+',
                        default=[2000.0, 2000])
    parser.add_argument('--anc_thres',
                        help='Anchor Threshold',
                        type=float,
                        default=0.75)
    parser.add_argument('--prog_thres',
                        help='Progress Threshold',
                        type=float,
                        default=0.6)
    parser.add_argument('--batch_size',
                        help='Batch Size',
                        type=int,
                        default=2500)
    parser.add_argument('--max_iter',
                        help='Maximum Iteration',
                        type=int,
                        default=2000)
    parser.add_argument('--use_reftraj',
                        help='Use reference trajectory',
                        type=int,
                        default=0)
    args = parser.parse_args()
    logger.reset()

    logger.configure(
        'data/ppo_curriculum_150eachit_vel15_tvel1scale_up03fwd03ltl15_spinepen1_thighyawpen001_mirror4_runningavg1p5_2s_stride15_e1_'
        + args.env + '_' + str(args.seed) + '_' + str(args.anc_thres) + '_' +
        str(args.prog_thres) + '_' + str(args.batch_size))
    sess = U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env)

    ob_space = env.observation_space
    ac_space = env.action_space

    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(
            name=name,
            ob_space=ob_space,
            ac_space=ac_space,
            hid_size=64,
            num_hid_layers=3,
            gmm_comp=1,
            mirror_loss=True,
            observation_permutation=np.array([
                0.0001, -1, 2, -3, -4, -11, 12, -13, 14, 15, 16, -5, 6, -7, 8,
                9, 10, -17, 18, -19, -24, 25, -26, 27, -20, 21, -22, 23, 28,
                29, -30, 31, -32, -33, -40, 41, -42, 43, 44, 45, -34, 35, -36,
                37, 38, 39, -46, 47, -48, -53, 54, -55, 56, -49, 50, -51, 52,
                58, 57, 59
            ]),
            action_permutation=np.array([
                -6, 7, -8, 9, 10, 11, -0.001, 1, -2, 3, 4, 5, -12, 13, -14,
                -19, 20, -21, 22, -15, 16, -17, 18
            ]))

    policy = policy_fn('policy', ob_space, ac_space)
    init_curriculum = np.array(args.init_curriculum)
    ref_policy = policy_fn('ref_policy', ob_space, ac_space)
    ref_curriculum = np.array(args.ref_curriculum)

    policy_params = joblib.load(args.init_policy)
    ref_policy_params = joblib.load(args.ref_policy)
    U.initialize()
    cur_scope = policy.get_variables()[0].name[0:policy.get_variables()[0].
                                               name.find('/')]
    orig_scope = list(
        policy_params.keys())[0][0:list(policy_params.keys())[0].find('/')]
    ref_scope = list(ref_policy_params.keys())[0][0:list(ref_policy_params.
                                                         keys())[0].find('/')]
    for i in range(len(policy.get_variables())):
        assign_op = policy.get_variables()[i].assign(
            policy_params[policy.get_variables()[i].name.replace(
                cur_scope, orig_scope, 1)])
        sess.run(assign_op)
        assign_op = ref_policy.get_variables()[i].assign(
            ref_policy_params[ref_policy.get_variables()[i].name.replace(
                'ref_' + cur_scope, ref_scope, 1)])
        sess.run(assign_op)

    anchor_threshold = args.anc_thres
    progress_threshold = args.prog_thres

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)
    env.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)

    curriculum_evolution = []

    env.env.env.anchor_kp = ref_curriculum
    ref_score = None
    ref_max_score = None
    reference_trajectory = None
    #if MPI.COMM_WORLD.Get_rank() == 0:
    if args.use_reftraj == 1:
        reference_trajecotry = gen_reftraj(env, ref_policy, 299)
        env.env.reference_trajectory = reference_trajectory
    ref_score, ref_max_score = evaluate_policy(env, ref_policy, 24)
    ref_score = MPI.COMM_WORLD.bcast(ref_score, root=0)
    ref_max_score = MPI.COMM_WORLD.bcast(ref_max_score, root=0)
    reference_score = ref_score * progress_threshold
    reference_anchor_score = ref_score * anchor_threshold
    reference_max_score = ref_max_score * 0.9
    env.env.env.anchor_kp = init_curriculum
    reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0)
    env.env.reference_trajectory = reference_trajectory

    current_curriculum = np.copy(init_curriculum)
    print('reference scores: ', reference_score, reference_anchor_score,
          reference_max_score)

    previous_params = policy_params
    for iter in range(args.max_iter):
        print('curriculum iter ', iter)
        print('ref score: ', reference_anchor_score)

        opt_pi, final_rew = pposgd_mirror.learn(
            env,
            policy_fn,
            max_timesteps=args.batch_size * MPI.COMM_WORLD.Get_size() * 150,
            timesteps_per_batch=int(args.batch_size),
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=10,
            optim_stepsize=3e-4,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
            callback=callback,
            sym_loss_weight=4.0,
            return_threshold=reference_anchor_score,
            init_policy_params=previous_params,
            policy_scope='pi' + str(iter),
            min_iters=0,
            reward_drop_bound=True,
            #max_threshold = reference_max_score,
        )
        print('one learning iteration done')
        if np.linalg.norm(current_curriculum) >= 0.0001:
            # re-compute reference trajectory
            if MPI.COMM_WORLD.Get_rank() == 0 and args.use_reftraj == 1:
                print('recompute reference traj')
                reference_trajecotry = gen_reftraj(env, opt_pi, 299)
            reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory,
                                                        root=0)
            env.env.reference_trajectory = reference_trajectory

            if final_rew < reference_anchor_score * 0.95:
                print('update reference scores')
                reference_score = reference_score / reference_anchor_score * final_rew
                reference_anchor_score = final_rew

            closest_candidate = None
            #if MPI.COMM_WORLD.Get_rank() == 0:
            directions = [
                np.array([-1, 0]),
                np.array([0, -1]),
                -current_curriculum / np.linalg.norm(current_curriculum)
            ]
            int_d1 = directions[0] + directions[2]
            int_d2 = directions[1] + directions[2]
            directions.append(int_d1 / np.linalg.norm(int_d1))
            directions.append(int_d2 / np.linalg.norm(int_d2))

            #directions = [np.array([0.0, -1.0])] # only search in one direction
            candidate_next_anchors = []
            for direction in directions:
                found_point, perf = binary_search_curriculum(
                    env, opt_pi, current_curriculum, direction,
                    reference_score, reference_max_score, 6)
                print(direction, found_point, perf)
                candidate_next_anchors.append(found_point)
                if closest_candidate is None:
                    closest_candidate = np.copy(found_point)
                elif np.linalg.norm(closest_candidate) > np.linalg.norm(
                        found_point):
                    closest_candidate = np.copy(found_point)
            if np.linalg.norm(closest_candidate) < 0.5:
                closest_candidate = np.array([0, 0])
            if np.abs(closest_candidate[0]) < 0.1:
                closest_candidate[0] = 0.0
            if np.abs(closest_candidate[1]) < 0.1:
                closest_candidate[1] = 0.0
            #closest_candidate = MPI.COMM_WORLD.bcast(closest_candidate, root=0)

            current_curriculum = np.copy(closest_candidate)
        env.env.env.anchor_kp = current_curriculum
        '''print('Update Init Pose Distributions')
        update_init_poses(env, opt_pi)
        if MPI.COMM_WORLD.Get_rank() == 0:
            joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir()+'/init_poses_'+np.array2string(current_curriculum, separator=',')+'.pkl', compress=True)
            joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir() + '/init_poses.pkl', compress=True)'''

        curriculum_evolution.append(current_curriculum)
        print('Current curriculum: ', current_curriculum)
        opt_variable = opt_pi.get_variables()
        previous_params = {}
        for i in range(len(opt_variable)):
            cur_val = opt_variable[i].eval()
            previous_params[opt_variable[i].name] = cur_val
        if np.linalg.norm(current_curriculum) < 0.0001:
            if reference_anchor_score < ref_score:
                reference_anchor_score = ref_score
            else:
                break

    env.close()
Ejemplo n.º 28
0
def main():
    parser = arg_parser()
    add_env_params(parser)
    parser.add_argument('--num-timesteps', type=int, default=int(1e7))
    parser.add_argument('--num_env', type=int, default=16)
    parser.add_argument('--use_news', type=int, default=0)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--gamma_ext', type=float, default=0.99)
    parser.add_argument('--lam', type=float, default=0.95)
    parser.add_argument('--update_ob_stats_every_step', type=int, default=0)
    parser.add_argument('--update_ob_stats_independently_per_gpu',
                        type=int,
                        default=0)
    parser.add_argument('--update_ob_stats_from_random_agent',
                        type=int,
                        default=1)
    parser.add_argument('--proportion_of_exp_used_for_predictor_update',
                        type=float,
                        default=1.)
    parser.add_argument('--tag', type=str, default='')
    parser.add_argument('--policy',
                        type=str,
                        default='rnn',
                        choices=['cnn', 'rnn'])
    parser.add_argument('--int_coeff', type=float, default=1.)
    parser.add_argument('--ext_coeff', type=float, default=0)
    parser.add_argument('--dynamics_bonus', type=int, default=0)

    parser.add_argument('--clear-run',
                        action='store_true',
                        default=False,
                        help='if clear the save folder')
    parser.add_argument('--mega-wrapper',
                        type=int,
                        default=0,
                        help='if use the same wrapper as mega')

    args = parser.parse_args()
    args.save_dir = '../rnd_results/'
    args.save_dir = os.path.join(args.save_dir, 'e_n-{}/'.format(args.env))
    args.save_dir = os.path.join(
        args.save_dir, 'mega_wrapper-{}'.format(str(args.mega_wrapper)))
    args.save_dir = os.path.join(args.save_dir,
                                 'num_env-{}'.format(str(args.num_env)))
    args.save_dir = os.path.join(args.save_dir,
                                 'int_coeff-{}'.format(str(args.int_coeff)))

    if args.clear_run:
        '''if clear_run, clear the path before create the path'''
        input('You have set clear_run, is that what you want?')
        subprocess.call(["rm", "-r", args.save_dir])

    try:
        os.makedirs(args.save_dir)
    except Exception as e:
        print('file exists')

    try:
        os.makedirs('../rnd_log_results/' + args.env + '/')
    except Exception as e:
        print('log file exists')

    args.summary_writer = tf.summary.FileWriter(args.save_dir)

    logger.configure(dir='../rnd_log_results/' + args.env + '/',
                     format_strs=['stdout', 'log', 'csv']
                     if MPI.COMM_WORLD.Get_rank() == 0 else [])
    if MPI.COMM_WORLD.Get_rank() == 0:
        with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'),
                  'w') as f:
            f.write(args.tag)
        # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code'))

    mpi_util.setup_mpi_gpus()

    seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(frame_stack=4,
               nminibatches=4,
               nepochs=4,
               lr=0.0001,
               max_grad_norm=0.0,
               use_news=args.use_news,
               gamma=args.gamma,
               gamma_ext=args.gamma_ext,
               max_episode_steps=args.max_episode_steps,
               lam=args.lam,
               update_ob_stats_every_step=args.update_ob_stats_every_step,
               update_ob_stats_independently_per_gpu=args.
               update_ob_stats_independently_per_gpu,
               update_ob_stats_from_random_agent=args.
               update_ob_stats_from_random_agent,
               proportion_of_exp_used_for_predictor_update=args.
               proportion_of_exp_used_for_predictor_update,
               policy=args.policy,
               int_coeff=args.int_coeff,
               ext_coeff=args.ext_coeff,
               dynamics_bonus=args.dynamics_bonus)

    tf_util.make_session(make_default=True)
    train(env_id=args.env,
          num_env=args.num_env,
          seed=seed,
          num_timesteps=args.num_timesteps,
          hps=hps,
          args=args)
Ejemplo n.º 29
0
def make_env():
    env = gym.make(env_name)
    env.set_episode_size(alg_kwargs['nsteps'])
    env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir()), allow_early_resets=True)

    return env

# Get dictionary from baselines/acktr/defaults
alg_kwargs = defaults.mara_mlp()
env_name = alg_kwargs['env_name']
alg_kwargs['total_timesteps'] = alg_kwargs['nsteps']

# Generate tensorboard file
format_strs = os.getenv('MARA_LOG_FORMAT', 'stdout,log,csv,tensorboard').split(',')
logger.configure(os.path.abspath('/tmp/acktr'), format_strs)

env = DummyVecEnv([make_env])

# Remove unused parameters for training
alg_kwargs.pop('env_name')
alg_kwargs.pop('trained_path')
alg_kwargs.pop('transfer_path')

network = mlp(num_layers=alg_kwargs['num_layers'], num_hidden=alg_kwargs['num_hidden'], layer_norm=alg_kwargs['layer_norm'])

with tf.Session(config=config) as train_sess:
    _ = acktr.learn(env=env, network=network, **alg_kwargs)

tf.reset_default_graph()
Ejemplo n.º 30
0
def main():
    args = mujoco_arg_parser().parse_args()
    logger.configure()
    parameters = algorithm_parameters()
    train(args.env, parameters=parameters, seed=args.seed)
Ejemplo n.º 31
0
def main():
    args = gym_ctrl_arg_parser().parse_args()
    logger.configure(format_strs=['stdout', 'log', 'csv'], log_suffix = "ACKTR-"+args.env)
    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
Ejemplo n.º 32
0
def main(args=None):
    # configure logger, disable logging in child MPI processes (with rank > 0)

    if args is None:
        from thesis_galljamov18.python.training.guro_train import LOAD_MODEL
        arg_parser = common_arg_parser()
        args, unknown_args = arg_parser.parse_known_args()
        extra_args = {
            k: parse(v)
            for k, v in parse_unknown_args(unknown_args).items()
        }
        """All args: {'nsteps': 2048, 'nminibatches': 32, 'lam': 0.95, 'gamma': 0.99, 'noptepochs': 10, 'log_interval': 1, 
        'ent_coef': 0.0, 'lr': <function mujoco.<locals>.<lambda> at 0x7f8f5af49f28>, 'cliprange': 0.2, 'value_network': 'copy'}"""

        # train my environment instead default one
        args.env = "Guro-v0"
        args.num_timesteps = 0 if LOAD_MODEL else 10e6 + 1e5
        args.play = LOAD_MODEL
        args.alg = 'ppo2'
        args.network = 'mlp'

        # change further arguments
        # nsteps = 2048
        # nminibatches = 32
        # gamma = 0.95
        # lr = 0.001
        # cliprange = 0.2

        # extra_args.update({'nsteps': nsteps, 'nminibatches': nminibatches, 'gamma': gamma, 'cliprange': cliprange})
        # extra_args.update({'lr': 1e-10})
    else:
        extra_args = {}

    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        logger.configure()
    else:
        logger.configure(format_strs=[])
        rank = MPI.COMM_WORLD.Get_rank()

    model, _ = train(args, extra_args)

    if args.save_path is not None and rank == 0:
        save_path = osp.expanduser(args.save_path)
        model.save(save_path)

    if args.play:
        logger.log("\n------------\nRunning trained model\n------------\n")

        def say(text):
            os.system(
                'spd-say "{}" --volume -1 --voice-type male2'.format(text))

        # say("Attention please! Running trained model in 10 seconds!")
        # import time
        # time.sleep(10)
        env = build_env(args)
        obs = env.reset()
        #env.ob_rms.mean = [0,0,0,0,0,0] #[0., 0.39362465587763634, 0., -0.11370739423088674, 0.01929697539211253, 0.5066570016460371]
        # [ 0,         0.46073392,  0,          0.20411958, -0.05412459,  0.49079091]
        #         print("\n----------\nOBSERV_MEANS of loaded model: " + str(env.ob_rms.mean) + "\n----------\n")
        # exit(33)
        while True:
            actions = model.step(obs)[0]
            obs, _, done, _ = env.step(actions)
            env.render()
            done = done.any() if isinstance(done, np.ndarray) else done

            if done:
                obs = env.reset()
Ejemplo n.º 33
0
def main(args):

    arg_parser = common_arg_parser()
    args, unknown_args = arg_parser.parse_known_args(args)
    import os
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    extra_args = parse_cmdline_kwargs(unknown_args)
    if 'gpu-id' in extra_args:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(extra_args['gpu-id'])
        extra_args.pop('gpu-id')
    if 'num_trials' in extra_args:
        num_trials = extra_args.pop('num_trials')
    else:
        num_trials = 1000

    if 'mle' in extra_args:
        if extra_args['mle']:
            args.use_mle = True
        extra_args.pop('mle')
    else:
        args.use_mle = False

    print("mle", args.use_mle)
    if 'residual_weight' not in extra_args and (args.alg == 'bppo2_expert'
                                                or args.alg == 'bppo2'):
        print("residual_weight not in extra_args, set it to 0.1")
        extra_args['residual_weight'] = 0.1
    if 'residual_weight' in extra_args:
        print("Residual weight", extra_args["residual_weight"])

    if 'render' in extra_args:
        render = True
        del extra_args['render']
    else:
        render = False
    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        logger.configure()
    else:
        logger.configure(format_strs=[])
        rank = MPI.COMM_WORLD.Get_rank()

    model, env = train(args, extra_args)
    env.close()

    if args.save_path is not None and rank == 0:
        save_path = osp.expanduser(args.save_path)
        model.save(save_path)

    if args.play:
        logger.log("Running trained model")
        env = build_env(args)
        obs = env.reset()

        def initialize_placeholders(nlstm=128, **kwargs):
            return np.zeros((args.num_env or 1, 2 * nlstm)), np.zeros((1))

        state, dones = initialize_placeholders(**extra_args)

        # GL: Get mean, std
        from baselines.common.math_util import discount
        all_rewards = []

        if 'tiger' in args.env:
            from brl_gym.envs.tiger import ACTION_NAME, OBS_NAME
            for _ in range(10):
                obs = env.reset()
                rewards = []
                for t in range(100):
                    tiger_loc = env.envs[0].env.env.env.tiger
                    tiger = "LEFT" if tiger_loc == 0 else "RIGHT"
                    actions, _, state, _ = model.step(obs[0], S=state, M=dones)
                    obs, r, done, _ = env.step(actions[0])
                    obs_name = OBS_NAME[np.argmax(obs[0, :3])]
                    print("Reward: {}\tAction: {}\tObs: {}\tHidden: {}".format(
                        r, ACTION_NAME[actions[0]], obs_name, tiger))
                    done = done.any() if isinstance(done, np.ndarray) else done
                    rewards += [r]
                    if done:
                        print("=========== RESET ========== ")
                all_rewards += [discount(np.array(rewards).ravel(), 0.95)[0]]
        elif 'rocksample' in args.env:
            if 'gamma' not in extra_args:
                extra_args['gamma'] = 1.0
            if 'fixed' in args.alg:
                from brl_gym.qmdps.rocksample_qmdp import RockSampleQMDPQFunction as QMDPQFunction
                q_func = QMDPQFunction(num_rocks=8, num_envs=args.num_env)
            else:
                qval = None
            for _ in range(num_trials):
                obs = env.reset()
                obs, bel = obs[:, :148], obs[:, 148:]
                qval = q_func(obs, bel)
                done = False
                rewards = []
                while not done:
                    action = model.step(obs,
                                        belief=bel,
                                        S=state,
                                        M=dones,
                                        expert_qval=qval,
                                        update_eps=0)[0][0]
                    obs, r, done, _ = env.step(action)
                    obs, bel = obs[:, :148], obs[:, 148:]
                    qval = q_func(obs, bel)
                    # env.render()
                    # print(action, r)
                    done = done.any() if isinstance(done, np.ndarray) else done
                    rewards += [r]

                all_rewards += [
                    discount(np.array(rewards).ravel(), extra_args['gamma'])[0]
                ]
        elif args.alg == 'bddpg_fe':
            if 'gamma' not in extra_args:
                extra_args['gamma'] = 1.0
            for _ in range(num_trials):
                obs = env.reset()

                done = False
                rewards = []
                t = 0
                #from brl_gym.wrapper_envs.wrapper_pusher import get_qmdp_expert
                obs_dim = 22
                from brl_gym.wrapper_envs.wrapper_pusher import qmdp_expert, simple_combined_expert
                while not done:
                    # action = model.step(obs,S=state, M=dones)[0][0]
                    # print(action[0], r[0], done[0], q[0])
                    obs = obs.reshape(1, -1)
                    qval = qmdp_expert(obs[:, :obs_dim], obs[:, obs_dim:])
                    action = model.step(obs, qval, apply_noise=False)[0][0]
                    action = 0.1 * action + simple_combined_expert(
                        obs[:, :obs_dim], obs[:, obs_dim:])
                    obs, r, done, _ = env.step(action)
                    env.render()
                    done = done.any() if isinstance(done, np.ndarray) else done
                    rewards += [r]
                    t += 1
                    #if t >=800:
                    #    break
                print("T: ", t)

                all_rewards += [
                    discount(np.array(rewards).ravel(), extra_args['gamma'])[0]
                ]
                print(all_rewards)
        else:
            if 'gamma' not in extra_args:
                extra_args['gamma'] = 0.99
            if 'Maze' in args.env:
                from brl_gym.wrapper_envs.wrapper_maze import Expert
                maze_type = 10 if 'Maze10' in args.env else 4
                expert = Expert(nenv=1, maze_type=maze_type)
            else:
                from brl_gym.experts.util import get_expert
                expert = get_expert(
                    args.env,
                    use_mle=args.use_mle,
                    num_env=args.num_env,
                )

            undiscounted_sum = []

            #import cProfile

            with open(args.output, 'w') as f:
                #if 'Maze' in args.env :
                #    f.write('target\treward\tnum-sensing\tlength\n')
                #else:
                #    f.write('reward\tnum-sensing\tnum-collision\tlength\n')
                actual_env = env.envs[0].env.env.env
                #env = env.envs[0].env.env
                no_collision = 0
                lengths = []

                for k in range(num_trials):
                    #profile = cProfile.Profile()
                    #profile.enable()
                    print('-------------------------')
                    # env.envs[0].env.env.env.env.target = 3
                    # env.envs[0].env.env.env.reset_params=False
                    obs = env.reset()
                    #_env = env.envs[0]
                    #while hasattr(_env, "env"):
                    #    _env = _env.env

                    #if 'Maze' in args.env:
                    #   target = _env.target

                    done = False
                    rewards = []
                    residual_actions = []
                    obses = []
                    info = []
                    t = 0
                    expert_actions = []
                    agent_pos = []
                    observations = []
                    w = extra_args['residual_weight']
                    print("Weight", w)

                    while not done and t < 500:
                        #print("obs :", np.around(obs, 1))

                        if args.alg == 'bppo2_expert':
                            expert_action = expert.action(obs, info)
                            obs = np.concatenate([obs, expert_action], axis=1)
                            expert_action = expert_action.ravel()

                        observations += [obs.copy()]
                        action = model.step(obs)[0][0].numpy()
                        residual_actions += [action]

                        if args.alg == 'bppo2_expert':
                            agent_pos += [obs.ravel()[:2]]
                            expert_actions += [expert_action.copy()]

                            #print("action", action, "expert",  expert_action,)
                            if 'cartpole' in args.env.lower():
                                expert_action = expert_action + action * w
                            else:
                                expert_action = (
                                    1.0 - w) * expert_action + action * w
                            action = expert_action
                        action = np.clip(action, env.action_space.low,
                                         env.action_space.high)
                        #print("final action", action)
                        obs, r, done, info = env.step(action)
                        #print('reward:', r)
                        #print('done  :', done)

                        if render:
                            os.makedirs('imgs/trial{}'.format(k),
                                        exist_ok=True)
                            actual_env._visualize(
                                filename="imgs/trial{}/crosswalk_{}.png".
                                format(k, t))
                        t += 1

                        done = done.any() if isinstance(done,
                                                        np.ndarray) else done
                        rewards += [r]
                        obses += [obs]
                        # actions += [action]

                    #profile.disable()
                    #profile.print_stats()
                    #import IPython; IPython.embed(); import sys ;sys.exit(0)
                    lengths += [t]
                    rewards = np.array(rewards).ravel()
                    if rewards[-1] > 0:
                        no_collision += 1
                    print(np.sum(rewards), no_collision)

                    #residual_actions = np.array(residual_actions).squeeze()
                    #observations = np.array(observations).squeeze()
                    #data = {"r":rewards, "action":residual_actions, "obs":observations}
                    #os.makedirs('trials', exist_ok=True)
                    #data_file = open("trials/trial_{}.pkl".format(k), 'wb+')
                    #pickle.dump(data, data_file)
                    #print("Wrote to trial_{}.pkl".format(k))

                    all_rewards += [np.sum(rewards)]

        env.close()
        mean = np.mean(all_rewards)
        ste = np.std(all_rewards) / np.sqrt(len(all_rewards))
        print(all_rewards)
        print("Reward stat: ", mean, "+/-", ste)
        print("No collision", no_collision / num_trials)
        print("Length", np.mean(lengths))

    return model
Ejemplo n.º 34
0
def learn(env,
          eval_env,
          policy_func,
          reward_giver,
          expert_dataset,
          rank,
          pretrained,
          pretrained_weight,
          *,
          g_step,
          d_step,
          entcoeff,
          reward_coeff,
          save_per_iter,
          ckpt_dir,
          log_dir,
          timesteps_per_batch,
          task_name,
          gamma,
          lam,
          max_kl,
          cg_iters,
          cg_damping=1e-2,
          vf_stepsize=3e-4,
          d_stepsize=3e-4,
          vf_iters=3,
          max_timesteps=0,
          max_episodes=0,
          max_iters=0,
          num_epochs=1000,
          eval_interval=10,
          callback=None):

    # Configure log
    logger.configure(dir=log_dir)

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi",
                     ob_space,
                     ac_space,
                     reuse=(pretrained_weight != None))
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list
        if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")
    ]
    vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
    assert len(var_list) == len(vf_var_list) + 1
    d_adam = MpiAdam(reward_giver.get_trainable_variables())
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    vfadam.sync()
    if rank == 0:
        print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    # seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True)
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     reward_giver,
                                     reward_coeff,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(reward_giver.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())

    for epoch in range(num_epochs):
        logger.log("********** Epoch %i ************" % epoch)

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=128):
                        if hasattr(pi, "ob_rms"):
                            pi.ob_rms.update(
                                mbob)  # update running mean/std for policy
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)

        # evaluate current policy
        if (epoch + 1) % eval_interval == 0:
            total_samples = (epoch + 1) * timesteps_per_batch * g_step
            evaluate_policy(pi, reward_giver, eval_env, total_samples, tstart)

        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, reward_giver.loss_name))
        batch_size = len(ob) // d_step
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        for ob_batch, ac_batch in dataset.iterbatches(
            (ob, ac), include_final_partial_batch=False,
                batch_size=batch_size):
            ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
            # update running mean/std for reward_giver
            if hasattr(reward_giver, "obs_rms"):
                reward_giver.obs_rms.update(
                    np.concatenate((ob_batch, ob_expert), 0))
            *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch,
                                                     ob_expert, ac_expert)
            d_adam.update(allmean(g), d_stepsize)
            d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))
Ejemplo n.º 35
0
def main():
    args = atari_arg_parser().parse_args()
    logger.configure()
    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32)
Ejemplo n.º 36
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    #parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--dueling', type=int, default=0)
    #parser.add_argument('--checkpoint-freq', type=int, default=10000)
    parser.add_argument('--checkpoint-freq', type=int, default=10000)
    parser.add_argument('--checkpoint-path', type=str, default='/.')

    args = parser.parse_args()
    # TODO change logging dir for tensorboard
    #logger.configure(dir=None, format_strs='stdout,log,csv,json,tensorboard')
    #logger.configure(dir=None, format_strs=['stdout', 'log', 'csv', 'json', 'tensorboard'])
    timestart = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y-%m-%d-%H:%M:%S')
    logger.configure(
        dir=PROJ_DIR + "/../tensorboard/" + str(timestart),
        format_strs=['stdout', 'log', 'csv', 'json', 'tensorboard'])
    logger.set_level(logger.INFO)
    set_global_seeds(args.seed)

    env = gym_super_mario_bros.make('SuperMarioBros-v1')
    #wrap environment
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)

    #record videos of an episode
    env = VideoRecorderWrapper(env, PROJ_DIR + "/../video", str(timestart), 50)
    #the agent has only one trial
    env = EpisodicLifeEnv(env)

    # nes_py
    #preprocess the input frame
    env = DownsampleEnv(env, (84, 84))
    #set death penalty
    env = PenalizeDeathEnv(env, penalty=-25)
    #Stack 4 Framse as input
    env = FrameStackEnv(env, 4)

    #print tensorboard log information
    print("logger.get_dir():", logger.get_dir())
    print("PROJ_DIR:", PROJ_DIR)

    act = None
    #enable output in the terminal
    env = bench.Monitor(env, logger.get_dir())

    modelname = datetime.datetime.now().isoformat()

    #define callback function for the training process
    def render_callback(lcl, _glb):
        # print(lcl['episode_rewards'])
        total_steps = lcl['env'].total_steps
        #if total_steps % 2000 == 0:

        env.render()
        # pass


#different models with different parameters. out commented
#CNN built deepq.models.with cnn_to_mlp(params)
#trained with deepq.learn(params)

#2018-08-12-10:25:50 model 4, 100k, lr 0.0005, alpha 0.6, gamma 0.99, 8 frames v1
#2018-08-12-11:31:59 model 4, 100k, lr 0.0005, alpha 0.8, gamma 0.99, 6 frames v1

# model 04
# nature human paper + Improvements
# Dueling Double DQN, Prioritized Experience Replay, and fixed Q-targets

    model = deepq.models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2),
               (64, 3, 1)],  # (num_outputs, kernel_size, stride)
        hiddens=[512],  # 512
        dueling=bool(1),
    )

    act = deepq.learn(
        env,
        q_func=model,
        lr=0.0001,  # 0.00025 1e-4
        max_timesteps=int(100000),  # 100k -> 3h
        buffer_size=50000,  # 5000, #10000
        exploration_fraction=0.3,  # 0.1,
        exploration_final_eps=0.1,  # 0.01
        train_freq=4,  # 4
        learning_starts=25000,  # 10000
        target_network_update_freq=1000,
        gamma=0.5,  #0.99,
        prioritized_replay=bool(1),
        prioritized_replay_alpha=0.2,
        checkpoint_freq=args.checkpoint_freq,
        #        checkpoint_path=args.checkpoint_path,
        callback=render_callback,
        print_freq=1)

    print("Saving model to mario_model.pkl " + timestart)
    act.save("../models/mario_model_{}.pkl".format(timestart))

    env.close()
# parser.add_argument('--noise_type', type=str, default='adaptive-param_0.2')  # choices are adaptive-param_xx, ou_xx, normal_xx, none
parser.add_argument('--noise_type', type=str, default='ou_0.2')
boolean_flag(parser, 'evaluation', default=False)
args = parser.parse_args()

sess = U.single_threaded_session()
sess.__enter__()

# Configure things.
rank = MPI.COMM_WORLD.Get_rank()
if rank != 0:
    logger.set_level(logger.DISABLED)

# Create envs.
env = gym.make(str(args.environment))
logger.configure("/tmp/experiments/"+str(args.environment)+"/DDPG/")
env = bench.Monitor(env, logger.get_dir())
if args.evaluation and rank==0:
    eval_env = gym.make(env_id)
    eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
    env = bench.Monitor(env, None)
else:
    eval_env = None

# gym.logger.setLevel(logging.WARN)
# if evaluation and rank==0:
#     eval_env = gym.make(env_id)
#     eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
#     env = bench.Monitor(env, None)

# Parse noise_type
import gym
from baselines import logger
import numpy as np

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartHexapod-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure('data/ppo_' + args.env + str(args.seed) + '_walk')

    env = gym.make(args.env)
    env.env.assist_timeout = 100.0
    env.env.target_vel = 2.0
    env.env.init_tv = 0.0
    env.env.final_tv = 2.0
    env.env.tv_endtime = 1.0
    env.env.energy_weight = 0.2
    env.env.alive_bonus = 4.0
    train_mirror_sig(env,
                     num_timesteps=int(5000000),
                     seed=args.seed,
                     obs_perm=np.array([
                         0.0001, -1, 2, -3, -4, 8, 9, 10, 5, 6, 7, 14, 15, 16,
                         11, 12, 13, 20, 21, 22, 17, 18, 19, 23, 24, -25, 26,
Ejemplo n.º 39
0
def main():
    args = mujoco_arg_parser().parse_args()
    logger.configure()
    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
Ejemplo n.º 40
0
def main():
    """Run DQN until the environment throws an exception."""
    # Hyperparameters
    num_envs = 64
    learning_rate = 2.5e-4
    gamma = 0.99
    nstep_return = 3
    timesteps_per_proc = 25_000_000
    train_interval = 64
    target_interval = 8192
    batch_size = 512
    min_buffer_size = 20000

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='hard',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    parser.add_argument('--level_setup',
                        type=str,
                        default='procgen',
                        choices=["procgen", "oracle"])
    parser.add_argument('--mix_mode',
                        type=str,
                        default='nomix',
                        choices=['nomix', 'mixreg'])
    parser.add_argument('--mix_alpha', type=float, default=0.2)
    parser.add_argument('--mix_beta', type=float, default=0.2)
    parser.add_argument('--use_l2reg', action='store_true')
    parser.add_argument('--data_aug',
                        type=str,
                        default='no_aug',
                        choices=['no_aug', 'cutout_color', 'crop'])
    args = parser.parse_args()

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    # Setup env specs
    if args.level_setup == "procgen":
        env_name = args.env_name
        num_levels = 0 if is_test_worker else args.num_levels
        start_level = args.start_level
    elif args.level_setup == "oracle":
        env_name = args.env_name
        num_levels = 0
        start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(
        dir=LOG_DIR +
        f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}',
        format_strs=format_strs)

    # Create env
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=env_name,
                      num_levels=num_levels,
                      start_level=start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank % len(gpus_id)]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup Rainbow models
    logger.info("building models")
    online_net, target_net = rainbow_models(
        sess,
        venv.action_space.n,
        gym_space_vectorizer(venv.observation_space),
        min_val=REWARD_RANGE_FOR_C51[env_name][0],
        max_val=REWARD_RANGE_FOR_C51[env_name][1])
    dqn = MpiDQN(online_net,
                 target_net,
                 discount=gamma,
                 comm=comm,
                 mpi_rank_weight=mpi_rank_weight,
                 mix_mode=args.mix_mode,
                 mix_alpha=args.mix_alpha,
                 mix_beta=args.mix_beta,
                 use_l2reg=args.use_l2reg,
                 data_aug=args.data_aug)
    player = NStepPlayer(VecPlayer(venv, dqn.online_net), nstep_return)
    optimize = dqn.optimize(learning_rate=learning_rate)

    # Initialize and sync variables
    sess.run(tf.global_variables_initializer())
    global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope="")
    if comm.Get_size() > 1:
        sync_from_root(sess, global_variables, comm=comm)  #pylint: disable=E110

    # Training
    logger.info("training")
    dqn.train(num_steps=timesteps_per_proc,
              player=player,
              replay_buffer=PrioritizedReplayBuffer(500000,
                                                    0.5,
                                                    0.4,
                                                    epsilon=0.1),
              optimize_op=optimize,
              train_interval=train_interval,
              target_interval=target_interval,
              batch_size=batch_size,
              min_buffer_size=min_buffer_size)
def train():
    rank = MPI.COMM_WORLD.Get_rank()
    sess = utils.make_gpu_session(args.num_gpu)
    sess.__enter__()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)

    if args.use_2D_env:
        config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'configs', 'husky_space7_ppo2_2D.yaml')
    else:
        config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'configs', 'husky_space7_ppo2.yaml')

    if args.use_2D_env:
        raw_env = Husky2DNavigateEnv(gpu_idx=args.gpu_idx,
                                     config=config_file,
                                     pos_interval=args.pos_interval)
    else:
        raw_env = Husky1DNavigateEnv(gpu_idx=args.gpu_idx,
                                     config=config_file,
                                     ob_space_range=[0.0, 40.0])

    env = Monitor(
        raw_env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    gym.logger.setLevel(logging.WARN)

    base_dirname = os.path.join(currentdir, "simulation_and_analysis_dqn",
                                "rslts")

    if not os.path.exists(base_dirname):
        os.makedirs(base_dirname)
    dir_name = "husky_dqn_"
    if args.use_feedback:
        dir_name += "hr"
    elif args.use_rich_reward:
        dir_name += "rl_rich"
    else:
        dir_name += "rl_sparse"
    dir_name = addDateTime(dir_name)
    dir_name = os.path.join(base_dirname, dir_name)
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)

    hyperparams = {
        "seed": args.seed,
        # env
        "use_2D_env": args.use_2D_env,
        "use_rich_reward": args.use_rich_reward,
        "use_multiple_starts": args.use_multiple_starts,
        "total_timesteps": args.total_timesteps,
        "pos_interval": args.pos_interval,
        # hr
        "use_feedback": args.use_feedback,
        "use_real_feedback": args.use_real_feedback,
        "trans_by_interpolate": args.trans_by_interpolate,
        "only_use_hr_until": args.only_use_hr_until,
        "trans_to_rl_in": args.trans_to_rl_in,
        "good_feedback_acc": args.good_feedback_acc,
        "bad_feedback_acc": args.bad_feedback_acc,
        # dqn
        "exploration_fraction": args.exploration_fraction,
        "exploration_final_eps": args.exploration_final_eps,
        "lr": args.lr,
        "batch_size": args.batch_size,
        "dqn_epochs": args.dqn_epochs,
        "train_freq": args.train_freq,
        "target_network_update_freq": args.target_network_update_freq,
        "learning_starts": args.learning_starts,
        "param_noise": args.param_noise,
        "gamma": args.gamma,
        # hr training
        "feedback_lr": args.feedback_lr,
        "feedback_epochs": args.feedback_epochs,
        "feedback_batch_size": args.feedback_batch_size,
        "feedback_minibatch_size": args.feedback_minibatch_size,
        "min_feedback_buffer_size": args.min_feedback_buffer_size,
        "feedback_training_prop": args.feedback_training_prop,
        "feedback_training_new_prop": args.feedback_training_new_prop,
        # dqn replay buffer
        "buffer_size": args.buffer_size,
        "prioritized_replay": args.prioritized_replay,
        "prioritized_replay_alpha": args.prioritized_replay_alpha,
        "prioritized_replay_beta0": args.prioritized_replay_beta0,
        "prioritized_replay_beta_iters": args.prioritized_replay_beta_iters,
        "prioritized_replay_eps": args.prioritized_replay_eps,
        #
        "checkpoint_freq": args.checkpoint_freq,
        "use_embedding": raw_env._use_embedding,
        "use_raycast": raw_env._use_raycast,
        "offline": raw_env.config['offline']
    }

    print_freq = 5

    param_fname = os.path.join(dir_name, "param.json")
    with open(param_fname, "w") as f:
        json.dump(hyperparams, f, indent=4, sort_keys=True)

    video_name = os.path.join(dir_name, "video.mp4")
    p_logging = p.startStateLogging(p.STATE_LOGGING_VIDEO_MP4, video_name)

    act, performance = learn(  # env flags
        env,
        raw_env,
        use_2D_env=args.use_2D_env,
        use_multiple_starts=args.use_multiple_starts,
        use_rich_reward=args.use_rich_reward,
        total_timesteps=args.total_timesteps,
        # dqn
        exploration_fraction=args.exploration_fraction,
        exploration_final_eps=args.exploration_final_eps,
        # hr
        use_feedback=args.use_feedback,
        use_real_feedback=args.use_real_feedback,
        only_use_hr_until=args.only_use_hr_until,
        trans_to_rl_in=args.trans_to_rl_in,
        good_feedback_acc=args.good_feedback_acc,
        bad_feedback_acc=args.bad_feedback_acc,
        # dqn training
        lr=args.lr,
        batch_size=args.batch_size,
        dqn_epochs=args.dqn_epochs,
        train_freq=args.train_freq,
        target_network_update_freq=args.target_network_update_freq,
        learning_starts=args.learning_starts,
        param_noise=args.param_noise,
        gamma=args.gamma,
        # hr training
        feedback_lr=args.feedback_lr,
        feedback_epochs=args.feedback_epochs,
        feedback_batch_size=args.feedback_batch_size,
        feedback_minibatch_size=args.feedback_minibatch_size,
        min_feedback_buffer_size=args.min_feedback_buffer_size,
        feedback_training_prop=args.feedback_training_prop,
        feedback_training_new_prop=args.feedback_training_new_prop,
        # replay buffer
        buffer_size=args.buffer_size,
        prioritized_replay=args.prioritized_replay,
        prioritized_replay_alpha=args.prioritized_replay_alpha,
        prioritized_replay_beta0=args.prioritized_replay_beta0,
        prioritized_replay_beta_iters=args.prioritized_replay_beta_iters,
        prioritized_replay_eps=args.prioritized_replay_eps,
        # rslts saving and others
        checkpoint_freq=args.checkpoint_freq,
        print_freq=print_freq,
        checkpoint_path=None,
        load_path=None,
        callback=None,
        seed=args.seed)

    p.stopStateLogging(p_logging)

    performance_fname = os.path.join(dir_name, "performance.p")
    with open(performance_fname, "wb") as f:
        pickle.dump(performance, f)
    act.save(os.path.join(dir_name, "cartpole_model.pkl"))
Ejemplo n.º 42
0
def train_fn(env_name,
             num_envs,
             distribution_mode,
             num_levels,
             start_level,
             timesteps_per_proc,
             args,
             is_test_worker=False,
             log_dir='./tmp/procgen',
             comm=None,
             alternate_ppo=False,
             do_eval=False,
             eval_num_envs=None,
             eval_env_name=None,
             eval_num_levels=None,
             eval_start_level=None,
             eval_distribution_mode=None,
             do_test=False,
             test_num_envs=None,
             test_env_name=None,
             test_num_levels=None,
             test_start_level=None,
             test_distribution_mode=None):
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    use_vf_clipping = True

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else num_levels

    if log_dir is not None:
        log_comm = comm.Split(1 if is_test_worker else 0, 0)
        format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
        logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=env_name,
                      num_levels=num_levels,
                      start_level=start_level,
                      distribution_mode=distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    eval_env = None
    if do_eval:
        eval_env = ProcgenEnv(num_envs=eval_num_envs,
                              env_name=eval_env_name,
                              num_levels=eval_num_levels,
                              start_level=eval_start_level,
                              distribution_mode=eval_distribution_mode)
        eval_env = VecExtractDictObs(eval_env, "rgb")

        eval_env = VecMonitor(
            venv=eval_env,
            filename=None,
            keep_buf=100,
        )

        eval_env = VecNormalize(venv=eval_env, ob=False)

    test_env = None
    if do_test:
        test_env = ProcgenEnv(num_envs=test_num_envs,
                              env_name=test_env_name,
                              num_levels=test_num_levels,
                              start_level=test_start_level,
                              distribution_mode=test_distribution_mode)
        test_env = VecExtractDictObs(test_env, "rgb")

        test_env = VecMonitor(
            venv=test_env,
            filename=None,
            keep_buf=100,
        )

        test_env = VecNormalize(venv=test_env, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

    logger.info("training")
    if alternate_ppo:
        alt_ppo2.learn(env=venv,
                       eval_env=eval_env,
                       test_env=test_env,
                       network=conv_fn,
                       total_timesteps=timesteps_per_proc,
                       save_interval=1,
                       nsteps=nsteps,
                       nminibatches=nminibatches,
                       lam=lam,
                       gamma=gamma,
                       noptepochs=ppo_epochs,
                       log_interval=1,
                       ent_coef=ent_coef,
                       mpi_rank_weight=mpi_rank_weight,
                       clip_vf=use_vf_clipping,
                       comm=comm,
                       lr=learning_rate,
                       cliprange=clip_range,
                       update_fn=None,
                       init_fn=None,
                       vf_coef=0.5,
                       max_grad_norm=0.5,
                       args=args,
                       load_path=args.resume_path)
    else:
        ppo2.learn(env=venv,
                   eval_env=eval_env,
                   network=conv_fn,
                   total_timesteps=timesteps_per_proc,
                   save_interval=1,
                   nsteps=nsteps,
                   nminibatches=nminibatches,
                   lam=lam,
                   gamma=gamma,
                   noptepochs=ppo_epochs,
                   log_interval=1,
                   ent_coef=ent_coef,
                   mpi_rank_weight=mpi_rank_weight,
                   clip_vf=use_vf_clipping,
                   comm=comm,
                   lr=learning_rate,
                   cliprange=clip_range,
                   update_fn=None,
                   init_fn=None,
                   vf_coef=0.5,
                   max_grad_norm=0.5,
                   args=args)
Ejemplo n.º 43
0
def main():
    # Hyperparameters
    num_envs = 128
    learning_rate = 5e-4
    ent_coef = .01
    vf_coef = 0.5
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    max_grad_norm = 0.5
    timesteps_per_proc = 100_000_000
    use_vf_clipping = True

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='hard',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    parser.add_argument('--use_bn', action='store_true')
    parser.add_argument('--use_l2reg', action='store_true')
    parser.add_argument('--l2reg_coeff', type=float, default=1e-4)
    parser.add_argument('--data_aug',
                        type=str,
                        default='no_aug',
                        choices=["no_aug", "cutout_color", "crop"])
    parser.add_argument('--use_rand_conv', action='store_true')
    parser.add_argument('--model_width',
                        type=str,
                        default='1x',
                        choices=["1x", "2x", "4x"])
    parser.add_argument('--level_setup',
                        type=str,
                        default='procgen',
                        choices=["procgen", "oracle"])
    parser.add_argument('--mix_mode',
                        type=str,
                        default='nomix',
                        choices=['nomix', 'mixreg', 'mixobs'])
    parser.add_argument('--mix_alpha', type=float, default=0.2)
    parser.add_argument('--timesteps_per_proc', type=float, default=1_000_000)
    parser.add_argument('--save_dir',
                        type=str,
                        default='gdrive/MyDrive/182 Project/mixreg')
    args = parser.parse_args()

    log_dir = args.save_dir

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    # Setup env specs
    if args.level_setup == "procgen":
        env_name = args.env_name
        num_levels = 0 if is_test_worker else args.num_levels
        start_level = args.start_level
    elif args.level_setup == "oracle":
        env_name = args.env_name
        num_levels = 0
        start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(
        dir=log_dir +
        f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}',
        format_strs=format_strs)

    # Create env
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=env_name,
                      num_levels=num_levels,
                      start_level=start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    venv = VecNormalize(venv=venv, ob=False)

    eval_env = ProcgenEnv(num_envs=num_envs,
                          env_name=env_name,
                          num_levels=500,
                          start_level=0,
                          distribution_mode=args.distribution_mode)
    eval_env = VecExtractDictObs(eval_env, "rgb")
    eval_env = VecMonitor(
        venv=eval_env,
        filename=None,
        keep_buf=100,
    )
    eval_env = VecNormalize(venv=eval_env, ob=False, ret=True)

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup model
    if args.model_width == '1x':
        depths = [16, 32, 32]
    elif args.model_width == '2x':
        depths = [32, 64, 64]
    elif args.model_width == '4x':
        depths = [64, 128, 128]
    conv_fn = lambda x: build_impala_cnn(x,
                                         depths=depths,
                                         use_bn=args.use_bn,
                                         randcnn=args.use_rand_conv and
                                         not is_test_worker)

    # Training
    logger.info("training")
    ppo2.learn = learn  # use customized "learn" function
    model = ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=args.timesteps_per_proc,
        eval_env=eval_env,
        save_interval=0,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        data_aug=args.data_aug,
        use_rand_conv=args.use_rand_conv,
        model_fn=get_mixreg_model(mix_mode=args.mix_mode,
                                  mix_alpha=args.mix_alpha,
                                  use_l2reg=args.use_l2reg,
                                  l2reg_coeff=args.l2reg_coeff),
    )

    # Saving
    logger.info("saving final model")
    if rank == 0:
        checkdir = os.path.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)
        model.save(os.path.join(checkdir, 'final_model.ckpt'))
Ejemplo n.º 44
0
def main():
    # configure logger, disable logging in child MPI processes (with rank > 0)

    arg_parser = common_arg_parser()
    args, unknown_args = arg_parser.parse_known_args()

    if args.env == 'Humanoid-v1' or args.env == 'Humanoid(rllab)' or args.env == 'HumanoidStandup-v1':
        args.num_timesteps = 1e7
    if args.env == 'Ant-v1':
        args.num_timesteps = 5e6
    extra_args = parse_cmdline_kwargs(unknown_args)
    print("args")
    print(args)
    if args.num_repeat==1:
        dir = args.log_dir + '/iter%d' % args.seed
        if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
            rank = 0
            logger.configure(dir=dir)
        else:
            logger.configure(dir=dir, format_strs=[])
            rank = MPI.COMM_WORLD.Get_rank()

        model, env, sess, evalenv = train(args, extra_args, args.seed)
        env.close()
        evalenv.close()
        tf.reset_default_graph()
        sess.close()
    else:
        for seed in range(args.num_repeat):
            dir = args.log_dir + '/iter%d'%seed
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                rank = 0
                logger.configure(dir=dir)
            else:
                logger.configure(dir=dir, format_strs=[])
                rank = MPI.COMM_WORLD.Get_rank()

            model, env, sess, evalenv = train(args, extra_args, seed)
            env.close()
            evalenv.close()
            tf.reset_default_graph()
            sess.close()

    if args.save_path is not None and rank == 0:
        save_path = osp.expanduser(args.save_path)
        model.save(save_path)

    if args.play:
        logger.log("Running trained model")
        env = build_env(args)
        obs = env.reset()
        def initialize_placeholders(nlstm=128,**kwargs):
            return np.zeros((1, 2*nlstm)), np.zeros((1))
        state, dones = initialize_placeholders(**extra_args)
        while True:
            actions, _, state, _ = model.step(obs,S=state, M=dones)
            obs, _, done, _ = env.step(actions)
            env.render()
            done = done.any() if isinstance(done, np.ndarray) else done

            if done:
                obs = env.reset()

        env.close()
Ejemplo n.º 45
0
def configure_logger(log_path, **kwargs):
    if log_path is not None:
        logger.configure(log_path)
    else:
        logger.configure(**kwargs)
Ejemplo n.º 46
0
    parser.add_argument('--actor-lr', type=float, default=1e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    boolean_flag(parser, 'popart', default=False)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--reward-scale', type=float, default=1.)
    parser.add_argument('--clip-norm', type=float, default=None)
    parser.add_argument('--nb-epochs', type=int, default=500)  # with default settings, perform 1M steps total
    parser.add_argument('--nb-epoch-cycles', type=int, default=20)
    parser.add_argument('--nb-train-steps', type=int, default=50)  # per epoch cycle and MPI worker
    parser.add_argument('--nb-eval-steps', type=int, default=100)  # per epoch cycle and MPI worker
    parser.add_argument('--nb-rollout-steps', type=int, default=100)  # per epoch cycle and MPI worker
    parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2')  # choices are adaptive-param_xx, ou_xx, normal_xx, none
    parser.add_argument('--num-timesteps', type=int, default=None)
    boolean_flag(parser, 'evaluation', default=False)
    args = parser.parse_args()
    # we don't directly specify timesteps for this script, so make sure that if we do specify them
    # they agree with the other parameters
    if args.num_timesteps is not None:
        assert(args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps)
    dict_args = vars(args)
    del dict_args['num_timesteps']
    return dict_args


if __name__ == '__main__':
    args = parse_args()
    if MPI.COMM_WORLD.Get_rank() == 0:
        logger.configure()
    # Run actual script.
    run(**args)