Exemple #1
0
def train(env_id, num_episodes, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_halide_env(env_id, seed)
    pposgd_simple.learn(
        env,
        policy_fn,
        max_episodes=num_episodes,
        timesteps_per_actorbatch=256,
        clip_param=0.2,
        entcoeff=0.03,
        optim_epochs=4,
        optim_stepsize=2.5e-3,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
    env.close()
def train(env_id, num_timesteps, timesteps_per_actor_batch, seed,
          entropy_coeff, filepath):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    sess = U.make_session(num_cpu=1)
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_mujoco_env(env_id, seed)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=timesteps_per_actor_batch,
                        clip_param=0.2,
                        entcoeff=entropy_coeff,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear')
    env.close()

    # Save policy etc.
    saver = tf.train.Saver()
    saver.save(sess, filepath + "_final")
Exemple #3
0
def train_ppo1(env_id, num_timesteps, sfs, seed):
    from baselines.ppo1 import pposgd_simple
    sess = U.make_session(num_cpu=4)
    sess.__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)

    model_dir = get_model_dir(env_id, 'ppo')

    # monitor tensorboard
    log_dir = osp.join(logger.get_dir(), 'log_ppo')
    logger.log("log_dir: %s" % log_dir)
    env = bench.Monitor(env, log_dir)

    env = ModelSaverWrapper(env, model_dir, sfs)

    env.seed(seed)
    # env.render()
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,  # TODO 0.2
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',  # TODO linear
    )
    env.close()
def train(env_id, num_frames, seed, max_ts, logdir):
    """Train agent."""
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    logger.configure(osp.join(logdir, "%i.log.json" % rank))
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        """Given an obs, returns an act."""
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space,
                                    ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = atari_env.wrap_train(env)
    num_timesteps = max_ts or int(num_frames / 4 * 1.1)
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=256,
                        clip_param=0.2, entcoeff=0.01,
                        optim_epochs=4, optim_stepsize=1e-3,
                        optim_batchsize=64,
                        gamma=0.99, lam=0.95,
                        schedule='linear'
                       )
    env.close()
Exemple #5
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    sess = U.make_session(num_cpu=1)
    sess.__enter__()
    logger.configure()
    set_global_seeds(seed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return cnn_policy.CnnPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space)

    env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=int(num_timesteps * 1.1),
                        timesteps_per_actorbatch=256,
                        clip_param=0.2,
                        entcoeff=0.01,
                        optim_epochs=4,
                        optim_stepsize=1e-3,
                        optim_batchsize=64,
                        gamma=1,
                        lam=0.95,
                        schedule='linear')
    env.close()
    saver = tf.train.Saver()
    saver.save(sess, '/tmp/model')
Exemple #6
0
def train(env_id, num_timesteps, seed,
          save_interval, output_prefix):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    sess = U.make_session(num_cpu=1)
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=128, num_hid_layers=2)

    def callback_fn(local_vars, global_vars):
        iters = local_vars["iters_so_far"]
        saver = tf.train.Saver()
        if iters % save_interval == 0:
            saver.save(sess, output_prefix + str(iters))

    env = make_dart_env(env_id, seed)
    pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95, schedule='linear',
                        callback=callback_fn
        )
    env.close()
Exemple #7
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple

    sess = U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make("FenceEscape-v0")

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = bench.Monitor(env, logger.get_dir())
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)

    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
    env.close()
Exemple #8
0
def train(env, num_timesteps, seed):
    #from baselines.ppo1 import pposgd_simple, cnn_policy
    sess = U.make_session(num_cpu=1)
    sess.__enter__()
    set_global_seeds(seed)

    def policy_fn(name, ob_space, ac_space):
        #return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env.seed(seed)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=20,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=9,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear',
                        sess=sess,
                        old_model=False)
Exemple #9
0
def train(env_id, num_timesteps, seed, save_model, load_model, model_dir, timesteps_per_actorbatch,
        clip_param, ent_coeff, epochs, learning_rate, batch_size, gamma, lambd, exploration_rate, filename):
    from baselines.ppo1 import kick_policy, pposgd_simple, reward_scaler
    rank = MPI.COMM_WORLD.Get_rank()
    U.make_session(num_cpu=1).__enter__()
    workerseed = seed + 10000 * rank
    set_global_seeds(workerseed)
    env = SoccerEnv(rank)

    def policy_fn(name, ob_space, ac_space):
        return kick_policy.KickPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    hid_size=64, num_hid_layers=2, exploration_rate = exploration_rate)
    env = bench.Monitor(env, logger.get_dir())
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    rw_scaler = reward_scaler.RewardScaler("rw_scaler")
    pposgd_simple.learn(env, policy_fn, 
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=timesteps_per_actorbatch,
                        clip_param=clip_param, entcoeff=ent_coeff,
                        optim_epochs=epochs, optim_stepsize=learning_rate, optim_batchsize=batch_size,
                        gamma=gamma, lam=lambd, schedule='linear',
                        save_model=save_model, load_model=load_model, model_dir=model_dir, 
                        rw_scaler=rw_scaler, filename=filename
                        )
    env.close()
Exemple #10
0
def train(env_id, num_timesteps, seed,beta,theta,decay):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None
    set_global_seeds(workerseed)
    print(env_id)
    env = make_atari(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=int(num_timesteps * 1.1),
        timesteps_per_actorbatch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear',beta=beta,theta=theta,decay=decay,
    )
    env.close()
def main():
    #Create Asynchronous Simulation of InvertedDoublePendulum-v2 mujoco environment.
    env = DoubleInvertedPendulumEnv(
        agent_dt=0.005,
        sensor_dt=[0.01, 0.0033333],
    )
    # Start environment processes
    env.start()

    # Create baselines ppo policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=64,
                         num_hid_layers=2)

    # create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Plotting process
    pp = Process(target=plot_returns,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines PPO learn

    kindred_callback = create_callback(shared_returns)

    # Train baselines PPO
    learn(
        env,
        policy_fn,
        max_timesteps=1e6,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=0.0001,
        optim_batchsize=64,
        gamma=0.995,
        lam=0.995,
        schedule="linear",
        callback=kindred_callback,
    )

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    # Shutdown the environment
    env.close()
Exemple #12
0
def train(env_id, num_timesteps, history_len, seed, render):
    U.make_session(num_cpu=1).__enter__()

    # We need to make sure the seed is different in each COMM world
    rank = MPI.COMM_WORLD.Get_rank()
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)

    train_env = LearningEnvironment(num_particles=PARTICLES, disable_render=not render)
    train_env = StackedEnvWrapper(train_env, state_history_len=history_len)

    eval_env = LearningEnvironment(num_particles=PARTICLES, disable_render=not render)
    eval_env = StackedEnvWrapper(eval_env, state_history_len=history_len)
    eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), "monitor.json"))

    pposgd_simple.learn(train_env, eval_env, policy_fn,
            directory=DIRECTORY.format(history_len),
            max_timesteps=num_timesteps,
            timesteps_per_batch=1024*VAR_REDUCTION,
            clip_param=0.2,
            entcoeff=0.0001,
            optim_epochs=10,
            optim_stepsize=2e-4,
            optim_batchsize=64,
            gamma=0.99, lam=0.95, schedule='linear',
            render=render
        )
    train_env.close()
    eval_env.close()
Exemple #13
0
def train(env_id, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    env.seed(workerseed)

    task_name = "ppo." + args.env.split("-")[0] + "." + ("%.2f"%args.entcoeff)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    pposgd_simple.learn(env, policy_fn, 
        max_timesteps=args.num_timesteps,
        timesteps_per_batch=256,
        clip_param=0.2, entcoeff=args.entcoeff,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        sample_stochastic=args.sample_stochastic, task_name=task_name, save_per_iter=args.save_per_iter,
        ckpt_dir=args.checkpoint_dir, load_model_path=args.load_model_path, task=args.task)
    env.close()
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for the Mujoco environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2,
                                    sess=sess,
                                    placeholders=placeholders)

    env = make_mujoco_env(env_id, seed)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=2048,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear')
    env.close()
Exemple #15
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    #env = make_mujoco_env(env_id, seed)
    env = gym.make("CartPole-v0")
    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
    env.close()
Exemple #16
0
def train(env_id, num_timesteps, seed, tb_dir=None):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = ContinuousGridworld('gridworld', visualize=False)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=256,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=4,
                        optim_stepsize=1e-3,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear',
                        tb_dir=tb_dir)
    env.close()
Exemple #17
0
def train(exp_name, env_id, max_iters, save_step, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    sess = U.make_session(num_cpu=1)
    sess.__enter__()
#    logger.session().__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    hid_size=64, num_hid_layers=2)
#    env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)

    scriptpath = os.path.dirname(os.path.abspath( __file__ ))
    directory = os.path.join(scriptpath, exp_name)
    if not os.path.exists(directory):
        os.makedirs(directory)
    filepath = os.path.join(directory, "")

    pposgd_simple.learn(env, policy_fn,
            max_iters=max_iters,
            filepath=filepath,
            save_step=save_step,
            timesteps_per_actorbatch=4000,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95,
        )
    env.close()
Exemple #18
0
def train(num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=128,
                                    num_hid_layers=2)

    env = TwoDofArmEnv(ActiveMuscles='antagonistic',
                       actionParameterization=True,
                       sim_length=0.005,
                       traj_track=True,
                       exo=True,
                       exo_gain=70.,
                       delay=0.020)
    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=1048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
    env.close()
Exemple #19
0
def train(env_id, num_frames, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()

    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * rank
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    env.seed(seed)

    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps = int(num_frames / 4 * 1.1)
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    env.close()
Exemple #20
0
def train(env,
          num_timesteps,
          seed,
          ckpt_dir=None,
          render=False,
          ckpt_freq=0,
          restore_dir=None,
          optim_stepsize=3e-4,
          schedule="linear",
          gamma=0.99,
          optim_epochs=10,
          optim_batchsize=64,
          horizon=2048):

    from baselines.common.fc_learning_utils import FlightLog
    from mpi4py import MPI
    from baselines import logger
    from baselines.ppo1.mlp_policy import MlpPolicy
    from baselines.common import set_global_seeds
    from baselines.ppo1 import pposgd_simple
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    logger.set_level(logger.DISABLED)
    workerseed = seed + 1000000 * rank

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    if render:
        env.render()
    env.seed(workerseed)
    set_global_seeds(workerseed)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=horizon,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=optim_epochs,
                        optim_stepsize=optim_stepsize,
                        optim_batchsize=optim_batchsize,
                        gamma=0.99,
                        lam=0.95,
                        schedule=schedule,
                        flight_log=None,
                        ckpt_dir=ckpt_dir,
                        restore_dir=restore_dir,
                        save_timestep_period=ckpt_freq)
    env.close()
Exemple #21
0
def train(env_id, num_timesteps, seed, save_model, load_model, model_dir):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    rank = MPI.COMM_WORLD.Get_rank()
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = SoccerEnv(rank)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = bench.Monitor(env, logger.get_dir())
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=2048,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear',
                        save_model=save_model,
                        load_model=load_model,
                        model_dir=model_dir)
    env.close()
Exemple #22
0
def train(env_id,
          backend,
          num_timesteps,
          seed,
          stdev=0.,
          collision_detector='bullet'):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env_dist = dr.dist.Normal(env_id, backend, stdev=stdev)
    env_dist.seed(seed)
    set_global_seeds(seed)

    pposgd_simple.learn(
        env_dist,
        collision_detector,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    mujoco_py.ignore_mujoco_warnings().__enter__()
    workerseed = seed + 10000 * rank
    set_global_seeds(workerseed)
    env = make_robotics_env(env_id, workerseed, rank=rank)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=256,
                                    num_hid_layers=3)

    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=5,
        optim_stepsize=3e-4,
        optim_batchsize=256,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
    env.close()
Exemple #24
0
def train(env_id, num_frames, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps = int(num_frames / 4 * 1.1)
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    env.close()
Exemple #25
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=3,
                                    gmm_comp=1)

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=int(5000),
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear',
                        callback=callback)
    env.close()
Exemple #26
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for Robotics environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """

    rank = MPI.COMM_WORLD.Get_rank()
    with mujoco_py.ignore_mujoco_warnings():
        workerseed = seed + 10000 * rank
        set_global_seeds(workerseed)
        env = make_robotics_env(env_id, workerseed, rank=rank)

        def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):
            return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3,
                                        sess=sess, placeholders=placeholders)

        pposgd_simple.learn(env, policy_fn,
                            max_timesteps=num_timesteps,
                            timesteps_per_actorbatch=2048,
                            clip_param=0.2, entcoeff=0.0,
                            optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256,
                            gamma=0.99, lam=0.95, schedule='linear')
        env.close()
Exemple #27
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None
    set_global_seeds(workerseed)
    env = make_atari(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=int(num_timesteps * 1.1),
        timesteps_per_actorbatch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    env.close()
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    from gym.envs.registration import register
    # Potential Pendulum Env
    if (env_id == 'Pendulumnf-v0'):
        register(
            id='Pendulumnf-v0',
            entry_point='nfunk.envs_nf.pendulum_nf:PendulumEnv',
            max_episode_steps=400,
            #kwargs = vars(args),
        )
        env = gym.make('Pendulumnf-v0')
    # Potential Scalar Env
    elif (env_id == 'Scalarnf-v0'):
        register(
            id='Scalarnf-v0',
            entry_point='nfunk.envs_nf.gym_scalar_nf:GymScalarEnv',
            max_episode_steps=400,
            #kwargs = vars(args),
        )
        env = gym.make('Scalarnf-v0')
    # Potential CartPole Environment (own one -> continouus)
    elif (env_id == 'CartPole-v9'):
        register(
            id='CartPole-v9',
            entry_point='nfunk.envs_nf.cartpole:CartPoleEnv',
            max_episode_steps=200,
            #kwargs = vars(args),
        )
        env = gym.make('CartPole-v9')
    else:
        env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = bench.Monitor(env, logger.get_dir())
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=2048,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear',
                        seed=seed)
    env.close()
def train(env_id, num_timesteps, seed, num_options,app, saves ,wsaves, epoch,dc):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    from gym.envs.registration import register
    # Potential Pendulum Env
    if (env_id=='Pendulumnf-v0'):
        register(
            id='Pendulumnf-v0',
            entry_point='nfunk.envs_nf.pendulum_nf:PendulumEnv',
            max_episode_steps=400,
            #kwargs = vars(args),
        )
        env = gym.make('Pendulumnf-v0')
    # Potential Scalar Env
    elif (env_id=='Scalarnf-v0'):
        register(
            id='Scalarnf-v0',
            entry_point='nfunk.envs_nf.gym_scalar_nf:GymScalarEnv',
            max_episode_steps=400,
            #kwargs = vars(args),
        )
        env = gym.make('Scalarnf-v0')
    # Potential CartPole Environment (own one -> continouus)
    elif (env_id=='CartPole-v9'):
        register(
            id='CartPole-v9',
            entry_point='nfunk.envs_nf.cartpole:CartPoleEnv',
            max_episode_steps=200,
            #kwargs = vars(args),
        )
        env = gym.make('CartPole-v9')
    else:
        env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2, num_options=num_options, dc=dc)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)

    if num_options ==1:
        optimsize=64
    elif num_options ==2:
        optimsize=32
    else:
        print("Only two options or primitive actions is currently supported.")
        sys.exit()

    pposgd_simple.learn(env, policy_fn, 
            max_timesteps=num_timesteps,
            timesteps_per_batch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-5, optim_batchsize=optimsize,
            gamma=0.99, lam=0.95, schedule='constant', num_options=num_options,
            app=app, saves=saves, wsaves=wsaves, epoch=epoch, seed=seed,dc=dc
        )
    env.close()
Exemple #30
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    #env = make_atari(env_id)

    env = gym_super_mario_bros.make('SuperMarioBros-v1')
    # env = gym_super_mario_bros.make('SuperMarioBrosNoFrameskip-v3')

    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
    env = ProcessFrame84(env)

    env = FrameMemoryWrapper(env)




    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    #env = wrap_deepmind(env)
    env.seed(workerseed)


    def render_callback(lcl, _glb):
        # print(lcl['episode_rewards'])
        total_steps = lcl['env'].total_steps
        #if total_steps % 1000 == 0:
        #    print("Saving model to mario_model.pkl")
        #    act.save("../models/mario_model_{}.pkl".format(modelname))


        env.render()
        # pass


    pposgd_simple.learn(env, policy_fn,
        max_timesteps=int(num_timesteps * 1.1),
        timesteps_per_actorbatch=2048,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4,
        optim_stepsize=1e-3, # 3e-4
        optim_batchsize=64, #256
        gamma=0.99, lam=0.95,
        schedule='linear',
        callback = render_callback
    )
    env.close()
def train(num_timesteps,
          seed,
          save_model_with_prefix,
          restore_model_from_file,
          save_after,
          load_after_iters,
          viz=False,
          stochastic=True):

    from baselines.ppo1 import pposgd_simple
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()

    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)

    g = tf.get_default_graph()
    with g.as_default():
        tf.set_random_seed(workerseed)

    env = ProstheticsEnv_R2_multiclip(visualize=viz)
    env_string = str(env).split('<')[1]

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=312,
                         num_hid_layers=2)

    env.seed(workerseed)

    pposgd_simple.learn(env,
                        workerseed,
                        policy_fn,
                        max_timesteps=int(num_timesteps * 1.1),
                        timesteps_per_actorbatch=1536,
                        clip_param=0.2,
                        entcoeff=0.01,
                        optim_epochs=4,
                        optim_stepsize=1e-3,
                        optim_batchsize=512,
                        gamma=0.999,
                        lam=0.9,
                        schedule='linear',
                        save_model_with_prefix=save_model_with_prefix,
                        save_prefix=env_string,
                        restore_model_from_file=restore_model_from_file,
                        load_after_iters=load_after_iters,
                        save_after=save_after,
                        stochastic=stochastic)
    env.close()
Exemple #32
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('gym_id')
    parser.add_argument('-l', '--load')
    parser.add_argument('-e', '--episodes-per-batch', type=int, default=2500)
    args = parser.parse_args()

    rank = MPI.COMM_WORLD.Get_rank()
    size = MPI.COMM_WORLD.Get_size()
    U.single_threaded_session().__enter__()

    log_dir = 'logs/{}_{}'.format(
        args.gym_id,
        datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
    logger.configure(dir=log_dir, format_strs=None if rank == 0 else [])

    env = bench.Monitor(gym.make(args.gym_id),
                        os.path.join(logger.get_dir(), str(rank)))

    episodes_per_actorbatch = args.episodes_per_batch // size
    timesteps_per_actorbatch = episodes_per_actorbatch * 200

    callbacks = []
    if rank == 0:
        video_episodes_monitor_callback = VideoEpisodesMonitorCallback()
        callbacks.append(video_episodes_monitor_callback)
        monitor_path = os.path.join(log_dir, 'monitor')
        env = VideoMonitor(
            env,
            monitor_path,
            video_callable=video_episodes_monitor_callback.should_monitor)

    callbacks += [
        ReloadCallback(model_path=args.load),
        # HardEnvCallback(env=env, switch_iterations=10000, linear_schedule=True),
        # VersusCallback(env=env, start_iterations=20, threshold_iterations=20, default_ai_weight=2,
        #                latest_models_proportion=0.5, load_first_model=False),
    ]
    if rank == 0:
        callbacks += [
            SaveCallback(log_dir=log_dir),
        ]
    pposgd_simple.learn(
        env,
        env.unwrapped.policy_class,
        max_iters=1000000,
        timesteps_per_actorbatch=timesteps_per_actorbatch,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=6,
        optim_stepsize=1e-3,
        optim_batchsize=4096,
        gamma=0.995,
        lam=0.95,
        schedule='constant',
        callback=lambda lv, gv: [cb(lv, gv) for cb in callbacks],
    )
    env.close()
Exemple #33
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = make_mujoco_env(env_id, seed)
    pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95, schedule='linear',
        )
    env.close()
def train(num_timesteps, seed, model_path=None):
    env_id = 'Humanoid-v2'
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = make_mujoco_env(env_id, seed)

    # parameters below were the best found in a simple random search
    # these are good enough to make humanoid walk, but whether those are
    # an absolute best or not is not certain
    env = RewScale(env, 0.1)
    pi = pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, 
            optim_stepsize=3e-4, 
            optim_batchsize=64, 
            gamma=0.99, 
            lam=0.95,
            schedule='linear',
        )
    env.close()
    if model_path:
        U.save_state(model_path)
        
    return pi
Exemple #35
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env, policy_fn, 
            max_timesteps=num_timesteps,
            timesteps_per_batch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95, schedule='linear',
        )
    env.close()
import gym
from baselines.ppo1 import mlp_policy, pposgd_simple
import tensorflow as tf

env = gym.make("MountainCarContinuous-v0")

g = tf.Graph()
with g.as_default():
    # tf.reset_default_graph()
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    pposgd_simple.learn(env,
            policy_fn,
            max_timesteps=10000,
            timesteps_per_actorbatch=2048, # timesteps per actor per update
            # timesteps_per_actorbatch=128, # timesteps per actor per update
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=10,
            optim_stepsize=3e-4,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
            save_model_with_prefix=str(env.__class__.__name__), # typically, the env.
            outdir="/tmp/experiments/continuous/PPO/" # path for the log files (tensorboard) and models
        )

    # act.save("models/mountaincar_continuous_model_PPO_"+str(m)+".pkl")