Example #1
0
def train(args, env):
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=1,
                            inter_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    get_session(config=config)

    network = args.network
    logger.configure()

    if os.path.exists(args.load_path):
        model = ppo2.learn(network=network,
                           env=env,
                           load_path=args.load_path,
                           total_timesteps=args.total_timesteps,
                           nsteps=args.nsteps,
                           save_interval=args.save_interval,
                           lr=args.lr,
                           num_layers=args.num_layers)
    else:
        print('Warning: PATH ', args.load_path, ' does not exist.')
        model = ppo2.learn(network=network,
                           env=env,
                           total_timesteps=args.total_timesteps,
                           nsteps=args.nsteps,
                           save_interval=args.save_interval,
                           lr=args.lr,
                           num_layers=args.num_layers)

    model.save(args.save_path)
Example #2
0
def train(env_id, num_timesteps, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
    policy = {
        'cnn': CnnPolicy,
        'lstm': LstmPolicy,
        'lnlstm': LnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    ppo2.learn(policy=policy,
               env=env,
               n_steps=128,
               nminibatches=4,
               lam=0.95,
               gamma=0.99,
               noptepochs=4,
               log_interval=1,
               ent_coef=.01,
               learning_rate=lambda f: f * 2.5e-4,
               cliprange=lambda f: f * 0.1,
               total_timesteps=int(num_timesteps * 1.1))
def train():
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=args.num_cpus,
                            inter_op_parallelism_threads=args.num_cpus)
    tf.Session(config=config).__enter__()

    env = RemoteVecEnv([create_env] * args.num_cpus)
    env = VecNormalize(env, ret=True, gamma=args.gamma)

    ppo2.learn(policy=policies.MlpPolicy,
               env=env,
               total_timesteps=args.num_timesteps,
               nminibatches=args.num_minibatches,
               nsteps=args.num_steps,
               noptepochs=args.num_epochs,
               lr=args.learning_rate,
               gamma=args.gamma,
               lam=args.lam,
               ent_coef=args.ent_coef,
               vf_coef=args.vf_coef,
               cliprange=args.clip_range,
               log_interval=args.log_interval,
               save_interval=args.save_interval,
               load_path=args.checkpoint_path,
               num_casks=args.num_casks)
def train(num_timesteps, seed):
    num_cpus = 1
    num_casks = 1
    num_cpus += num_casks

    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=num_cpus,
                            inter_op_parallelism_threads=num_cpus)
    tf.Session(config=config).__enter__()

    gamma = 0.995

    env = RemoteVecEnv([make_env] * num_cpus)
    env = VecNormalize(env, ret=True, gamma=gamma)

    set_global_seeds(seed)
    policy = policies.MlpPolicy
    ppo2.learn(policy=policy,
               env=env,
               nsteps=128,
               nminibatches=num_cpus-num_casks,
               lam=0.95,
               gamma=gamma,
               noptepochs=4,
               log_interval=1,
               vf_coef=0.5,
               ent_coef=0.0,
               lr=3e-4,
               cliprange=0.2,
               save_interval=2,
               load_path="./logs/course_6/00244",
               total_timesteps=num_timesteps,
               num_casks=num_casks)
def main():
    """Run PPO until the environment throws an exception."""
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config):
        #env = make_env
        #env = lambda: make_training_env('SonicTheHedgehog-Genesis', 'GreenHillZone.Act1', stack=True, scale_rew=True)
        env = MultigameEnvWrapper
        #load_path = '/root/compo/trained_on_images_nature_cnn.joblib'
        load_path = './saved_weights.joblib'
        logger.configure(dir='./logs', format_strs=['stdout', 'tensorboard'])

        # Take more timesteps than we need to be sure that
        # we stop due to an exception.
        ppo2.learn(policy=CustomCnnPolicy,
                   env=DummyVecEnv([env]),
                   nsteps=4096,
                   nminibatches=8,
                   lam=0.95,
                   gamma=0.99,
                   noptepochs=3,
                   log_interval=1,
                   ent_coef=0.01,
                   lr=lambda _: 2e-4,
                   cliprange=lambda _: 0.1,
                   total_timesteps=int(1e8),
                   load_path=load_path,
                   save_interval=20)
Example #6
0
def train(env_id, num_timesteps, seed, render):
    from baselines.common import set_global_seeds
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import DynamicLstmPolicy
    from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv

    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    tf.Session(config=config).__enter__()

    def make_env(rank):
        def env_fn():
            env = LearningEnvironment(num_particles=PARTICLES, disable_render=not render)
            env = bench.Monitor(env, logger.get_dir())
            return env
        return env_fn

    env = SubprocVecEnv([make_env(i) for i in range(ENVIRONMENTS)])

    set_global_seeds(seed)
    policy = DynamicLstmPolicy
    ppo2.learn(policy=policy, env=env, nsteps=1000, nminibatches=3,
        lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
        save_interval=10,
        ent_coef=0.002,
        lr=1e-4,
        cliprange=0.2,
        total_timesteps=num_timesteps)
Example #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--game', default='Airstriker-Genesis')
    parser.add_argument('--state', default=retro.State.DEFAULT)
    parser.add_argument('--scenario', default=None)
    args = parser.parse_args()

    def make_env():
        env = make_retro(game=args.game,
                         state=args.state,
                         scenario=args.scenario)
        env = wrap_deepmind_retro(env)
        return env

    venv = SubprocVecEnv([make_env] * 8)
    ppo2.learn(
        network='cnn',
        env=venv,
        total_timesteps=int(100e6),
        nsteps=128,
        nminibatches=4,
        lam=0.95,
        gamma=0.99,
        noptepochs=4,
        log_interval=1,
        ent_coef=.01,
        lr=lambda f: f * 2.5e-4,
        cliprange=0.1,
    )
Example #8
0
def run():
  """Runs a PPO agent on a given environment."""

  def _load_env():
    """Loads environment."""
    raw_env = rwrl.load(
        domain_name=FLAGS.domain_name,
        task_name=FLAGS.task_name,
        safety_spec=dict(enable=True),
        delay_spec=dict(enable=True, actions=20),
        log_output=os.path.join(FLAGS.save_path, 'log.npz'),
        environment_kwargs=dict(
            log_safety_vars=True, log_every=20, flat_observation=True))
    env = GymEnv(raw_env)
    env = bench.Monitor(env, FLAGS.save_path)
    return env

  env = dummy_vec_env.DummyVecEnv([_load_env])

  ppo2.learn(
      env=env,
      network=FLAGS.network,
      lr=FLAGS.learning_rate,
      total_timesteps=FLAGS.total_timesteps,  # make sure to run enough steps
      nsteps=FLAGS.nsteps,
      gamma=FLAGS.agent_discount,
  )
Example #9
0
def train(env_id, num_timesteps, seed, policy):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    gym.logger.setLevel(logging.WARN)
    tf.Session(config=config).__enter__()
    nenvs = 8
    def make_env(rank):
        def env_fn():
            print(rank)
            if nenvs == 1:
                env = MarioEnv(num_steering_dir=11, jump=True)
            else:
                env = MarioEnv(num_steering_dir=11, num_env=rank, jump=True)
            env.seed(seed + rank)
            env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            return env
        return env_fn

    env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
    set_global_seeds(seed)
    env = VecFrameStack(env, 4)
    policy = {'cont': ContCnnPolicy, 'cnn' : OurCNN2, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy}[policy]
    ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
        lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
        ent_coef=.01,
        lr=lambda f : f * 1e-3,
        cliprange=lambda f : f * 0.1,
        total_timesteps=int(num_timesteps * 1.1),
               save_interval=10)
Example #10
0
def train():
    """Trains a PPO2 policy."""
    ncpu = multiprocessing.cpu_count()
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True
    tf.Session(config=config).__enter__()

    vec_env = SubprocVecEnv([(lambda _i=i: create_single_football_env(_i))
                             for i in range(FLAGS.num_envs)],
                            context=None)

    ppo2.learn(network=FLAGS.policy,
               total_timesteps=FLAGS.num_timesteps,
               env=vec_env,
               seed=FLAGS.seed,
               nsteps=FLAGS.nsteps,
               nminibatches=FLAGS.nminibatches,
               noptepochs=FLAGS.noptepochs,
               gamma=FLAGS.gamma,
               ent_coef=FLAGS.ent_coef,
               lr=FLAGS.lr,
               log_interval=1,
               save_interval=FLAGS.save_interval,
               cliprange=FLAGS.cliprange)
Example #11
0
def train(angle, num_timesteps, seed):
    from baselines.common import set_global_seeds
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import MlpPolicy
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

    with tf.Session() as sess:

        def make_env():
            return ant_env(angle)
            # env = gym.make('Ant-v1')
            # return env

        env = DummyVecEnv([make_env])
        env = VecNormalize(env)
        # env = ant_env(angle)

        set_global_seeds(seed)
        policy = MlpPolicy
        ppo2.learn(policy=policy,
                   env=env,
                   nsteps=2048,
                   nminibatches=32,
                   lam=0.95,
                   gamma=0.99,
                   noptepochs=10,
                   log_interval=10,
                   ent_coef=0.0,
                   lr=3e-4,
                   cliprange=0.2,
                   total_timesteps=num_timesteps)
Example #12
0
def main(unused_argv):
    rs = FLAGS.random_seed
    if FLAGS.random_seed is None:
        rs = int((time.time() % 1) * 1000000)

    logger.configure(dir=FLAGS.train_log_dir, format_strs=['log'])

    players = []
    players.append(sc2_env.Agent(races[FLAGS.agent_race]))
    players.append(sc2_env.Agent(races[FLAGS.oppo_race]))

    screen_res = (int(FLAGS.screen_ratio * FLAGS.screen_resolution) // 4 * 4,
                  FLAGS.screen_resolution)
    if FLAGS.agent_interface_format == 'feature':
        agent_interface_format = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(
                screen=screen_res, minimap=FLAGS.minimap_resolution))
    elif FLAGS.agent_interface_format == 'rgb':
        agent_interface_format = sc2_env.AgentInterfaceFormat(
            rgb_dimensions=sc2_env.Dimensions(
                screen=screen_res, minimap=FLAGS.minimap_resolution))
    else:
        raise NotImplementedError

    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    tf.Session(config=config).__enter__()

    #flags.DEFINE_float('param_tstep', 100000, 'the parameter totoal step')
    param_lam = FLAGS.param_lam
    param_gamma = FLAGS.param_gamma
    param_concurrent = FLAGS.param_concurrent
    param_lr = FLAGS.param_lr
    param_cr = FLAGS.param_cr
    param_tstep = FLAGS.param_tstep
    print('params, lam={} gamma={} concurrent={} lr={} tstep={}'.format(
        param_lam, param_gamma, param_concurrent, param_lr, param_tstep))

    env = make_sc2_dis_env(num_env=param_concurrent,
                           seed=rs,
                           players=players,
                           agent_interface_format=agent_interface_format)

    ppo2.learn(policy=CnnPolicy,
               env=env,
               nsteps=128,
               nminibatches=1,
               lam=param_lam,
               gamma=param_gamma,
               noptepochs=4,
               log_interval=1,
               ent_coef=0.01,
               lr=lambda f: f * param_lr,
               cliprange=lambda f: f * param_cr,
               total_timesteps=param_tstep,
               save_interval=10)
def train(_):
    """Trains a PPO2 policy."""
    vec_env = SubprocVecEnv([(lambda _i=i: create_single_football_env(_i))
                             for i in range(FLAGS.num_envs)],
                            context=None)

    # Import tensorflow after we create environments. TF is not fork sake, and
    # we could be using TF as part of environment if one of the players is
    # controled by an already trained model.
    import tensorflow.compat.v1 as tf
    ncpu = multiprocessing.cpu_count()
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True
    tf.Session(config=config).__enter__()

    ppo2.learn(network=FLAGS.policy,
               total_timesteps=FLAGS.num_timesteps,
               env=vec_env,
               seed=FLAGS.seed,
               nsteps=FLAGS.nsteps,
               nminibatches=FLAGS.nminibatches,
               noptepochs=FLAGS.noptepochs,
               max_grad_norm=FLAGS.max_grad_norm,
               gamma=FLAGS.gamma,
               ent_coef=FLAGS.ent_coef,
               lr=FLAGS.lr,
               log_interval=1,
               save_interval=FLAGS.save_interval,
               cliprange=FLAGS.cliprange,
               load_path=FLAGS.load_path)
Example #14
0
def train(env_id, num_timesteps, seed, policy):

    ncpu = multiprocessing.cpu_count()

    if sys.platform == 'darwin': ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    tf.Session(config=config).__enter__()

    env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
    policy = {
        'cnn': CnnPolicy,
        'lstm': LstmPolicy,
        'lnlstm': LnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    ppo2.learn(policy=policy,
               env=env,
               nsteps=128,
               nminibatches=4,
               lam=0.95,
               gamma=0.99,
               noptepochs=4,
               log_interval=1,
               ent_coef=.01,
               lr=lambda f: f * 2.5e-4,
               cliprange=lambda f: f * 0.1,
               total_timesteps=int(num_timesteps * 1.1))
Example #15
0
def run(bsuite_id: Text) -> Text:
    """Runs a PPO agent on a given bsuite environment, logging to CSV."""
    def _load_env():
        raw_env = bsuite.load_and_record(
            bsuite_id=bsuite_id,
            save_path=FLAGS.save_path,
            logging_mode=FLAGS.logging_mode,
            overwrite=FLAGS.overwrite,
        )
        if FLAGS.verbose:
            raw_env = terminal_logging.wrap_environment(raw_env,
                                                        log_every=True)
        return gym_wrapper.GymFromDMEnv(raw_env)

    env = dummy_vec_env.DummyVecEnv([_load_env])

    ppo2.learn(
        env=env,
        network=FLAGS.network,
        lr=FLAGS.learning_rate,
        total_timesteps=FLAGS.total_timesteps,  # make sure to run enough steps
        nsteps=FLAGS.nsteps,
        gamma=FLAGS.agent_discount,
    )

    return bsuite_id
Example #16
0
def train(env_id, num_timesteps, seed):
    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import MlpPolicy
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()
    def make_env():
        env = gym.make(env_id)
        env = bench.Monitor(env, logger.get_dir())
        return env
    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
        lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
        ent_coef=0.0,
        lr=3e-4,
        cliprange=0.2,
        total_timesteps=num_timesteps)
Example #17
0
def main(policy, env, params):
    """Run PPO until the environment throws an exception."""
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config):
        # Take more timesteps than we need to be sure that
        # we stop due to an exception.
        ppo2.learn(policy=policy,
                   env=env,
                   nsteps=params['n_steps'],
                   nminibatches=(params['n_steps'] * env.num_envs) //
                   params["batch_size"],
                   lam=params["lam"],
                   gamma=params['gamma'],
                   noptepochs=params["n_opt_epochs"],
                   log_interval=params["log_interval"],
                   ent_coef=params["ent_coef"],
                   vf_coef=params['vf_coef'],
                   lr=lambda _: params["lr"],
                   cliprange=lambda _: params['cliprange'],
                   max_grad_norm=params['max_grad_norm'],
                   total_timesteps=params["max_steps"],
                   save_interval=params["save_interval"],
                   weights_path=params["weights_path"],
                   adam_stats=params["adam_stats"],
                   nmixup=params["nmixup"],
                   weights_choose_eps=params["weights_choose_eps"],
                   cnn=params['cnn'])
Example #18
0
def train(env_id, num_timesteps, seed):
    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import MlpPolicy
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()
    def make_env():
        env = gym.make(env_id)
        env = bench.Monitor(env, logger.get_dir())
        return env
    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
        lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
        ent_coef=0.0,
        lr=3e-4,
        cliprange=0.2,
        total_timesteps=num_timesteps)
Example #19
0
def train(env_id, num_timesteps, seed, d_targ, load, point):
    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import LstmMlpPolicy, MlpPolicy
    import gym
    # import roboschool
    import multiprocessing
    import tensorflow as tf
    from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            return env

        return _thunk

    set_global_seeds(seed)

    ncpu = multiprocessing.cpu_count()
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    nenvs = 32
    env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
    env = VecNormalize(env)

    policy = MlpPolicy

    def adaptive_lr(lr, kl, d_targ):
        if kl < (d_targ / 1.5):
            lr *= 2.
        elif kl > (d_targ * 1.5):
            lr *= .5
        return lr

    ppo2.learn(policy=policy,
               env=env,
               nsteps=512,
               nminibatches=4,
               lam=0.95,
               gamma=0.99,
               noptepochs=15,
               log_interval=1,
               ent_coef=0.00,
               lr=adaptive_lr,
               cliprange=0.2,
               total_timesteps=num_timesteps,
               load=load,
               point=point,
               init_targ=d_targ)
Example #20
0
def main(environment, nr_episodes):
    env = make_unity_env(environment, 1, True)
    ppo2.learn(
        network="mlp",
        env=env,
        total_timesteps=nr_episodes,
        lr=1e-3,
    )
Example #21
0
def train(env_id, num_timesteps, seed, policy):
    from baselines.common import set_global_seeds
    from baselines.common.atari_wrappers import make_atari, wrap_deepmind
    from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
    from baselines.common.vec_env.vec_frame_stack import VecFrameStack
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy, MlpPolicy
    import gym
    import logging
    import multiprocessing
    import os.path as osp
    import tensorflow as tf
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin':
        ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    gym.logger.setLevel(logging.WARN)
    tf.Session(config=config).__enter__()

    def make_env(rank):
        def env_fn():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
            return env  # wrap_deepmind(env)

        return env_fn

    nenvs = 8
    env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
    set_global_seeds(seed)
    env = VecFrameStack(env, 4)
    policy = {
        'cnn': CnnPolicy,
        'lstm': LstmPolicy,
        'lnlstm': LnLstmPolicy,
        'mlp': MlpPolicy,
        'capsules': CapsulesPolicy
    }[policy]
    ppo2.learn(policy=policy,
               env=env,
               nsteps=128,
               nminibatches=4,
               lam=0.95,
               gamma=0.99,
               noptepochs=4,
               log_interval=1,
               ent_coef=.01,
               lr=lambda f: f * 2.5e-4,
               cliprange=lambda f: f * 0.1,
               total_timesteps=int(num_timesteps * 1.1))
Example #22
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
    env.close()
Example #23
0
def main():
    arenas_configurations = ArenaConfig(
        "configurations/arena_configurations/train_ml_agents_arenas.yml")
    env = make_aai_env("env/AnimalAI", 2, arenas_configurations)
    ppo2.learn(
        network="cnn",
        env=env,
        total_timesteps=100000,
        lr=1e-3,
    )
Example #24
0
def main():
    """Run PPO until the environment throws an exception."""
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--num_env', default=1, type=int)
    parser.add_argument('--seed', default=None, type=int)
    parser.add_argument('--game', default='ContraIII-Snes')
    parser.add_argument(
        '--state',
        default='level1.1player.easy.100lives')  #state=retro.State.DEFAULT
    parser.add_argument('--scenario', default='scenario')
    parser.add_argument('--discrete_actions', default=0, type=int)
    parser.add_argument('--bk2dir', default='videos')
    parser.add_argument('--monitordir', default='logs')
    parser.add_argument('--sonic_discretizer', default=1, type=int)
    parser.add_argument('--clip_rewards', default=0, type=int)
    parser.add_argument('--stack', default=4, type=int)
    parser.add_argument('--time_limit', default=8000, type=int)
    parser.add_argument('--scale_reward', default=0.01, type=float)
    parser.add_argument('--warp_frame', default=1, type=int)
    parser.add_argument('--stochastic_frame_skip', default=4, type=int)
    parser.add_argument('--skip_prob', default=0.0, type=float)
    parser.add_argument('--network', default='cnn')
    parser.add_argument('--scenario_number', default=1, type=int)
    parser.add_argument('--load_path', default=None)
    args = parser.parse_args()
    time_int = int(time.time())

    env_vec = make_vec_env(args, time_int)
    logger.configure(dir='./log/{}'.format(time_int),
                     format_strs=['stdout', 'log', 'csv', 'tensorboard'])
    with tf.Session(config=config):
        ppo2.learn(
            network=
            'impala_cnn',  #args.network, #network='contra_net', #network='cnn',
            env=env_vec,
            nsteps=1024,  #1024,
            nminibatches=128,  #16,256,512,64,128
            lam=0.95,
            gamma=0.997,  #0.99
            noptepochs=3,  #3,
            log_interval=100,
            ent_coef=0.003,  #0.003,#0.003, 0.001, 0.005 #many actions #0.01
            lr=lambda _: 5e-5,  #2e-4,1e-4,5e-5
            cliprange=0.1,
            save_interval=100,
            seed=args.seed,
            vf_coef=0.5,
            max_grad_norm=0.5,
            save_path='ppo_save/{}'.format(time_int),
            #load_path=args.load_path,
            total_timesteps=int(2e10))
Example #25
0
def train(env_id, num_timesteps, seed, nsteps, batch_size, epoch, method,
          net_size, i_trial, load_path, use_entr, ncpu):
    # rank = MPI.COMM_WORLD.Get_rank()
    # if rank != 0:
    #     logger.set_level(logger.DISABLED)

    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True

    # workerseed = seed + 10000 * rank
    tf.reset_default_graph()
    set_global_seeds(seed)

    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            if logger.get_dir():
                env = bench.Monitor(
                    env,
                    os.path.join(logger.get_dir(),
                                 'train-{}.monitor.json'.format(rank)))
            return env

        return _thunk

    # def make_env():
    #     env = gym.make(env_id)
    #     env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
    #     return env

    env = SubprocVecEnv([make_env(i) for i in range(ncpu)])
    # env = DummyVecEnv([make_env])
    env = VecNormalize(env)
    with tf.Session(config=config) as sess:
        policy = MlpPolicy
        ppo2.learn(policy=policy,
                   env=env,
                   nsteps=nsteps,
                   nminibatches=batch_size,
                   lam=0.95,
                   gamma=0.99,
                   noptepochs=epoch,
                   log_interval=1,
                   ent_coef=0.01,
                   lr=3e-4,
                   cliprange=0.2,
                   total_timesteps=num_timesteps,
                   useentr=use_entr,
                   net_size=net_size,
                   i_trial=i_trial,
                   load_path=load_path,
                   method=method)
Example #26
0
def main():
    print('Executable environment', OBS_TOWER_ENVPATH)
    env = make_unity_env(OBS_TOWER_ENVPATH, 1)

    ppo2.learn(
        network='mlp',
        env=env,
        total_timesteps=1e5,
        lr=1e-3,
    )
    ppo2.save('obs_tower_chall_model.pkl')
def main():
    if 1:
        env = gym.make("Pendulum-v0")
        env.num_envs = 1
        act = ppo2.learn(env=env,
                         network='mlp',
                         total_timesteps=0,
                         load_path="pendulum_model_ppo2.pkl")
    else:
        env_id = "pendulum-legacy-v0"
        env_type = "gym_poine"
        num_env = 1
        seed = 1234
        reward_scale = 1.
        flatten_dict_observations = False
        env = make_vec_env(env_id, env_type, num_env, seed, reward_scale,
                           flatten_dict_observations)

        act = ppo2.learn(
            env=env,
            network='mlp',
            total_timesteps=0,
            eval_env=None,
            seed=None,
            nsteps=2048,
            ent_coef=0.0,
            #lr=lambda f : f * 2.5e-4,
            lr=3e-4,
            vf_coef=0.5,
            max_grad_norm=0.5,
            gamma=0.9,  # default 0.99
            lam=0.95,
            log_interval=10,
            nminibatches=32,  # default 4
            noptepochs=10,
            cliprange=0.2,
            save_interval=0,
            load_path="pendulum_model_ppo2.pkl",
            model_fn=None,
            update_fn=None,
            init_fn=None,
            mpi_rank_weight=1,
            comm=None,
        )

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
Example #28
0
def train(env_id, num_timesteps, seed, pol, cur, vis, model):
    from baselines.common import set_global_seeds
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import HierPolicy, HierPolicy2, MlpPolicy, RandomWalkPolicy
    import gym
    import gym_program
    import tensorflow as tf
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    hier = True if pol == 'hier1' or pol == 'hier2' else False

    def make_env():
        set_global_seeds(seed)
        env = gym.make(env_id)
        env.set_curiosity(cur, model)
        env.set_hier(hier)
        env.set_visualize(vis)
        env = bench.Monitor(env, logger.get_dir())
        env.seed(seed)
        return env

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)

    if pol == 'hier1': policy = HierPolicy
    elif pol == 'hier2': policy = HierPolicy2
    elif policy == 'mlp': policy = MlpPolicy
    elif pol == 'random_walk':
        pol = RandomWalkPolicy
        pol(env)
        return

    ppo2.learn(policy=policy,
               env=env,
               pol=pol,
               nsteps=2048,
               nminibatches=32,
               lam=0.95,
               gamma=0.99,
               noptepochs=10,
               log_interval=1,
               ent_coef=0.0,
               lr=1e-4,
               cliprange=0.2,
               total_timesteps=num_timesteps)
Example #29
0
def run_baselines(env, seed, log_dir):
    """Create baselines model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    ncpu = max(multiprocessing.cpu_count() // 2, 1)
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.compat.v1.Session(config=config).__enter__()

    # Set up logger for baselines
    configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard'])
    baselines_logger.info('rank {}: seed={}, logdir={}'.format(
        0, seed, baselines_logger.get_dir()))

    env = DummyVecEnv([
        lambda: bench.Monitor(
            env, baselines_logger.get_dir(), allow_early_resets=True)
    ])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy

    nbatch = env.num_envs * hyper_parameters['batch_size']
    training_batch_number = nbatch // hyper_parameters['training_batch_size']

    # import pdb; pdb.set_trace()

    # use AdamOptimizer as optimizer and choose value function same with policy
    ppo2.learn(policy=policy,
               env=env,
               nsteps=hyper_parameters['batch_size'],
               lam=hyper_parameters['gae_lambda'],
               gamma=hyper_parameters['discount'],
               ent_coef=hyper_parameters['policy_ent_coeff'],
               nminibatches=training_batch_number,
               noptepochs=hyper_parameters['training_epochs'],
               max_grad_norm=None,
               lr=hyper_parameters['learning_rate'],
               cliprange=hyper_parameters['lr_clip_range'],
               total_timesteps=hyper_parameters['batch_size'] * hyper_parameters['n_epochs'])  # yapf: disable  # noqa: E501

    return osp.join(log_dir, 'progress.csv')
Example #30
0
def train(env_id, num_timesteps, seed):
    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import MlpPolicy
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    def make_env():
        if env_id == 'toy':
            #env = continuous_gridworld.ContinuousGridworld('', max_steps=1000,
            #                                           obstacle_mode=continuous_gridworld.NO_OBJECTS)
            from toy_environment import room_obstacle_list
            env = gridworld.Gridworld(
                obstacle_list_generator=room_obstacle_list.obstacle_list)
        elif env_id == 'navigate':
            env = NavigateEnv(use_camera=False,
                              continuous_actions=True,
                              neg_reward=True,
                              max_steps=500)
        elif env_id == 'arm2pos':
            #env = Arm2PosEnv(continuous=False, max_steps=500)
            pass
        else:
            env = gym.make(env_id)
        env = bench.Monitor(env, logger.get_dir())
        return env

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    ppo2.learn(policy=policy,
               env=env,
               nsteps=2048,
               nminibatches=32,
               lam=0.95,
               gamma=0.99,
               noptepochs=10,
               log_interval=1,
               ent_coef=0.0,
               lr=3e-4,
               cliprange=0.2,
               total_timesteps=num_timesteps)
Example #31
0
def run_baselines(env, seed, log_dir):
    '''
    Create baselines model and training.

    Replace the ppo and its training with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return
    '''
    ncpu = max(multiprocessing.cpu_count() // 2, 1)
    config = tf.ConfigProto(
        allow_soft_placement=True,
        intra_op_parallelism_threads=ncpu,
        inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    # Set up logger for baselines
    configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard'])
    baselines_logger.info('rank {}: seed={}, logdir={}'.format(
        0, seed, baselines_logger.get_dir()))

    def make_env():
        monitor = bench.Monitor(
            env, baselines_logger.get_dir(), allow_early_resets=True)
        return monitor

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    ppo2.learn(
        policy=policy,
        env=env,
        nsteps=2048,
        nminibatches=32,
        lam=0.95,
        gamma=0.99,
        noptepochs=10,
        log_interval=1,
        ent_coef=0.0,
        lr=1e-3,
        vf_coef=0.5,
        max_grad_norm=None,
        cliprange=0.2,
        total_timesteps=int(1e6))

    return osp.join(log_dir, 'progress.csv')
Example #32
0
def train(args):
    logger.configure(args.main_path)

    if args.diff_frames:
        assert "stack_frames" in args

    seed = int.from_bytes(os.urandom(4), byteorder='big')
    set_global_seeds(seed)
    env = ImVecNormalize(
        make_multiple_mujoco_env(args.env_id, args.number_of_agents, seed))

    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True
    tf.Session(config=config).__enter__()
    with tf.device("/device:GPU:0"):
        if args.policy == "cnn":
            policy = MImVecPolicy
        elif args.policy == "lstm_cnn":
            policy = MImVecLstmPolicy
        elif args.policy == "lnlstm_cnn":
            policy = MImVecLnLstmPolicy
        else:
            raise ValueError

    ppo2.learn(policy=policy,
               env=env,
               nsteps=args.nsteps,
               nminibatches=args.nminibatches,
               lam=args.lam,
               gamma=args.gamma,
               noptepochs=args.noptepochs,
               log_interval=1,
               ent_coef=0.0,
               lr=args.learning_rate,
               cliprange=args.cliprange,
               total_timesteps=int(args.num_timesteps * 1.01),
               add_flownet=args.add_flownet,
               flownet_path=args.flownet_path,
               flow_key=args.flow_key,
               train_from_scratch=args.train_from_scratch,
               large_cnn=args.large_cnn,
               add_predicted_flow_to_vec=args.add_predicted_flow_to_vec,
               diff_frames=args.diff_frames)
Example #33
0
def train(env_id, num_timesteps, seed, policy):

    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    tf.Session(config=config).__enter__()

    env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
    policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy, 'mlp': MlpPolicy}[policy]
    ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
        lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
        ent_coef=.01,
        lr=lambda f : f * 2.5e-4,
        cliprange=lambda f : f * 0.1,
        total_timesteps=int(num_timesteps * 1.1))
Example #34
0
import pytest
import tensorflow as tf
import random
import numpy as np
from gym.spaces import np_random

from baselines.a2c import a2c
from baselines.ppo2 import ppo2
from baselines.common.identity_env import IdentityEnv
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.ppo2.policies import MlpPolicy


learn_func_list = [
    lambda e: a2c.learn(policy=MlpPolicy, env=e, seed=0, total_timesteps=50000),
    lambda e: ppo2.learn(policy=MlpPolicy, env=e, total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.01)
]


@pytest.mark.slow
@pytest.mark.parametrize("learn_func", learn_func_list)
def test_identity(learn_func):
    '''
    Test if the algorithm (with a given policy) 
    can learn an identity transformation (i.e. return observation as an action)
    '''
    np.random.seed(0)
    np_random.seed(0)
    random.seed(0)

    env = DummyVecEnv([lambda: IdentityEnv(10)])