Example #1
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID',
                        default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--checkpoint-freq', type=int, default=10000)
    parser.add_argument('--checkpoint-path', type=str, default=None)

    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
    env = make_atari(args.env)
    env = Monitor(env, logger.get_dir())
    env = wrap_deepmind(env)
    model = cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=bool(args.dueling),
    )

    fit(
        env,
        q_func=model,
        lr=1e-4,
        max_timesteps=args.num_timesteps,
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=bool(args.prioritized),
        prioritized_replay_alpha=args.prioritized_replay_alpha,
        checkpoint_freq=args.checkpoint_freq,
        checkpoint_path=args.checkpoint_path,
    )

    env.close()
    sess = tf.get_default_session()
    del sess
Example #2
0
def fit(environ, env_id, num_timesteps, seed, model_path=None):
    # atari
    if environ == 'atari':
        rank = MPI.COMM_WORLD.Get_rank()
        sess = Model().single_threaded_session()
        sess.__enter__()
        if rank == 0:
            logger.configure()
        else:
            logger.configure(format_strs=[])
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed \
            is not None else None
        set_global_seeds(workerseed)
        env = make_atari(env_id)

        def policy_fn(name, ob_space, ac_space):
            return PPO1Cnn(name=name, ob_space=ob_space, ac_space=ac_space)

        env = Monitor(
            env,
            logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
        env.seed(workerseed)

        env = wrap_deepmind(env)
        env.seed(workerseed)

        pi = PPOSGD(env,
                    policy_fn,
                    env.observation_space,
                    env.action_space,
                    timesteps_per_actorbatch=256,
                    clip_param=0.2,
                    entcoeff=0.01,
                    optim_epochs=4,
                    optim_stepsize=1e-3,
                    optim_batchsize=64,
                    gamma=0.99,
                    lam=0.95,
                    max_timesteps=int(num_timesteps * 1.1),
                    schedule='linear')

        env.close()
        sess.close()
        return pi

    # mujoco
    if environ == 'mujoco':
        from utils.cmd import make_mujoco_env

        sess = Model().init_session(num_cpu=1).__enter__()

        def policy_fn(name, ob_space, ac_space):
            return PPO1Mlp(name=name,
                           ob_space=ob_space,
                           ac_space=ac_space,
                           hid_size=64,
                           num_hid_layers=2)

        env = make_mujoco_env(env_id, seed)
        pi = PPOSGD(
            env,
            policy_fn,
            env.observation_space,
            env.action_space,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=10,
            optim_stepsize=3e-4,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
        )
        env.close()
        sess.close()
        return pi

    if environ == 'humanoid':
        import gym
        from utils.cmd import make_mujoco_env

        env_id = 'Humanoid-v2'

        class RewScale(gym.RewardWrapper):
            def __init__(self, env, scale):
                gym.RewardWrapper.__init__(self, env)
                self.scale = scale

            def reward(self, r):
                return r * self.scale

        sess = Model().init_session(num_cpu=1).__enter__()

        def policy_fn(name, ob_space, ac_space):
            return PPO1Mlp(name=name,
                           ob_space=ob_space,
                           ac_space=ac_space,
                           hid_size=64,
                           num_hid_layers=2)

        env = make_mujoco_env(env_id, seed)

        # parameters below were the best found in a simple random
        # search these are good enough to make humanoid walk, but
        # whether those are an absolute best or not is not certain
        env = RewScale(env, 0.1)
        pi = PPOSGD(
            env,
            policy_fn,
            env.observation_space,
            env.action_space,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=10,
            optim_stepsize=3e-4,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
        )
        env.close()
        if model_path:
            Model().save_state(model_path)

        sess.close()
        return pi

    if environ == 'robotics':
        import mujoco_py
        from utils.cmd import make_robotics_env
        rank = MPI.COMM_WORLD.Get_rank()
        sess = Model().single_threaded_session()
        sess.__enter__()
        mujoco_py.ignore_mujoco_warnings().__enter__()
        workerseed = seed + 10000 * rank
        set_global_seeds(workerseed)
        env = make_robotics_env(env_id, workerseed, rank=rank)

        def policy_fn(name, ob_space, ac_space):
            return PPO1Mlp(name=name,
                           ob_space=ob_space,
                           ac_space=ac_space,
                           hid_size=256,
                           num_hid_layers=3)

        pi = PPOSGD(
            env,
            policy_fn,
            env.observation_space,
            env.action_space,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=5,
            optim_stepsize=3e-4,
            optim_batchsize=256,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
        )
        env.close()
        sess.close()
        return pi
Example #3
0
def main():
    parser = arg_parser()
    parser.add_argument('--platform',
                        help='environment choice',
                        choices=['atari', 'mujoco'],
                        default='atari')

    platform_args, environ_args = parser.parse_known_args()
    platform = platform_args.platform

    rank = MPI.COMM_WORLD.Get_rank()

    # atari
    if platform == 'atari':
        from bench import Monitor
        from utils.cmd import atari_arg_parser, make_atari, \
            wrap_deepmind
        from policies.nohashingcnn import CnnPolicy

        args = atari_arg_parser().parse_known_args()[0]
        if rank == 0:
            logger.configure()
        else:
            logger.configure(format_strs=[])

        workerseed = args.seed + 10000 * rank
        set_global_seeds(workerseed)
        env = make_atari(args.env)

        env = Monitor(
            env,
            logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
        env.seed(workerseed)

        env = wrap_deepmind(env)
        env.seed(workerseed)

        model = TRPO(CnnPolicy, env.observation_space, env.action_space)
        sess = model.single_threaded_session().__enter__()
        # model.reset_graph_and_vars()
        model.init_vars()

        fit(model,
            env,
            timesteps_per_batch=512,
            max_kl=0.001,
            cg_iters=10,
            cg_damping=1e-3,
            max_timesteps=int(args.num_timesteps * 1.1),
            gamma=0.98,
            lam=1.0,
            vf_iters=3,
            vf_stepsize=1e-4,
            entcoeff=0.00)
        sess.close()
        env.close()

    # mujoco
    if platform == 'mujoco':
        from policies.ppo1mlp import PPO1Mlp
        from utils.cmd import make_mujoco_env, mujoco_arg_parser
        args = mujoco_arg_parser().parse_known_args()[0]

        if rank == 0:
            logger.configure()
        else:
            logger.configure(format_strs=[])
            logger.set_level(logger.DISABLED)

        workerseed = args.seed + 10000 * rank

        env = make_mujoco_env(args.env, workerseed)

        def policy(name, observation_space, action_space):
            return PPO1Mlp(name,
                           env.observation_space,
                           env.action_space,
                           hid_size=32,
                           num_hid_layers=2)

        model = TRPO(policy, env.observation_space, env.action_space)
        sess = model.single_threaded_session().__enter__()
        model.init_vars()

        fit(model,
            env,
            timesteps_per_batch=1024,
            max_kl=0.01,
            cg_iters=10,
            cg_damping=0.1,
            max_timesteps=args.num_timesteps,
            gamma=0.99,
            lam=0.98,
            vf_iters=5,
            vf_stepsize=1e-3)
        sess.close()
        env.close()