def test_lstm_example():
    import tensorflow as tf
    from common import policies, models, cmd_util
    from common.vec_env.dummy_vec_env import DummyVecEnv

    # create vectorized environment
    venv = DummyVecEnv(
        [lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)])

    with tf.Session() as sess:
        # build policy based on lstm network with 128 units
        policy = policies.build_policy(venv, models.lstm(128))(nbatch=1,
                                                               nsteps=1)

        # initialize tensorflow variables
        sess.run(tf.global_variables_initializer())

        # prepare environment variables
        ob = venv.reset()
        state = policy.initial_state
        done = [False]
        step_counter = 0

        # run a single episode until the end (i.e. until done)
        while True:
            action, _, state, _ = policy.step(ob, S=state, M=done)
            ob, reward, done, _ = venv.step(action)
            step_counter += 1
            if done:
                break

        assert step_counter > 5
def test_microbatches():
    def env_fn():
        env = gym.make('CartPole-v0')
        env.seed(0)
        return env

    learn_fn = partial(learn,
                       network='mlp',
                       nsteps=32,
                       total_timesteps=32,
                       seed=0)

    env_ref = DummyVecEnv([env_fn])
    sess_ref = make_session(make_default=True, graph=tf.Graph())
    learn_fn(env=env_ref)
    vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()}

    env_test = DummyVecEnv([env_fn])
    sess_test = make_session(make_default=True, graph=tf.Graph())
    learn_fn(env=env_test,
             model_fn=partial(MicrobatchedModel, microbatch_size=2))
    # learn_fn(env=env_test)
    vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()}

    for v in vars_ref:
        np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=3e-3)
def test_serialization(learn_fn, network_fn):
    '''
    Test if the trained model can be serialized
    '''

    if network_fn.endswith('lstm') and learn_fn in [
            'acer', 'acktr', 'trpo_mpi', 'deepq'
    ]:
        # TODO make acktr work with recurrent policies
        # and test
        # github issue: https://github.com/openai/baselines/issues/660
        return

    def make_env():
        env = MnistEnv(episode_len=100)
        env.seed(10)
        return env

    env = DummyVecEnv([make_env])
    ob = env.reset().copy()
    learn = get_learn_function(learn_fn)

    kwargs = {}
    kwargs.update(network_kwargs[network_fn])
    kwargs.update(learn_kwargs[learn_fn])

    learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs)

    with tempfile.TemporaryDirectory() as td:
        model_path = os.path.join(td, 'serialization_test_model')

        with tf.Graph().as_default(), make_session().as_default():
            model = learn(total_timesteps=100)
            model.save(model_path)
            mean1, std1 = _get_action_stats(model, ob)
            variables_dict1 = _serialize_variables()

        with tf.Graph().as_default(), make_session().as_default():
            model = learn(total_timesteps=0, load_path=model_path)
            mean2, std2 = _get_action_stats(model, ob)
            variables_dict2 = _serialize_variables()

        for k, v in variables_dict1.items():
            np.testing.assert_allclose(
                v,
                variables_dict2[k],
                atol=0.01,
                err_msg='saved and loaded variable {} value mismatch'.format(
                    k))

        np.testing.assert_allclose(mean1, mean2, atol=0.5)
        np.testing.assert_allclose(std1, std2, atol=0.5)
def make_vec_env(env_id, env_type, num_env, seed,
                 wrapper_kwargs=None,
                 start_index=0,
                 reward_scale=1.0,
                 flatten_dict_observations=True,
                 gamestate=None):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    wrapper_kwargs = wrapper_kwargs or {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = seed + 10000 * mpi_rank if seed is not None else None
    logger_dir = logger.get_dir()
    def make_thunk(rank):
        return lambda: make_env(
            env_id=env_id,
            env_type=env_type,
            mpi_rank=mpi_rank,
            subrank=rank,
            seed=seed,
            reward_scale=reward_scale,
            gamestate=gamestate,
            flatten_dict_observations=flatten_dict_observations,
            wrapper_kwargs=wrapper_kwargs,
            logger_dir=logger_dir
        )

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)])
    else:
        return DummyVecEnv([make_thunk(start_index)])
예제 #5
0
def make_vec_env(env_id,
                 env_type,
                 num_env,
                 seed,
                 wrapper_kwargs=None,
                 start_index=0,
                 reward_scale=1.0,
                 flatten_dict_observations=True,
                 gamestate=None,
                 cloth_cfg_path=None,
                 render_path=None,
                 start_state_path=None):
    """Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.

    Daniel: the above docs from baselines seems out of date, ALL types go here?
    Also, we're adding arguments for the cloth env: the config path, the render
    path, and the starting state path (last one is optional for the cloth).
    """
    wrapper_kwargs = wrapper_kwargs or {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = seed + 10000 * mpi_rank if seed is not None else None
    logger_dir = logger.get_dir()

    def make_thunk(rank,
                   cloth_cfg_path=None,
                   render_path=None,
                   start_state_path=None):
        return lambda: make_env(
            env_id=env_id,
            env_type=env_type,
            mpi_rank=mpi_rank,
            subrank=rank,
            seed=seed,
            reward_scale=reward_scale,
            gamestate=gamestate,
            flatten_dict_observations=flatten_dict_observations,
            wrapper_kwargs=wrapper_kwargs,
            logger_dir=logger_dir,
            cloth_cfg_path=cloth_cfg_path,
            render_path=render_path,
            start_state_path=start_state_path,
        )

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv([
            make_thunk(
                i + start_index,
                cloth_cfg_path=cloth_cfg_path,
                render_path=None,  # Daniel: for now
                start_state_path=start_state_path) for i in range(num_env)
        ])
    else:
        return DummyVecEnv([
            make_thunk(start_index,
                       cloth_cfg_path,
                       render_path=render_path,
                       start_state_path=start_state_path)
        ])
예제 #6
0
def make_mujoco_env(env_id, seed, normalize=False, training=True):
    def make_env():
        env = gym.make(env_id)
        env.seed(seed)
        return env

    env = DummyVecEnv([make_env])
    np.random.seed(seed)
    torch.manual_seed(seed)
    if normalize:
        env = VecNormalize(env, training=training)
    return env
예제 #7
0
        def make_vec_envs(evaluation):
            def env_thunk(rank):
                return lambda: self.make_env(seed=int(seed),
                                             rank=rank,
                                             evaluation=evaluation,
                                             env_id=env_id)

            env_fns = [env_thunk(i) for i in range(num_processes)]
            use_dummy = len(
                env_fns) == 1 or sys.platform == "darwin" or synchronous
            return VecPyTorch(
                DummyVecEnv(env_fns, render=render
                            ) if use_dummy else SubprocVecEnv(env_fns))
예제 #8
0
파일: train.py 프로젝트: oidelima/ppo
    def make_vec_envs(
        self,
        num_processes,
        gamma,
        render,
        synchronous,
        env_id,
        add_timestep,
        seed,
        evaluation,
        time_limit,
        num_frame_stack=None,
        **env_args,
    ):
        envs = [
            functools.partial(  # thunk
                self.make_env,
                rank=i,
                env_id=env_id,
                add_timestep=add_timestep,
                seed=seed,
                evaluation=evaluation,
                time_limit=time_limit,
                evaluating=evaluation,
                **env_args,
            ) for i in range(num_processes)
        ]

        if len(envs) == 1 or sys.platform == "darwin" or synchronous:
            envs = DummyVecEnv(envs, render=render)
        else:
            envs = SubprocVecEnv(envs)

        # if (
        # envs.observation_space.shape
        # and len(envs.observation_space.shape) == 1
        # ):
        # if gamma is None:
        # envs = VecNormalize(envs, ret=False)
        # else:
        # envs = VecNormalize(envs, gamma=gamma)

        envs = VecPyTorch(envs)

        if num_frame_stack is not None:
            envs = VecPyTorchFrameStack(envs, num_frame_stack)
        # elif len(envs.observation_space.shape) == 3:
        #     envs = VecPyTorchFrameStack(envs, 4, device)

        return envs
def test_coexistence(learn_fn, network_fn):
    '''
    Test if more than one model can exist at a time
    '''

    if learn_fn == 'deepq':
        # TODO enable multiple DQN models to be useable at the same time
        # github issue https://github.com/openai/baselines/issues/656
        return

    if network_fn.endswith('lstm') and learn_fn in [
            'acktr', 'trpo_mpi', 'deepq'
    ]:
        # TODO make acktr work with recurrent policies
        # and test
        # github issue: https://github.com/openai/baselines/issues/660
        return

    env = DummyVecEnv([lambda: gym.make('CartPole-v0')])
    learn = get_learn_function(learn_fn)

    kwargs = {}
    kwargs.update(network_kwargs[network_fn])
    kwargs.update(learn_kwargs[learn_fn])

    learn = partial(learn,
                    env=env,
                    network=network_fn,
                    total_timesteps=0,
                    **kwargs)
    make_session(make_default=True, graph=tf.Graph())
    model1 = learn(seed=1)
    make_session(make_default=True, graph=tf.Graph())
    model2 = learn(seed=2)

    model1.step(env.observation_space.sample())
    model2.step(env.observation_space.sample())
예제 #10
0
def make_vec_env(env_id,
                 env_type,
                 num_env,
                 seed,
                 wrapper_kwargs=None,
                 start_index=0,
                 reward_scale=1.0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0

    def make_env(rank):  # pylint: disable=C0111
        def _thunk():
            env = make_atari(env_id) if env_type == 'atari' else gym.make(
                env_id)
            env.seed(seed + 10000 * mpi_rank +
                     rank if seed is not None else None)
            env = Monitor(env,
                          logger.get_dir()
                          and os.path.join(logger.get_dir(),
                                           str(mpi_rank) + '.' + str(rank)),
                          allow_early_resets=True)

            if env_type == 'atari': return wrap_deepmind(env, **wrapper_kwargs)
            elif reward_scale != 1: return RewardScaler(env, reward_scale)
            else: return env

        return _thunk

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv(
            [make_env(i + start_index) for i in range(num_env)])
    else:
        return DummyVecEnv([make_env(start_index)])
예제 #11
0
def main():
    parser = arg_parser()
    parser.add_argument('--platform', help='environment choice',
                        choices=['atari', 'mujoco', 'humanoid', 'robotics'],
                        default='atari')
    platform_args, environ_args = parser.parse_known_args()
    platform = platform_args.platform
    logger.configure()

    # atari
    if platform == 'atari':
        parser = atari_arg_parser()
        parser.add_argument('--policy', help='Policy architecture',
                            choices=['cnn', 'lstm', 'lnlstm', 'mlp'],
                            default='cnn')
        args = parser.parse_known_args()[0]

        # fit(
        #     args.env,
        #     num_timesteps=args.num_timesteps,
        #     seed=args.seed,
        #     policy=args.policy
        # )
        sess = Agent().init_session().__enter__()
        env = VecFrameStack(make_atari_env(args.env, 8, args.seed), 4)
        policy = {'cnn' : Convnet,
                  'lstm' : Lstm,
                  'lnlstm' : LnLstm,
                  'mlp': Mlp}[args.policy]

        fit(
            policy=policy,
            env=env,
            nsteps=128,
            nminibatches=8,
            lam=0.95,
            gamma=0.99,
            noptepochs=4,
            log_interval=1,
            ent_coef=.01,
            lr=lambda f: f * 2.5e-4,
            cliprange=lambda f: f * 0.1,
            total_timesteps=int(args.num_timesteps * 1.1)
        )

        sess.close()
        env.close()
        del sess

    # mujoco
    if platform == 'mujoco':
        args = mujoco_arg_parser().parse_known_args()[0]

        sess = Agent().init_session().__enter__()
        from utils.monitor import Monitor

        def make_env():
            env = make_mujoco_env(args.env, args.seed)
            # env = gym.make(env_id)
            env = Monitor(env, logger.get_dir(), allow_early_resets=True)
            return env

        env = DummyVecEnv([make_env])
        env = VecNormalize(env)

        model = fit(
            policy=Mlp,
            env=env,
            nsteps=2048,
            nminibatches=32,
            lam=0.95,
            gamma=0.99,
            noptepochs=10,
            log_interval=1,
            ent_coef=0.0,
            lr=3e-4,
            cliprange=0.2,
            total_timesteps=args.num_timesteps
        )

        # return model, env

        if args.play:
            logger.log("Running trained model")
            obs = np.zeros((env.num_envs,) + env.observation_space.shape)
            obs[:] = env.reset()
            while True:
                actions = model.step(obs)[0]
                obs[:]  = env.step(actions)[0]
                env.render()

        sess.close()
        env.close()
        del sess