Beispiel #1
0
def make_vec_env(make_env, num_env, init_seed):
    r"""Create a vectorized environment, each associated with a different random seed.
    
    Example::
        
        >>> import gym
        >>> make_vec_env(lambda: gym.make('CartPole-v1'), 3, 0)
        <VecEnv: 3, CartPole-v1>
    
    Args:
        make_env (function): a function to create an environment
        num_env (int): number of environments to create. 
        init_seed (int): initial seed for :class:`Seeder` to sample random seeds. 
    
    Returns
    -------
    env : VecEnv
        created vectorized environment
    
    """
    # Generate different seeds for each environment
    seeder = Seeder(init_seed=init_seed)
    seeds = seeder(size=num_env)

    def f(seed):
        env = make_env()
        env.seed(seed)
        env.observation_space.seed(seed)
        env.action_space.seed(seed)
        return env

    # Use partial to generate a list of argument-free make_env, each with different seed
    list_make_env = [partial(f, seed=seed) for seed in seeds]
    return VecEnv(list_make_env)
Beispiel #2
0
def test_make_vec_env(env_id, num_env, init_seed):
    def make_env():
        return gym.make(env_id)
    env = make_vec_env(make_env, num_env, init_seed)
    assert isinstance(env, VecEnv)
    seeds = [x.keywords['seed'] for x in env.list_make_env]
    seeder = Seeder(init_seed)
    assert seeds == seeder(num_env)
Beispiel #3
0
def test_episode_runner(vec_env, env_id):
    env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0)
    env_spec = EnvSpec(env)

    if env_id == 'CartPole-v1':
        sticky_action = 1
    elif env_id == 'Pendulum-v0':
        sticky_action = [0.1]

    T = 30

    agent = StickyAgent(None, env_spec, sticky_action)
    runner = EpisodeRunner(None, agent, env)
    D = runner(T)

    assert D.N == 3
    assert D.maxT == max(D.Ts)

    seeder = Seeder(0)
    seed1, seed2, seed3 = seeder(3)
    env1 = make_gym_env(env_id, seed1)
    env2 = make_gym_env(env_id, seed2)
    env3 = make_gym_env(env_id, seed3)

    for n, ev in enumerate([env1, env2, env3]):
        obs = ev.reset()
        assert np.allclose(obs, D.observations[n][0])
        assert np.allclose(obs, D.numpy_observations[n, 0, ...])
        for t in range(T):
            obs, reward, done, info = ev.step(sticky_action)

            assert np.allclose(reward, D.rewards[n][t])
            assert np.allclose(reward, D.numpy_rewards[n, t])
            assert np.allclose(done, D.dones[n][t])
            assert done == D.numpy_dones[n, t]
            assert int(not done) == D.masks[n][t]
            assert int(not done) == D.numpy_masks[n, t]

            if done:
                assert np.allclose(obs, D.infos[n][t]['terminal_observation'])
                assert D.completes[n]
                assert np.allclose(0.0, D.numpy_observations[n, t + 1 + 1:,
                                                             ...])
                assert np.allclose(0.0, D.numpy_actions[n, t + 1:, ...])
                assert np.allclose(0.0, D.numpy_rewards[n, t + 1:])
                assert np.allclose(True, D.numpy_dones[n, t + 1:])
                assert np.allclose(0.0, D.numpy_masks[n, t + 1:])
                break
            else:
                assert np.allclose(obs, D.observations[n][t + 1])
Beispiel #4
0
def test_seeder():
    seeder = Seeder(init_seed=0)

    assert seeder.rng.get_state()[1][0] == 0
    assert np.random.get_state()[1][20] != seeder.rng.get_state()[1][20]

    # Single list of seeds
    seeds = seeder(size=1)
    assert len(seeds) == 1
    seeds = seeder(size=5)
    assert len(seeds) == 5

    # Batched seeds
    seeds = seeder(size=[1, 3])
    assert np.alltrue(np.array(seeds).shape == (1, 3))
    seeds = seeder(size=[2, 3])
    assert np.alltrue(np.array(seeds).shape == (2, 3))
Beispiel #5
0
    def test_make_envs(self):
        list_make_env = make_envs(make_env=make_gym_env,
                                  env_id='Pendulum-v0',
                                  num_env=3,
                                  init_seed=1)
        assert len(list_make_env) == 3
        assert list_make_env[0] != list_make_env[1] and list_make_env[
            0] != list_make_env[2]

        # Test if the seedings are correct
        seeder = Seeder(init_seed=1)
        seeds = seeder(3)
        for make_env, seed in zip(list_make_env, seeds):
            assert make_env.keywords['seed'] == seed
        env = list_make_env[0]()
        raw_env = gym.make('Pendulum-v0')
        raw_env.seed(seeds[0])
        assert np.allclose(env.reset(), raw_env.reset())
Beispiel #6
0
def test_rolling_segment_runner(vec_env, env_id):
    env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0)
    env_spec = EnvSpec(env)

    if env_id == 'CartPole-v1':
        sticky_action = 1
    elif env_id == 'Pendulum-v0':
        sticky_action = [0.1]

    T = 30

    agent = StickyAgent(None, env_spec, sticky_action)
    runner = RollingSegmentRunner(None, agent, env)
    D = runner(T)

    assert D.N == 3
    assert D.T == T

    seeder = Seeder(0)
    seed1, seed2, seed3 = seeder(3)
    env1 = make_gym_env(env_id, seed1)
    env2 = make_gym_env(env_id, seed2)
    env3 = make_gym_env(env_id, seed3)

    for n, ev in enumerate([env1, env2, env3]):
        obs = ev.reset()
        assert np.allclose(obs, D.numpy_observations[n, 0, ...])
        for t in range(T):
            obs, reward, done, info = ev.step(sticky_action)
            if done:
                info['terminal_observation'] = obs
                obs = ev.reset()

            assert np.allclose(obs, D.numpy_observations[n, t + 1, ...])
            assert np.allclose(sticky_action, D.numpy_actions[n, t, ...])
            assert np.allclose(reward, D.numpy_rewards[n, t])
            assert done == D.numpy_dones[n, t]
            assert int(not done) == D.numpy_masks[n, t]

            if done:
                assert np.allclose(info['terminal_observation'],
                                   D.infos[n][t]['terminal_observation'])
        def algorithm(config, seed, device):
            logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)
            seeder = Seeder(seed)
            seeds = seeder(size=config['env.count'])
            env_constructors = []
            for seed in seeds:
                env_constructors.append(partial(CraftingEnv, seed))
            env = VecStandardize(SerialVecEnv(env_constructors),
                                 clip_reward=100.0)
            env_spec = EnvSpec(env)

            agent = Agent(config, env_spec, device)
            runner = RollingSegmentRunner(config, agent, env)
            engine = Engine(agent, runner, env)

            for i in range(config['train.iter']):
                training_result = engine.train(i)
                print(f'Training iteration {i} complete.')
                if i % config['log.interval'] == 0:
                    logs = engine.log_train(training_result)
                    pickle_dump(obj=logs, f=logdir / f'iter_{i}_train_logs', ext='.pkl')
                    torch.save(engine.agent.policy.state_dict(),
                               logdir / 'trained_params')
Beispiel #8
0
def make_envs(make_env, env_id, num_env, init_seed, **kwargs):
    r"""Create a list of argument-free make_env() functions based on the given settings. 
    
    .. note::
    
        Each make_env function in the list uses different random seeds generated by :class:`Seeder`. 
    
    Example::
        
        >>> make_envs(make_env=make_gym_env, env_id='CartPole-v1', num_env=3, init_seed=0)
        [functools.partial(<function make_gym_env at 0x7f2127b5ce18>, env_id='CartPole-v1', seed=209652396),
         functools.partial(<function make_gym_env at 0x7f2127b5ce18>, env_id='CartPole-v1', seed=398764591),
         functools.partial(<function make_gym_env at 0x7f2127b5ce18>, env_id='CartPole-v1', seed=924231285)]
    
    Args:
        make_env (function): a function to create an environment
        env_id (str): environment ID, e.g. 'Pendulum-v0', 'Ant-v2'
        num_env (int): number of environments to create. 
        init_seed (int): initial seed for :class:`Seeder` to sample random seeds. 
        **kwargs: keyword aguments used to specify other options for make_env. 
        
    Returns
    -------
    list_make_env : list
        a list of argument-free make_env() functions, each associated with different random seed. 
    """
    # Generate different seeds for each environment
    seeder = Seeder(init_seed=init_seed)
    seeds = seeder(size=num_env)

    # Use partial to generate a list of argument-free make_env, each with different seed
    list_make_env = [
        partial(make_env, env_id=env_id, seed=seed, **kwargs) for seed in seeds
    ]

    return list_make_env
Beispiel #9
0
import torch
import pickle

params = torch.load('logs-3/0/1013845395/trained_params')

from dial_control_rl import agent
from dial_control_rl.env import CraftingEnv

from lagom.utils import Seeder
from lagom.envs.vec_env import SerialVecEnv, VecStandardize
from lagom.envs import EnvSpec
from functools import partial

env = CraftingEnv()

seeder = Seeder(0)
seeds = seeder(size=1)
env_constructors = []
for seed in seeds:
    env_constructors.append(partial(CraftingEnv, seed))
env = VecStandardize(SerialVecEnv(env_constructors), clip_reward=100.0)
env_spec = EnvSpec(env)

policy = agent.Policy({'algo.rl': 0}, env_spec, torch.device('cpu'))
policy.load_state_dict(params)
policy = policy.double()


def V(x):
    out = policy(torch.tensor(x), ['V'])
    return out['V'][0]
Beispiel #10
0
def test_batch_episode(vec_env, env_id):
    env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0)
    env_spec = EnvSpec(env)

    D = BatchEpisode(env_spec)

    if env_id == 'CartPole-v1':
        sticky_action = 1
        action_shape = ()
        action_dtype = np.int32
    elif env_id == 'Pendulum-v0':
        sticky_action = [0.1]
        action_shape = env_spec.action_space.shape
        action_dtype = np.float32

    obs = env.reset()
    D.add_observation(obs)
    for t in range(30):
        action = [sticky_action] * env.num_env
        obs, reward, done, info = env.step(action)
        D.add_observation(obs)
        D.add_action(action)
        D.add_reward(reward)
        D.add_done(done)
        D.add_info(info)
        D.add_batch_info({'V': [0.1 * (t + 1), (t + 1), 10 * (t + 1)]})
        [D.set_completed(n) for n, d in enumerate(done) if d]

    assert D.N == 3
    assert len(D.Ts) == 3
    assert D.maxT == max(D.Ts)

    assert all([
        isinstance(x, np.ndarray) for x in [
            D.numpy_observations, D.numpy_actions, D.numpy_rewards,
            D.numpy_dones, D.numpy_masks
        ]
    ])
    assert all([
        x.dtype == np.float32
        for x in [D.numpy_observations, D.numpy_rewards, D.numpy_masks]
    ])
    assert all([
        x.shape == (3, D.maxT)
        for x in [D.numpy_rewards, D.numpy_dones, D.numpy_masks]
    ])
    assert D.numpy_actions.dtype == action_dtype
    assert D.numpy_dones.dtype == np.bool
    assert D.numpy_observations.shape == (3, D.maxT +
                                          1) + env_spec.observation_space.shape
    assert D.numpy_actions.shape == (3, D.maxT) + action_shape
    assert isinstance(D.batch_infos, list) and len(D.batch_infos) == 30
    assert np.allclose([0.1 * (x + 1) for x in range(30)],
                       [info['V'][0] for info in D.batch_infos])
    assert np.allclose([1 * (x + 1) for x in range(30)],
                       [info['V'][1] for info in D.batch_infos])
    assert np.allclose([10 * (x + 1) for x in range(30)],
                       [info['V'][2] for info in D.batch_infos])

    seeder = Seeder(0)
    seed1, seed2, seed3 = seeder(3)
    env1 = make_gym_env(env_id, seed1)
    env2 = make_gym_env(env_id, seed2)
    env3 = make_gym_env(env_id, seed3)

    for n, ev in enumerate([env1, env2, env3]):
        obs = ev.reset()
        assert np.allclose(obs, D.observations[n][0])
        assert np.allclose(obs, D.numpy_observations[n, 0, ...])
        for t in range(30):
            obs, reward, done, info = ev.step(sticky_action)

            assert np.allclose(reward, D.rewards[n][t])
            assert np.allclose(reward, D.numpy_rewards[n, t])
            assert np.allclose(done, D.dones[n][t])
            assert done == D.numpy_dones[n, t]
            assert int(not done) == D.masks[n][t]
            assert int(not done) == D.numpy_masks[n, t]

            if done:
                assert np.allclose(obs, D.infos[n][t]['terminal_observation'])
                assert D.completes[n]
                assert np.allclose(0.0, D.numpy_observations[n, t + 1 + 1:,
                                                             ...])
                assert np.allclose(0.0, D.numpy_actions[n, t + 1:, ...])
                assert np.allclose(0.0, D.numpy_rewards[n, t + 1:])
                assert np.allclose(True, D.numpy_dones[n, t + 1:])
                assert np.allclose(0.0, D.numpy_masks[n, t + 1:])
                break
            else:
                assert np.allclose(obs, D.observations[n][t + 1])
Beispiel #11
0
def test_batch_segment(vec_env, env_id):
    env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0)
    env_spec = EnvSpec(env)

    T = 30

    D = BatchSegment(env_spec, T)

    if env_id == 'CartPole-v1':
        sticky_action = 1
        action_shape = ()
        action_dtype = np.int32
    elif env_id == 'Pendulum-v0':
        sticky_action = [0.1]
        action_shape = env_spec.action_space.shape
        action_dtype = np.float32

    obs = env.reset()
    D.add_observation(0, obs)
    for t in range(T):
        action = [sticky_action] * env.num_env
        obs, reward, done, info = env.step(action)
        D.add_observation(t + 1, obs)
        D.add_action(t, action)
        D.add_reward(t, reward)
        D.add_done(t, done)
        D.add_info(info)
        D.add_batch_info({'V': [0.1 * (t + 1), (t + 1), 10 * (t + 1)]})

    assert D.N == 3
    assert D.T == T
    assert all([
        isinstance(x, np.ndarray) for x in [
            D.numpy_observations, D.numpy_actions, D.numpy_rewards,
            D.numpy_dones, D.numpy_masks
        ]
    ])
    assert all([
        x.dtype == np.float32
        for x in [D.numpy_observations, D.numpy_rewards, D.numpy_masks]
    ])
    assert D.numpy_actions.dtype == action_dtype
    assert D.numpy_dones.dtype == np.bool
    assert D.numpy_observations.shape[:2] == (3, T + 1)
    assert D.numpy_actions.shape == (3, T) + action_shape
    assert all([
        x.shape == (3, T)
        for x in [D.numpy_rewards, D.numpy_dones, D.numpy_masks]
    ])
    assert isinstance(D.batch_infos, list) and len(D.batch_infos) == T
    assert np.allclose([0.1 * (x + 1) for x in range(T)],
                       [info['V'][0] for info in D.batch_infos])
    assert np.allclose([1 * (x + 1) for x in range(T)],
                       [info['V'][1] for info in D.batch_infos])
    assert np.allclose([10 * (x + 1) for x in range(T)],
                       [info['V'][2] for info in D.batch_infos])

    seeder = Seeder(0)
    seed1, seed2, seed3 = seeder(3)
    env1 = make_gym_env(env_id, seed1)
    env2 = make_gym_env(env_id, seed2)
    env3 = make_gym_env(env_id, seed3)

    for n, ev in enumerate([env1, env2, env3]):
        obs = ev.reset()
        assert np.allclose(obs, D.numpy_observations[n, 0, ...])
        for t in range(T):
            obs, reward, done, info = ev.step(sticky_action)
            if done:
                info['terminal_observation'] = obs
                obs = ev.reset()

            assert np.allclose(obs, D.numpy_observations[n, t + 1, ...])
            assert np.allclose(sticky_action, D.numpy_actions[n, t, ...])
            assert np.allclose(reward, D.numpy_rewards[n, t])
            assert done == D.numpy_dones[n, t]
            assert int(not done) == D.numpy_masks[n, t]

            if done:
                assert np.allclose(info['terminal_observation'],
                                   D.infos[n][t]['terminal_observation'])