Beispiel #1
0
    def test_make_gym_env(self):
        env = make_gym_env(env_id='CartPole-v1', seed=0, monitor=False)
        assert isinstance(env, Env)
        assert not isinstance(env, gym.Env)
        assert isinstance(env, Wrapper)
        assert isinstance(env.observation_space, Box)
        assert isinstance(env.action_space, Discrete)
        env_spec = EnvSpec(env)
        assert env_spec.control_type == 'Discrete'
        assert env_spec.T == 500
        assert env_spec.max_episode_reward == 475.0
        assert env_spec.reward_range == (-float('inf'), float('inf'))
        assert not env_spec.is_vec_env

        with pytest.raises(TypeError):
            env_spec.num_env

        assert env.reset().shape == (4, )
        assert len(env.step(env.action_space.sample())) == 4

        del env
        del env_spec

        # Pendulum, continuous
        # do not test redundant part
        env = make_gym_env('Pendulum-v0', seed=0)
        assert isinstance(env, Env)
        env_spec = EnvSpec(env)
        assert isinstance(env_spec.action_space, Box)
        assert env_spec.T == 200
        assert env_spec.control_type == 'Continuous'

        assert env.reset().shape == (3, )
        assert len(env.step(env.action_space.sample())) == 4
Beispiel #2
0
def test_episode_runner(vec_env, env_id):
    env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0)
    env_spec = EnvSpec(env)

    if env_id == 'CartPole-v1':
        sticky_action = 1
    elif env_id == 'Pendulum-v0':
        sticky_action = [0.1]

    T = 30

    agent = StickyAgent(None, env_spec, sticky_action)
    runner = EpisodeRunner(None, agent, env)
    D = runner(T)

    assert D.N == 3
    assert D.maxT == max(D.Ts)

    seeder = Seeder(0)
    seed1, seed2, seed3 = seeder(3)
    env1 = make_gym_env(env_id, seed1)
    env2 = make_gym_env(env_id, seed2)
    env3 = make_gym_env(env_id, seed3)

    for n, ev in enumerate([env1, env2, env3]):
        obs = ev.reset()
        assert np.allclose(obs, D.observations[n][0])
        assert np.allclose(obs, D.numpy_observations[n, 0, ...])
        for t in range(T):
            obs, reward, done, info = ev.step(sticky_action)

            assert np.allclose(reward, D.rewards[n][t])
            assert np.allclose(reward, D.numpy_rewards[n, t])
            assert np.allclose(done, D.dones[n][t])
            assert done == D.numpy_dones[n, t]
            assert int(not done) == D.masks[n][t]
            assert int(not done) == D.numpy_masks[n, t]

            if done:
                assert np.allclose(obs, D.infos[n][t]['terminal_observation'])
                assert D.completes[n]
                assert np.allclose(0.0, D.numpy_observations[n, t + 1 + 1:,
                                                             ...])
                assert np.allclose(0.0, D.numpy_actions[n, t + 1:, ...])
                assert np.allclose(0.0, D.numpy_rewards[n, t + 1:])
                assert np.allclose(True, D.numpy_dones[n, t + 1:])
                assert np.allclose(0.0, D.numpy_masks[n, t + 1:])
                break
            else:
                assert np.allclose(obs, D.observations[n][t + 1])
Beispiel #3
0
def test_frame_stack():
    env = make_gym_env(env_id='CartPole-v1', seed=1)
    env = FrameStack(env, num_stack=4)
    assert isinstance(env, FrameStack)
    assert isinstance(env, Env)
    assert env.num_stack == 4
    assert env.observation_space.shape == (4, 4)
    assert isinstance(env.stack_buffer, np.ndarray)
    assert env.stack_buffer.shape == (4, 4)
    assert np.all(env.stack_buffer == 0.0)
    assert env.stack_buffer.dtype == np.float32
    assert env.reset().shape == (4, 4)
    obs = env.step(0)[0]
    assert obs[:, 0].sum() != 0.0
    assert obs[:, 1].sum() != 0.0
    assert np.all(obs[:, 2:] == 0.0)
    assert np.any(obs[:, 0] != obs[:, 1])
    obs = env.step(1)[0]
    obs = env.step(1)[0]
    assert np.allclose(obs[:, -1],
                       [0.03073904, 0.00145001, -0.03088818, -0.03131252])
    assert np.allclose(obs[:, 2],
                       [0.03076804, -0.19321568, -0.03151444, 0.25146705])
    obs = env.step(1)[0]
    assert np.allclose(obs[:, -1],
                       [0.03076804, -0.19321568, -0.03151444, 0.25146705])
Beispiel #4
0
def test_categorical_head():
    with pytest.raises(AssertionError):
        env = make_gym_env('Pendulum-v0', 0)
        env_spec = EnvSpec(env)
        CategoricalHead(None, None, 30, env_spec)

    env = make_gym_env('CartPole-v1', 0)
    env_spec = EnvSpec(env)
    head = CategoricalHead(None, None, 30, env_spec)
    assert head.feature_dim == 30
    assert isinstance(head.action_head, nn.Linear)
    assert head.action_head.in_features == 30 and head.action_head.out_features == 2
    dist = head(torch.randn(3, 30))
    assert isinstance(dist, Categorical)
    assert list(dist.batch_shape) == [3]
    assert list(dist.probs.shape) == [3, 2]
    action = dist.sample()
    assert action.shape == (3, )
Beispiel #5
0
def test_rolling_segment_runner(vec_env, env_id):
    env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0)
    env_spec = EnvSpec(env)

    if env_id == 'CartPole-v1':
        sticky_action = 1
    elif env_id == 'Pendulum-v0':
        sticky_action = [0.1]

    T = 30

    agent = StickyAgent(None, env_spec, sticky_action)
    runner = RollingSegmentRunner(None, agent, env)
    D = runner(T)

    assert D.N == 3
    assert D.T == T

    seeder = Seeder(0)
    seed1, seed2, seed3 = seeder(3)
    env1 = make_gym_env(env_id, seed1)
    env2 = make_gym_env(env_id, seed2)
    env3 = make_gym_env(env_id, seed3)

    for n, ev in enumerate([env1, env2, env3]):
        obs = ev.reset()
        assert np.allclose(obs, D.numpy_observations[n, 0, ...])
        for t in range(T):
            obs, reward, done, info = ev.step(sticky_action)
            if done:
                info['terminal_observation'] = obs
                obs = ev.reset()

            assert np.allclose(obs, D.numpy_observations[n, t + 1, ...])
            assert np.allclose(sticky_action, D.numpy_actions[n, t, ...])
            assert np.allclose(reward, D.numpy_rewards[n, t])
            assert done == D.numpy_dones[n, t]
            assert int(not done) == D.numpy_masks[n, t]

            if done:
                assert np.allclose(info['terminal_observation'],
                                   D.infos[n][t]['terminal_observation'])
Beispiel #6
0
def test_diag_gaussian_head():
    with pytest.raises(AssertionError):
        env = make_gym_env('CartPole-v1', 0)
        env_spec = EnvSpec(env)
        DiagGaussianHead(None, None, 30, env_spec)

    env = make_gym_env('Pendulum-v0', 0)
    env_spec = EnvSpec(env)
    head = DiagGaussianHead(None, None, 30, env_spec)
    assert head.feature_dim == 30
    assert isinstance(head.mean_head, nn.Linear)
    assert isinstance(head.logstd_head, nn.Parameter)
    assert head.mean_head.in_features == 30 and head.mean_head.out_features == 1
    assert list(head.logstd_head.shape) == [1]
    assert torch.eq(head.logstd_head, torch.tensor(-0.510825624))
    dist = head(torch.randn(3, 30))
    assert isinstance(dist, Independent) and isinstance(dist.base_dist, Normal)
    assert list(dist.batch_shape) == [3]
    action = dist.sample()
    assert list(action.shape) == [3, 1]

    head = DiagGaussianHead(None, None, 30, env_spec, std_style='softplus')
    dist = head(torch.randn(3, 30))
    action = dist.sample()
    assert list(action.shape) == [3, 1]
    assert torch.eq(head.logstd_head, torch.tensor(-0.19587036834631966))

    head = DiagGaussianHead(None, None, 30, env_spec, std_style='sigmoidal')
    assert torch.eq(head.logstd_head, torch.tensor(-0.871222446472449))

    head = DiagGaussianHead(None, None, 30, env_spec, std_state_dependent=True)
    dist = head(torch.randn(3, 30))
    action = dist.sample()
    assert list(action.shape) == [3, 1]

    head = DiagGaussianHead(None, None, 30, env_spec, constant_std=0.3)
    dist = head(torch.randn(3, 30))
    action = dist.sample()
    assert list(action.shape) == [3, 1]
    assert not head.logstd_head.requires_grad
    assert torch.eq(head.logstd_head, torch.tensor([-1.2039728]))
Beispiel #7
0
def test_constraint_action():
    env = make_gym_env('Pendulum-v0', 0)
    env_spec = EnvSpec(env)

    action = torch.tensor([1.5])
    assert torch.eq(constraint_action(env_spec, action), torch.tensor([1.5]))

    action = torch.tensor([3.0])
    assert torch.eq(constraint_action(env_spec, action), torch.tensor([2.0]))

    action = torch.tensor([-10.0])
    assert torch.eq(constraint_action(env_spec, action), torch.tensor([-2.0]))
Beispiel #8
0
    def test_resize_observation(self, env_id):
        env = make_gym_env(env_id, 0)
        new_env = ResizeObservation(env, 16)
        assert env.observation_space.shape[
            -1] == new_env.observation_space.shape[-1]
        assert new_env.observation_space.shape[:2] == (16, 16)

        obs = env.reset()
        assert obs.shape == env.observation_space.shape

        obs = new_env.reset()
        assert obs.shape[:2] == (16, 16)
        assert obs.shape == new_env.observation_space.shape
Beispiel #9
0
def test_random_agent():
    env = make_gym_env('Pendulum-v0', 0)
    env_spec = EnvSpec(env)
    agent = RandomAgent(None, env_spec)
    out = agent.choose_action(env.reset())
    assert isinstance(out, dict)
    assert 'action' in out and out['action'].shape == (1,)

    venv = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0)
    env_spec = EnvSpec(venv)
    agent = RandomAgent(None, env_spec)
    out = agent.choose_action(env.reset())
    assert isinstance(out, dict)
    assert 'action' in out and len(out['action']) == 3 and isinstance(out['action'][0], int)
Beispiel #10
0
    def test_gray_scale_observation(self, env_id):
        env = make_gym_env(env_id, 0)
        new_env = GrayScaleObservation(env, keep_dim=True)
        assert env.observation_space.shape[:
                                           2] == new_env.observation_space.shape[:
                                                                                 2]
        assert env.observation_space.shape[-1] == 3
        assert new_env.observation_space.shape[-1] == 1

        obs = env.reset()
        assert obs.shape == env.observation_space.shape

        obs = new_env.reset()
        assert obs.shape == new_env.observation_space.shape
Beispiel #11
0
def test_random_policy():
    env = make_gym_env('Pendulum-v0', 0)
    env_spec = EnvSpec(env)
    policy = RandomPolicy(None, env_spec)
    out = policy(env.reset())
    assert isinstance(out, dict)
    assert 'action' in out and out['action'].shape == (1, )

    venv = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v0', 3, 0, False)
    env_spec = EnvSpec(venv)
    policy = RandomPolicy(None, env_spec)
    out = policy(env.reset())
    assert isinstance(out, dict)
    assert 'action' in out and len(out['action']) == 3 and isinstance(
        out['action'][0], int)
Beispiel #12
0
def test_sticky_agent():
    sticky_action = 0
    
    env = make_gym_env('CartPole-v1', 0)
    env_spec = EnvSpec(env)
    agent = StickyAgent(None, env_spec, sticky_action)
    out = agent.choose_action(env.reset())
    assert isinstance(out, dict)
    assert 'action' in out and isinstance(out['action'], int)
    assert out['action'] == sticky_action
    
    venv = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0)
    env_spec = EnvSpec(venv)
    agent = StickyAgent(None, env_spec, sticky_action)
    out = agent.choose_action(env.reset())
    assert isinstance(out, dict)
    assert 'action' in out and len(out['action']) == 3 and isinstance(out['action'][0], int)
    assert np.allclose(out['action'], [0, 0, 0])
Beispiel #13
0
    def eval(self, n):
        # Set network as evaluation mode
        self.agent.policy.network.eval()

        # Create a new instance of the envrionment
        env = make_gym_env(env_id=self.config['env:id'],
                           seed=self.config['seed'],
                           monitor=False,
                           monitor_dir=None)
        # Create a TrajectoryRunner
        runner = TrajectoryRunner(agent=self.agent,
                                  env=env,
                                  gamma=self.config['algo:gamma'])
        # Evaluate the agent for a set of trajectories
        D = runner(N=self.config['eval:N'], T=self.config['eval:T'])

        # Return evaluation output
        eval_output = {}
        eval_output['D'] = D
        eval_output['n'] = n

        return eval_output
Beispiel #14
0
def test_batch_episode(vec_env, env_id):
    env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0)
    env_spec = EnvSpec(env)

    D = BatchEpisode(env_spec)

    if env_id == 'CartPole-v1':
        sticky_action = 1
        action_shape = ()
        action_dtype = np.int32
    elif env_id == 'Pendulum-v0':
        sticky_action = [0.1]
        action_shape = env_spec.action_space.shape
        action_dtype = np.float32

    obs = env.reset()
    D.add_observation(obs)
    for t in range(30):
        action = [sticky_action] * env.num_env
        obs, reward, done, info = env.step(action)
        D.add_observation(obs)
        D.add_action(action)
        D.add_reward(reward)
        D.add_done(done)
        D.add_info(info)
        D.add_batch_info({'V': [0.1 * (t + 1), (t + 1), 10 * (t + 1)]})
        [D.set_completed(n) for n, d in enumerate(done) if d]

    assert D.N == 3
    assert len(D.Ts) == 3
    assert D.maxT == max(D.Ts)

    assert all([
        isinstance(x, np.ndarray) for x in [
            D.numpy_observations, D.numpy_actions, D.numpy_rewards,
            D.numpy_dones, D.numpy_masks
        ]
    ])
    assert all([
        x.dtype == np.float32
        for x in [D.numpy_observations, D.numpy_rewards, D.numpy_masks]
    ])
    assert all([
        x.shape == (3, D.maxT)
        for x in [D.numpy_rewards, D.numpy_dones, D.numpy_masks]
    ])
    assert D.numpy_actions.dtype == action_dtype
    assert D.numpy_dones.dtype == np.bool
    assert D.numpy_observations.shape == (3, D.maxT +
                                          1) + env_spec.observation_space.shape
    assert D.numpy_actions.shape == (3, D.maxT) + action_shape
    assert isinstance(D.batch_infos, list) and len(D.batch_infos) == 30
    assert np.allclose([0.1 * (x + 1) for x in range(30)],
                       [info['V'][0] for info in D.batch_infos])
    assert np.allclose([1 * (x + 1) for x in range(30)],
                       [info['V'][1] for info in D.batch_infos])
    assert np.allclose([10 * (x + 1) for x in range(30)],
                       [info['V'][2] for info in D.batch_infos])

    seeder = Seeder(0)
    seed1, seed2, seed3 = seeder(3)
    env1 = make_gym_env(env_id, seed1)
    env2 = make_gym_env(env_id, seed2)
    env3 = make_gym_env(env_id, seed3)

    for n, ev in enumerate([env1, env2, env3]):
        obs = ev.reset()
        assert np.allclose(obs, D.observations[n][0])
        assert np.allclose(obs, D.numpy_observations[n, 0, ...])
        for t in range(30):
            obs, reward, done, info = ev.step(sticky_action)

            assert np.allclose(reward, D.rewards[n][t])
            assert np.allclose(reward, D.numpy_rewards[n, t])
            assert np.allclose(done, D.dones[n][t])
            assert done == D.numpy_dones[n, t]
            assert int(not done) == D.masks[n][t]
            assert int(not done) == D.numpy_masks[n, t]

            if done:
                assert np.allclose(obs, D.infos[n][t]['terminal_observation'])
                assert D.completes[n]
                assert np.allclose(0.0, D.numpy_observations[n, t + 1 + 1:,
                                                             ...])
                assert np.allclose(0.0, D.numpy_actions[n, t + 1:, ...])
                assert np.allclose(0.0, D.numpy_rewards[n, t + 1:])
                assert np.allclose(True, D.numpy_dones[n, t + 1:])
                assert np.allclose(0.0, D.numpy_masks[n, t + 1:])
                break
            else:
                assert np.allclose(obs, D.observations[n][t + 1])
Beispiel #15
0
def test_batch_segment(vec_env, env_id):
    env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0)
    env_spec = EnvSpec(env)

    T = 30

    D = BatchSegment(env_spec, T)

    if env_id == 'CartPole-v1':
        sticky_action = 1
        action_shape = ()
        action_dtype = np.int32
    elif env_id == 'Pendulum-v0':
        sticky_action = [0.1]
        action_shape = env_spec.action_space.shape
        action_dtype = np.float32

    obs = env.reset()
    D.add_observation(0, obs)
    for t in range(T):
        action = [sticky_action] * env.num_env
        obs, reward, done, info = env.step(action)
        D.add_observation(t + 1, obs)
        D.add_action(t, action)
        D.add_reward(t, reward)
        D.add_done(t, done)
        D.add_info(info)
        D.add_batch_info({'V': [0.1 * (t + 1), (t + 1), 10 * (t + 1)]})

    assert D.N == 3
    assert D.T == T
    assert all([
        isinstance(x, np.ndarray) for x in [
            D.numpy_observations, D.numpy_actions, D.numpy_rewards,
            D.numpy_dones, D.numpy_masks
        ]
    ])
    assert all([
        x.dtype == np.float32
        for x in [D.numpy_observations, D.numpy_rewards, D.numpy_masks]
    ])
    assert D.numpy_actions.dtype == action_dtype
    assert D.numpy_dones.dtype == np.bool
    assert D.numpy_observations.shape[:2] == (3, T + 1)
    assert D.numpy_actions.shape == (3, T) + action_shape
    assert all([
        x.shape == (3, T)
        for x in [D.numpy_rewards, D.numpy_dones, D.numpy_masks]
    ])
    assert isinstance(D.batch_infos, list) and len(D.batch_infos) == T
    assert np.allclose([0.1 * (x + 1) for x in range(T)],
                       [info['V'][0] for info in D.batch_infos])
    assert np.allclose([1 * (x + 1) for x in range(T)],
                       [info['V'][1] for info in D.batch_infos])
    assert np.allclose([10 * (x + 1) for x in range(T)],
                       [info['V'][2] for info in D.batch_infos])

    seeder = Seeder(0)
    seed1, seed2, seed3 = seeder(3)
    env1 = make_gym_env(env_id, seed1)
    env2 = make_gym_env(env_id, seed2)
    env3 = make_gym_env(env_id, seed3)

    for n, ev in enumerate([env1, env2, env3]):
        obs = ev.reset()
        assert np.allclose(obs, D.numpy_observations[n, 0, ...])
        for t in range(T):
            obs, reward, done, info = ev.step(sticky_action)
            if done:
                info['terminal_observation'] = obs
                obs = ev.reset()

            assert np.allclose(obs, D.numpy_observations[n, t + 1, ...])
            assert np.allclose(sticky_action, D.numpy_actions[n, t, ...])
            assert np.allclose(reward, D.numpy_rewards[n, t])
            assert done == D.numpy_dones[n, t]
            assert int(not done) == D.numpy_masks[n, t]

            if done:
                assert np.allclose(info['terminal_observation'],
                                   D.infos[n][t]['terminal_observation'])
Beispiel #16
0
    def __call__(self, config):
        # Set random seeds: PyTorch, numpy.random, random
        set_global_seeds(seed=config['seed'])

        # Create an environment
        env = make_gym_env(env_id=config['env:id'],
                           seed=config['seed'],
                           monitor=False,
                           monitor_dir=None)
        # Create environment specification
        env_spec = EnvSpec(env)

        # Create device
        torch.cuda.set_device(config['cuda_id'])
        device = torch.device(
            f'cuda:{config["cuda_id"]}' if config['cuda'] else 'cpu')

        # Create policy
        network = MLP(config=config).to(device)
        policy = CategoricalPolicy(network=network, env_spec=env_spec)

        # Create optimizer
        optimizer = optim.Adam(policy.network.parameters(),
                               lr=config['algo:lr'])
        # Create learning rate scheduler
        if config['algo:use_lr_scheduler']:
            max_epoch = config[
                'train:iter']  # Max number of lr decay, Note where lr_scheduler put
            lambda_f = lambda epoch: 1 - epoch / max_epoch  # decay learning rate for each training epoch
            lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                       lr_lambda=lambda_f)

        # Create agent
        kwargs = {'device': device}
        if config['algo:use_lr_scheduler']:
            kwargs['lr_scheduler'] = lr_scheduler
        agent = ActorCriticAgent(policy=policy,
                                 optimizer=optimizer,
                                 config=config,
                                 **kwargs)

        # Create runner
        runner = TrajectoryRunner(agent=agent,
                                  env=env,
                                  gamma=config['algo:gamma'])

        # Create engine
        engine = Engine(agent=agent, runner=runner, config=config, logger=None)

        # Training and evaluation
        train_logs = []
        eval_logs = []
        for i in range(config['train:iter']):
            train_output = engine.train(i)

            # Logging and evaluation
            if i == 0 or (i + 1) % config['log:interval'] == 0:
                # Log training and record the loggings
                train_logger = engine.log_train(train_output)
                train_logs.append(train_logger.logs)
                # Log evaluation and record the loggings
                eval_output = engine.eval(i)
                eval_logger = engine.log_eval(eval_output)
                eval_logs.append(eval_logger.logs)

        # Save the loggings
        np.save(
            Path(config['log:dir']) / str(config['ID']) / 'train', train_logs)
        np.save(
            Path(config['log:dir']) / str(config['ID']) / 'eval', eval_logs)

        return None
Beispiel #17
0
def test_reward_scale():
    env = make_gym_env(env_id='CartPole-v1', seed=0)
    env = RewardScale(env, scale=0.02)
    env.reset()
    observation, reward, done, info = env.step(env.action_space.sample())
    assert reward == 0.02
 def test_make_gym_env(self):
     env = make_gym_env(env_id='Pendulum-v0', seed=1)
     assert isinstance(env, Env)