def test_step_limit_with_single_env_dataset(env_name: str):
    env = gym.make(env_name)
    start = 0
    target = 10
    env = DummyEnvironment(start=start, target=target, max_value=10 * 2)
    env = EnvDataset(env)

    max_steps = 5

    env = ObservationLimit(env, max_steps=max_steps)
    env.seed(123)
    values = []
    for i, obs in zip(range(100), env):
        values.append(obs)
        _ = env.send(1)
    assert values == list(range(start, max_steps))

    assert env.is_closed

    with pytest.raises(gym.error.ClosedEnvironmentError):
        env.reset()

    with pytest.raises(gym.error.ClosedEnvironmentError):
        env.step(env.action_space.sample())

    with pytest.raises(gym.error.ClosedEnvironmentError):
        for i, _ in zip(range(5), env):
            assert False
def test_iterating_with_send():
    env = DummyEnvironment(target=5)
    env = EnvDataset(env)
    env.seed(123)

    actions = [0, 1, 1, 2, 1, 1, 1, 1, 0, 0, 0]
    expected_obs = [0, 0, 1, 2, 1, 2, 3, 4, 5]
    expected_rewards = [5, 4, 3, 4, 3, 2, 1, 0]
    expected_dones = [False, False, False, False, False, False, False, True]

    reset_obs = 0
    # obs = env.reset()
    # assert obs == reset_obs
    n_calls = 0

    for i, observation in enumerate(env):
        print(f"Step {i}: batch: {observation}")
        assert observation == expected_obs[i]

        action = actions[i]
        reward = env.send(action)
        assert reward == expected_rewards[i]
    # TODO: The episode will end as soon as 'done' is encountered, which means
    # that we will never be given the 'final' observation. In this case, the
    # DummyEnvironment will set done=True when the state is state = target = 5
    # in this case.
    assert observation == 4
def test_doesnt_raise_error_when_action_sent():
    env = DummyEnvironment()
    with EnvDataset(env) as env:
        env.reset()
        env.seed(123)

        for i, obs in zip(range(5), env):
            assert obs in env.observation_space
            reward = env.send(env.action_space.sample())
Beispiel #4
0
def test_measure_RL_performance_iteration():
    env = DummyEnvironment(start=0, target=5, max_value=10)
    from gym.wrappers import TimeLimit
    max_episode_steps = 50
    env = EnvDataset(env)
    env = TimeLimit(env, max_episode_steps=max_episode_steps)

    # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards)

    env = MeasureRLPerformanceWrapper(env)
    env.seed(123)
    all_episode_rewards = []
    all_episode_steps = []

    for episode in range(5):
        episode_steps = 0
        episode_reward = 0
        for step, obs in enumerate(env):
            print(f"Episode {episode}, obs: {obs}")
            action = env.action_space.sample()
            reward = env.send(action)
            episode_reward += reward
            episode_steps += 1
            # print(obs, reward, done, info)
            assert step <= max_episode_steps, "shouldn't be able to iterate longer than that."

        all_episode_steps.append(episode_steps)
        all_episode_rewards.append(episode_reward)

    expected_metrics = {}
    for episode_steps, cumul_step, episode_reward in zip(
            all_episode_steps, accumulate(all_episode_steps),
            all_episode_rewards):
        expected_metrics[cumul_step] = EpisodeMetrics(
            n_samples=1,
            mean_episode_reward=episode_reward,
            mean_episode_length=episode_steps,
        )

    assert env.get_online_performance() == expected_metrics
Beispiel #5
0
def test_measure_RL_performance_iteration():
    env = DummyEnvironment(start=0, target=5, max_value=10)
    env = EnvDataset(env)
    from sequoia.settings.active.continual.continual_rl_setting import \
        ContinualRLSetting

    # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards)

    env = MeasureRLPerformanceWrapper(env)
    env.seed(123)
    all_episode_rewards = []
    all_episode_steps = []

    for episode in range(5):
        episode_steps = 0
        episode_reward = 0
        for step, obs in enumerate(env):
            print(f"Episode {episode}, obs: {obs}")
            action = env.action_space.sample()
            reward = env.send(action)
            episode_reward += reward
            episode_steps += 1
            # print(obs, reward, done, info)

        all_episode_steps.append(episode_steps)
        all_episode_rewards.append(episode_reward)
    from itertools import accumulate

    expected_metrics = {}
    for episode_steps, cumul_step, episode_reward in zip(
            all_episode_steps, accumulate(all_episode_steps),
            all_episode_rewards):
        expected_metrics[cumul_step] = EpisodeMetrics(
            n_samples=1,
            mean_episode_reward=episode_reward,
            mean_episode_length=episode_steps,
        )

    assert env.get_online_performance() == expected_metrics