def env_fn(): # FIXME: Using the DummyEnvironment for now since it's easier to debug with. # env = gym.make(env_name) env = DummyEnvironment() env = AddDoneToObservation(env) env = TimeLimit(env, max_episode_steps=max_steps_per_episode) return env
def test_doesnt_raise_error_when_action_sent(): env = DummyEnvironment() with EnvDataset(env) as env: env.reset() env.seed(123) for i, obs in zip(range(5), env): assert obs in env.observation_space reward = env.send(env.action_space.sample())
def test_iterating_with_send(): env = DummyEnvironment(target=5) env = EnvDataset(env) env.seed(123) actions = [0, 1, 1, 2, 1, 1, 1, 1, 0, 0, 0] expected_obs = [0, 0, 1, 2, 1, 2, 3, 4, 5] expected_rewards = [5, 4, 3, 4, 3, 2, 1, 0] expected_dones = [False, False, False, False, False, False, False, True] reset_obs = 0 # obs = env.reset() # assert obs == reset_obs n_calls = 0 for i, observation in enumerate(env): print(f"Step {i}: batch: {observation}") assert observation == expected_obs[i] action = actions[i] reward = env.send(action) assert reward == expected_rewards[i] # TODO: The episode will end as soon as 'done' is encountered, which means # that we will never be given the 'final' observation. In this case, the # DummyEnvironment will set done=True when the state is state = target = 5 # in this case. assert observation == 4
def test_raise_error_when_missing_action(): env = DummyEnvironment() with EnvDataset(env) as env: env.reset() env.seed(123) with pytest.raises(RuntimeError): for i, observation in zip(range(5), env): pass
def test_measure_RL_performance_iteration(): env = DummyEnvironment(start=0, target=5, max_value=10) from gym.wrappers import TimeLimit max_episode_steps = 50 env = EnvDataset(env) env = TimeLimit(env, max_episode_steps=max_episode_steps) # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards) env = MeasureRLPerformanceWrapper(env) env.seed(123) all_episode_rewards = [] all_episode_steps = [] for episode in range(5): episode_steps = 0 episode_reward = 0 for step, obs in enumerate(env): print(f"Episode {episode}, obs: {obs}") action = env.action_space.sample() reward = env.send(action) episode_reward += reward episode_steps += 1 # print(obs, reward, done, info) assert step <= max_episode_steps, "shouldn't be able to iterate longer than that." all_episode_steps.append(episode_steps) all_episode_rewards.append(episode_reward) expected_metrics = {} for episode_steps, cumul_step, episode_reward in zip( all_episode_steps, accumulate(all_episode_steps), all_episode_rewards): expected_metrics[cumul_step] = EpisodeMetrics( n_samples=1, mean_episode_reward=episode_reward, mean_episode_length=episode_steps, ) assert env.get_online_performance() == expected_metrics
def test_iterating_with_policy(): env = DummyEnvironment() env = PolicyEnv(env) env.seed(123) actions = [0, 1, 1, 2, 1, 1, 1, 1] expected_obs = [0, 0, 1, 2, 1, 2, 3, 4, 5] expected_rewards = [5, 4, 3, 4, 3, 2, 1, 0] expected_dones = [False, False, False, False, False, False, False, True] # Expect the transitions to have this form. expected_transitions = list(zip(expected_obs[0:], actions[0:], expected_obs[1:])) reset_obs = 0 # obs = env.reset() # assert obs == reset_obs n_calls = 0 def custom_policy(observations, action_space): # Deteministic policy used for testing purposes. nonlocal n_calls action = actions[n_calls] n_calls += 1 return action n_expected_transitions = len(actions) env.set_policy(custom_policy) actual_transitions: List[StateTransition] = [] i = 0 for i, batch in enumerate(env): print(f"Step {i}: batch: {batch}") state_transition, reward = batch actual_transitions.append(state_transition) observation, action, next_observation = state_transition.as_tuple() assert observation == expected_obs[i] assert next_observation == expected_obs[i+1] assert action == actions[i] assert reward == expected_rewards[i] assert i == n_expected_transitions - 1 assert len(actual_transitions) == n_expected_transitions assert [v.as_tuple() for v in actual_transitions] == expected_transitions
def test_measure_RL_performance_basics(): env = DummyEnvironment(start=0, target=5, max_value=10) from sequoia.settings.active.continual.continual_rl_setting import \ ContinualRLSetting # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards) env = MeasureRLPerformanceWrapper(env) env.seed(123) all_episode_rewards = [] all_episode_steps = [] for episode in range(5): episode_steps = 0 episode_reward = 0 obs = env.reset() print(f"Episode {episode}, obs: {obs}") done = False while not done: action = env.action_space.sample() obs, reward, done, info = env.step(action) episode_reward += reward episode_steps += 1 # print(obs, reward, done, info) all_episode_steps.append(episode_steps) all_episode_rewards.append(episode_reward) from itertools import accumulate expected_metrics = {} for episode_steps, cumul_step, episode_reward in zip( all_episode_steps, accumulate(all_episode_steps), all_episode_rewards): expected_metrics[cumul_step] = EpisodeMetrics( n_samples=1, mean_episode_reward=episode_reward, mean_episode_length=episode_steps, ) assert env.get_online_performance() == expected_metrics
def test_step_limit_with_single_env_dataset(env_name: str): env = gym.make(env_name) start = 0 target = 10 env = DummyEnvironment(start=start, target=target, max_value=10 * 2) env = EnvDataset(env) max_steps = 5 env = ObservationLimit(env, max_steps=max_steps) env.seed(123) values = [] for i, obs in zip(range(100), env): values.append(obs) _ = env.send(1) assert values == list(range(start, max_steps)) assert env.is_closed with pytest.raises(gym.error.ClosedEnvironmentError): env.reset() with pytest.raises(gym.error.ClosedEnvironmentError): env.step(env.action_space.sample()) with pytest.raises(gym.error.ClosedEnvironmentError): for i, _ in zip(range(5), env): assert False
def test_step_normally_works_fine(): env = DummyEnvironment() env = EnvDataset(env) env.seed(123) obs = env.reset() assert obs == 0 obs, reward, done, info = env.step(0) assert (obs, reward, done, info) == (0, 5, False, {}) obs, reward, done, info = env.step(1) assert (obs, reward, done, info) == (1, 4, False, {}) obs, reward, done, info = env.step(1) assert (obs, reward, done, info) == (2, 3, False, {}) obs, reward, done, info = env.step(2) assert (obs, reward, done, info) == (1, 4, False, {}) obs, reward, done, info = env.step(1) assert (obs, reward, done, info) == (2, 3, False, {}) obs, reward, done, info = env.step(1) assert (obs, reward, done, info) == (3, 2, False, {}) obs, reward, done, info = env.step(1) assert (obs, reward, done, info) == (4, 1, False, {}) obs, reward, done, info = env.step(1) assert (obs, reward, done, info) == (5, 0, True, {}) env.reset() obs, reward, done, info = env.step(0) assert (obs, reward, done, info) == (0, 5, False, {})