def test_batched_state(env_name: str, batch_size: int): max_steps_per_episode = 10 env = make_batched_env(env_name, batch_size=batch_size) dataset = EnvDataset(env, max_steps_per_episode=max_steps_per_episode) env: GymDataLoader = GymDataLoader( dataset, batch_size=batch_size, ) with gym.make(env_name) as temp_env: state_shape = temp_env.observation_space.shape action_shape = temp_env.action_space.shape state_shape = (batch_size, *state_shape) action_shape = (batch_size, *action_shape) reward_shape = (batch_size, ) state = env.reset() assert state.shape == state_shape env.seed(123) i = 0 for obs_batch in take(env, 5): assert obs_batch.shape == state_shape random_actions = env.action_space.sample() assert torch.as_tensor(random_actions).shape == action_shape assert temp_env.action_space.contains(random_actions[0]) reward = env.send(random_actions) assert reward.shape == reward_shape i += 1 assert i == 5
def test_multiple_epochs_works(env_name: str, batch_size: int): epochs = 3 max_steps_per_episode = 10 env = make_batched_env(env_name, batch_size=batch_size) dataset = EnvDataset(env, max_steps_per_episode=max_steps_per_episode) env: GymDataLoader = GymDataLoader(dataset, ) all_rewards = [] with env: env.reset() for epoch in range(epochs): for i, batch in enumerate(env): assert i < max_steps_per_episode, "Max steps per episode should have been respected." rewards = env.send(env.action_space.sample()) if batch_size is None: all_rewards.append(rewards) else: all_rewards.extend(rewards) # Since in the VectorEnv, 'episodes' are infinite, we must have # reached the limit of the number of steps, while in a single # environment, the episode might have been shorter. assert i <= max_steps_per_episode - 1 assert epoch == epochs - 1 if batch_size is None: assert len(all_rewards) <= epochs * max_steps_per_episode else: assert len(all_rewards) == epochs * max_steps_per_episode * batch_size
def test_spaces(env_name: str, batch_size: int): dataset = EnvDataset(make_batched_env(env_name, batch_size=batch_size)) batched_obs_space = dataset.observation_space # NOTE: the VectorEnv class creates the 'batched' action space by creating a # Tuple of the single action space, of length 'N', which seems a bit weird. # batched_action_space = vector_env.action_space batched_action_space = batch_space(dataset.single_action_space, batch_size) dataloader_env = GymDataLoader(dataset, batch_size=batch_size) assert dataloader_env.observation_space == batched_obs_space assert dataloader_env.action_space == batched_action_space dataloader_env.reset() for observation_batch in take(dataloader_env, 3): if isinstance(observation_batch, Tensor): observation_batch = observation_batch.cpu().numpy() assert observation_batch in batched_obs_space actions = dataloader_env.action_space.sample() assert len(actions) == batch_size assert actions in batched_action_space rewards = dataloader_env.send(actions) assert len(rewards) == batch_size assert rewards in dataloader_env.reward_space
def test_max_steps_is_respected(env_name: str, batch_size: int): max_steps = 5 env_name = "CartPole-v0" env = make_batched_env(env_name, batch_size=batch_size) dataset = EnvDataset(env, max_steps=max_steps) env: GymDataLoader = GymDataLoader(dataset, ) env.reset() for i, batch in enumerate(env): assert i < max_steps, f"Max steps should have been respected: {i}" env.send(env.action_space.sample()) assert i == max_steps - 1 env.close()
def test_done_is_sometimes_True_when_iterating_through_env(batch_size: int): """ Test that when *iterating* through the env, done is sometimes 'True'. """ env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=True) env = AddDoneToObservation(env) env = ConvertToFromTensors(env) env = EnvDataset(env) for i, obs in zip(range(100), env): print(i, obs) _ = env.send(env.action_space.sample()) if any(obs["done"]): break else: assert False, "Never encountered done=True!"
def test_max_steps_is_respected(self, env_name: str, batch_size: int): max_steps = 5 env_name = "CartPole-v0" env = make_batched_env(env_name, batch_size=batch_size) dataset = EnvDataset(env) from sequoia.common.gym_wrappers.action_limit import ActionLimit dataset = ActionLimit(dataset, max_steps=max_steps * (batch_size or 1)) env: GymDataLoader = self.GymDataLoader(dataset) env.reset() i = 0 for i, obs in enumerate(env): assert obs in env.observation_space assert i < max_steps, f"Max steps should have been respected: {i}" env.send(env.action_space.sample()) assert i == max_steps - 1 env.close()
def test_measure_RL_performance_batched_env(): batch_size = 3 start = [i for i in range(batch_size)] target = 5 env = EnvDataset( SyncVectorEnv([ partial(DummyEnvironment, start=start[i], target=target, max_value=target * 2) for i in range(batch_size) ])) # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards) env = MeasureRLPerformanceWrapper(env) env.seed(123) all_episode_rewards = [] all_episode_steps = [] for step, obs in enumerate(itertools.islice(env, 100)): print(f"step {step} obs: {obs}") action = np.ones(batch_size) # always increment the counter reward = env.send(action) print(env.done_) # print(obs, reward, done, info) assert step == 99 from collections import defaultdict from sequoia.common.metrics import Metrics expected_metrics = defaultdict(Metrics) for i in range(101): for env_index in range(batch_size): if i and i % target == 0: expected_metrics[i] += EpisodeMetrics( n_samples=1, mean_episode_reward= 10., # ? FIXME: Actually understand this condition mean_episode_length=target, ) # FIXME: This test is a bit too complicated, hard to follow. I'll keep the # batches synced-up for now. # if i > 0 and (i + env_index) % target == 0: # expected_metrics[i] += EpisodeMetrics( # n_samples=1, # mean_episode_reward=sum(target - (i + env_index % target) for j in range(start[env_index], target)), # mean_episode_length=target - start[env_index] - 1 # ) assert env.get_online_performance() == expected_metrics
def test_reward_isnt_always_one(self, env_name: str, batch_size: int): epochs = 3 max_steps_per_episode = 100 env = make_batched_env(env_name, batch_size=batch_size) dataset = EnvDataset(env, max_steps_per_episode=max_steps_per_episode) env: GymDataLoader = self.GymDataLoader(env=dataset) all_rewards = [] with env: env.reset() for epoch in range(epochs): for i, batch in enumerate(env): rewards = env.send(env.action_space.sample()) all_rewards.extend(rewards) assert all_rewards != np.ones(len(all_rewards)).tolist()
def test_batched_pixels(self, env_name: str, batch_size: int): max_steps_per_episode = 10 wrappers = [PixelObservationWrapper] env = make_batched_env(env_name, wrappers=wrappers, batch_size=batch_size) dataset = EnvDataset(env, max_steps_per_episode=max_steps_per_episode) with gym.make(env_name) as temp_env: for wrapper in wrappers: temp_env = wrapper(temp_env) state_shape = temp_env.observation_space.shape action_shape = temp_env.action_space.shape state_shape = (batch_size, *state_shape) action_shape = (batch_size, *action_shape) reward_shape = (batch_size, ) env = self.GymDataLoader( dataset, batch_size=batch_size, ) assert isinstance(env.observation_space, spaces.Box) assert len(env.observation_space.shape) == 4 assert env.observation_space.shape[0] == batch_size env.seed(1234) for i, batch in enumerate(env): assert len(batch) == batch_size if isinstance(batch, Tensor): batch = batch.cpu().numpy() assert batch in env.observation_space random_actions = env.action_space.sample() assert torch.as_tensor(random_actions).shape == action_shape assert temp_env.action_space.contains(random_actions[0]) reward = env.send(random_actions) assert reward.shape == reward_shape
def test_measure_RL_performance_iteration(): env = DummyEnvironment(start=0, target=5, max_value=10) from gym.wrappers import TimeLimit max_episode_steps = 50 env = EnvDataset(env) env = TimeLimit(env, max_episode_steps=max_episode_steps) # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards) env = MeasureRLPerformanceWrapper(env) env.seed(123) all_episode_rewards = [] all_episode_steps = [] for episode in range(5): episode_steps = 0 episode_reward = 0 for step, obs in enumerate(env): print(f"Episode {episode}, obs: {obs}") action = env.action_space.sample() reward = env.send(action) episode_reward += reward episode_steps += 1 # print(obs, reward, done, info) assert step <= max_episode_steps, "shouldn't be able to iterate longer than that." all_episode_steps.append(episode_steps) all_episode_rewards.append(episode_reward) expected_metrics = {} for episode_steps, cumul_step, episode_reward in zip( all_episode_steps, accumulate(all_episode_steps), all_episode_rewards): expected_metrics[cumul_step] = EpisodeMetrics( n_samples=1, mean_episode_reward=episode_reward, mean_episode_length=episode_steps, ) assert env.get_online_performance() == expected_metrics
def test_measure_RL_performance_iteration(): env = DummyEnvironment(start=0, target=5, max_value=10) env = EnvDataset(env) from sequoia.settings.active.continual.continual_rl_setting import \ ContinualRLSetting # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards) env = MeasureRLPerformanceWrapper(env) env.seed(123) all_episode_rewards = [] all_episode_steps = [] for episode in range(5): episode_steps = 0 episode_reward = 0 for step, obs in enumerate(env): print(f"Episode {episode}, obs: {obs}") action = env.action_space.sample() reward = env.send(action) episode_reward += reward episode_steps += 1 # print(obs, reward, done, info) all_episode_steps.append(episode_steps) all_episode_rewards.append(episode_reward) from itertools import accumulate expected_metrics = {} for episode_steps, cumul_step, episode_reward in zip( all_episode_steps, accumulate(all_episode_steps), all_episode_rewards): expected_metrics[cumul_step] = EpisodeMetrics( n_samples=1, mean_episode_reward=episode_reward, mean_episode_length=episode_steps, ) assert env.get_online_performance() == expected_metrics
def test_with_controllable_episode_lengths(batch_size: int, monkeypatch): """ TODO: Test out the PolicyHead in a very controlled environment, where we know exactly the lengths of each episode. """ env = FakeEnvironment( partial(gym.make, "CartPole-v0"), batch_size=batch_size, episode_lengths=[5, *(10 for _ in range(batch_size - 1))], new_episode_length=lambda env_index: 10, ) env = AddDoneToObservation(env) env = ConvertToFromTensors(env) env = EnvDataset(env) obs_space = env.single_observation_space x_dim = flatdim(obs_space["x"]) # Create some dummy encoder. encoder = nn.Linear(x_dim, x_dim) representation_space = obs_space["x"] output_head = PolicyHead( input_space=representation_space, action_space=env.single_action_space, reward_space=env.single_reward_space, hparams=PolicyHead.HParams( max_episode_window_length=100, min_episodes_before_update=1, accumulate_losses_before_backward=False, ), ) # TODO: Simulating as if the output head were attached to a BaselineModel. PolicyHead.base_model_optimizer = torch.optim.Adam( output_head.parameters(), lr=1e-3 ) # Simplify the loss function so we know exactly what the loss should be at # each step. def mock_policy_gradient( rewards: Sequence[float], log_probs: Sequence[float], gamma: float = 0.95 ) -> Optional[Loss]: log_probs = (log_probs - log_probs.clone()) + 1 # Return the length of the episode, but with a "gradient" flowing back into log_probs. return len(rewards) * log_probs.mean() monkeypatch.setattr(output_head, "policy_gradient", mock_policy_gradient) batch_size = env.batch_size obs = env.reset() step_done = np.zeros(batch_size, dtype=np.bool) for step in range(200): x, obs_done = obs["x"], obs["done"] # The done from the obs should always be the same as the 'done' from the 'step' function. assert np.array_equal(obs_done, step_done) representations = encoder(x) observations = ContinualRLSetting.Observations(x=x, done=obs_done,) actions_obj = output_head(observations, representations) actions = actions_obj.y_pred # TODO: kinda useless to wrap a single tensor in an object.. forward_pass = ForwardPass( observations=observations, representations=representations, actions=actions, ) obs, rewards, step_done, info = env.step(actions) rewards_obj = ContinualRLSetting.Rewards(y=rewards) loss = output_head.get_loss( forward_pass=forward_pass, actions=actions_obj, rewards=rewards_obj, ) print(f"Step {step}") print(f"num episodes since update: {output_head.num_episodes_since_update}") print(f"steps left in episode: {env.steps_left_in_episode}") print(f"Loss for that step: {loss}") if any(obs_done): assert loss != 0.0 if step == 5.0: # Env 0 first episode from steps 0 -> 5 assert loss.loss == 5.0 assert loss.metrics["gradient_usage"].used_gradients == 5.0 assert loss.metrics["gradient_usage"].wasted_gradients == 0.0 elif step == 10: # Envs[1:batch_size], first episode, from steps 0 -> 10 # NOTE: At this point, both envs have reached the required number of episodes. # This means that the gradient usage on the next time any env reaches # an end-of-episode will be one less than the total number of items. assert loss.loss == 10.0 * (batch_size - 1) assert loss.metrics["gradient_usage"].used_gradients == 10.0 * ( batch_size - 1 ) assert loss.metrics["gradient_usage"].wasted_gradients == 0.0 elif step == 15: # Env 0 second episode from steps 5 -> 15 assert loss.loss == 10.0 assert loss.metrics["gradient_usage"].used_gradients == 4 assert loss.metrics["gradient_usage"].wasted_gradients == 6 elif step == 20: # Envs[1:batch_size]: second episode, from steps 0 -> 10 # NOTE: At this point, both envs have reached the required number of episodes. # This means that the gradient usage on the next time any env reaches # an end-of-episode will be one less than the total number of items. assert loss.loss == 10.0 * (batch_size - 1) assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1) assert loss.metrics["gradient_usage"].wasted_gradients == 1 * ( batch_size - 1 ) elif step == 25: # Env 0 third episode from steps 5 -> 15 assert loss.loss == 10.0 assert loss.metrics["gradient_usage"].used_gradients == 4 assert loss.metrics["gradient_usage"].wasted_gradients == 6 elif step > 0 and step % 10 == 0: # Same pattern as step 20 above assert loss.loss == 10.0 * (batch_size - 1), step assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1) assert loss.metrics["gradient_usage"].wasted_gradients == 1 * ( batch_size - 1 ) elif step > 0 and step % 5 == 0: # Same pattern as step 25 above assert loss.loss == 10.0 assert loss.metrics["gradient_usage"].used_gradients == 4 assert loss.metrics["gradient_usage"].wasted_gradients == 6 else: assert loss.loss == 0.0, step
def test_loss_is_nonzero_at_episode_end_iterate(batch_size: int): """ Test that when *iterating* through the env (active-dataloader style), when the episode ends, a non-zero loss is returned by the output head. """ with gym.make("CartPole-v0") as temp_env: temp_env = AddDoneToObservation(temp_env) obs_space = temp_env.observation_space action_space = temp_env.action_space reward_space = getattr( temp_env, "reward_space", spaces.Box(*temp_env.reward_range, shape=()) ) env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=False) env = AddDoneToObservation(env) env = ConvertToFromTensors(env) env = EnvDataset(env) head = PolicyHead( # observation_space=obs_space, input_space=obs_space["x"], action_space=action_space, reward_space=reward_space, hparams=PolicyHead.HParams(accumulate_losses_before_backward=False), ) env.seed(123) non_zero_losses = 0 for i, obs in zip(range(100), env): print(i, obs) x = obs["x"] done = obs["done"] representations = x assert isinstance(x, Tensor) assert isinstance(done, Tensor) observations = ContinualRLSetting.Observations( x=x, done=done, # info=info, ) head_output = head.forward(observations, representations=representations) actions = head_output.actions.numpy().tolist() # actions = np.zeros(batch_size, dtype=int).tolist() rewards = env.send(actions) # print(f"Step {i}, obs: {obs}, done: {done}") assert isinstance(representations, Tensor) forward_pass = ForwardPass( observations=observations, representations=representations, actions=head_output, ) rewards = ContinualRLSetting.Rewards(rewards) loss = head.get_loss(forward_pass, actions=head_output, rewards=rewards) print("loss:", loss) for env_index, env_is_done in enumerate(observations.done): if env_is_done: print(f"Episode ended for env {env_index} at step {i}") assert loss.total_loss != 0.0 non_zero_losses += 1 break else: print(f"No episode ended on step {i}, expecting no loss.") assert loss.total_loss == 0.0 assert non_zero_losses > 0
def test_loss_is_nonzero_at_episode_end(batch_size: int): """ Test that when stepping through the env, when the episode ends, a non-zero loss is returned by the output head. """ with gym.make("CartPole-v0") as temp_env: temp_env = AddDoneToObservation(temp_env) obs_space = temp_env.observation_space action_space = temp_env.action_space reward_space = getattr( temp_env, "reward_space", spaces.Box(*temp_env.reward_range, shape=()) ) env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=False) env = AddDoneToObservation(env) env = ConvertToFromTensors(env) env = EnvDataset(env) head = PolicyHead( input_space=obs_space.x, action_space=action_space, reward_space=reward_space, hparams=PolicyHead.HParams(accumulate_losses_before_backward=False), ) # TODO: Simulating as if the output head were attached to a BaselineModel. PolicyHead.base_model_optimizer = torch.optim.Adam(head.parameters(), lr=1e-3) head.train() env.seed(123) obs = env.reset() # obs = torch.as_tensor(obs, dtype=torch.float32) done = torch.zeros(batch_size, dtype=bool) info = np.array([{} for _ in range(batch_size)]) loss = None non_zero_losses = 0 encoder = nn.Linear(4, 4) encoder.train() for i in range(100): representations = encoder(obs["x"]) observations = ContinualRLSetting.Observations( x=obs["x"], done=done, # info=info, ) head_output = head.forward(observations, representations=representations) actions = head_output.actions.numpy().tolist() # actions = np.zeros(batch_size, dtype=int).tolist() obs, rewards, done, info = env.step(actions) done = torch.as_tensor(done, dtype=bool) rewards = ContinualRLSetting.Rewards(rewards) assert len(info) == batch_size print(f"Step {i}, obs: {obs}, done: {done}, info: {info}") forward_pass = ForwardPass( observations=observations, representations=representations, actions=head_output, ) loss = head.get_loss(forward_pass, actions=head_output, rewards=rewards) print("loss:", loss) assert observations.done is not None for env_index, env_is_done in enumerate(observations.done): if env_is_done: print(f"Episode ended for env {env_index} at step {i}") assert loss.loss != 0.0 non_zero_losses += 1 break else: print(f"No episode ended on step {i}, expecting no loss.") assert loss is None or loss.loss == 0.0 assert non_zero_losses > 0
def test_multiple_epochs_works(self, batch_size: Optional[int], seed: Optional[int]): epochs = 3 max_steps_per_episode = 10 from gym.wrappers import TimeLimit from sequoia.conftest import DummyEnvironment from sequoia.common.gym_wrappers import AddDoneToObservation def env_fn(): # FIXME: Using the DummyEnvironment for now since it's easier to debug with. # env = gym.make(env_name) env = DummyEnvironment() env = AddDoneToObservation(env) env = TimeLimit(env, max_episode_steps=max_steps_per_episode) return env # assert False, [env_fn(i).unwrapped for i in range(4)] # env = gym.vector.make(env_name, num_envs=(batch_size or 1)) env = make_batched_env(env_fn, batch_size=batch_size) batched_env = env # from sequoia.common.gym_wrappers.episode_limit import EpisodeLimit # env = EpisodeLimit(env, max_episodes=epochs) from sequoia.common.gym_wrappers.convert_tensors import ConvertToFromTensors env = ConvertToFromTensors(env) env = EnvDataset(env, max_steps_per_episode=max_steps_per_episode) env: GymDataLoader = self.GymDataLoader(env) # BUG: Seems to be a little bug in the shape of the items yielded by the env due # to the concat_fn of the DataLoader. # if batch_size and batch_size >= 1: # assert False, (env.reset().shape, env.observation_space, next(iter(env)).shape) env.seed(seed) all_rewards = [] with env: for epoch in range(epochs): for step, obs in enumerate(env): print(f"'epoch' {epoch}, step {step}:, obs: {obs}") assert obs in env.observation_space, obs.shape assert ( # BUG: This isn't working: (sometimes!) step < max_steps_per_episode ), "Max steps per episode should have been respected." rewards = env.send(env.action_space.sample()) if batch_size is None: all_rewards.append(rewards) else: all_rewards.extend(rewards) # Since in the VectorEnv, 'episodes' are infinite, we must have # reached the limit of the number of steps, while in a single # environment, the episode might have been shorter. assert step <= max_steps_per_episode - 1 assert epoch == epochs - 1 if batch_size in [None, 1]: # Some episodes might last shorter than the max number of steps per episode, # therefore the total should be at most this much: assert len(all_rewards) <= epochs * max_steps_per_episode else: # The maximum number of steps per episode is set, but the env is vectorized, # so the number of 'total' rewards we get from all envs should be *exactly* # this much: assert len( all_rewards) == epochs * max_steps_per_episode * batch_size