Python EnvDataset Exemples, sequoia.common.gym_wrappers.EnvDataset Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : gym_dataloader_test.py Projet : ryanlindeborg/Sequoia

def test_batched_state(env_name: str, batch_size: int):
    max_steps_per_episode = 10

    env = make_batched_env(env_name, batch_size=batch_size)
    dataset = EnvDataset(env, max_steps_per_episode=max_steps_per_episode)

    env: GymDataLoader = GymDataLoader(
        dataset,
        batch_size=batch_size,
    )
    with gym.make(env_name) as temp_env:
        state_shape = temp_env.observation_space.shape
        action_shape = temp_env.action_space.shape

    state_shape = (batch_size, *state_shape)
    action_shape = (batch_size, *action_shape)
    reward_shape = (batch_size, )

    state = env.reset()
    assert state.shape == state_shape
    env.seed(123)
    i = 0
    for obs_batch in take(env, 5):
        assert obs_batch.shape == state_shape

        random_actions = env.action_space.sample()
        assert torch.as_tensor(random_actions).shape == action_shape
        assert temp_env.action_space.contains(random_actions[0])

        reward = env.send(random_actions)
        assert reward.shape == reward_shape
        i += 1
    assert i == 5

Exemple #2

0

Afficher le fichier

Fichier : gym_dataloader_test.py Projet : ryanlindeborg/Sequoia

def test_multiple_epochs_works(env_name: str, batch_size: int):

    epochs = 3
    max_steps_per_episode = 10

    env = make_batched_env(env_name, batch_size=batch_size)
    dataset = EnvDataset(env, max_steps_per_episode=max_steps_per_episode)
    env: GymDataLoader = GymDataLoader(dataset, )
    all_rewards = []
    with env:
        env.reset()
        for epoch in range(epochs):
            for i, batch in enumerate(env):
                assert i < max_steps_per_episode, "Max steps per episode should have been respected."
                rewards = env.send(env.action_space.sample())

                if batch_size is None:
                    all_rewards.append(rewards)
                else:
                    all_rewards.extend(rewards)

            # Since in the VectorEnv, 'episodes' are infinite, we must have
            # reached the limit of the number of steps, while in a single
            # environment, the episode might have been shorter.
            assert i <= max_steps_per_episode - 1

        assert epoch == epochs - 1

    if batch_size is None:
        assert len(all_rewards) <= epochs * max_steps_per_episode
    else:
        assert len(all_rewards) == epochs * max_steps_per_episode * batch_size

Exemple #3

0

Afficher le fichier

Fichier : gym_dataloader_test.py Projet : ryanlindeborg/Sequoia

def test_spaces(env_name: str, batch_size: int):
    dataset = EnvDataset(make_batched_env(env_name, batch_size=batch_size))

    batched_obs_space = dataset.observation_space
    # NOTE: the VectorEnv class creates the 'batched' action space by creating a
    # Tuple of the single action space, of length 'N', which seems a bit weird.
    # batched_action_space = vector_env.action_space
    batched_action_space = batch_space(dataset.single_action_space, batch_size)

    dataloader_env = GymDataLoader(dataset, batch_size=batch_size)
    assert dataloader_env.observation_space == batched_obs_space
    assert dataloader_env.action_space == batched_action_space

    dataloader_env.reset()
    for observation_batch in take(dataloader_env, 3):
        if isinstance(observation_batch, Tensor):
            observation_batch = observation_batch.cpu().numpy()
        assert observation_batch in batched_obs_space

        actions = dataloader_env.action_space.sample()
        assert len(actions) == batch_size
        assert actions in batched_action_space

        rewards = dataloader_env.send(actions)
        assert len(rewards) == batch_size
        assert rewards in dataloader_env.reward_space

Exemple #4

0

Afficher le fichier

Fichier : gym_dataloader_test.py Projet : ryanlindeborg/Sequoia

def test_max_steps_is_respected(env_name: str, batch_size: int):
    max_steps = 5
    env_name = "CartPole-v0"
    env = make_batched_env(env_name, batch_size=batch_size)
    dataset = EnvDataset(env, max_steps=max_steps)
    env: GymDataLoader = GymDataLoader(dataset, )
    env.reset()
    for i, batch in enumerate(env):
        assert i < max_steps, f"Max steps should have been respected: {i}"
        env.send(env.action_space.sample())
    assert i == max_steps - 1
    env.close()

Exemple #5

0

Afficher le fichier

def test_done_is_sometimes_True_when_iterating_through_env(batch_size: int):
    """ Test that when *iterating* through the env, done is sometimes 'True'.
    """
    env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=True)
    env = AddDoneToObservation(env)
    env = ConvertToFromTensors(env)
    env = EnvDataset(env)
    for i, obs in zip(range(100), env):
        print(i, obs)
        _ = env.send(env.action_space.sample())
        if any(obs["done"]):
            break
    else:
        assert False, "Never encountered done=True!"

Exemple #6

0

Afficher le fichier

 def test_max_steps_is_respected(self, env_name: str, batch_size: int):
     max_steps = 5
     env_name = "CartPole-v0"
     env = make_batched_env(env_name, batch_size=batch_size)
     dataset = EnvDataset(env)
     from sequoia.common.gym_wrappers.action_limit import ActionLimit
     dataset = ActionLimit(dataset, max_steps=max_steps * (batch_size or 1))
     env: GymDataLoader = self.GymDataLoader(dataset)
     env.reset()
     i = 0
     for i, obs in enumerate(env):
         assert obs in env.observation_space
         assert i < max_steps, f"Max steps should have been respected: {i}"
         env.send(env.action_space.sample())
     assert i == max_steps - 1
     env.close()

Exemple #7

0

Afficher le fichier

def test_measure_RL_performance_batched_env():
    batch_size = 3
    start = [i for i in range(batch_size)]
    target = 5
    env = EnvDataset(
        SyncVectorEnv([
            partial(DummyEnvironment,
                    start=start[i],
                    target=target,
                    max_value=target * 2) for i in range(batch_size)
        ]))
    # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards)

    env = MeasureRLPerformanceWrapper(env)
    env.seed(123)
    all_episode_rewards = []
    all_episode_steps = []

    for step, obs in enumerate(itertools.islice(env, 100)):
        print(f"step {step} obs: {obs}")
        action = np.ones(batch_size)  # always increment the counter
        reward = env.send(action)
        print(env.done_)
        # print(obs, reward, done, info)
    assert step == 99
    from collections import defaultdict
    from sequoia.common.metrics import Metrics

    expected_metrics = defaultdict(Metrics)
    for i in range(101):
        for env_index in range(batch_size):
            if i and i % target == 0:
                expected_metrics[i] += EpisodeMetrics(
                    n_samples=1,
                    mean_episode_reward=
                    10.,  # ? FIXME: Actually understand this condition
                    mean_episode_length=target,
                )

            # FIXME: This test is a bit too complicated, hard to follow. I'll keep the
            # batches synced-up for now.
            # if i > 0 and (i + env_index) % target == 0:
            #     expected_metrics[i] += EpisodeMetrics(
            #         n_samples=1,
            #         mean_episode_reward=sum(target - (i + env_index % target) for j in range(start[env_index], target)),
            #         mean_episode_length=target - start[env_index] - 1
            #     )

    assert env.get_online_performance() == expected_metrics

Exemple #8

0

Afficher le fichier

    def test_reward_isnt_always_one(self, env_name: str, batch_size: int):
        epochs = 3
        max_steps_per_episode = 100

        env = make_batched_env(env_name, batch_size=batch_size)
        dataset = EnvDataset(env, max_steps_per_episode=max_steps_per_episode)

        env: GymDataLoader = self.GymDataLoader(env=dataset)
        all_rewards = []
        with env:
            env.reset()
            for epoch in range(epochs):
                for i, batch in enumerate(env):
                    rewards = env.send(env.action_space.sample())
                    all_rewards.extend(rewards)

        assert all_rewards != np.ones(len(all_rewards)).tolist()

Exemple #9

0

Afficher le fichier

    def test_batched_pixels(self, env_name: str, batch_size: int):
        max_steps_per_episode = 10

        wrappers = [PixelObservationWrapper]
        env = make_batched_env(env_name,
                               wrappers=wrappers,
                               batch_size=batch_size)
        dataset = EnvDataset(env, max_steps_per_episode=max_steps_per_episode)

        with gym.make(env_name) as temp_env:
            for wrapper in wrappers:
                temp_env = wrapper(temp_env)

            state_shape = temp_env.observation_space.shape
            action_shape = temp_env.action_space.shape

        state_shape = (batch_size, *state_shape)
        action_shape = (batch_size, *action_shape)
        reward_shape = (batch_size, )

        env = self.GymDataLoader(
            dataset,
            batch_size=batch_size,
        )
        assert isinstance(env.observation_space, spaces.Box)
        assert len(env.observation_space.shape) == 4
        assert env.observation_space.shape[0] == batch_size

        env.seed(1234)
        for i, batch in enumerate(env):
            assert len(batch) == batch_size

            if isinstance(batch, Tensor):
                batch = batch.cpu().numpy()
            assert batch in env.observation_space

            random_actions = env.action_space.sample()
            assert torch.as_tensor(random_actions).shape == action_shape
            assert temp_env.action_space.contains(random_actions[0])

            reward = env.send(random_actions)
            assert reward.shape == reward_shape

Exemple #10

0

Afficher le fichier

def test_measure_RL_performance_iteration():
    env = DummyEnvironment(start=0, target=5, max_value=10)
    from gym.wrappers import TimeLimit
    max_episode_steps = 50
    env = EnvDataset(env)
    env = TimeLimit(env, max_episode_steps=max_episode_steps)

    # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards)

    env = MeasureRLPerformanceWrapper(env)
    env.seed(123)
    all_episode_rewards = []
    all_episode_steps = []

    for episode in range(5):
        episode_steps = 0
        episode_reward = 0
        for step, obs in enumerate(env):
            print(f"Episode {episode}, obs: {obs}")
            action = env.action_space.sample()
            reward = env.send(action)
            episode_reward += reward
            episode_steps += 1
            # print(obs, reward, done, info)
            assert step <= max_episode_steps, "shouldn't be able to iterate longer than that."

        all_episode_steps.append(episode_steps)
        all_episode_rewards.append(episode_reward)

    expected_metrics = {}
    for episode_steps, cumul_step, episode_reward in zip(
            all_episode_steps, accumulate(all_episode_steps),
            all_episode_rewards):
        expected_metrics[cumul_step] = EpisodeMetrics(
            n_samples=1,
            mean_episode_reward=episode_reward,
            mean_episode_length=episode_steps,
        )

    assert env.get_online_performance() == expected_metrics

Exemple #11

0

Afficher le fichier

def test_measure_RL_performance_iteration():
    env = DummyEnvironment(start=0, target=5, max_value=10)
    env = EnvDataset(env)
    from sequoia.settings.active.continual.continual_rl_setting import \
        ContinualRLSetting

    # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards)

    env = MeasureRLPerformanceWrapper(env)
    env.seed(123)
    all_episode_rewards = []
    all_episode_steps = []

    for episode in range(5):
        episode_steps = 0
        episode_reward = 0
        for step, obs in enumerate(env):
            print(f"Episode {episode}, obs: {obs}")
            action = env.action_space.sample()
            reward = env.send(action)
            episode_reward += reward
            episode_steps += 1
            # print(obs, reward, done, info)

        all_episode_steps.append(episode_steps)
        all_episode_rewards.append(episode_reward)
    from itertools import accumulate

    expected_metrics = {}
    for episode_steps, cumul_step, episode_reward in zip(
            all_episode_steps, accumulate(all_episode_steps),
            all_episode_rewards):
        expected_metrics[cumul_step] = EpisodeMetrics(
            n_samples=1,
            mean_episode_reward=episode_reward,
            mean_episode_length=episode_steps,
        )

    assert env.get_online_performance() == expected_metrics

Exemple #12

0

Afficher le fichier

def test_with_controllable_episode_lengths(batch_size: int, monkeypatch):
    """ TODO: Test out the PolicyHead in a very controlled environment, where we
    know exactly the lengths of each episode.
    """
    env = FakeEnvironment(
        partial(gym.make, "CartPole-v0"),
        batch_size=batch_size,
        episode_lengths=[5, *(10 for _ in range(batch_size - 1))],
        new_episode_length=lambda env_index: 10,
    )
    env = AddDoneToObservation(env)
    env = ConvertToFromTensors(env)
    env = EnvDataset(env)

    obs_space = env.single_observation_space
    x_dim = flatdim(obs_space["x"])
    # Create some dummy encoder.
    encoder = nn.Linear(x_dim, x_dim)
    representation_space = obs_space["x"]

    output_head = PolicyHead(
        input_space=representation_space,
        action_space=env.single_action_space,
        reward_space=env.single_reward_space,
        hparams=PolicyHead.HParams(
            max_episode_window_length=100,
            min_episodes_before_update=1,
            accumulate_losses_before_backward=False,
        ),
    )
    # TODO: Simulating as if the output head were attached to a BaselineModel.
    PolicyHead.base_model_optimizer = torch.optim.Adam(
        output_head.parameters(), lr=1e-3
    )

    # Simplify the loss function so we know exactly what the loss should be at
    # each step.

    def mock_policy_gradient(
        rewards: Sequence[float], log_probs: Sequence[float], gamma: float = 0.95
    ) -> Optional[Loss]:
        log_probs = (log_probs - log_probs.clone()) + 1
        # Return the length of the episode, but with a "gradient" flowing back into log_probs.
        return len(rewards) * log_probs.mean()

    monkeypatch.setattr(output_head, "policy_gradient", mock_policy_gradient)

    batch_size = env.batch_size

    obs = env.reset()
    step_done = np.zeros(batch_size, dtype=np.bool)

    for step in range(200):
        x, obs_done = obs["x"], obs["done"]

        # The done from the obs should always be the same as the 'done' from the 'step' function.
        assert np.array_equal(obs_done, step_done)

        representations = encoder(x)
        observations = ContinualRLSetting.Observations(x=x, done=obs_done,)

        actions_obj = output_head(observations, representations)
        actions = actions_obj.y_pred

        # TODO: kinda useless to wrap a single tensor in an object..
        forward_pass = ForwardPass(
            observations=observations, representations=representations, actions=actions,
        )
        obs, rewards, step_done, info = env.step(actions)

        rewards_obj = ContinualRLSetting.Rewards(y=rewards)
        loss = output_head.get_loss(
            forward_pass=forward_pass, actions=actions_obj, rewards=rewards_obj,
        )
        print(f"Step {step}")
        print(f"num episodes since update: {output_head.num_episodes_since_update}")
        print(f"steps left in episode: {env.steps_left_in_episode}")
        print(f"Loss for that step: {loss}")

        if any(obs_done):
            assert loss != 0.0

        if step == 5.0:
            # Env 0 first episode from steps 0 -> 5
            assert loss.loss == 5.0
            assert loss.metrics["gradient_usage"].used_gradients == 5.0
            assert loss.metrics["gradient_usage"].wasted_gradients == 0.0
        elif step == 10:
            # Envs[1:batch_size], first episode, from steps 0 -> 10
            # NOTE: At this point, both envs have reached the required number of episodes.
            # This means that the gradient usage on the next time any env reaches
            # an end-of-episode will be one less than the total number of items.
            assert loss.loss == 10.0 * (batch_size - 1)
            assert loss.metrics["gradient_usage"].used_gradients == 10.0 * (
                batch_size - 1
            )
            assert loss.metrics["gradient_usage"].wasted_gradients == 0.0
        elif step == 15:
            # Env 0 second episode from steps 5 -> 15
            assert loss.loss == 10.0
            assert loss.metrics["gradient_usage"].used_gradients == 4
            assert loss.metrics["gradient_usage"].wasted_gradients == 6

        elif step == 20:
            # Envs[1:batch_size]: second episode, from steps 0 -> 10
            # NOTE: At this point, both envs have reached the required number of episodes.
            # This means that the gradient usage on the next time any env reaches
            # an end-of-episode will be one less than the total number of items.
            assert loss.loss == 10.0 * (batch_size - 1)
            assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1)
            assert loss.metrics["gradient_usage"].wasted_gradients == 1 * (
                batch_size - 1
            )

        elif step == 25:
            # Env 0 third episode from steps 5 -> 15
            assert loss.loss == 10.0
            assert loss.metrics["gradient_usage"].used_gradients == 4
            assert loss.metrics["gradient_usage"].wasted_gradients == 6

        elif step > 0 and step % 10 == 0:
            # Same pattern as step 20 above
            assert loss.loss == 10.0 * (batch_size - 1), step
            assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1)
            assert loss.metrics["gradient_usage"].wasted_gradients == 1 * (
                batch_size - 1
            )

        elif step > 0 and step % 5 == 0:
            # Same pattern as step 25 above
            assert loss.loss == 10.0
            assert loss.metrics["gradient_usage"].used_gradients == 4
            assert loss.metrics["gradient_usage"].wasted_gradients == 6

        else:
            assert loss.loss == 0.0, step

Exemple #13

0

Afficher le fichier

def test_loss_is_nonzero_at_episode_end_iterate(batch_size: int):
    """ Test that when *iterating* through the env (active-dataloader style),
    when the episode ends, a non-zero loss is returned by the output head.
    """
    with gym.make("CartPole-v0") as temp_env:
        temp_env = AddDoneToObservation(temp_env)

        obs_space = temp_env.observation_space
        action_space = temp_env.action_space
        reward_space = getattr(
            temp_env, "reward_space", spaces.Box(*temp_env.reward_range, shape=())
        )

    env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=False)
    env = AddDoneToObservation(env)
    env = ConvertToFromTensors(env)
    env = EnvDataset(env)

    head = PolicyHead(
        # observation_space=obs_space,
        input_space=obs_space["x"],
        action_space=action_space,
        reward_space=reward_space,
        hparams=PolicyHead.HParams(accumulate_losses_before_backward=False),
    )

    env.seed(123)
    non_zero_losses = 0

    for i, obs in zip(range(100), env):
        print(i, obs)
        x = obs["x"]
        done = obs["done"]
        representations = x
        assert isinstance(x, Tensor)
        assert isinstance(done, Tensor)
        observations = ContinualRLSetting.Observations(
            x=x,
            done=done,
            # info=info,
        )
        head_output = head.forward(observations, representations=representations)

        actions = head_output.actions.numpy().tolist()
        # actions = np.zeros(batch_size, dtype=int).tolist()

        rewards = env.send(actions)

        # print(f"Step {i}, obs: {obs}, done: {done}")
        assert isinstance(representations, Tensor)
        forward_pass = ForwardPass(
            observations=observations,
            representations=representations,
            actions=head_output,
        )
        rewards = ContinualRLSetting.Rewards(rewards)
        loss = head.get_loss(forward_pass, actions=head_output, rewards=rewards)
        print("loss:", loss)

        for env_index, env_is_done in enumerate(observations.done):
            if env_is_done:
                print(f"Episode ended for env {env_index} at step {i}")
                assert loss.total_loss != 0.0
                non_zero_losses += 1
                break
        else:
            print(f"No episode ended on step {i}, expecting no loss.")
            assert loss.total_loss == 0.0

    assert non_zero_losses > 0

Exemple #14

0

Afficher le fichier

def test_loss_is_nonzero_at_episode_end(batch_size: int):
    """ Test that when stepping through the env, when the episode ends, a
    non-zero loss is returned by the output head.
    """
    with gym.make("CartPole-v0") as temp_env:
        temp_env = AddDoneToObservation(temp_env)
        obs_space = temp_env.observation_space
        action_space = temp_env.action_space
        reward_space = getattr(
            temp_env, "reward_space", spaces.Box(*temp_env.reward_range, shape=())
        )

    env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=False)
    env = AddDoneToObservation(env)
    env = ConvertToFromTensors(env)
    env = EnvDataset(env)

    head = PolicyHead(
        input_space=obs_space.x,
        action_space=action_space,
        reward_space=reward_space,
        hparams=PolicyHead.HParams(accumulate_losses_before_backward=False),
    )
    # TODO: Simulating as if the output head were attached to a BaselineModel.
    PolicyHead.base_model_optimizer = torch.optim.Adam(head.parameters(), lr=1e-3)
    head.train()

    env.seed(123)
    obs = env.reset()

    # obs = torch.as_tensor(obs, dtype=torch.float32)

    done = torch.zeros(batch_size, dtype=bool)
    info = np.array([{} for _ in range(batch_size)])
    loss = None

    non_zero_losses = 0

    encoder = nn.Linear(4, 4)
    encoder.train()

    for i in range(100):
        representations = encoder(obs["x"])

        observations = ContinualRLSetting.Observations(
            x=obs["x"],
            done=done,
            # info=info,
        )
        head_output = head.forward(observations, representations=representations)
        actions = head_output.actions.numpy().tolist()
        # actions = np.zeros(batch_size, dtype=int).tolist()

        obs, rewards, done, info = env.step(actions)
        done = torch.as_tensor(done, dtype=bool)
        rewards = ContinualRLSetting.Rewards(rewards)
        assert len(info) == batch_size

        print(f"Step {i}, obs: {obs}, done: {done}, info: {info}")

        forward_pass = ForwardPass(
            observations=observations,
            representations=representations,
            actions=head_output,
        )
        loss = head.get_loss(forward_pass, actions=head_output, rewards=rewards)
        print("loss:", loss)

        assert observations.done is not None
        for env_index, env_is_done in enumerate(observations.done):
            if env_is_done:
                print(f"Episode ended for env {env_index} at step {i}")
                assert loss.loss != 0.0
                non_zero_losses += 1
                break
        else:
            print(f"No episode ended on step {i}, expecting no loss.")
            assert loss is None or loss.loss == 0.0

    assert non_zero_losses > 0

Exemple #15

0

Afficher le fichier

    def test_multiple_epochs_works(self, batch_size: Optional[int],
                                   seed: Optional[int]):
        epochs = 3
        max_steps_per_episode = 10
        from gym.wrappers import TimeLimit
        from sequoia.conftest import DummyEnvironment
        from sequoia.common.gym_wrappers import AddDoneToObservation

        def env_fn():
            # FIXME: Using the DummyEnvironment for now since it's easier to debug with.
            # env = gym.make(env_name)
            env = DummyEnvironment()
            env = AddDoneToObservation(env)
            env = TimeLimit(env, max_episode_steps=max_steps_per_episode)
            return env

        # assert False, [env_fn(i).unwrapped for i in range(4)]
        # env = gym.vector.make(env_name, num_envs=(batch_size or 1))
        env = make_batched_env(env_fn, batch_size=batch_size)

        batched_env = env
        # from sequoia.common.gym_wrappers.episode_limit import EpisodeLimit
        # env = EpisodeLimit(env, max_episodes=epochs)
        from sequoia.common.gym_wrappers.convert_tensors import ConvertToFromTensors
        env = ConvertToFromTensors(env)

        env = EnvDataset(env, max_steps_per_episode=max_steps_per_episode)

        env: GymDataLoader = self.GymDataLoader(env)
        # BUG: Seems to be a little bug in the shape of the items yielded by the env due
        # to the concat_fn of the DataLoader.
        # if batch_size and batch_size >= 1:
        #     assert False, (env.reset().shape, env.observation_space, next(iter(env)).shape)
        env.seed(seed)

        all_rewards = []
        with env:
            for epoch in range(epochs):
                for step, obs in enumerate(env):
                    print(f"'epoch' {epoch}, step {step}:, obs: {obs}")
                    assert obs in env.observation_space, obs.shape
                    assert (  # BUG: This isn't working: (sometimes!)
                        step < max_steps_per_episode
                    ), "Max steps per episode should have been respected."
                    rewards = env.send(env.action_space.sample())

                    if batch_size is None:
                        all_rewards.append(rewards)
                    else:
                        all_rewards.extend(rewards)

                # Since in the VectorEnv, 'episodes' are infinite, we must have
                # reached the limit of the number of steps, while in a single
                # environment, the episode might have been shorter.
                assert step <= max_steps_per_episode - 1

            assert epoch == epochs - 1

        if batch_size in [None, 1]:
            # Some episodes might last shorter than the max number of steps per episode,
            # therefore the total should be at most this much:
            assert len(all_rewards) <= epochs * max_steps_per_episode
        else:
            # The maximum number of steps per episode is set, but the env is vectorized,
            # so the number of 'total' rewards we get from all envs should be *exactly*
            # this much:
            assert len(
                all_rewards) == epochs * max_steps_per_episode * batch_size