Exemple #1
0
def test_done_is_sometimes_True_when_iterating_through_env(batch_size: int):
    """ Test that when *iterating* through the env, done is sometimes 'True'.
    """
    env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=True)
    env = AddDoneToObservation(env)
    env = ConvertToFromTensors(env)
    env = EnvDataset(env)
    for i, obs in zip(range(100), env):
        print(i, obs)
        _ = env.send(env.action_space.sample())
        if any(obs["done"]):
            break
    else:
        assert False, "Never encountered done=True!"
Exemple #2
0
def test_loss_is_nonzero_at_episode_end_iterate(batch_size: int):
    """ Test that when *iterating* through the env (active-dataloader style),
    when the episode ends, a non-zero loss is returned by the output head.
    """
    with gym.make("CartPole-v0") as temp_env:
        temp_env = AddDoneToObservation(temp_env)

        obs_space = temp_env.observation_space
        action_space = temp_env.action_space
        reward_space = getattr(
            temp_env, "reward_space", spaces.Box(*temp_env.reward_range, shape=())
        )

    env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=False)
    env = AddDoneToObservation(env)
    env = ConvertToFromTensors(env)
    env = EnvDataset(env)

    head = PolicyHead(
        # observation_space=obs_space,
        input_space=obs_space["x"],
        action_space=action_space,
        reward_space=reward_space,
        hparams=PolicyHead.HParams(accumulate_losses_before_backward=False),
    )

    env.seed(123)
    non_zero_losses = 0

    for i, obs in zip(range(100), env):
        print(i, obs)
        x = obs["x"]
        done = obs["done"]
        representations = x
        assert isinstance(x, Tensor)
        assert isinstance(done, Tensor)
        observations = ContinualRLSetting.Observations(
            x=x,
            done=done,
            # info=info,
        )
        head_output = head.forward(observations, representations=representations)

        actions = head_output.actions.numpy().tolist()
        # actions = np.zeros(batch_size, dtype=int).tolist()

        rewards = env.send(actions)

        # print(f"Step {i}, obs: {obs}, done: {done}")
        assert isinstance(representations, Tensor)
        forward_pass = ForwardPass(
            observations=observations,
            representations=representations,
            actions=head_output,
        )
        rewards = ContinualRLSetting.Rewards(rewards)
        loss = head.get_loss(forward_pass, actions=head_output, rewards=rewards)
        print("loss:", loss)

        for env_index, env_is_done in enumerate(observations.done):
            if env_is_done:
                print(f"Episode ended for env {env_index} at step {i}")
                assert loss.total_loss != 0.0
                non_zero_losses += 1
                break
        else:
            print(f"No episode ended on step {i}, expecting no loss.")
            assert loss.total_loss == 0.0

    assert non_zero_losses > 0