Exemple #1
0
def test_sanity_check_cartpole_done_vector():
    """TODO: Sanity check, make sure that cartpole has done=True at some point
    when using a BatchedEnv.
    """
    batch_size = 5

    starting_values = [i for i in range(batch_size)]
    targets = [10 for i in range(batch_size)]

    env = make_batched_env("CartPole-v0",
                           batch_size=5,
                           wrappers=[PixelObservationWrapper])
    env = AddDoneToObservation(env)
    # env = AddInfoToObservation(env)

    # env = BatchedVectorEnv([
    #     partial(gym.make, "CartPole-v0") for i in range(batch_size)
    # ])
    obs = env.reset()

    for i in range(100):
        obs, rewards, done, info = env.step(env.action_space.sample())
        assert all(obs[1] == done), i
        if any(done):

            break
    else:
        assert False, "Should have had at least one done=True, over the 100 steps!"
Exemple #2
0
def test_loss_is_nonzero_at_episode_end(batch_size: int):
    """ Test that when stepping through the env, when the episode ends, a
    non-zero loss is returned by the output head.
    """
    with gym.make("CartPole-v0") as temp_env:
        temp_env = AddDoneToObservation(temp_env)
        obs_space = temp_env.observation_space
        action_space = temp_env.action_space
        reward_space = getattr(
            temp_env, "reward_space", spaces.Box(*temp_env.reward_range, shape=())
        )

    env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=False)
    env = AddDoneToObservation(env)
    env = ConvertToFromTensors(env)
    env = EnvDataset(env)

    head = PolicyHead(
        input_space=obs_space.x,
        action_space=action_space,
        reward_space=reward_space,
        hparams=PolicyHead.HParams(accumulate_losses_before_backward=False),
    )
    # TODO: Simulating as if the output head were attached to a BaselineModel.
    PolicyHead.base_model_optimizer = torch.optim.Adam(head.parameters(), lr=1e-3)
    head.train()

    env.seed(123)
    obs = env.reset()

    # obs = torch.as_tensor(obs, dtype=torch.float32)

    done = torch.zeros(batch_size, dtype=bool)
    info = np.array([{} for _ in range(batch_size)])
    loss = None

    non_zero_losses = 0

    encoder = nn.Linear(4, 4)
    encoder.train()

    for i in range(100):
        representations = encoder(obs["x"])

        observations = ContinualRLSetting.Observations(
            x=obs["x"],
            done=done,
            # info=info,
        )
        head_output = head.forward(observations, representations=representations)
        actions = head_output.actions.numpy().tolist()
        # actions = np.zeros(batch_size, dtype=int).tolist()

        obs, rewards, done, info = env.step(actions)
        done = torch.as_tensor(done, dtype=bool)
        rewards = ContinualRLSetting.Rewards(rewards)
        assert len(info) == batch_size

        print(f"Step {i}, obs: {obs}, done: {done}, info: {info}")

        forward_pass = ForwardPass(
            observations=observations,
            representations=representations,
            actions=head_output,
        )
        loss = head.get_loss(forward_pass, actions=head_output, rewards=rewards)
        print("loss:", loss)

        assert observations.done is not None
        for env_index, env_is_done in enumerate(observations.done):
            if env_is_done:
                print(f"Episode ended for env {env_index} at step {i}")
                assert loss.loss != 0.0
                non_zero_losses += 1
                break
        else:
            print(f"No episode ended on step {i}, expecting no loss.")
            assert loss is None or loss.loss == 0.0

    assert non_zero_losses > 0