예제 #1
0
def _create_env_model(observation_space, action_space):
    batch_size = 3
    time_limit = 5

    terminations = MutableBatchConstantTermination(observation_space, batch_size)
    observation = create_uniform_distribution_from_spec(observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = TFTimeLimit(
        EnvironmentModel(
            transition_model=model,
            reward_model=ConstantReward(observation_space, action_space, -1.0),
            termination_model=terminations,
            initial_state_distribution_model=DeterministicInitialStateModel(observation),
            batch_size=batch_size,
        ),
        duration=time_limit,
    )

    actions = create_uniform_distribution_from_spec(action_space).sample((batch_size,))

    # Initial time step
    env_model.reset()

    observations = np.squeeze(
        np.repeat(np.expand_dims(observation, axis=0), batch_size, axis=0)
    )
    return terminations, observations, actions, env_model
예제 #2
0
def assert_rollouts_are_close_to_actuals(model, max_steps):
    tf_env = tf_py_environment.TFPyEnvironment(
        create_pendulum_environment(max_steps))
    collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                    tf_env.action_spec())

    test_trajectory = policy_evaluation(tf_env,
                                        collect_policy,
                                        num_episodes=1,
                                        max_buffer_capacity=200,
                                        use_function=True)

    start_state = test_trajectory.observation[0, 0, :]

    env_model = TFTimeLimit(
        EnvironmentModel(
            model,
            PendulumReward(tf_env.observation_spec(), tf_env.action_spec()),
            ConstantFalseTermination(tf_env.observation_spec()),
            DeterministicInitialStateModel(start_state),
            batch_size=30,
        ),
        max_steps + 1,
    )

    replayed_trajectories = replay_actions_across_batch_transition_models(
        env_model, test_trajectory.action[0])

    prediction_mean = tf.reduce_mean(replayed_trajectories.observation, axis=0)
    np.testing.assert_allclose(prediction_mean,
                               test_trajectory.observation[0],
                               atol=1e-1,
                               rtol=2e-1)
예제 #3
0
def test_replay_actions_across_batches(observation_space, action_space,
                                       horizon, batch_size):
    transition_network = DummyEnsembleTransitionNetwork(observation_space)
    transition_model = KerasTransitionModel(
        [transition_network],
        observation_space,
        action_space,
    )
    reward = ConstantReward(observation_space, action_space, 0.0)
    termination = ConstantFalseTermination(observation_space)
    initial_state_sampler = create_uniform_initial_state_distribution(
        observation_space)

    env_model = TFTimeLimit(
        EnvironmentModel(transition_model, reward, termination,
                         initial_state_sampler, batch_size),
        horizon,
    )

    actions_distribution = create_uniform_initial_state_distribution(
        observation_space)
    actions = actions_distribution.sample((horizon, ))
    trajectory = replay_actions_across_batch_transition_models(
        env_model, actions)

    assert (trajectory.observation.shape == (
        batch_size,
        horizon,
    ) + observation_space.shape)
예제 #4
0
def test_generate_virtual_rollouts_assert_no_time_limit_wrapper(mocker):
    env_model = TFTimeLimit(mocker.MagicMock(spec=EnvironmentModel), 100)
    policy = mocker.MagicMock(spec=TFPolicy)

    with pytest.raises(AssertionError) as excinfo:
        virtual_rollouts_buffer_and_driver(env_model, policy, 64)

    assert "should not be wrapped" in str(excinfo)
예제 #5
0
def _wrapped_environment_fixture(observation_space, action_space, batch_size):
    observation = create_uniform_distribution_from_spec(
        observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        transition_model=model,
        reward_model=ConstantReward(observation_space, action_space, -1.0),
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=batch_size,
    )
    wrapped_environment_model = TFTimeLimit(env_model, 2)

    action = create_uniform_distribution_from_spec(action_space).sample(
        (batch_size, ))

    return wrapped_environment_model, action
def test_sample_trajectory_for_mountain_car():
    tf_env = tf_py_environment.TFPyEnvironment(
        suite_gym.load("MountainCar-v0"))

    network = LinearTransitionNetwork(tf_env.observation_spec())
    model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    reward = ConstantReward(tf_env.observation_spec(), tf_env.action_spec(),
                            -1.0)
    terminates = MountainCarTermination(tf_env.observation_spec())
    initial_state_sampler = MountainCarInitialState(tf_env.observation_spec())
    environment = TFTimeLimit(EnvironmentModel(model, reward, terminates,
                                               initial_state_sampler),
                              duration=200)

    collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                    tf_env.action_spec())
    replay_buffer_capacity = 1001
    policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        collect_policy.trajectory_spec,
        batch_size=1,
        max_length=replay_buffer_capacity)

    collect_episodes_per_iteration = 2
    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        environment,
        collect_policy,
        observers=[policy_training_buffer.add_batch],
        num_episodes=collect_episodes_per_iteration,
    )

    collect_driver.run()

    trajectory = policy_training_buffer.gather_all()

    first_batch_step_type = trajectory.step_type[0, :]
    assert (first_batch_step_type[0] == StepType.FIRST
            and first_batch_step_type[-1] == StepType.LAST)
예제 #7
0
def test_tf_time_limit_wrapper_with_environment_model(observation_space,
                                                      action_space,
                                                      trajectory_length):
    """
    This test checks that the environment wrapper can in turn be wrapped by the `TimeLimit`
    environment wrapper from TF-Agents.
    """
    ts_spec = time_step_spec(observation_space)

    network = LinearTransitionNetwork(observation_space)
    environment = KerasTransitionModel([network], observation_space,
                                       action_space)
    wrapped_environment = TFTimeLimit(
        EnvironmentModel(
            environment,
            ConstantReward(observation_space, action_space, 0.0),
            ConstantFalseTermination(observation_space),
            create_uniform_initial_state_distribution(observation_space),
        ),
        trajectory_length,
    )

    collect_policy = RandomTFPolicy(ts_spec, action_space)
    replay_buffer_capacity = 1001
    policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        collect_policy.trajectory_spec,
        batch_size=1,
        max_length=replay_buffer_capacity)

    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        wrapped_environment,
        collect_policy,
        observers=[policy_training_buffer.add_batch],
        num_episodes=1,
    )
    collect_driver.run()

    trajectories = policy_training_buffer.gather_all()

    assert trajectories.step_type.shape == (1, trajectory_length + 1)
예제 #8
0
# %% [markdown]
"""
## Training on samples

We define an environment which uses the trained transition model for the dynamics, along with a
reward function, episode termination condition, initial state distributions and bound on episode
length.
"""

# %%
reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec())
terminates = MountainCarTermination(tf_env.observation_spec())
initial_state_distribution = MountainCarInitialState(tf_env.observation_spec())
environment_model = TFTimeLimit(
    EnvironmentModel(transition_model, reward, terminates, initial_state_distribution),
    duration=200,
)

# %% [markdown]
"""
The agent is trained on data gathered from the environment model. Using the environment interface
means the TF-Agents drivers can be used to generate rollouts.
"""

# %%
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=tf_agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=replay_buffer_capacity,
)