Esempio n. 1
0
def test_incorrect_termination_model():
    """
    The generic model-based agent should only allow a ConstantFalseTermination model.
    """

    # setup arguments for the model-based agent constructor
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    network = LinearTransitionNetwork(observation_spec)
    transition_model = KerasTransitionModel([network], observation_spec, action_spec)
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)
    termination_model = MountainCarTermination(observation_spec)
    policy = RandomTFPolicy(time_step_spec, action_spec)

    with pytest.raises(AssertionError) as excinfo:
        ModelBasedAgent(
            time_step_spec,
            action_spec,
            transition_model,
            reward_model,
            termination_model,
            initial_state_distribution_model,
            policy,
            policy,
        )

    assert "Only constant false termination supported" in str(excinfo.value)
def _test_ensemble_model_close_to_actuals(trajectories, tf_env,
                                          trajectory_sampling_strategy):
    keras_transition_networks = [
        LinearTransitionNetwork(tf_env.observation_spec(), True)
        for _ in range(_ENSEMBLE_SIZE)
    ]
    model = KerasTransitionModel(
        keras_transition_networks,
        tf_env.observation_spec(),
        tf_env.action_spec(),
        predict_state_difference=False,
        trajectory_sampling_strategy=trajectory_sampling_strategy,
    )

    training_spec = KerasTrainingSpec(
        epochs=1000,
        training_batch_size=256,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor="loss", patience=20)
        ],
    )

    model.train(trajectories, training_spec)

    assert_rollouts_are_close_to_actuals(model, max_steps=1)
Esempio n. 3
0
def _create_wrapped_environment(observation_space, action_space, reward):
    network = LinearTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    return EnvironmentModel(
        model,
        ConstantReward(observation_space, action_space, reward),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )
def get_cross_entropy_policy(observation_space, action_space, horizon,
                             batch_size):
    time_step_space = time_step_spec(observation_space)
    network = LinearTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        model,
        ConstantReward(observation_space, action_space),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
        batch_size,
    )
    policy = CrossEntropyMethodPolicy(time_step_space, action_space, horizon,
                                      batch_size)
    return env_model, policy
Esempio n. 5
0
def _transition_fixture(mountain_car_environment, batch_size):
    network = LinearTransitionNetwork(
        mountain_car_environment.observation_spec())
    transition_model = KerasTransitionModel(
        [network],
        mountain_car_environment.observation_spec(),
        mountain_car_environment.action_spec(),
    )

    reward_model = ConstantReward(mountain_car_environment.observation_spec(),
                                  mountain_car_environment.action_spec())

    transition = sample_uniformly_distributed_transitions(
        transition_model, 2 * batch_size, reward_model)

    return mountain_car_environment, transition
Esempio n. 6
0
def test_mismatch_ensemble_size(observation_space, action_space,
                                trajectory_sampling_strategy_factory,
                                batch_size):
    """
    Ensure that the ensemble size specified in the trajectory sampling strategy is equal to the
    number of networks in the models.
    """
    strategy = trajectory_sampling_strategy_factory(batch_size, 2)
    if isinstance(strategy, SingleFunction):
        pytest.skip("SingleFunction strategy is not an ensemble strategy.")

    with pytest.raises(AssertionError):
        KerasTransitionModel(
            [LinearTransitionNetwork(observation_space)],
            observation_space,
            action_space,
            trajectory_sampling_strategy=strategy,
        )
def test_train_method_increments_counter_for_generic_background_planning(
        mocker, agent_class):
    """
    The docstring for the `_train` method of a TFAgent requires that the implementation increments
    the `train_step_counter`.
    """
    population_size = 1
    horizon = 10
    model_free_training_iterations = 1

    mf_agent = create_mock_model_free_agent(mocker, TIMESTEP_SPEC, ACTION_SPEC,
                                            agent_class)
    network = LinearTransitionNetwork(OBSERVATION_SPEC)
    transition_model = KerasTransitionModel([network], OBSERVATION_SPEC,
                                            ACTION_SPEC)
    reward_model = ConstantReward(OBSERVATION_SPEC, ACTION_SPEC)
    initial_state_model = create_uniform_initial_state_distribution(
        OBSERVATION_SPEC)

    train_step_counter = common.create_variable("train_step_counter",
                                                shape=(),
                                                dtype=tf.float64)
    model_based_agent = BackgroundPlanningAgent(
        (transition_model, TransitionModelTrainingSpec(1, 1)),
        reward_model,
        initial_state_model,
        mf_agent,
        population_size,
        horizon,
        model_free_training_iterations,
        train_step_counter=train_step_counter,
    )

    dummy_trajectories = generate_dummy_trajectories(
        OBSERVATION_SPEC,
        ACTION_SPEC,
        batch_size=population_size,
        trajectory_length=horizon)
    train_kwargs = {
        TRAIN_ARGSPEC_COMPONENT_ID: EnvironmentModelComponents.TRANSITION.value
    }
    model_based_agent.train(dummy_trajectories, **train_kwargs)

    assert train_step_counter.value() == 1
Esempio n. 8
0
def test_planning_policy_batch_environment_model():
    """
    Ensure that planning policy is operational.
    """

    # number of trajectories for planning and planning horizon
    population_size = 3
    planner_horizon = 5
    number_of_particles = 1

    # setup the environment and a model of it
    py_env = suite_gym.load("MountainCar-v0")
    tf_env = TFPyEnvironment(py_env)
    reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec())
    terminates = MountainCarTermination(tf_env.observation_spec())
    network = LinearTransitionNetwork(tf_env.observation_spec())
    transition_model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    initial_state = MountainCarInitialState(tf_env.observation_spec())
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=terminates,
        initial_state_distribution_model=initial_state,
    )

    # setup the trajectory optimiser
    random_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                   tf_env.action_spec())
    trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy,
                                                     planner_horizon,
                                                     population_size,
                                                     number_of_particles)
    planning_policy = PlanningPolicy(environment_model, trajectory_optimiser)

    # test whether it runs
    collect_driver_planning_policy = DynamicEpisodeDriver(tf_env,
                                                          planning_policy,
                                                          num_episodes=1)
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)
Esempio n. 9
0
def test_random_shooting_with_dynamic_step_driver(observation_space, action_space):
    """
    This test uses the environment wrapper as an adapter so that a driver from TF-Agents can be used
    to generate a rollout. This also serves as an example of how to construct "random shooting"
    rollouts from an environment model.

    The assertion in this test is that selected action has the expected log_prob value consistent
    with optimisers from a uniform distribution. All this is really checking is that the preceeding
    code has run successfully.
    """

    network = LinearTransitionNetwork(observation_space)
    environment = KerasTransitionModel([network], observation_space, action_space)
    wrapped_environment = EnvironmentModel(
        environment,
        ConstantReward(observation_space, action_space, 0.0),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )

    random_policy = RandomTFPolicy(
        wrapped_environment.time_step_spec(), action_space, emit_log_probability=True
    )

    transition_observer = _RecordLastLogProbTransitionObserver()

    driver = DynamicStepDriver(
        env=wrapped_environment,
        policy=random_policy,
        transition_observers=[transition_observer],
    )
    driver.run()

    last_log_prob = transition_observer.last_log_probability

    uniform_distribution = create_uniform_distribution_from_spec(action_space)
    action_log_prob = uniform_distribution.log_prob(transition_observer.action)
    expected = np.sum(action_log_prob.numpy().astype(np.float32))
    actual = np.sum(last_log_prob.numpy())

    np.testing.assert_array_almost_equal(actual, expected, decimal=4)
def test_train_method_increments_counter_for_model_free_supported_agents(
    mocker, agent_class, train_component
):
    """
    The docstring for the `_train` method of a TFAgent requires that the implementation increments
    the `train_step_counter`.
    """
    population_size = 1
    number_of_particles = 1
    horizon = 10

    mf_agent = create_mock_model_free_agent(mocker, TIMESTEP_SPEC, ACTION_SPEC, agent_class)
    trajectory_optimiser = random_shooting_trajectory_optimisation(
        TIMESTEP_SPEC, ACTION_SPEC, horizon, population_size, number_of_particles
    )
    network = LinearTransitionNetwork(OBSERVATION_SPEC)
    transition_model = KerasTransitionModel([network], OBSERVATION_SPEC, ACTION_SPEC)
    reward_model = ConstantReward(OBSERVATION_SPEC, ACTION_SPEC)
    initial_state_model = create_uniform_initial_state_distribution(OBSERVATION_SPEC)

    train_step_counter = common.create_variable(
        "train_step_counter", shape=(), dtype=tf.float64
    )
    agent = ModelFreeSupportedDecisionTimePlanningAgent(
        TIMESTEP_SPEC,
        ACTION_SPEC,
        (transition_model, TransitionModelTrainingSpec(1, 1)),
        reward_model,
        initial_state_model,
        trajectory_optimiser,
        mf_agent,
        train_step_counter=train_step_counter,
    )

    dummy_trajectories = generate_dummy_trajectories(
        OBSERVATION_SPEC, ACTION_SPEC, batch_size=population_size, trajectory_length=horizon
    )
    train_kwargs = {TRAIN_ARGSPEC_COMPONENT_ID: train_component.value}
    agent.train(dummy_trajectories, **train_kwargs)

    assert train_step_counter.value() == 1
def test_sample_trajectory_for_mountain_car():
    tf_env = tf_py_environment.TFPyEnvironment(
        suite_gym.load("MountainCar-v0"))

    network = LinearTransitionNetwork(tf_env.observation_spec())
    model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    reward = ConstantReward(tf_env.observation_spec(), tf_env.action_spec(),
                            -1.0)
    terminates = MountainCarTermination(tf_env.observation_spec())
    initial_state_sampler = MountainCarInitialState(tf_env.observation_spec())
    environment = TFTimeLimit(EnvironmentModel(model, reward, terminates,
                                               initial_state_sampler),
                              duration=200)

    collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                    tf_env.action_spec())
    replay_buffer_capacity = 1001
    policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        collect_policy.trajectory_spec,
        batch_size=1,
        max_length=replay_buffer_capacity)

    collect_episodes_per_iteration = 2
    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        environment,
        collect_policy,
        observers=[policy_training_buffer.add_batch],
        num_episodes=collect_episodes_per_iteration,
    )

    collect_driver.run()

    trajectory = policy_training_buffer.gather_all()

    first_batch_step_type = trajectory.step_type[0, :]
    assert (first_batch_step_type[0] == StepType.FIRST
            and first_batch_step_type[-1] == StepType.LAST)
Esempio n. 12
0
def test_tf_time_limit_wrapper_with_environment_model(observation_space,
                                                      action_space,
                                                      trajectory_length):
    """
    This test checks that the environment wrapper can in turn be wrapped by the `TimeLimit`
    environment wrapper from TF-Agents.
    """
    ts_spec = time_step_spec(observation_space)

    network = LinearTransitionNetwork(observation_space)
    environment = KerasTransitionModel([network], observation_space,
                                       action_space)
    wrapped_environment = TFTimeLimit(
        EnvironmentModel(
            environment,
            ConstantReward(observation_space, action_space, 0.0),
            ConstantFalseTermination(observation_space),
            create_uniform_initial_state_distribution(observation_space),
        ),
        trajectory_length,
    )

    collect_policy = RandomTFPolicy(ts_spec, action_space)
    replay_buffer_capacity = 1001
    policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        collect_policy.trajectory_spec,
        batch_size=1,
        max_length=replay_buffer_capacity)

    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        wrapped_environment,
        collect_policy,
        observers=[policy_training_buffer.add_batch],
        num_episodes=1,
    )
    collect_driver.run()

    trajectories = policy_training_buffer.gather_all()

    assert trajectories.step_type.shape == (1, trajectory_length + 1)
Esempio n. 13
0
def test_invalid_num_elites(observation_space, action_space, horizon):

    # some fixed parameters
    population_size = 10
    number_of_particles = 1

    # set up the environment model
    network = LinearTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    environment_model = EnvironmentModel(
        model,
        ConstantReward(observation_space, action_space),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
        population_size,
    )

    # set up the trajectory optimizer
    time_step_space = time_step_spec(observation_space)
    optimiser = cross_entropy_method_trajectory_optimisation(
        time_step_space,
        action_space,
        horizon=horizon,
        population_size=population_size,
        number_of_particles=number_of_particles,
        num_elites=population_size + 1,
        learning_rate=0.1,
        max_iterations=1,
    )

    # remember the time step comes from the real environment with batch size 1
    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=(1, ))
    initial_time_step = restart(observation, batch_size=1)

    # run
    with pytest.raises(AssertionError) as excinfo:
        optimiser.optimise(initial_time_step, environment_model)

    assert "num_elites" in str(excinfo)
Esempio n. 14
0
- no impule: red
- right impulse: green

In these state-space plots, the x-axis is the agents position and the y-axis is the velocity.
"""

# %%
batch_size = 64

training_spec = KerasTrainingSpec(
    epochs=5000,
    training_batch_size=256,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
    verbose=0,
)
linear_transition_network = LinearTransitionNetwork(tf_env.observation_spec())
trajectory_sampling_strategy = InfiniteHorizonTrajectorySampling(batch_size, 1)
transition_model = KerasTransitionModel(
    [linear_transition_network],
    tf_env.observation_spec(),
    tf_env.action_spec(),
)
reward_model = ConstantReward(tf_env.observation_spec(), tf_env.action_spec())
sample_transitions = sample_uniformly_distributed_transitions(
    transition_model, 1000, reward_model
)

# %%
plot_mountain_car_transitions(
    sample_transitions.observation.numpy(),
    sample_transitions.action.numpy(),