Python MountainCarReward Examples, examples.utils.classic_control.MountainCarReward Python Examples

Example #1

0

Show file

File: test_model_based_agent.py Project: adak32/bellman

def test_incorrect_termination_model():
    """
    The generic model-based agent should only allow a ConstantFalseTermination model.
    """

    # setup arguments for the model-based agent constructor
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    network = LinearTransitionNetwork(observation_spec)
    transition_model = KerasTransitionModel([network], observation_spec, action_spec)
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)
    termination_model = MountainCarTermination(observation_spec)
    policy = RandomTFPolicy(time_step_spec, action_spec)

    with pytest.raises(AssertionError) as excinfo:
        ModelBasedAgent(
            time_step_spec,
            action_spec,
            transition_model,
            reward_model,
            termination_model,
            initial_state_distribution_model,
            policy,
            policy,
        )

    assert "Only constant false termination supported" in str(excinfo.value)

Example #2

0

Show file

File: test_pets_agent.py Project: adak32/bellman

def test_unknown_transition_model():
    """
    Pets Agent has prespecified transition model, RuntimeError should raise on unknown model.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(
        observation_spec)

    # trajectory optimiser
    trajectory_optimiser_type = TrajectoryOptimizationType.CrossEntropyMethod
    transition_model_type = "unknown_model"
    trajectory_sampler_type = TrajectorySamplerType.TS1

    # some parameters need to be set correctly
    ensemble_size = 2
    num_elites = 10
    learning_rate = 0.9
    max_iterations = 5
    population_size = num_elites + 10
    number_of_particles = 1
    horizon = 1

    with pytest.raises(RuntimeError) as excinfo:
        PetsAgent(
            time_step_spec,
            action_spec,
            transition_model_type,
            1,
            10,
            tf.nn.relu,
            ensemble_size,
            False,
            1,
            1,
            [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
            reward_model,
            initial_state_distribution_model,
            trajectory_sampler_type,
            trajectory_optimiser_type,
            horizon,
            population_size,
            number_of_particles,
            num_elites,
            learning_rate,
            max_iterations,
        )

    assert "Unknown transition model" in str(excinfo.value)

Example #3

0

Show file

File: test_mbpo_agent.py Project: adak32/bellman

def test_ensemble_size_set_correctly():
    """
    For ensemble transition models ensemble size needs to be larger than 1.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)

    # transition model and model-free agent
    transition_model_type = TransitionModelType.DeterministicEnsemble
    trajectory_sampler_type = TrajectorySamplerType.TS1
    model_free_agent_type = ModelFreeAgentType.Ppo

    # some parameters need to be set correctly
    ensemble_size = 1
    population_size = 10
    horizon = 1

    # define agent, many transition model and trajectory optimiser parameters can
    # be arbitrary
    with pytest.raises(AssertionError) as excinfo:
        MbpoAgent(
            time_step_spec,
            action_spec,
            transition_model_type,
            1,
            10,
            tf.nn.relu,
            ensemble_size,
            False,
            1,
            1,
            [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
            reward_model,
            initial_state_distribution_model,
            trajectory_sampler_type,
            horizon,
            population_size,
            model_free_agent_type,
            1,
            10,
            tf.nn.relu,
            2,
            1,
        )

    assert "ensemble_size should be > 1" in str(excinfo.value)

Example #4

0

Show file

File: test_mbpo_agent.py Project: adak32/bellman

def test_unknown_transition_model():
    """
    Mepo Agent has prespecified transition model, RuntimeError should raise on unknown model.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)

    # transition model and model-free agent
    transition_model_type = "unknown_model"
    trajectory_sampler_type = TrajectorySamplerType.TS1
    model_free_agent_type = ModelFreeAgentType.Ppo

    # some parameters need to be set correctly
    ensemble_size = 2
    num_elites = 10
    population_size = num_elites + 10
    horizon = 1

    with pytest.raises(RuntimeError) as excinfo:
        MbpoAgent(
            time_step_spec,
            action_spec,
            transition_model_type,
            1,
            10,
            tf.nn.relu,
            ensemble_size,
            False,
            1,
            1,
            [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
            reward_model,
            initial_state_distribution_model,
            trajectory_sampler_type,
            horizon,
            population_size,
            model_free_agent_type,
            1,
            10,
            tf.nn.relu,
            2,
            1,
        )

    assert "Unknown transition model" in str(excinfo.value)

Example #5

0

Show file

def _fixture(mountain_car_environment, batch_size):
    observation_space = mountain_car_environment.observation_spec()
    action_space = mountain_car_environment.action_spec()

    observation_distr = create_uniform_distribution_from_spec(
        observation_space)
    batch_observations = observation_distr.sample(batch_size)

    reward = MountainCarReward(observation_space, action_space)
    action_distr = create_uniform_distribution_from_spec(action_space)
    batch_actions = action_distr.sample(batch_size)

    return reward, batch_observations, batch_actions, batch_size

Example #6

0

Show file

def test_planning_policy_batch_environment_model():
    """
    Ensure that planning policy is operational.
    """

    # number of trajectories for planning and planning horizon
    population_size = 3
    planner_horizon = 5
    number_of_particles = 1

    # setup the environment and a model of it
    py_env = suite_gym.load("MountainCar-v0")
    tf_env = TFPyEnvironment(py_env)
    reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec())
    terminates = MountainCarTermination(tf_env.observation_spec())
    network = LinearTransitionNetwork(tf_env.observation_spec())
    transition_model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    initial_state = MountainCarInitialState(tf_env.observation_spec())
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=terminates,
        initial_state_distribution_model=initial_state,
    )

    # setup the trajectory optimiser
    random_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                   tf_env.action_spec())
    trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy,
                                                     planner_horizon,
                                                     population_size,
                                                     number_of_particles)
    planning_policy = PlanningPolicy(environment_model, trajectory_optimiser)

    # test whether it runs
    collect_driver_planning_policy = DynamicEpisodeDriver(tf_env,
                                                          planning_policy,
                                                          num_episodes=1)
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)

Example #7

0

Show file

File: model_visualisation.py Project: adak32/bellman

    sample_transitions.observation.numpy(),
    sample_transitions.action.numpy(),
    sample_transitions.next_observation.numpy(),
)

# %% [markdown]
"""
## Training on samples

We define an environment which uses the trained transition model for the dynamics, along with a
reward function, episode termination condition, initial state distributions and bound on episode
length.
"""

# %%
reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec())
terminates = MountainCarTermination(tf_env.observation_spec())
initial_state_distribution = MountainCarInitialState(tf_env.observation_spec())
environment_model = TFTimeLimit(
    EnvironmentModel(transition_model, reward, terminates, initial_state_distribution),
    duration=200,
)

# %% [markdown]
"""
The agent is trained on data gathered from the environment model. Using the environment interface
means the TF-Agents drivers can be used to generate rollouts.
"""

# %%
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(

Example #8

0

Show file

def test_all_mepo_variants_work(transition_model, trajectory_sampler,
                                model_free_agent_type):
    """
    Mepo Agent has prespecified transition model, trajectory sampler and model-free agent
    types. Here we check that all combinations execute without errors.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(
        observation_spec)

    # some parameters need to be set correctly
    ensemble_size = 2
    num_elites = 10
    population_size = num_elites + 10
    horizon = 1

    # define agent, many transition model and trajectory optimiser parameters can
    # be arbitrary
    agent = MepoAgent(
        time_step_spec,
        action_spec,
        transition_model,
        1,
        10,
        tf.nn.relu,
        ensemble_size,
        False,
        1,
        1,
        [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
        reward_model,
        initial_state_distribution_model,
        trajectory_sampler,
        horizon,
        population_size,
        model_free_agent_type,
        1,
        10,
        tf.nn.relu,
        2,
    )

    # we need some training data
    random_policy = RandomTFPolicy(
        time_step_spec,
        action_spec,
        info_spec=agent.collect_policy.info_spec,
    )
    model_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        random_policy.trajectory_spec, batch_size=1, max_length=1000)
    collect_driver_random_policy = TFDriver(
        tf_env,
        random_policy,
        observers=[model_training_buffer.add_batch],
        max_steps=10,
        disable_tf_function=True,
    )
    initial_time_step = tf_env.reset()
    collect_driver_random_policy.run(initial_time_step)
    pets_agent_trainer = BackgroundPlanningAgentTrainer(10, 10)
    tf_training_scheduler = pets_agent_trainer.create_training_scheduler(
        agent, model_training_buffer)
    training_losses = tf_training_scheduler.maybe_train(
        tf.constant(10, dtype=tf.int64))
    assert EnvironmentModelComponents.TRANSITION in training_losses

    # test the agent
    collect_driver_planning_policy = TFDriver(
        tf_env,
        agent.collect_policy,
        observers=[model_training_buffer.add_batch],
        max_steps=10,
        disable_tf_function=True,
    )
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)