def test_batched_environment_model(observation_space, action_space,
                                   batch_size):
    transition_network = DummyEnsembleTransitionNetwork(observation_space)
    transition_model = KerasTransitionModel(
        [transition_network],
        observation_space,
        action_space,
    )
    reward = ConstantReward(observation_space, action_space, 0.0)
    termination = ConstantFalseTermination(observation_space)
    initial_state_sampler = create_uniform_initial_state_distribution(
        observation_space)

    env_model = EnvironmentModel(transition_model, reward, termination,
                                 initial_state_sampler, batch_size)
    action_distr = create_uniform_distribution_from_spec(action_space)
    single_action = action_distr.sample()
    batch_actions = tf.convert_to_tensor(
        [single_action for _ in range(batch_size)])

    first_step = env_model.reset()
    assert (first_step.step_type == [
        StepType.FIRST for _ in range(batch_size)
    ]).numpy().all()
    assert first_step.observation.shape == [batch_size] + list(
        observation_space.shape)

    next_step = env_model.step(batch_actions)
    assert (next_step.step_type == [StepType.MID
                                    for _ in range(batch_size)]).numpy().all()
    assert next_step.observation.shape == [batch_size] + list(
        observation_space.shape)
    assert next_step.reward.shape == [batch_size]
예제 #2
0
def test_replay_actions_across_batches(observation_space, action_space,
                                       horizon, batch_size):
    transition_network = DummyEnsembleTransitionNetwork(observation_space)
    transition_model = KerasTransitionModel(
        [transition_network],
        observation_space,
        action_space,
    )
    reward = ConstantReward(observation_space, action_space, 0.0)
    termination = ConstantFalseTermination(observation_space)
    initial_state_sampler = create_uniform_initial_state_distribution(
        observation_space)

    env_model = TFTimeLimit(
        EnvironmentModel(transition_model, reward, termination,
                         initial_state_sampler, batch_size),
        horizon,
    )

    actions_distribution = create_uniform_initial_state_distribution(
        observation_space)
    actions = actions_distribution.sample((horizon, ))
    trajectory = replay_actions_across_batch_transition_models(
        env_model, actions)

    assert (trajectory.observation.shape == (
        batch_size,
        horizon,
    ) + observation_space.shape)
예제 #3
0
    def __init__(
        self,
        time_step_spec: TimeStep,
        action_spec: NestedTensorSpec,
        transition_model: Union[Tuple[TrainableTransitionModel,
                                      TransitionModelTrainingSpec],
                                TransitionModel],
        reward_model: RewardModel,
        initial_state_distribution_model: InitialStateDistributionModel,
        trajectory_optimiser: TrajectoryOptimiser,
        debug_summaries: bool = False,
        train_step_counter: Optional[tf.Variable] = None,
    ):
        """
        :param time_step_spec: A nest of tf.TypeSpec representing the time_steps.
        :param action_spec: A nest of BoundedTensorSpec representing the actions.
        :param transition_model: A component of the environment model that describes the
            transition dynamics. Either a tuple containing a trainable transition model together
            with training specs, or a pre-specified transition model.
        :param reward_model: A component of the environment model that describes the
            rewards. At the moment only pre-specified reward models are allowed, i.e. the agent
            assumes the reward function is known.
        :param initial_state_distribution_model: A component of the environment model that
            describes the initial state distribution (can be both deterministic or
            probabilistic). At the moment only pre-specified initial state distribution models
            are allowed, i.e. the agent assumes the initial state distribution is known.
        :param trajectory_optimiser: A TrajectoryOptimiser which takes an environment model and
            optimises a sequence of actions over a given horizon.
        :param debug_summaries: A bool; if true, subclasses should gather debug summaries.
        :param train_step_counter: An optional counter to increment every time the train op is run.
            Defaults to the global_step.
        """

        # setting up the environment model and policy
        if isinstance(transition_model, tuple):
            _transition_model, _ = transition_model
        else:
            _transition_model = transition_model  # type: ignore
        environment_model = EnvironmentModel(
            transition_model=_transition_model,
            reward_model=reward_model,
            termination_model=ConstantFalseTermination(
                _transition_model.observation_space_spec),
            initial_state_distribution_model=initial_state_distribution_model,
        )
        planning_policy = PlanningPolicy(environment_model,
                                         trajectory_optimiser)

        super().__init__(
            time_step_spec,
            action_spec,
            transition_model,
            reward_model,
            environment_model.termination_model,
            initial_state_distribution_model,
            planning_policy,
            planning_policy,
            debug_summaries=debug_summaries,
            train_step_counter=train_step_counter,
        )
예제 #4
0
def test_generate_virtual_rollouts(observation_space, action_space, batch_size,
                                   horizon):
    observation = create_uniform_distribution_from_spec(
        observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        transition_model=model,
        reward_model=ConstantReward(observation_space, action_space, -1.0),
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=batch_size,
    )
    random_policy = RandomTFPolicy(time_step_spec(observation_space),
                                   action_space)

    replay_buffer, driver, wrapped_env_model = virtual_rollouts_buffer_and_driver(
        env_model, random_policy, horizon)

    driver.run(wrapped_env_model.reset())
    trajectory = replay_buffer.gather_all()

    mid_steps = repeat(1, horizon - 1)
    expected_step_types = tf.constant(list(chain([0], mid_steps, [2])))
    batched_step_types = replicate(expected_step_types, (batch_size, ))
    np.testing.assert_array_equal(batched_step_types, trajectory.step_type)
예제 #5
0
def assert_rollouts_are_close_to_actuals(model, max_steps):
    tf_env = tf_py_environment.TFPyEnvironment(
        create_pendulum_environment(max_steps))
    collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                    tf_env.action_spec())

    test_trajectory = policy_evaluation(tf_env,
                                        collect_policy,
                                        num_episodes=1,
                                        max_buffer_capacity=200,
                                        use_function=True)

    start_state = test_trajectory.observation[0, 0, :]

    env_model = TFTimeLimit(
        EnvironmentModel(
            model,
            PendulumReward(tf_env.observation_spec(), tf_env.action_spec()),
            ConstantFalseTermination(tf_env.observation_spec()),
            DeterministicInitialStateModel(start_state),
            batch_size=30,
        ),
        max_steps + 1,
    )

    replayed_trajectories = replay_actions_across_batch_transition_models(
        env_model, test_trajectory.action[0])

    prediction_mean = tf.reduce_mean(replayed_trajectories.observation, axis=0)
    np.testing.assert_allclose(prediction_mean,
                               test_trajectory.observation[0],
                               atol=1e-1,
                               rtol=2e-1)
예제 #6
0
def _create_env_model(observation_space, action_space):
    batch_size = 3
    time_limit = 5

    terminations = MutableBatchConstantTermination(observation_space, batch_size)
    observation = create_uniform_distribution_from_spec(observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = TFTimeLimit(
        EnvironmentModel(
            transition_model=model,
            reward_model=ConstantReward(observation_space, action_space, -1.0),
            termination_model=terminations,
            initial_state_distribution_model=DeterministicInitialStateModel(observation),
            batch_size=batch_size,
        ),
        duration=time_limit,
    )

    actions = create_uniform_distribution_from_spec(action_space).sample((batch_size,))

    # Initial time step
    env_model.reset()

    observations = np.squeeze(
        np.repeat(np.expand_dims(observation, axis=0), batch_size, axis=0)
    )
    return terminations, observations, actions, env_model
예제 #7
0
def get_optimiser_and_environment_model(
        time_step_space,
        observation_space,
        action_space,
        population_size,
        number_of_particles,
        horizon,
        optimiser_policy_trajectory_optimiser_factory,
        sample_shape=(),
):
    reward = ConstantReward(observation_space, action_space, -1.0)

    batched_transition_network = DummyEnsembleTransitionNetwork(
        observation_space)
    batched_transition_model = KerasTransitionModel(
        [batched_transition_network],
        observation_space,
        action_space,
    )

    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=sample_shape)
    environment_model = EnvironmentModel(
        transition_model=batched_transition_model,
        reward_model=reward,
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=population_size,
    )
    trajectory_optimiser = optimiser_policy_trajectory_optimiser_factory(
        time_step_space, action_space, horizon, population_size,
        number_of_particles)
    return trajectory_optimiser, environment_model
예제 #8
0
    def _time_step_to_initial_observation(
        self,
        time_step: TimeStep,
        environment_model: EnvironmentModel,
    ):
        """
        Construct initial observation from time step.

        :param time_step: Initial time step from the real environment with nominal batch size of 1
            (because the real environment is assumed to be not "batchable").
        :param environment_model: An `EnvironmentModel` is a model of the MDP that represents
            the environment, consisting of a transition, reward, termination and initial state
            distribution model, of which some are trainable and some are fixed.

        :return: Initial observation that has the appropriate batch size as first dimension.
        """

        observation = time_step.observation
        batch_size = get_outer_shape(observation,
                                     environment_model.observation_spec())
        # the time step comes from the real environment
        assert batch_size == (
            1,
        ), f"batch_size of time_step.observation = {batch_size} and it should be 1"
        initial_observation = tf.repeat(observation,
                                        repeats=self._batch_size,
                                        axis=0)
        return initial_observation
예제 #9
0
def _create_wrapped_environment(observation_space, action_space, reward):
    network = LinearTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    return EnvironmentModel(
        model,
        ConstantReward(observation_space, action_space, reward),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )
예제 #10
0
def test_tf_env_wrapper_is_reset_at_the_start_of_each_iteration(action_space):

    observations_array = [
        # First iteration
        [StepType.FIRST, StepType.FIRST],
        [StepType.LAST, StepType.MID],
        [StepType.FIRST, StepType.MID],
        [StepType.LAST, StepType.LAST],
        # Second iteration
        [StepType.FIRST, StepType.FIRST],
        [StepType.MID, StepType.MID],
        [StepType.MID, StepType.MID],
        [StepType.MID, StepType.LAST],
        [StepType.MID, StepType.FIRST],
    ]
    observations = [
        tf.concat(ob_array, axis=0) for ob_array in observations_array
    ]

    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, iter(observations))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = TrajectoryOptimiserTerminationModel(
        OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=2,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)

    policy = RandomTFPolicy(time_step_space,
                            action_space,
                            automatic_state_reset=False,
                            validate_args=False)
    policy_state_updater = StubPolicyStateUpdater()
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon=3,
        population_size=2,
        max_iterations=2,
        policy_state_updater=policy_state_updater,
    )

    initial_time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST),
                                               axis=0),
                                batch_size=1)

    trajectory_optimiser.optimise(initial_time_step, environment_model)

    for stored_trajectories in policy_state_updater.step_types:
        np.testing.assert_equal(stored_trajectories[:, 0], np.array([0, 0]))
예제 #11
0
def test_random_shooting_with_dynamic_step_driver(observation_space, action_space):
    """
    This test uses the environment wrapper as an adapter so that a driver from TF-Agents can be used
    to generate a rollout. This also serves as an example of how to construct "random shooting"
    rollouts from an environment model.

    The assertion in this test is that selected action has the expected log_prob value consistent
    with optimisers from a uniform distribution. All this is really checking is that the preceeding
    code has run successfully.
    """

    network = LinearTransitionNetwork(observation_space)
    environment = KerasTransitionModel([network], observation_space, action_space)
    wrapped_environment = EnvironmentModel(
        environment,
        ConstantReward(observation_space, action_space, 0.0),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )

    random_policy = RandomTFPolicy(
        wrapped_environment.time_step_spec(), action_space, emit_log_probability=True
    )

    transition_observer = _RecordLastLogProbTransitionObserver()

    driver = DynamicStepDriver(
        env=wrapped_environment,
        policy=random_policy,
        transition_observers=[transition_observer],
    )
    driver.run()

    last_log_prob = transition_observer.last_log_probability

    uniform_distribution = create_uniform_distribution_from_spec(action_space)
    action_log_prob = uniform_distribution.log_prob(transition_observer.action)
    expected = np.sum(action_log_prob.numpy().astype(np.float32))
    actual = np.sum(last_log_prob.numpy())

    np.testing.assert_array_almost_equal(actual, expected, decimal=4)
예제 #12
0
def test_trajectory_optimiser_pathological_trajectories(
        action_space, horizon, batch_size):
    """
    The replay buffer is a FIFO buffer of fixed capacity. Ensure that the capacity is sufficient
    such that the initial observation is still present in the buffer even in the pathological case
    where all trajectories are of length 2.
    """

    # construct the environment model
    observations = list(
        chain.from_iterable(
            repeat(
                [
                    replicate(tf.constant(StepType.FIRST), [batch_size]),
                    replicate(tf.constant(StepType.LAST), [batch_size]),
                ],
                horizon,
            )))

    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, iter(observations))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = TrajectoryOptimiserTerminationModel(
        OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=batch_size,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)
    policy = RandomTFPolicy(time_step_space, action_space)
    stub_policy_state_updater = StubPolicyStateUpdater()
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon,
        population_size=batch_size,
        max_iterations=1,
        policy_state_updater=stub_policy_state_updater,
    )

    time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST), axis=0),
                        batch_size=1)

    trajectory_optimiser.optimise(time_step, environment_model)

    stored_trajectory = stub_policy_state_updater.step_types[0]
    assert stored_trajectory[0][0] == StepType.FIRST
def get_cross_entropy_policy(observation_space, action_space, horizon,
                             batch_size):
    time_step_space = time_step_spec(observation_space)
    network = LinearTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        model,
        ConstantReward(observation_space, action_space),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
        batch_size,
    )
    policy = CrossEntropyMethodPolicy(time_step_space, action_space, horizon,
                                      batch_size)
    return env_model, policy
예제 #14
0
def test_planning_policy_batch_environment_model():
    """
    Ensure that planning policy is operational.
    """

    # number of trajectories for planning and planning horizon
    population_size = 3
    planner_horizon = 5
    number_of_particles = 1

    # setup the environment and a model of it
    py_env = suite_gym.load("MountainCar-v0")
    tf_env = TFPyEnvironment(py_env)
    reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec())
    terminates = MountainCarTermination(tf_env.observation_spec())
    network = LinearTransitionNetwork(tf_env.observation_spec())
    transition_model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    initial_state = MountainCarInitialState(tf_env.observation_spec())
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=terminates,
        initial_state_distribution_model=initial_state,
    )

    # setup the trajectory optimiser
    random_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                   tf_env.action_spec())
    trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy,
                                                     planner_horizon,
                                                     population_size,
                                                     number_of_particles)
    planning_policy = PlanningPolicy(environment_model, trajectory_optimiser)

    # test whether it runs
    collect_driver_planning_policy = DynamicEpisodeDriver(tf_env,
                                                          planning_policy,
                                                          num_episodes=1)
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)
예제 #15
0
def _wrapped_environment_fixture(observation_space, action_space, batch_size):
    observation = create_uniform_distribution_from_spec(
        observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        transition_model=model,
        reward_model=ConstantReward(observation_space, action_space, -1.0),
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=batch_size,
    )
    wrapped_environment_model = TFTimeLimit(env_model, 2)

    action = create_uniform_distribution_from_spec(action_space).sample(
        (batch_size, ))

    return wrapped_environment_model, action
def test_sample_trajectory_for_mountain_car():
    tf_env = tf_py_environment.TFPyEnvironment(
        suite_gym.load("MountainCar-v0"))

    network = LinearTransitionNetwork(tf_env.observation_spec())
    model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    reward = ConstantReward(tf_env.observation_spec(), tf_env.action_spec(),
                            -1.0)
    terminates = MountainCarTermination(tf_env.observation_spec())
    initial_state_sampler = MountainCarInitialState(tf_env.observation_spec())
    environment = TFTimeLimit(EnvironmentModel(model, reward, terminates,
                                               initial_state_sampler),
                              duration=200)

    collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                    tf_env.action_spec())
    replay_buffer_capacity = 1001
    policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        collect_policy.trajectory_spec,
        batch_size=1,
        max_length=replay_buffer_capacity)

    collect_episodes_per_iteration = 2
    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        environment,
        collect_policy,
        observers=[policy_training_buffer.add_batch],
        num_episodes=collect_episodes_per_iteration,
    )

    collect_driver.run()

    trajectory = policy_training_buffer.gather_all()

    first_batch_step_type = trajectory.step_type[0, :]
    assert (first_batch_step_type[0] == StepType.FIRST
            and first_batch_step_type[-1] == StepType.LAST)
예제 #17
0
def test_tf_time_limit_wrapper_with_environment_model(observation_space,
                                                      action_space,
                                                      trajectory_length):
    """
    This test checks that the environment wrapper can in turn be wrapped by the `TimeLimit`
    environment wrapper from TF-Agents.
    """
    ts_spec = time_step_spec(observation_space)

    network = LinearTransitionNetwork(observation_space)
    environment = KerasTransitionModel([network], observation_space,
                                       action_space)
    wrapped_environment = TFTimeLimit(
        EnvironmentModel(
            environment,
            ConstantReward(observation_space, action_space, 0.0),
            ConstantFalseTermination(observation_space),
            create_uniform_initial_state_distribution(observation_space),
        ),
        trajectory_length,
    )

    collect_policy = RandomTFPolicy(ts_spec, action_space)
    replay_buffer_capacity = 1001
    policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        collect_policy.trajectory_spec,
        batch_size=1,
        max_length=replay_buffer_capacity)

    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        wrapped_environment,
        collect_policy,
        observers=[policy_training_buffer.add_batch],
        num_episodes=1,
    )
    collect_driver.run()

    trajectories = policy_training_buffer.gather_all()

    assert trajectories.step_type.shape == (1, trajectory_length + 1)
예제 #18
0
def test_invalid_num_elites(observation_space, action_space, horizon):

    # some fixed parameters
    population_size = 10
    number_of_particles = 1

    # set up the environment model
    network = LinearTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    environment_model = EnvironmentModel(
        model,
        ConstantReward(observation_space, action_space),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
        population_size,
    )

    # set up the trajectory optimizer
    time_step_space = time_step_spec(observation_space)
    optimiser = cross_entropy_method_trajectory_optimisation(
        time_step_space,
        action_space,
        horizon=horizon,
        population_size=population_size,
        number_of_particles=number_of_particles,
        num_elites=population_size + 1,
        learning_rate=0.1,
        max_iterations=1,
    )

    # remember the time step comes from the real environment with batch size 1
    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=(1, ))
    initial_time_step = restart(observation, batch_size=1)

    # run
    with pytest.raises(AssertionError) as excinfo:
        optimiser.optimise(initial_time_step, environment_model)

    assert "num_elites" in str(excinfo)
예제 #19
0
def test_mismatch_between_optimizer_and_environment_model_batch_size(
        observation_space, action_space,
        optimiser_policy_trajectory_optimiser_factory):
    time_step_space = time_step_spec(observation_space)
    environment_model = EnvironmentModel(
        StubTrainableTransitionModel(observation_space,
                                     action_space,
                                     predict_state_difference=True),
        ConstantReward(observation_space, action_space),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )
    population_size = environment_model.batch_size + 1
    trajectory_optimiser = optimiser_policy_trajectory_optimiser_factory(
        time_step_space, action_space, 1, population_size, 1)
    # remember the time step comes from the real environment with batch size 1
    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=(1, ))
    time_step = restart(observation, batch_size=1)
    with pytest.raises(AssertionError) as excinfo:
        _ = trajectory_optimiser.optimise(time_step, environment_model)

    assert "batch_size parameter is not equal to environment_model.batch_size" in str(
        excinfo)
예제 #20
0
def test_trajectory_optimiser_with_particles_actions_shape(
        action_space, horizon, population_size, number_of_particles):
    observation = create_uniform_distribution_from_spec(
        OBSERVATION_SPACE_SPEC).sample(sample_shape=(population_size *
                                                     number_of_particles, ))
    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, repeat(observation))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = ConstantFalseTermination(OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=population_size * number_of_particles,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)

    policy = RandomTFPolicy(time_step_space,
                            action_space,
                            automatic_state_reset=False)
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon=horizon,
        population_size=population_size,
        number_of_particles=number_of_particles,
        max_iterations=2,
    )

    initial_time_step = restart(tf.expand_dims(observation[0], axis=0))
    optimal_actions = trajectory_optimiser.optimise(initial_time_step,
                                                    environment_model)

    assert optimal_actions.shape == (horizon + 1, ) + action_space.shape
예제 #21
0
    def __init__(
        self,
        transition_model: Union[
            Tuple[TrainableTransitionModel, TransitionModelTrainingSpec], TransitionModel
        ],
        reward_model: RewardModel,
        initial_state_distribution_model: InitialStateDistributionModel,
        model_free_agent: TFAgent,
        planner_batch_size: int,
        planning_horizon: int,
        model_free_training_iterations: int,
        debug_summaries=False,
        train_step_counter=None,
    ):
        """
        :param transition_model: A component of the environment model that describes the
            transition dynamics. Either a tuple containing a trainable transition model together
            with training specs, or a pre-specified transition model.
        :param reward_model: A component of the environment model that describes the
            rewards. At the moment only pre-specified reward models are allowed, i.e. the agent
            assumes the reward function is known.
        :param initial_state_distribution_model: A component of the environment model that
            describes the initial state distribution (can be both deterministic or
            probabilistic). At the moment only pre-specified initial state distribution models
            are allowed, i.e. the agent assumes the initial state distribution is known.
        :param model_free_agent: model-free agent that is trained virtually with samples from the
            model. Could be for example PPO or TRPO which are on-policy, as well as DDPG, SAC or
            TD3 which are off-policy.
        :param planner_batch_size: Number of parallel virtual trajectories.
        :param planning_horizon: Number of steps taken in the environment in each virtual rollout.
        :param model_free_training_iterations: Number of model-free training iterations per each
            train-call.
        :param debug_summaries: A bool; if true, subclasses should gather debug summaries.
        :param train_step_counter: An optional counter to increment every time the train op is run.
            Defaults to the global_step.
        """
        assert planner_batch_size > 0, "Planner batch size must be positive."
        assert planning_horizon > 0, "Planning horizon must be positive."
        assert (
            model_free_training_iterations > 0
        ), "Model-free train iterations must be positive."

        self._assert_model_free_agent(model_free_agent)

        # for setting up the environment model and policy
        if isinstance(transition_model, tuple):
            _transition_model, _ = transition_model
        else:
            _transition_model = transition_model  # type: ignore

        assert (
            _transition_model.observation_space_spec
            == model_free_agent.time_step_spec.observation
        ), "Transition model observation spec needs to match the model-free agent's one."
        assert (
            _transition_model.action_space_spec == model_free_agent.action_spec
        ), "Transition model action spec needs to match the model-free agent's one."

        super().__init__(
            model_free_agent.time_step_spec,
            model_free_agent.action_spec,
            transition_model,
            reward_model,
            ConstantFalseTermination(_transition_model.observation_space_spec),
            initial_state_distribution_model,
            model_free_agent.policy,
            model_free_agent.collect_policy,
            debug_summaries=debug_summaries,
            train_step_counter=train_step_counter,
        )

        environment_model = EnvironmentModel(
            transition_model=_transition_model,
            reward_model=reward_model,
            termination_model=self.termination_model,
            initial_state_distribution_model=initial_state_distribution_model,
            batch_size=planner_batch_size,
        )

        (
            self._virtual_rollouts_replay_buffer,
            self._virtual_rollouts_driver,
            self._environment_model,
        ) = virtual_rollouts_buffer_and_driver(
            environment_model, model_free_agent.collect_policy, planning_horizon
        )

        self._model_free_agent = model_free_agent
        self._model_free_training_iterations = model_free_training_iterations
예제 #22
0
def test_trajectory_optimiser_each_iteration_starts_with_the_initial_observation(
        action_space, horizon, batch_size, max_iterations):
    class WrappedRandomTFPolicy(TFPolicy):
        def __init__(
            self,
            ts_spec: ts.TimeStep,
            action_spec: types.NestedTensorSpec,
            env_model: EnvironmentModel,
        ):
            super().__init__(ts_spec, action_spec)

            self._internal_policy = RandomTFPolicy(ts_spec, action_space)

            self._environment_model = env_model

        def _action(
            self,
            time_step: ts.TimeStep,
            policy_state: types.NestedTensor,
            seed: Optional[types.Seed],
        ) -> policy_step.PolicyStep:
            np.testing.assert_array_equal(
                time_step.observation,
                self._environment_model.current_time_step().observation)
            return self._internal_policy._action(time_step, policy_state, seed)

        def _distribution(
                self, time_step: ts.TimeStep,
                policy_state: types.NestedTensorSpec
        ) -> policy_step.PolicyStep:
            raise NotImplementedError()

    observations = list(
        repeat(replicate(tf.constant(StepType.MID), [batch_size]),
               max_iterations * (horizon + 1)))

    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, iter(observations))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = TrajectoryOptimiserTerminationModel(
        OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=batch_size,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)

    policy = WrappedRandomTFPolicy(time_step_space, action_space,
                                   environment_model)
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon=horizon,
        population_size=batch_size,
        max_iterations=max_iterations,
    )

    initial_time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST),
                                               axis=0),
                                batch_size=1)

    trajectory_optimiser.optimise(initial_time_step, environment_model)
예제 #23
0
# %% [markdown]
"""
## Training on samples

We define an environment which uses the trained transition model for the dynamics, along with a
reward function, episode termination condition, initial state distributions and bound on episode
length.
"""

# %%
reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec())
terminates = MountainCarTermination(tf_env.observation_spec())
initial_state_distribution = MountainCarInitialState(tf_env.observation_spec())
environment_model = TFTimeLimit(
    EnvironmentModel(transition_model, reward, terminates, initial_state_distribution),
    duration=200,
)

# %% [markdown]
"""
The agent is trained on data gathered from the environment model. Using the environment interface
means the TF-Agents drivers can be used to generate rollouts.
"""

# %%
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=tf_agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=replay_buffer_capacity,
)
예제 #24
0
    def __init__(
        self,
        environment_model: EnvironmentModel,
        trajectory_optimiser: TrajectoryOptimiser,
        clip: bool = True,
        emit_log_probability: bool = False,
        automatic_state_reset: bool = True,
        observation_and_action_constraint_splitter: Optional[
            types.Splitter] = None,
        validate_args: bool = True,
        name: Optional[Text] = None,
    ):
        """
        Initializes the class.

        :param environment_model: An `EnvironmentModel` is a model of the MDP that represents
            the environment, consisting of a transition, reward, termination and initial state
            distribution model, of which some are trainable and some are fixed.
        :param trajectory_optimiser: A `TrajectoryOptimiser` takes an environment model and
            optimises a sequence of actions over a given horizon using virtual rollouts.
        :param clip: Whether to clip actions to spec before returning them. By default True.
        :param emit_log_probability: Emit log-probabilities of actions, if supported. If
            True, policy_step.info will have CommonFields.LOG_PROBABILITY set.
            Please consult utility methods provided in policy_step for setting and
            retrieving these. When working with custom policies, either provide a
            dictionary info_spec or a namedtuple with the field 'log_probability'.
        :param automatic_state_reset:  If `True`, then `get_initial_policy_state` is used
            to clear state in `action()` and `distribution()` for for time steps
            where `time_step.is_first()`.
        :param observation_and_action_constraint_splitter: A function used to process
            observations with action constraints. These constraints can indicate,
            for example, a mask of valid/invalid actions for a given state of the
            environment. The function takes in a full observation and returns a
            tuple consisting of 1) the part of the observation intended as input to
            the network and 2) the constraint. An example
            `observation_and_action_constraint_splitter` could be as simple as: ```
            def observation_and_action_constraint_splitter(observation): return
              observation['network_input'], observation['constraint'] ```
            *Note*: when using `observation_and_action_constraint_splitter`, make
              sure the provided `q_network` is compatible with the network-specific
              half of the output of the
              `observation_and_action_constraint_splitter`. In particular,
              `observation_and_action_constraint_splitter` will be called on the
              observation before passing to the network. If
              `observation_and_action_constraint_splitter` is None, action
              constraints are not applied.
        :param validate_args: Python bool.  Whether to verify inputs to, and outputs of,
            functions like `action` and `distribution` against spec structures,
            dtypes, and shapes. Research code may prefer to set this value to `False`
            to allow iterating on input and output structures without being hamstrung
            by overly rigid checking (at the cost of harder-to-debug errors). See also
            `TFAgent.validate_args`.
        :param name: A name for this module. Defaults to the class name.
        """

        self.trajectory_optimiser = trajectory_optimiser
        self._environment_model = environment_model

        # making sure the batch_size in environment_model is correctly set
        self._environment_model.batch_size = self.trajectory_optimiser.batch_size

        super(PlanningPolicy, self).__init__(
            time_step_spec=environment_model.time_step_spec(),
            action_spec=environment_model.action_spec(),
            policy_state_spec=(),
            info_spec=(),
            clip=clip,
            emit_log_probability=emit_log_probability,
            automatic_state_reset=automatic_state_reset,
            observation_and_action_constraint_splitter=
            observation_and_action_constraint_splitter,
            validate_args=validate_args,
            name=name,
        )