Ejemplo n.º 1
0
def sample_uniformly_distributed_transitions(
        model: TransitionModel, number_of_transitions: int,
        reward_model: RewardModel) -> Transition:
    """
    Sample `number_of_transitions` transitions from the model. Draw observations and actions from a
    uniform distribution over the respective spaces. Get corresponding rewards from a reward model.
    """

    observation_distribution = create_uniform_distribution_from_spec(
        model.observation_space_spec)
    action_distribution = create_uniform_distribution_from_spec(
        model.action_space_spec)

    observations = observation_distribution.sample((number_of_transitions, ))
    actions = action_distribution.sample((number_of_transitions, ))
    next_observations = model.step(observations, actions)
    rewards = reward_model.step_reward(observations, actions,
                                       next_observations)

    return Transition(
        observation=observations,
        action=actions,
        reward=rewards,
        next_observation=next_observations,
    )
Ejemplo n.º 2
0
def test_step_call_shape(
    transition_network,
    observation_space,
    action_space,
    batch_size,
    ensemble_size,
):
    network_list = [
        transition_network(observation_space, bootstrap_data=True)
        for _ in range(ensemble_size)
    ]
    transition_model = KerasTransitionModel(
        network_list,
        observation_space,
        action_space,
        predict_state_difference=True,
        trajectory_sampling_strategy=OneStepTrajectorySampling(batch_size, ensemble_size),
    )
    observation_distribution = create_uniform_distribution_from_spec(observation_space)
    observations = observation_distribution.sample((batch_size,))
    action_distribution = create_uniform_distribution_from_spec(action_space)
    actions = action_distribution.sample((batch_size,))

    next_observations = transition_model.step(observations, actions)

    assert next_observations.shape == (batch_size,) + observation_space.shape
    assert observation_space.is_compatible_with(next_observations[0])
Ejemplo n.º 3
0
def _create_env_model(observation_space, action_space):
    batch_size = 3
    time_limit = 5

    terminations = MutableBatchConstantTermination(observation_space, batch_size)
    observation = create_uniform_distribution_from_spec(observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = TFTimeLimit(
        EnvironmentModel(
            transition_model=model,
            reward_model=ConstantReward(observation_space, action_space, -1.0),
            termination_model=terminations,
            initial_state_distribution_model=DeterministicInitialStateModel(observation),
            batch_size=batch_size,
        ),
        duration=time_limit,
    )

    actions = create_uniform_distribution_from_spec(action_space).sample((batch_size,))

    # Initial time step
    env_model.reset()

    observations = np.squeeze(
        np.repeat(np.expand_dims(observation, axis=0), batch_size, axis=0)
    )
    return terminations, observations, actions, env_model
Ejemplo n.º 4
0
def _fixture(observation_space, action_space, batch_size):
    observation_distr = create_uniform_distribution_from_spec(
        observation_space)
    batch_observations = observation_distr.sample(batch_size)

    reward = ConstantReward(observation_space, action_space, REWARD_TARGET)
    action_distr = create_uniform_distribution_from_spec(action_space)
    batch_actions = action_distr.sample(batch_size)

    return reward, batch_observations, batch_actions, batch_size
Ejemplo n.º 5
0
def _fixture(mountain_car_environment, batch_size):
    observation_space = mountain_car_environment.observation_spec()
    action_space = mountain_car_environment.action_spec()

    observation_distr = create_uniform_distribution_from_spec(
        observation_space)
    batch_observations = observation_distr.sample(batch_size)

    reward = MountainCarReward(observation_space, action_space)
    action_distr = create_uniform_distribution_from_spec(action_space)
    batch_actions = action_distr.sample(batch_size)

    return reward, batch_observations, batch_actions, batch_size
Ejemplo n.º 6
0
    def _distribution(self, time_step, policy_state):
        # Planning subroutine outputs nested structure of distributions or actions. The
        # trajectory optimizer will return a sequence of actions, one for each virtual planning
        # step. While planners do not currently support batched environments, the TFEnvironment
        # expects a batch dimension with length one in this case.

        # This condition catches initial states for the optimizer that are terminal
        # (TF Agents drivers query policies for these states as well), we return random
        # sample here (NaN cannot work for both continuous and discrete actions), otherwise
        # we call the optimizer.
        actions_or_distributions = tf.cond(
            tf.equal(time_step.is_last(), tf.constant(True)),
            lambda: tf.reshape(
                create_uniform_distribution_from_spec(self.action_spec).sample(
                ),
                1 + self.action_spec.shape,
            ),
            lambda: self.trajectory_optimiser.optimise(
                time_step, self._environment_model)[None, 0],
        )

        def _to_distribution(action_or_distribution):
            if isinstance(action_or_distribution, tf.Tensor):
                # This is an action tensor, so wrap it in a deterministic distribution.
                return tfp.distributions.Deterministic(
                    loc=action_or_distribution)
            return action_or_distribution

        distributions = tf.nest.map_structure(_to_distribution,
                                              actions_or_distributions)
        return policy_step.PolicyStep(distributions, policy_state)
def test_batched_environment_model(observation_space, action_space,
                                   batch_size):
    transition_network = DummyEnsembleTransitionNetwork(observation_space)
    transition_model = KerasTransitionModel(
        [transition_network],
        observation_space,
        action_space,
    )
    reward = ConstantReward(observation_space, action_space, 0.0)
    termination = ConstantFalseTermination(observation_space)
    initial_state_sampler = create_uniform_initial_state_distribution(
        observation_space)

    env_model = EnvironmentModel(transition_model, reward, termination,
                                 initial_state_sampler, batch_size)
    action_distr = create_uniform_distribution_from_spec(action_space)
    single_action = action_distr.sample()
    batch_actions = tf.convert_to_tensor(
        [single_action for _ in range(batch_size)])

    first_step = env_model.reset()
    assert (first_step.step_type == [
        StepType.FIRST for _ in range(batch_size)
    ]).numpy().all()
    assert first_step.observation.shape == [batch_size] + list(
        observation_space.shape)

    next_step = env_model.step(batch_actions)
    assert (next_step.step_type == [StepType.MID
                                    for _ in range(batch_size)]).numpy().all()
    assert next_step.observation.shape == [batch_size] + list(
        observation_space.shape)
    assert next_step.reward.shape == [batch_size]
Ejemplo n.º 8
0
def get_optimiser_and_environment_model(
        time_step_space,
        observation_space,
        action_space,
        population_size,
        number_of_particles,
        horizon,
        optimiser_policy_trajectory_optimiser_factory,
        sample_shape=(),
):
    reward = ConstantReward(observation_space, action_space, -1.0)

    batched_transition_network = DummyEnsembleTransitionNetwork(
        observation_space)
    batched_transition_model = KerasTransitionModel(
        [batched_transition_network],
        observation_space,
        action_space,
    )

    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=sample_shape)
    environment_model = EnvironmentModel(
        transition_model=batched_transition_model,
        reward_model=reward,
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=population_size,
    )
    trajectory_optimiser = optimiser_policy_trajectory_optimiser_factory(
        time_step_space, action_space, horizon, population_size,
        number_of_particles)
    return trajectory_optimiser, environment_model
Ejemplo n.º 9
0
def test_sample_shape_from_uniform_distribution_from_tensor_spec_by_dtype(
        gym_space_shape, dtype):
    tensor_spec = BoundedTensorSpec(gym_space_shape, dtype, 0, 1)
    uniform_distribution = create_uniform_distribution_from_spec(tensor_spec)

    sample = uniform_distribution.sample()
    assert sample.shape == gym_space_shape
Ejemplo n.º 10
0
def test_planning_policy_action_shape(
        observation_space, action_space,
        optimiser_policy_trajectory_optimiser_factory):
    """
    Ensure action shape of the planning policy is correct.
    """
    population_size = 10
    number_of_particles = 1
    horizon = 7
    time_step_space = time_step_spec(observation_space)
    trajectory_optimiser, environment_model = get_optimiser_and_environment_model(
        time_step_space,
        observation_space,
        action_space,
        population_size=population_size,
        number_of_particles=number_of_particles,
        horizon=horizon,
        optimiser_policy_trajectory_optimiser_factory=
        optimiser_policy_trajectory_optimiser_factory,
    )

    # remember the time step comes from the real environment with batch size 1
    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=(1, ))
    time_step = restart(observation, batch_size=1)

    planning_policy = PlanningPolicy(environment_model, trajectory_optimiser)

    policy_step = planning_policy.action(time_step)
    action = policy_step.action
    assert get_outer_shape(action, action_space) == (1, )
    assert action_space.is_compatible_with(action[0])
Ejemplo n.º 11
0
def test_generate_virtual_rollouts(observation_space, action_space, batch_size,
                                   horizon):
    observation = create_uniform_distribution_from_spec(
        observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        transition_model=model,
        reward_model=ConstantReward(observation_space, action_space, -1.0),
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=batch_size,
    )
    random_policy = RandomTFPolicy(time_step_spec(observation_space),
                                   action_space)

    replay_buffer, driver, wrapped_env_model = virtual_rollouts_buffer_and_driver(
        env_model, random_policy, horizon)

    driver.run(wrapped_env_model.reset())
    trajectory = replay_buffer.gather_all()

    mid_steps = repeat(1, horizon - 1)
    expected_step_types = tf.constant(list(chain([0], mid_steps, [2])))
    batched_step_types = replicate(expected_step_types, (batch_size, ))
    np.testing.assert_array_equal(batched_step_types, trajectory.step_type)
Ejemplo n.º 12
0
def _wrapped_environment_fixture(observation_space, action_space, batch_size):
    observation = create_uniform_distribution_from_spec(
        observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        transition_model=model,
        reward_model=ConstantReward(observation_space, action_space, -1.0),
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=batch_size,
    )
    wrapped_environment_model = TFTimeLimit(env_model, 2)

    action = create_uniform_distribution_from_spec(action_space).sample(
        (batch_size, ))

    return wrapped_environment_model, action
Ejemplo n.º 13
0
def test_step_call_goal_state_transform(
    transition_network,
    observation_space_latent_obs,
    action_space_latent_obs,
    batch_size,
    ensemble_size,
):
    latent_observation_space_spec = BoundedTensorSpec(
        shape=observation_space_latent_obs.shape[:-1]
        + [observation_space_latent_obs.shape[-1] - 1],
        dtype=observation_space_latent_obs.dtype,
        minimum=observation_space_latent_obs.minimum,
        maximum=observation_space_latent_obs.maximum,
        name=observation_space_latent_obs.name,
    )
    network_list = [
        transition_network(latent_observation_space_spec, bootstrap_data=True)
        for _ in range(ensemble_size)
    ]
    observation_transformation = GoalStateObservationTransformation(
        latent_observation_space_spec=latent_observation_space_spec,
        goal_state_start_index=-1,
    )
    transition_model = KerasTransitionModel(
        network_list,
        observation_space_latent_obs,
        action_space_latent_obs,
        predict_state_difference=True,
        trajectory_sampling_strategy=OneStepTrajectorySampling(batch_size, ensemble_size),
        observation_transformation=observation_transformation,
    )
    observation_distribution = create_uniform_distribution_from_spec(
        observation_space_latent_obs
    )
    observations = observation_distribution.sample((batch_size,))
    action_distribution = create_uniform_distribution_from_spec(action_space_latent_obs)
    actions = action_distribution.sample((batch_size,))

    next_observations = transition_model.step(observations, actions)

    assert next_observations.shape == (batch_size,) + observation_space_latent_obs.shape
    assert observation_space_latent_obs.is_compatible_with(next_observations[0])
    tf.assert_equal(next_observations[..., -1], observations[..., -1])
def create_uniform_initial_state_distribution(
    state_spec: BoundedTensorSpec,
) -> InitialStateDistributionModel:
    """
    Helper function to create uniform initial state distributions.
    """
    state_sampler = ProbabilisticInitialStateDistributionModel(
        create_uniform_distribution_from_spec(state_spec)
    )
    return state_sampler
Ejemplo n.º 15
0
def test_decorate_policy_with_particles_action_shapes(
    observation_space, action_space, population_size, number_of_particles
):
    time_step_space = time_step_spec(observation_space)
    policy = RandomTFPolicy(time_step_space, action_space)
    decorated_policy = decorate_policy_with_particles(policy, number_of_particles)

    observation = create_uniform_distribution_from_spec(observation_space).sample(
        sample_shape=(population_size * number_of_particles,)
    )
    initial_time_step = restart(observation, batch_size=population_size * number_of_particles)
    policy_step = decorated_policy.action(initial_time_step)
    actions = policy_step.action
    assert actions.shape == [population_size * number_of_particles] + action_space.shape.dims
Ejemplo n.º 16
0
def test_random_shooting_with_dynamic_step_driver(observation_space, action_space):
    """
    This test uses the environment wrapper as an adapter so that a driver from TF-Agents can be used
    to generate a rollout. This also serves as an example of how to construct "random shooting"
    rollouts from an environment model.

    The assertion in this test is that selected action has the expected log_prob value consistent
    with optimisers from a uniform distribution. All this is really checking is that the preceeding
    code has run successfully.
    """

    network = LinearTransitionNetwork(observation_space)
    environment = KerasTransitionModel([network], observation_space, action_space)
    wrapped_environment = EnvironmentModel(
        environment,
        ConstantReward(observation_space, action_space, 0.0),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )

    random_policy = RandomTFPolicy(
        wrapped_environment.time_step_spec(), action_space, emit_log_probability=True
    )

    transition_observer = _RecordLastLogProbTransitionObserver()

    driver = DynamicStepDriver(
        env=wrapped_environment,
        policy=random_policy,
        transition_observers=[transition_observer],
    )
    driver.run()

    last_log_prob = transition_observer.last_log_probability

    uniform_distribution = create_uniform_distribution_from_spec(action_space)
    action_log_prob = uniform_distribution.log_prob(transition_observer.action)
    expected = np.sum(action_log_prob.numpy().astype(np.float32))
    actual = np.sum(last_log_prob.numpy())

    np.testing.assert_array_almost_equal(actual, expected, decimal=4)
Ejemplo n.º 17
0
def test_invalid_num_elites(observation_space, action_space, horizon):

    # some fixed parameters
    population_size = 10
    number_of_particles = 1

    # set up the environment model
    network = LinearTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    environment_model = EnvironmentModel(
        model,
        ConstantReward(observation_space, action_space),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
        population_size,
    )

    # set up the trajectory optimizer
    time_step_space = time_step_spec(observation_space)
    optimiser = cross_entropy_method_trajectory_optimisation(
        time_step_space,
        action_space,
        horizon=horizon,
        population_size=population_size,
        number_of_particles=number_of_particles,
        num_elites=population_size + 1,
        learning_rate=0.1,
        max_iterations=1,
    )

    # remember the time step comes from the real environment with batch size 1
    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=(1, ))
    initial_time_step = restart(observation, batch_size=1)

    # run
    with pytest.raises(AssertionError) as excinfo:
        optimiser.optimise(initial_time_step, environment_model)

    assert "num_elites" in str(excinfo)
Ejemplo n.º 18
0
def sample_uniformly_distributed_observations_and_get_actions(
        policy: TFPolicy, number_of_samples: int):
    """
    Sample observations from a uniform distribution over the space of observations, and then get
    corresponding actions from the policy.

    :param policy: A policy, instance of `TFPolicy`, from which observations and actions are
                   sampled.
    :param number_of_samples: Number of observation action pairs that will be sampled.

    :return: Dictionary (`dict`) consisting of 'observations' and 'actions'.
    """
    observation_distribution = create_uniform_distribution_from_spec(
        policy.time_step_spec.observation)

    observations = observation_distribution.sample((number_of_samples, ))
    rewards = tf.zeros((number_of_samples, ), dtype=tf.float32)

    time_step = transition(observations, rewards)

    actions = policy.action(time_step).action

    return {"observations": observations, "actions": actions}
Ejemplo n.º 19
0
def test_trajectory_optimiser_with_particles_actions_shape(
        action_space, horizon, population_size, number_of_particles):
    observation = create_uniform_distribution_from_spec(
        OBSERVATION_SPACE_SPEC).sample(sample_shape=(population_size *
                                                     number_of_particles, ))
    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, repeat(observation))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = ConstantFalseTermination(OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=population_size * number_of_particles,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)

    policy = RandomTFPolicy(time_step_space,
                            action_space,
                            automatic_state_reset=False)
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon=horizon,
        population_size=population_size,
        number_of_particles=number_of_particles,
        max_iterations=2,
    )

    initial_time_step = restart(tf.expand_dims(observation[0], axis=0))
    optimal_actions = trajectory_optimiser.optimise(initial_time_step,
                                                    environment_model)

    assert optimal_actions.shape == (horizon + 1, ) + action_space.shape
Ejemplo n.º 20
0
def test_mismatch_between_optimizer_and_environment_model_batch_size(
        observation_space, action_space,
        optimiser_policy_trajectory_optimiser_factory):
    time_step_space = time_step_spec(observation_space)
    environment_model = EnvironmentModel(
        StubTrainableTransitionModel(observation_space,
                                     action_space,
                                     predict_state_difference=True),
        ConstantReward(observation_space, action_space),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )
    population_size = environment_model.batch_size + 1
    trajectory_optimiser = optimiser_policy_trajectory_optimiser_factory(
        time_step_space, action_space, 1, population_size, 1)
    # remember the time step comes from the real environment with batch size 1
    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=(1, ))
    time_step = restart(observation, batch_size=1)
    with pytest.raises(AssertionError) as excinfo:
        _ = trajectory_optimiser.optimise(time_step, environment_model)

    assert "batch_size parameter is not equal to environment_model.batch_size" in str(
        excinfo)
Ejemplo n.º 21
0
def test_constant_termination(observation_space):
    constant_termination = ConstantFalseTermination(observation_space)

    observation = create_uniform_distribution_from_spec(observation_space).sample()

    assert not constant_termination.terminates(observation)