def sample_uniformly_distributed_transitions( model: TransitionModel, number_of_transitions: int, reward_model: RewardModel) -> Transition: """ Sample `number_of_transitions` transitions from the model. Draw observations and actions from a uniform distribution over the respective spaces. Get corresponding rewards from a reward model. """ observation_distribution = create_uniform_distribution_from_spec( model.observation_space_spec) action_distribution = create_uniform_distribution_from_spec( model.action_space_spec) observations = observation_distribution.sample((number_of_transitions, )) actions = action_distribution.sample((number_of_transitions, )) next_observations = model.step(observations, actions) rewards = reward_model.step_reward(observations, actions, next_observations) return Transition( observation=observations, action=actions, reward=rewards, next_observation=next_observations, )
def test_step_call_shape( transition_network, observation_space, action_space, batch_size, ensemble_size, ): network_list = [ transition_network(observation_space, bootstrap_data=True) for _ in range(ensemble_size) ] transition_model = KerasTransitionModel( network_list, observation_space, action_space, predict_state_difference=True, trajectory_sampling_strategy=OneStepTrajectorySampling(batch_size, ensemble_size), ) observation_distribution = create_uniform_distribution_from_spec(observation_space) observations = observation_distribution.sample((batch_size,)) action_distribution = create_uniform_distribution_from_spec(action_space) actions = action_distribution.sample((batch_size,)) next_observations = transition_model.step(observations, actions) assert next_observations.shape == (batch_size,) + observation_space.shape assert observation_space.is_compatible_with(next_observations[0])
def _create_env_model(observation_space, action_space): batch_size = 3 time_limit = 5 terminations = MutableBatchConstantTermination(observation_space, batch_size) observation = create_uniform_distribution_from_spec(observation_space).sample() network = DummyEnsembleTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) env_model = TFTimeLimit( EnvironmentModel( transition_model=model, reward_model=ConstantReward(observation_space, action_space, -1.0), termination_model=terminations, initial_state_distribution_model=DeterministicInitialStateModel(observation), batch_size=batch_size, ), duration=time_limit, ) actions = create_uniform_distribution_from_spec(action_space).sample((batch_size,)) # Initial time step env_model.reset() observations = np.squeeze( np.repeat(np.expand_dims(observation, axis=0), batch_size, axis=0) ) return terminations, observations, actions, env_model
def _fixture(observation_space, action_space, batch_size): observation_distr = create_uniform_distribution_from_spec( observation_space) batch_observations = observation_distr.sample(batch_size) reward = ConstantReward(observation_space, action_space, REWARD_TARGET) action_distr = create_uniform_distribution_from_spec(action_space) batch_actions = action_distr.sample(batch_size) return reward, batch_observations, batch_actions, batch_size
def _fixture(mountain_car_environment, batch_size): observation_space = mountain_car_environment.observation_spec() action_space = mountain_car_environment.action_spec() observation_distr = create_uniform_distribution_from_spec( observation_space) batch_observations = observation_distr.sample(batch_size) reward = MountainCarReward(observation_space, action_space) action_distr = create_uniform_distribution_from_spec(action_space) batch_actions = action_distr.sample(batch_size) return reward, batch_observations, batch_actions, batch_size
def _distribution(self, time_step, policy_state): # Planning subroutine outputs nested structure of distributions or actions. The # trajectory optimizer will return a sequence of actions, one for each virtual planning # step. While planners do not currently support batched environments, the TFEnvironment # expects a batch dimension with length one in this case. # This condition catches initial states for the optimizer that are terminal # (TF Agents drivers query policies for these states as well), we return random # sample here (NaN cannot work for both continuous and discrete actions), otherwise # we call the optimizer. actions_or_distributions = tf.cond( tf.equal(time_step.is_last(), tf.constant(True)), lambda: tf.reshape( create_uniform_distribution_from_spec(self.action_spec).sample( ), 1 + self.action_spec.shape, ), lambda: self.trajectory_optimiser.optimise( time_step, self._environment_model)[None, 0], ) def _to_distribution(action_or_distribution): if isinstance(action_or_distribution, tf.Tensor): # This is an action tensor, so wrap it in a deterministic distribution. return tfp.distributions.Deterministic( loc=action_or_distribution) return action_or_distribution distributions = tf.nest.map_structure(_to_distribution, actions_or_distributions) return policy_step.PolicyStep(distributions, policy_state)
def test_batched_environment_model(observation_space, action_space, batch_size): transition_network = DummyEnsembleTransitionNetwork(observation_space) transition_model = KerasTransitionModel( [transition_network], observation_space, action_space, ) reward = ConstantReward(observation_space, action_space, 0.0) termination = ConstantFalseTermination(observation_space) initial_state_sampler = create_uniform_initial_state_distribution( observation_space) env_model = EnvironmentModel(transition_model, reward, termination, initial_state_sampler, batch_size) action_distr = create_uniform_distribution_from_spec(action_space) single_action = action_distr.sample() batch_actions = tf.convert_to_tensor( [single_action for _ in range(batch_size)]) first_step = env_model.reset() assert (first_step.step_type == [ StepType.FIRST for _ in range(batch_size) ]).numpy().all() assert first_step.observation.shape == [batch_size] + list( observation_space.shape) next_step = env_model.step(batch_actions) assert (next_step.step_type == [StepType.MID for _ in range(batch_size)]).numpy().all() assert next_step.observation.shape == [batch_size] + list( observation_space.shape) assert next_step.reward.shape == [batch_size]
def get_optimiser_and_environment_model( time_step_space, observation_space, action_space, population_size, number_of_particles, horizon, optimiser_policy_trajectory_optimiser_factory, sample_shape=(), ): reward = ConstantReward(observation_space, action_space, -1.0) batched_transition_network = DummyEnsembleTransitionNetwork( observation_space) batched_transition_model = KerasTransitionModel( [batched_transition_network], observation_space, action_space, ) observation = create_uniform_distribution_from_spec( observation_space).sample(sample_shape=sample_shape) environment_model = EnvironmentModel( transition_model=batched_transition_model, reward_model=reward, termination_model=ConstantFalseTermination(observation_space), initial_state_distribution_model=DeterministicInitialStateModel( observation), batch_size=population_size, ) trajectory_optimiser = optimiser_policy_trajectory_optimiser_factory( time_step_space, action_space, horizon, population_size, number_of_particles) return trajectory_optimiser, environment_model
def test_sample_shape_from_uniform_distribution_from_tensor_spec_by_dtype( gym_space_shape, dtype): tensor_spec = BoundedTensorSpec(gym_space_shape, dtype, 0, 1) uniform_distribution = create_uniform_distribution_from_spec(tensor_spec) sample = uniform_distribution.sample() assert sample.shape == gym_space_shape
def test_planning_policy_action_shape( observation_space, action_space, optimiser_policy_trajectory_optimiser_factory): """ Ensure action shape of the planning policy is correct. """ population_size = 10 number_of_particles = 1 horizon = 7 time_step_space = time_step_spec(observation_space) trajectory_optimiser, environment_model = get_optimiser_and_environment_model( time_step_space, observation_space, action_space, population_size=population_size, number_of_particles=number_of_particles, horizon=horizon, optimiser_policy_trajectory_optimiser_factory= optimiser_policy_trajectory_optimiser_factory, ) # remember the time step comes from the real environment with batch size 1 observation = create_uniform_distribution_from_spec( observation_space).sample(sample_shape=(1, )) time_step = restart(observation, batch_size=1) planning_policy = PlanningPolicy(environment_model, trajectory_optimiser) policy_step = planning_policy.action(time_step) action = policy_step.action assert get_outer_shape(action, action_space) == (1, ) assert action_space.is_compatible_with(action[0])
def test_generate_virtual_rollouts(observation_space, action_space, batch_size, horizon): observation = create_uniform_distribution_from_spec( observation_space).sample() network = DummyEnsembleTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) env_model = EnvironmentModel( transition_model=model, reward_model=ConstantReward(observation_space, action_space, -1.0), termination_model=ConstantFalseTermination(observation_space), initial_state_distribution_model=DeterministicInitialStateModel( observation), batch_size=batch_size, ) random_policy = RandomTFPolicy(time_step_spec(observation_space), action_space) replay_buffer, driver, wrapped_env_model = virtual_rollouts_buffer_and_driver( env_model, random_policy, horizon) driver.run(wrapped_env_model.reset()) trajectory = replay_buffer.gather_all() mid_steps = repeat(1, horizon - 1) expected_step_types = tf.constant(list(chain([0], mid_steps, [2]))) batched_step_types = replicate(expected_step_types, (batch_size, )) np.testing.assert_array_equal(batched_step_types, trajectory.step_type)
def _wrapped_environment_fixture(observation_space, action_space, batch_size): observation = create_uniform_distribution_from_spec( observation_space).sample() network = DummyEnsembleTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) env_model = EnvironmentModel( transition_model=model, reward_model=ConstantReward(observation_space, action_space, -1.0), termination_model=ConstantFalseTermination(observation_space), initial_state_distribution_model=DeterministicInitialStateModel( observation), batch_size=batch_size, ) wrapped_environment_model = TFTimeLimit(env_model, 2) action = create_uniform_distribution_from_spec(action_space).sample( (batch_size, )) return wrapped_environment_model, action
def test_step_call_goal_state_transform( transition_network, observation_space_latent_obs, action_space_latent_obs, batch_size, ensemble_size, ): latent_observation_space_spec = BoundedTensorSpec( shape=observation_space_latent_obs.shape[:-1] + [observation_space_latent_obs.shape[-1] - 1], dtype=observation_space_latent_obs.dtype, minimum=observation_space_latent_obs.minimum, maximum=observation_space_latent_obs.maximum, name=observation_space_latent_obs.name, ) network_list = [ transition_network(latent_observation_space_spec, bootstrap_data=True) for _ in range(ensemble_size) ] observation_transformation = GoalStateObservationTransformation( latent_observation_space_spec=latent_observation_space_spec, goal_state_start_index=-1, ) transition_model = KerasTransitionModel( network_list, observation_space_latent_obs, action_space_latent_obs, predict_state_difference=True, trajectory_sampling_strategy=OneStepTrajectorySampling(batch_size, ensemble_size), observation_transformation=observation_transformation, ) observation_distribution = create_uniform_distribution_from_spec( observation_space_latent_obs ) observations = observation_distribution.sample((batch_size,)) action_distribution = create_uniform_distribution_from_spec(action_space_latent_obs) actions = action_distribution.sample((batch_size,)) next_observations = transition_model.step(observations, actions) assert next_observations.shape == (batch_size,) + observation_space_latent_obs.shape assert observation_space_latent_obs.is_compatible_with(next_observations[0]) tf.assert_equal(next_observations[..., -1], observations[..., -1])
def create_uniform_initial_state_distribution( state_spec: BoundedTensorSpec, ) -> InitialStateDistributionModel: """ Helper function to create uniform initial state distributions. """ state_sampler = ProbabilisticInitialStateDistributionModel( create_uniform_distribution_from_spec(state_spec) ) return state_sampler
def test_decorate_policy_with_particles_action_shapes( observation_space, action_space, population_size, number_of_particles ): time_step_space = time_step_spec(observation_space) policy = RandomTFPolicy(time_step_space, action_space) decorated_policy = decorate_policy_with_particles(policy, number_of_particles) observation = create_uniform_distribution_from_spec(observation_space).sample( sample_shape=(population_size * number_of_particles,) ) initial_time_step = restart(observation, batch_size=population_size * number_of_particles) policy_step = decorated_policy.action(initial_time_step) actions = policy_step.action assert actions.shape == [population_size * number_of_particles] + action_space.shape.dims
def test_random_shooting_with_dynamic_step_driver(observation_space, action_space): """ This test uses the environment wrapper as an adapter so that a driver from TF-Agents can be used to generate a rollout. This also serves as an example of how to construct "random shooting" rollouts from an environment model. The assertion in this test is that selected action has the expected log_prob value consistent with optimisers from a uniform distribution. All this is really checking is that the preceeding code has run successfully. """ network = LinearTransitionNetwork(observation_space) environment = KerasTransitionModel([network], observation_space, action_space) wrapped_environment = EnvironmentModel( environment, ConstantReward(observation_space, action_space, 0.0), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), ) random_policy = RandomTFPolicy( wrapped_environment.time_step_spec(), action_space, emit_log_probability=True ) transition_observer = _RecordLastLogProbTransitionObserver() driver = DynamicStepDriver( env=wrapped_environment, policy=random_policy, transition_observers=[transition_observer], ) driver.run() last_log_prob = transition_observer.last_log_probability uniform_distribution = create_uniform_distribution_from_spec(action_space) action_log_prob = uniform_distribution.log_prob(transition_observer.action) expected = np.sum(action_log_prob.numpy().astype(np.float32)) actual = np.sum(last_log_prob.numpy()) np.testing.assert_array_almost_equal(actual, expected, decimal=4)
def test_invalid_num_elites(observation_space, action_space, horizon): # some fixed parameters population_size = 10 number_of_particles = 1 # set up the environment model network = LinearTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) environment_model = EnvironmentModel( model, ConstantReward(observation_space, action_space), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), population_size, ) # set up the trajectory optimizer time_step_space = time_step_spec(observation_space) optimiser = cross_entropy_method_trajectory_optimisation( time_step_space, action_space, horizon=horizon, population_size=population_size, number_of_particles=number_of_particles, num_elites=population_size + 1, learning_rate=0.1, max_iterations=1, ) # remember the time step comes from the real environment with batch size 1 observation = create_uniform_distribution_from_spec( observation_space).sample(sample_shape=(1, )) initial_time_step = restart(observation, batch_size=1) # run with pytest.raises(AssertionError) as excinfo: optimiser.optimise(initial_time_step, environment_model) assert "num_elites" in str(excinfo)
def sample_uniformly_distributed_observations_and_get_actions( policy: TFPolicy, number_of_samples: int): """ Sample observations from a uniform distribution over the space of observations, and then get corresponding actions from the policy. :param policy: A policy, instance of `TFPolicy`, from which observations and actions are sampled. :param number_of_samples: Number of observation action pairs that will be sampled. :return: Dictionary (`dict`) consisting of 'observations' and 'actions'. """ observation_distribution = create_uniform_distribution_from_spec( policy.time_step_spec.observation) observations = observation_distribution.sample((number_of_samples, )) rewards = tf.zeros((number_of_samples, ), dtype=tf.float32) time_step = transition(observations, rewards) actions = policy.action(time_step).action return {"observations": observations, "actions": actions}
def test_trajectory_optimiser_with_particles_actions_shape( action_space, horizon, population_size, number_of_particles): observation = create_uniform_distribution_from_spec( OBSERVATION_SPACE_SPEC).sample(sample_shape=(population_size * number_of_particles, )) transition_model = TrajectoryOptimiserTransitionModel( action_space, repeat(observation)) reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0) termination_model = ConstantFalseTermination(OBSERVATION_SPACE_SPEC) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=termination_model, initial_state_distribution_model=DeterministicInitialStateModel( StepType.FIRST), batch_size=population_size * number_of_particles, ) time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC) policy = RandomTFPolicy(time_step_space, action_space, automatic_state_reset=False) trajectory_optimiser = PolicyTrajectoryOptimiser( policy, horizon=horizon, population_size=population_size, number_of_particles=number_of_particles, max_iterations=2, ) initial_time_step = restart(tf.expand_dims(observation[0], axis=0)) optimal_actions = trajectory_optimiser.optimise(initial_time_step, environment_model) assert optimal_actions.shape == (horizon + 1, ) + action_space.shape
def test_mismatch_between_optimizer_and_environment_model_batch_size( observation_space, action_space, optimiser_policy_trajectory_optimiser_factory): time_step_space = time_step_spec(observation_space) environment_model = EnvironmentModel( StubTrainableTransitionModel(observation_space, action_space, predict_state_difference=True), ConstantReward(observation_space, action_space), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), ) population_size = environment_model.batch_size + 1 trajectory_optimiser = optimiser_policy_trajectory_optimiser_factory( time_step_space, action_space, 1, population_size, 1) # remember the time step comes from the real environment with batch size 1 observation = create_uniform_distribution_from_spec( observation_space).sample(sample_shape=(1, )) time_step = restart(observation, batch_size=1) with pytest.raises(AssertionError) as excinfo: _ = trajectory_optimiser.optimise(time_step, environment_model) assert "batch_size parameter is not equal to environment_model.batch_size" in str( excinfo)
def test_constant_termination(observation_space): constant_termination = ConstantFalseTermination(observation_space) observation = create_uniform_distribution_from_spec(observation_space).sample() assert not constant_termination.terminates(observation)