def test_replay_actions_across_batches(observation_space, action_space, horizon, batch_size): transition_network = DummyEnsembleTransitionNetwork(observation_space) transition_model = KerasTransitionModel( [transition_network], observation_space, action_space, ) reward = ConstantReward(observation_space, action_space, 0.0) termination = ConstantFalseTermination(observation_space) initial_state_sampler = create_uniform_initial_state_distribution( observation_space) env_model = TFTimeLimit( EnvironmentModel(transition_model, reward, termination, initial_state_sampler, batch_size), horizon, ) actions_distribution = create_uniform_initial_state_distribution( observation_space) actions = actions_distribution.sample((horizon, )) trajectory = replay_actions_across_batch_transition_models( env_model, actions) assert (trajectory.observation.shape == ( batch_size, horizon, ) + observation_space.shape)
def test_batched_environment_model(observation_space, action_space, batch_size): transition_network = DummyEnsembleTransitionNetwork(observation_space) transition_model = KerasTransitionModel( [transition_network], observation_space, action_space, ) reward = ConstantReward(observation_space, action_space, 0.0) termination = ConstantFalseTermination(observation_space) initial_state_sampler = create_uniform_initial_state_distribution( observation_space) env_model = EnvironmentModel(transition_model, reward, termination, initial_state_sampler, batch_size) action_distr = create_uniform_distribution_from_spec(action_space) single_action = action_distr.sample() batch_actions = tf.convert_to_tensor( [single_action for _ in range(batch_size)]) first_step = env_model.reset() assert (first_step.step_type == [ StepType.FIRST for _ in range(batch_size) ]).numpy().all() assert first_step.observation.shape == [batch_size] + list( observation_space.shape) next_step = env_model.step(batch_actions) assert (next_step.step_type == [StepType.MID for _ in range(batch_size)]).numpy().all() assert next_step.observation.shape == [batch_size] + list( observation_space.shape) assert next_step.reward.shape == [batch_size]
def _create_wrapped_environment(observation_space, action_space, reward): network = LinearTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) return EnvironmentModel( model, ConstantReward(observation_space, action_space, reward), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), )
def get_cross_entropy_policy(observation_space, action_space, horizon, batch_size): time_step_space = time_step_spec(observation_space) network = LinearTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) env_model = EnvironmentModel( model, ConstantReward(observation_space, action_space), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), batch_size, ) policy = CrossEntropyMethodPolicy(time_step_space, action_space, horizon, batch_size) return env_model, policy
def test_train_method_increments_counter_for_generic_background_planning( mocker, agent_class): """ The docstring for the `_train` method of a TFAgent requires that the implementation increments the `train_step_counter`. """ population_size = 1 horizon = 10 model_free_training_iterations = 1 mf_agent = create_mock_model_free_agent(mocker, TIMESTEP_SPEC, ACTION_SPEC, agent_class) network = LinearTransitionNetwork(OBSERVATION_SPEC) transition_model = KerasTransitionModel([network], OBSERVATION_SPEC, ACTION_SPEC) reward_model = ConstantReward(OBSERVATION_SPEC, ACTION_SPEC) initial_state_model = create_uniform_initial_state_distribution( OBSERVATION_SPEC) train_step_counter = common.create_variable("train_step_counter", shape=(), dtype=tf.float64) model_based_agent = BackgroundPlanningAgent( (transition_model, TransitionModelTrainingSpec(1, 1)), reward_model, initial_state_model, mf_agent, population_size, horizon, model_free_training_iterations, train_step_counter=train_step_counter, ) dummy_trajectories = generate_dummy_trajectories( OBSERVATION_SPEC, ACTION_SPEC, batch_size=population_size, trajectory_length=horizon) train_kwargs = { TRAIN_ARGSPEC_COMPONENT_ID: EnvironmentModelComponents.TRANSITION.value } model_based_agent.train(dummy_trajectories, **train_kwargs) assert train_step_counter.value() == 1
def test_random_shooting_with_dynamic_step_driver(observation_space, action_space): """ This test uses the environment wrapper as an adapter so that a driver from TF-Agents can be used to generate a rollout. This also serves as an example of how to construct "random shooting" rollouts from an environment model. The assertion in this test is that selected action has the expected log_prob value consistent with optimisers from a uniform distribution. All this is really checking is that the preceeding code has run successfully. """ network = LinearTransitionNetwork(observation_space) environment = KerasTransitionModel([network], observation_space, action_space) wrapped_environment = EnvironmentModel( environment, ConstantReward(observation_space, action_space, 0.0), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), ) random_policy = RandomTFPolicy( wrapped_environment.time_step_spec(), action_space, emit_log_probability=True ) transition_observer = _RecordLastLogProbTransitionObserver() driver = DynamicStepDriver( env=wrapped_environment, policy=random_policy, transition_observers=[transition_observer], ) driver.run() last_log_prob = transition_observer.last_log_probability uniform_distribution = create_uniform_distribution_from_spec(action_space) action_log_prob = uniform_distribution.log_prob(transition_observer.action) expected = np.sum(action_log_prob.numpy().astype(np.float32)) actual = np.sum(last_log_prob.numpy()) np.testing.assert_array_almost_equal(actual, expected, decimal=4)
def test_train_method_increments_counter_for_model_free_supported_agents( mocker, agent_class, train_component ): """ The docstring for the `_train` method of a TFAgent requires that the implementation increments the `train_step_counter`. """ population_size = 1 number_of_particles = 1 horizon = 10 mf_agent = create_mock_model_free_agent(mocker, TIMESTEP_SPEC, ACTION_SPEC, agent_class) trajectory_optimiser = random_shooting_trajectory_optimisation( TIMESTEP_SPEC, ACTION_SPEC, horizon, population_size, number_of_particles ) network = LinearTransitionNetwork(OBSERVATION_SPEC) transition_model = KerasTransitionModel([network], OBSERVATION_SPEC, ACTION_SPEC) reward_model = ConstantReward(OBSERVATION_SPEC, ACTION_SPEC) initial_state_model = create_uniform_initial_state_distribution(OBSERVATION_SPEC) train_step_counter = common.create_variable( "train_step_counter", shape=(), dtype=tf.float64 ) agent = ModelFreeSupportedDecisionTimePlanningAgent( TIMESTEP_SPEC, ACTION_SPEC, (transition_model, TransitionModelTrainingSpec(1, 1)), reward_model, initial_state_model, trajectory_optimiser, mf_agent, train_step_counter=train_step_counter, ) dummy_trajectories = generate_dummy_trajectories( OBSERVATION_SPEC, ACTION_SPEC, batch_size=population_size, trajectory_length=horizon ) train_kwargs = {TRAIN_ARGSPEC_COMPONENT_ID: train_component.value} agent.train(dummy_trajectories, **train_kwargs) assert train_step_counter.value() == 1
def test_tf_time_limit_wrapper_with_environment_model(observation_space, action_space, trajectory_length): """ This test checks that the environment wrapper can in turn be wrapped by the `TimeLimit` environment wrapper from TF-Agents. """ ts_spec = time_step_spec(observation_space) network = LinearTransitionNetwork(observation_space) environment = KerasTransitionModel([network], observation_space, action_space) wrapped_environment = TFTimeLimit( EnvironmentModel( environment, ConstantReward(observation_space, action_space, 0.0), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), ), trajectory_length, ) collect_policy = RandomTFPolicy(ts_spec, action_space) replay_buffer_capacity = 1001 policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( collect_policy.trajectory_spec, batch_size=1, max_length=replay_buffer_capacity) collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( wrapped_environment, collect_policy, observers=[policy_training_buffer.add_batch], num_episodes=1, ) collect_driver.run() trajectories = policy_training_buffer.gather_all() assert trajectories.step_type.shape == (1, trajectory_length + 1)
def test_invalid_num_elites(observation_space, action_space, horizon): # some fixed parameters population_size = 10 number_of_particles = 1 # set up the environment model network = LinearTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) environment_model = EnvironmentModel( model, ConstantReward(observation_space, action_space), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), population_size, ) # set up the trajectory optimizer time_step_space = time_step_spec(observation_space) optimiser = cross_entropy_method_trajectory_optimisation( time_step_space, action_space, horizon=horizon, population_size=population_size, number_of_particles=number_of_particles, num_elites=population_size + 1, learning_rate=0.1, max_iterations=1, ) # remember the time step comes from the real environment with batch size 1 observation = create_uniform_distribution_from_spec( observation_space).sample(sample_shape=(1, )) initial_time_step = restart(observation, batch_size=1) # run with pytest.raises(AssertionError) as excinfo: optimiser.optimise(initial_time_step, environment_model) assert "num_elites" in str(excinfo)
def test_train_oracle_transition_model(): """ Ensure that a non-trainable oracle transition model does not cause the agent `train` method to fail. """ population_size = 1 number_of_particles = 1 horizon = 10 trajectory_optimiser = random_shooting_trajectory_optimisation( TIMESTEP_SPEC, ACTION_SPEC, horizon, population_size, number_of_particles ) transition_model = StubTransitionModel(OBSERVATION_SPEC, ACTION_SPEC) reward_model = ConstantReward(OBSERVATION_SPEC, ACTION_SPEC) initial_state_model = create_uniform_initial_state_distribution(OBSERVATION_SPEC) train_step_counter = common.create_variable( "train_step_counter", shape=(), dtype=tf.float64 ) with pytest.warns(RuntimeWarning): agent = DecisionTimePlanningAgent( TIMESTEP_SPEC, ACTION_SPEC, transition_model, reward_model, initial_state_model, trajectory_optimiser, train_step_counter=train_step_counter, ) dummy_trajectories = generate_dummy_trajectories( OBSERVATION_SPEC, ACTION_SPEC, batch_size=population_size, trajectory_length=horizon ) train_kwargs = {TRAIN_ARGSPEC_COMPONENT_ID: EnvironmentModelComponents.TRANSITION.value} loss_info = agent.train(dummy_trajectories, **train_kwargs) assert loss_info.loss is None assert loss_info.extra is None
def test_mismatch_between_optimizer_and_environment_model_batch_size( observation_space, action_space, optimiser_policy_trajectory_optimiser_factory): time_step_space = time_step_spec(observation_space) environment_model = EnvironmentModel( StubTrainableTransitionModel(observation_space, action_space, predict_state_difference=True), ConstantReward(observation_space, action_space), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), ) population_size = environment_model.batch_size + 1 trajectory_optimiser = optimiser_policy_trajectory_optimiser_factory( time_step_space, action_space, 1, population_size, 1) # remember the time step comes from the real environment with batch size 1 observation = create_uniform_distribution_from_spec( observation_space).sample(sample_shape=(1, )) time_step = restart(observation, batch_size=1) with pytest.raises(AssertionError) as excinfo: _ = trajectory_optimiser.optimise(time_step, environment_model) assert "batch_size parameter is not equal to environment_model.batch_size" in str( excinfo)