def _create_env_model(observation_space, action_space): batch_size = 3 time_limit = 5 terminations = MutableBatchConstantTermination(observation_space, batch_size) observation = create_uniform_distribution_from_spec(observation_space).sample() network = DummyEnsembleTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) env_model = TFTimeLimit( EnvironmentModel( transition_model=model, reward_model=ConstantReward(observation_space, action_space, -1.0), termination_model=terminations, initial_state_distribution_model=DeterministicInitialStateModel(observation), batch_size=batch_size, ), duration=time_limit, ) actions = create_uniform_distribution_from_spec(action_space).sample((batch_size,)) # Initial time step env_model.reset() observations = np.squeeze( np.repeat(np.expand_dims(observation, axis=0), batch_size, axis=0) ) return terminations, observations, actions, env_model
def assert_rollouts_are_close_to_actuals(model, max_steps): tf_env = tf_py_environment.TFPyEnvironment( create_pendulum_environment(max_steps)) collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) test_trajectory = policy_evaluation(tf_env, collect_policy, num_episodes=1, max_buffer_capacity=200, use_function=True) start_state = test_trajectory.observation[0, 0, :] env_model = TFTimeLimit( EnvironmentModel( model, PendulumReward(tf_env.observation_spec(), tf_env.action_spec()), ConstantFalseTermination(tf_env.observation_spec()), DeterministicInitialStateModel(start_state), batch_size=30, ), max_steps + 1, ) replayed_trajectories = replay_actions_across_batch_transition_models( env_model, test_trajectory.action[0]) prediction_mean = tf.reduce_mean(replayed_trajectories.observation, axis=0) np.testing.assert_allclose(prediction_mean, test_trajectory.observation[0], atol=1e-1, rtol=2e-1)
def test_replay_actions_across_batches(observation_space, action_space, horizon, batch_size): transition_network = DummyEnsembleTransitionNetwork(observation_space) transition_model = KerasTransitionModel( [transition_network], observation_space, action_space, ) reward = ConstantReward(observation_space, action_space, 0.0) termination = ConstantFalseTermination(observation_space) initial_state_sampler = create_uniform_initial_state_distribution( observation_space) env_model = TFTimeLimit( EnvironmentModel(transition_model, reward, termination, initial_state_sampler, batch_size), horizon, ) actions_distribution = create_uniform_initial_state_distribution( observation_space) actions = actions_distribution.sample((horizon, )) trajectory = replay_actions_across_batch_transition_models( env_model, actions) assert (trajectory.observation.shape == ( batch_size, horizon, ) + observation_space.shape)
def test_generate_virtual_rollouts_assert_no_time_limit_wrapper(mocker): env_model = TFTimeLimit(mocker.MagicMock(spec=EnvironmentModel), 100) policy = mocker.MagicMock(spec=TFPolicy) with pytest.raises(AssertionError) as excinfo: virtual_rollouts_buffer_and_driver(env_model, policy, 64) assert "should not be wrapped" in str(excinfo)
def _wrapped_environment_fixture(observation_space, action_space, batch_size): observation = create_uniform_distribution_from_spec( observation_space).sample() network = DummyEnsembleTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) env_model = EnvironmentModel( transition_model=model, reward_model=ConstantReward(observation_space, action_space, -1.0), termination_model=ConstantFalseTermination(observation_space), initial_state_distribution_model=DeterministicInitialStateModel( observation), batch_size=batch_size, ) wrapped_environment_model = TFTimeLimit(env_model, 2) action = create_uniform_distribution_from_spec(action_space).sample( (batch_size, )) return wrapped_environment_model, action
def test_sample_trajectory_for_mountain_car(): tf_env = tf_py_environment.TFPyEnvironment( suite_gym.load("MountainCar-v0")) network = LinearTransitionNetwork(tf_env.observation_spec()) model = KerasTransitionModel( [network], tf_env.observation_spec(), tf_env.action_spec(), ) reward = ConstantReward(tf_env.observation_spec(), tf_env.action_spec(), -1.0) terminates = MountainCarTermination(tf_env.observation_spec()) initial_state_sampler = MountainCarInitialState(tf_env.observation_spec()) environment = TFTimeLimit(EnvironmentModel(model, reward, terminates, initial_state_sampler), duration=200) collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) replay_buffer_capacity = 1001 policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( collect_policy.trajectory_spec, batch_size=1, max_length=replay_buffer_capacity) collect_episodes_per_iteration = 2 collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( environment, collect_policy, observers=[policy_training_buffer.add_batch], num_episodes=collect_episodes_per_iteration, ) collect_driver.run() trajectory = policy_training_buffer.gather_all() first_batch_step_type = trajectory.step_type[0, :] assert (first_batch_step_type[0] == StepType.FIRST and first_batch_step_type[-1] == StepType.LAST)
def test_tf_time_limit_wrapper_with_environment_model(observation_space, action_space, trajectory_length): """ This test checks that the environment wrapper can in turn be wrapped by the `TimeLimit` environment wrapper from TF-Agents. """ ts_spec = time_step_spec(observation_space) network = LinearTransitionNetwork(observation_space) environment = KerasTransitionModel([network], observation_space, action_space) wrapped_environment = TFTimeLimit( EnvironmentModel( environment, ConstantReward(observation_space, action_space, 0.0), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), ), trajectory_length, ) collect_policy = RandomTFPolicy(ts_spec, action_space) replay_buffer_capacity = 1001 policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( collect_policy.trajectory_spec, batch_size=1, max_length=replay_buffer_capacity) collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( wrapped_environment, collect_policy, observers=[policy_training_buffer.add_batch], num_episodes=1, ) collect_driver.run() trajectories = policy_training_buffer.gather_all() assert trajectories.step_type.shape == (1, trajectory_length + 1)
# %% [markdown] """ ## Training on samples We define an environment which uses the trained transition model for the dynamics, along with a reward function, episode termination condition, initial state distributions and bound on episode length. """ # %% reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec()) terminates = MountainCarTermination(tf_env.observation_spec()) initial_state_distribution = MountainCarInitialState(tf_env.observation_spec()) environment_model = TFTimeLimit( EnvironmentModel(transition_model, reward, terminates, initial_state_distribution), duration=200, ) # %% [markdown] """ The agent is trained on data gathered from the environment model. Using the environment interface means the TF-Agents drivers can be used to generate rollouts. """ # %% replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=tf_agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity, )