def test_tf_environment_with_random(n_episodes=20): """Test tf environment through random actions.""" print(f'Testing tf environment over {n_episodes} episodes.') env = LakeMonsterEnvironment(**params) env = TFPyEnvironment(env) policy = RandomTFPolicy(time_step_spec=env.time_step_spec(), action_spec=env.action_spec()) ts = env.reset() rewards = [] n_steps = [] for _ in tqdm(range(n_episodes)): n_step = 0 while not ts.is_last(): action = policy.action(ts).action ts = env.step(action) n_step += 1 reward = ts.reward rewards.append(reward) n_steps.append(n_step) ts = env.reset() # print results print('average num of steps per episode:', np.mean(n_steps)) print('average reward per episode', np.mean(rewards))
def __init__( self, ts_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, env_model: EnvironmentModel, ): super().__init__(ts_spec, action_spec) self._internal_policy = RandomTFPolicy(ts_spec, action_space) self._environment_model = env_model
class WrappedRandomTFPolicy(TFPolicy): def __init__( self, ts_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, env_model: EnvironmentModel, ): super().__init__(ts_spec, action_spec) self._internal_policy = RandomTFPolicy(ts_spec, action_space) self._environment_model = env_model def _action( self, time_step: ts.TimeStep, policy_state: types.NestedTensor, seed: Optional[types.Seed], ) -> policy_step.PolicyStep: np.testing.assert_array_equal( time_step.observation, self._environment_model.current_time_step().observation) return self._internal_policy._action(time_step, policy_state, seed) def _distribution( self, time_step: ts.TimeStep, policy_state: types.NestedTensorSpec ) -> policy_step.PolicyStep: raise NotImplementedError()
def test_generate_virtual_rollouts(observation_space, action_space, batch_size, horizon): observation = create_uniform_distribution_from_spec( observation_space).sample() network = DummyEnsembleTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) env_model = EnvironmentModel( transition_model=model, reward_model=ConstantReward(observation_space, action_space, -1.0), termination_model=ConstantFalseTermination(observation_space), initial_state_distribution_model=DeterministicInitialStateModel( observation), batch_size=batch_size, ) random_policy = RandomTFPolicy(time_step_spec(observation_space), action_space) replay_buffer, driver, wrapped_env_model = virtual_rollouts_buffer_and_driver( env_model, random_policy, horizon) driver.run(wrapped_env_model.reset()) trajectory = replay_buffer.gather_all() mid_steps = repeat(1, horizon - 1) expected_step_types = tf.constant(list(chain([0], mid_steps, [2]))) batched_step_types = replicate(expected_step_types, (batch_size, )) np.testing.assert_array_equal(batched_step_types, trajectory.step_type)
def test_get_batch_of_actions(observation_space, action_space, batch_size): policy = RandomTFPolicy(time_step_spec(observation_space), action_space) samples = sample_uniformly_distributed_observations_and_get_actions(policy, batch_size) for i in range(batch_size): assert action_space.is_compatible_with(samples["actions"][i, ...])
def test_incorrect_termination_model(): """ The generic model-based agent should only allow a ConstantFalseTermination model. """ # setup arguments for the model-based agent constructor py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() network = LinearTransitionNetwork(observation_spec) transition_model = KerasTransitionModel([network], observation_spec, action_spec) reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState(observation_spec) termination_model = MountainCarTermination(observation_spec) policy = RandomTFPolicy(time_step_spec, action_spec) with pytest.raises(AssertionError) as excinfo: ModelBasedAgent( time_step_spec, action_spec, transition_model, reward_model, termination_model, initial_state_distribution_model, policy, policy, ) assert "Only constant false termination supported" in str(excinfo.value)
def assert_rollouts_are_close_to_actuals(model, max_steps): tf_env = tf_py_environment.TFPyEnvironment( create_pendulum_environment(max_steps)) collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) test_trajectory = policy_evaluation(tf_env, collect_policy, num_episodes=1, max_buffer_capacity=200, use_function=True) start_state = test_trajectory.observation[0, 0, :] env_model = TFTimeLimit( EnvironmentModel( model, PendulumReward(tf_env.observation_spec(), tf_env.action_spec()), ConstantFalseTermination(tf_env.observation_spec()), DeterministicInitialStateModel(start_state), batch_size=30, ), max_steps + 1, ) replayed_trajectories = replay_actions_across_batch_transition_models( env_model, test_trajectory.action[0]) prediction_mean = tf.reduce_mean(replayed_trajectories.observation, axis=0) np.testing.assert_allclose(prediction_mean, test_trajectory.observation[0], atol=1e-1, rtol=2e-1)
def test_tf_env_wrapper_is_reset_at_the_start_of_each_iteration(action_space): observations_array = [ # First iteration [StepType.FIRST, StepType.FIRST], [StepType.LAST, StepType.MID], [StepType.FIRST, StepType.MID], [StepType.LAST, StepType.LAST], # Second iteration [StepType.FIRST, StepType.FIRST], [StepType.MID, StepType.MID], [StepType.MID, StepType.MID], [StepType.MID, StepType.LAST], [StepType.MID, StepType.FIRST], ] observations = [ tf.concat(ob_array, axis=0) for ob_array in observations_array ] transition_model = TrajectoryOptimiserTransitionModel( action_space, iter(observations)) reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0) termination_model = TrajectoryOptimiserTerminationModel( OBSERVATION_SPACE_SPEC) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=termination_model, initial_state_distribution_model=DeterministicInitialStateModel( StepType.FIRST), batch_size=2, ) time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC) policy = RandomTFPolicy(time_step_space, action_space, automatic_state_reset=False, validate_args=False) policy_state_updater = StubPolicyStateUpdater() trajectory_optimiser = PolicyTrajectoryOptimiser( policy, horizon=3, population_size=2, max_iterations=2, policy_state_updater=policy_state_updater, ) initial_time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST), axis=0), batch_size=1) trajectory_optimiser.optimise(initial_time_step, environment_model) for stored_trajectories in policy_state_updater.step_types: np.testing.assert_equal(stored_trajectories[:, 0], np.array([0, 0]))
def test_trajectory_optimiser_pathological_trajectories( action_space, horizon, batch_size): """ The replay buffer is a FIFO buffer of fixed capacity. Ensure that the capacity is sufficient such that the initial observation is still present in the buffer even in the pathological case where all trajectories are of length 2. """ # construct the environment model observations = list( chain.from_iterable( repeat( [ replicate(tf.constant(StepType.FIRST), [batch_size]), replicate(tf.constant(StepType.LAST), [batch_size]), ], horizon, ))) transition_model = TrajectoryOptimiserTransitionModel( action_space, iter(observations)) reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0) termination_model = TrajectoryOptimiserTerminationModel( OBSERVATION_SPACE_SPEC) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=termination_model, initial_state_distribution_model=DeterministicInitialStateModel( StepType.FIRST), batch_size=batch_size, ) time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC) policy = RandomTFPolicy(time_step_space, action_space) stub_policy_state_updater = StubPolicyStateUpdater() trajectory_optimiser = PolicyTrajectoryOptimiser( policy, horizon, population_size=batch_size, max_iterations=1, policy_state_updater=stub_policy_state_updater, ) time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST), axis=0), batch_size=1) trajectory_optimiser.optimise(time_step, environment_model) stored_trajectory = stub_policy_state_updater.step_types[0] assert stored_trajectory[0][0] == StepType.FIRST
def test_decorate_policy_with_particles_action_shapes( observation_space, action_space, population_size, number_of_particles ): time_step_space = time_step_spec(observation_space) policy = RandomTFPolicy(time_step_space, action_space) decorated_policy = decorate_policy_with_particles(policy, number_of_particles) observation = create_uniform_distribution_from_spec(observation_space).sample( sample_shape=(population_size * number_of_particles,) ) initial_time_step = restart(observation, batch_size=population_size * number_of_particles) policy_step = decorated_policy.action(initial_time_step) actions = policy_step.action assert actions.shape == [population_size * number_of_particles] + action_space.shape.dims
def __init__(self, agent, enviroment): self._replay_buffer = TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=enviroment.batch_size, max_length=50000) self._random_policy = RandomTFPolicy(train_env.time_step_spec(), enviroment.action_spec()) self._fill_buffer(train_env, self._random_policy, steps=100) self.dataset = self._replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=BATCH_SIZE, num_steps=2).prefetch(3) self.iterator = iter(self.dataset)
def __init__(self, agent, enviroment, batch_size): self._replay_buffer = TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=enviroment.batch_size, max_length=50000) self._random_policy = RandomTFPolicy(enviroment.time_step_spec(), enviroment.action_spec()) self._fill_buffer(enviroment, self._random_policy, steps=100) self.dataset = self._replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2, single_deterministic_pass=False).prefetch(3) self.iterator = iter(self.dataset)
def test_planning_policy_batch_environment_model(): """ Ensure that planning policy is operational. """ # number of trajectories for planning and planning horizon population_size = 3 planner_horizon = 5 number_of_particles = 1 # setup the environment and a model of it py_env = suite_gym.load("MountainCar-v0") tf_env = TFPyEnvironment(py_env) reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec()) terminates = MountainCarTermination(tf_env.observation_spec()) network = LinearTransitionNetwork(tf_env.observation_spec()) transition_model = KerasTransitionModel( [network], tf_env.observation_spec(), tf_env.action_spec(), ) initial_state = MountainCarInitialState(tf_env.observation_spec()) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=terminates, initial_state_distribution_model=initial_state, ) # setup the trajectory optimiser random_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy, planner_horizon, population_size, number_of_particles) planning_policy = PlanningPolicy(environment_model, trajectory_optimiser) # test whether it runs collect_driver_planning_policy = DynamicEpisodeDriver(tf_env, planning_policy, num_episodes=1) time_step = tf_env.reset() collect_driver_planning_policy.run(time_step)
def _pendulum_training_data_fixture(): max_steps = 50 num_episodes = 80 tf_env = tf_py_environment.TFPyEnvironment(create_pendulum_environment(max_steps)) collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) trajectories = policy_evaluation( tf_env, collect_policy, num_episodes=num_episodes, max_buffer_capacity=1000, use_function=True, ) tf_env.close() return trajectories, tf_env
def test_random_shooting_with_dynamic_step_driver(observation_space, action_space): """ This test uses the environment wrapper as an adapter so that a driver from TF-Agents can be used to generate a rollout. This also serves as an example of how to construct "random shooting" rollouts from an environment model. The assertion in this test is that selected action has the expected log_prob value consistent with optimisers from a uniform distribution. All this is really checking is that the preceeding code has run successfully. """ network = LinearTransitionNetwork(observation_space) environment = KerasTransitionModel([network], observation_space, action_space) wrapped_environment = EnvironmentModel( environment, ConstantReward(observation_space, action_space, 0.0), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), ) random_policy = RandomTFPolicy( wrapped_environment.time_step_spec(), action_space, emit_log_probability=True ) transition_observer = _RecordLastLogProbTransitionObserver() driver = DynamicStepDriver( env=wrapped_environment, policy=random_policy, transition_observers=[transition_observer], ) driver.run() last_log_prob = transition_observer.last_log_probability uniform_distribution = create_uniform_distribution_from_spec(action_space) action_log_prob = uniform_distribution.log_prob(transition_observer.action) expected = np.sum(action_log_prob.numpy().astype(np.float32)) actual = np.sum(last_log_prob.numpy()) np.testing.assert_array_almost_equal(actual, expected, decimal=4)
def test_sample_trajectory_for_mountain_car(): tf_env = tf_py_environment.TFPyEnvironment( suite_gym.load("MountainCar-v0")) network = LinearTransitionNetwork(tf_env.observation_spec()) model = KerasTransitionModel( [network], tf_env.observation_spec(), tf_env.action_spec(), ) reward = ConstantReward(tf_env.observation_spec(), tf_env.action_spec(), -1.0) terminates = MountainCarTermination(tf_env.observation_spec()) initial_state_sampler = MountainCarInitialState(tf_env.observation_spec()) environment = TFTimeLimit(EnvironmentModel(model, reward, terminates, initial_state_sampler), duration=200) collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) replay_buffer_capacity = 1001 policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( collect_policy.trajectory_spec, batch_size=1, max_length=replay_buffer_capacity) collect_episodes_per_iteration = 2 collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( environment, collect_policy, observers=[policy_training_buffer.add_batch], num_episodes=collect_episodes_per_iteration, ) collect_driver.run() trajectory = policy_training_buffer.gather_all() first_batch_step_type = trajectory.step_type[0, :] assert (first_batch_step_type[0] == StepType.FIRST and first_batch_step_type[-1] == StepType.LAST)
def generate_pendulum_trajectories( batch_size: int, max_steps: int ) -> Tuple[Trajectory, BoundedTensorSpec, BoundedTensorSpec]: """ Utility function for generating batches of trajectories from the Pendulum-v0 gym environment. :param batch_size: Number of trajectories to generate :param max_steps: Length of trajectories :return: A tuple consisting of * A `Trajectory` object containing the batch of trajectories * The observation spec from the Pendulum-v0 environment * The action spec from the Pendulum-v0 environment """ tf_env = tf_py_environment.TFPyEnvironment( BatchedPyEnvironment([ create_pendulum_environment(max_steps) for _ in range(batch_size) ])) collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) replay_buffer_capacity = 1000 model_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( collect_policy.trajectory_spec, batch_size=batch_size, max_length=replay_buffer_capacity, ) collect_episodes_per_iteration = 1 collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, collect_policy, observers=[model_training_buffer.add_batch], num_episodes=collect_episodes_per_iteration, ) collect_driver.run() tf_env.close() training_data = model_training_buffer.gather_all() return training_data, tf_env.observation_spec(), tf_env.action_spec()
def test_tf_time_limit_wrapper_with_environment_model(observation_space, action_space, trajectory_length): """ This test checks that the environment wrapper can in turn be wrapped by the `TimeLimit` environment wrapper from TF-Agents. """ ts_spec = time_step_spec(observation_space) network = LinearTransitionNetwork(observation_space) environment = KerasTransitionModel([network], observation_space, action_space) wrapped_environment = TFTimeLimit( EnvironmentModel( environment, ConstantReward(observation_space, action_space, 0.0), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), ), trajectory_length, ) collect_policy = RandomTFPolicy(ts_spec, action_space) replay_buffer_capacity = 1001 policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( collect_policy.trajectory_spec, batch_size=1, max_length=replay_buffer_capacity) collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( wrapped_environment, collect_policy, observers=[policy_training_buffer.add_batch], num_episodes=1, ) collect_driver.run() trajectories = policy_training_buffer.gather_all() assert trajectories.step_type.shape == (1, trajectory_length + 1)
def _mountain_car_data_fixture(): tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load("MountainCar-v0")) collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) replay_buffer_capacity = 5000 model_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( collect_policy.trajectory_spec, batch_size=1, max_length=replay_buffer_capacity, ) collect_episodes_per_iteration = 10 collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, collect_policy, observers=[model_training_buffer.add_batch], num_episodes=collect_episodes_per_iteration, ) collect_driver.run() tf_env.close() return tf_env, model_training_buffer.gather_all()
def test_trajectory_optimiser_with_particles_actions_shape( action_space, horizon, population_size, number_of_particles): observation = create_uniform_distribution_from_spec( OBSERVATION_SPACE_SPEC).sample(sample_shape=(population_size * number_of_particles, )) transition_model = TrajectoryOptimiserTransitionModel( action_space, repeat(observation)) reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0) termination_model = ConstantFalseTermination(OBSERVATION_SPACE_SPEC) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=termination_model, initial_state_distribution_model=DeterministicInitialStateModel( StepType.FIRST), batch_size=population_size * number_of_particles, ) time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC) policy = RandomTFPolicy(time_step_space, action_space, automatic_state_reset=False) trajectory_optimiser = PolicyTrajectoryOptimiser( policy, horizon=horizon, population_size=population_size, number_of_particles=number_of_particles, max_iterations=2, ) initial_time_step = restart(tf.expand_dims(observation[0], axis=0)) optimal_actions = trajectory_optimiser.optimise(initial_time_step, environment_model) assert optimal_actions.shape == (horizon + 1, ) + action_space.shape
def create_real_drivers( self, real_replay_buffer: ReplayBuffer, train_metrics: List[TFStepMetric], ) -> Tuple[Driver, Driver]: """ Create the drivers for interacting with the real environment. This method creates two drivers: one uses the agent's "collect" policy, the other uses a uniform random policy. Note that the random policy is defined with the same `info_spec` as the agent's "collect" policy. The `info_spec` of the trajectories generated by the random policy must conform to the expectations of the agent when the data is used for training. """ agent_collect_driver = TFDriver( self._environment, self._agent.collect_policy, observers=[real_replay_buffer.add_batch] + train_metrics, max_steps=self._max_steps, disable_tf_function=not self._use_tf_function, ) random_policy = RandomTFPolicy( self._environment.time_step_spec(), self._environment.action_spec(), info_spec=self._agent.collect_policy.info_spec, ) random_policy_collect_driver = TFDriver( self._environment, random_policy, observers=[real_replay_buffer.add_batch] + train_metrics, max_steps=self._max_steps, disable_tf_function=not self._use_tf_function, ) return agent_collect_driver, random_policy_collect_driver
def random_shooting_trajectory_optimisation( time_step_spec: TimeStep, action_spec: types.NestedTensorSpec, horizon: int, population_size: int, number_of_particles: int, ) -> TrajectoryOptimiser: """ Construct a trajectory optimiser which uses the random shooting method. This method relies on `RandomTFPolicy` as a uniformly random policy. :param time_step_spec: A `TimeStep` spec of the expected time_steps. :param action_spec: A nest of BoundedTensorSpec representing the actions. :param horizon: Number of steps taken in the environment in each virtual rollout. :param population_size: The number of candidate sequences of actions at each iteration. :param number_of_particles: Number of monte-carlo rollouts of each action trajectory. :return: A `TrajectoryOptimiser` object which uses the random shooting method. """ policy = RandomTFPolicy(time_step_spec, action_spec) trajectory_optimiser = PolicyTrajectoryOptimiser(policy, horizon, population_size, number_of_particles) return trajectory_optimiser
logging.getLogger().setLevel(logging.INFO) ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ collect_driver = DynamicStepDriver( tf_env, # Env to play with agent.collect_policy, # Collect policy of the agent observers=[replay_buffer_observer] + train_metrics, # pass to all observers num_steps=1) # Speed up as tensorflow function collect_driver.run = function(collect_driver.run) initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) init_driver = DynamicStepDriver( tf_env, initial_collect_policy, observers=[replay_buffer.add_batch, ShowProgress(init_replay_buffer)], num_steps=init_replay_buffer) final_time_step, final_policy_state = init_driver.run() ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ dataset = replay_buffer.as_dataset( sample_batch_size=dataset_sample_batch_size, num_steps=dataset_num_steps,
def breakout_v4(seed=42): env = suite_gym.load("Breakout-v4") env.seed(seed) env.reset() repeating_env = ActionRepeat(env, times=4) for name in dir(tf_agents.environments.wrappers): obj = getattr(tf_agents.environments.wrappers, name) if hasattr(obj, "__base__") and issubclass( obj, tf_agents.environments.wrappers.PyEnvironmentBaseWrapper): print("{:27s} {}".format(name, obj.__doc__.split("\n")[0])) limited_repeating_env = suite_gym.load( "Breakout-v4", gym_env_wrappers=[partial(TimeLimit, max_episode_steps=10000)], env_wrappers=[partial(ActionRepeat, times=4)], ) max_episode_steps = 27000 # <=> 108k ALE frames since 1 step = 4 frames environment_name = "BreakoutNoFrameskip-v4" env = suite_atari.load( environment_name, max_episode_steps=max_episode_steps, gym_env_wrappers=[AtariPreprocessing, FrameStack4], ) env.seed(42) env.reset() time_step = env.step(np.array(1)) # FIRE for _ in range(4): time_step = env.step(np.array(3)) # LEFT def plot_observation(obs): # Since there are only 3 color channels, you cannot display 4 frames # with one primary color per frame. So this code computes the delta between # the current frame and the mean of the other frames, and it adds this delta # to the red and blue channels to get a pink color for the current frame. obs = obs.astype(np.float32) img_ = obs[..., :3] current_frame_delta = np.maximum( obs[..., 3] - obs[..., :3].mean(axis=-1), 0.0) img_[..., 0] += current_frame_delta img_[..., 2] += current_frame_delta img_ = np.clip(img_ / 150, 0, 1) plt.imshow(img_) plt.axis("off") plt.figure(figsize=(6, 6)) plot_observation(time_step.observation) plt.tight_layout() plt.savefig("./images/preprocessed_breakout_plot.png", format="png", dpi=300) plt.show() tf_env = TFPyEnvironment(env) preprocessing_layer = keras.layers.Lambda( lambda obs: tf.cast(obs, np.float32) / 255.0) conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)] fc_layer_params = [512] q_net = QNetwork( tf_env.observation_spec(), tf_env.action_spec(), preprocessing_layers=preprocessing_layer, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params, ) # see TF-agents issue #113 # optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, # epsilon=0.00001, centered=True) train_step = tf.Variable(0) update_period = 4 # run a training step every 4 collect steps optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=2.5e-4, decay=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ε decay_steps=250000 // update_period, # <=> 1,000,000 ALE frames end_learning_rate=0.01, ) # final ε agent = DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=2000, # <=> 32,000 ALE frames td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=0.99, # discount factor train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step), ) agent.initialize() from tf_agents.replay_buffers import tf_uniform_replay_buffer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=1000000) replay_buffer_observer = replay_buffer.add_batch class ShowProgress: def __init__(self, total): self.counter = 0 self.total = total def __call__(self, trajectory): if not trajectory.is_boundary(): self.counter += 1 if self.counter % 100 == 0: print("\r{}/{}".format(self.counter, self.total), end="") from tf_agents.metrics import tf_metrics train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] from tf_agents.eval.metric_utils import log_metrics import logging logging.getLogger().setLevel(logging.INFO) log_metrics(train_metrics) from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver collect_driver = DynamicStepDriver( tf_env, agent.collect_policy, observers=[replay_buffer_observer] + train_metrics, num_steps=update_period, ) # collect 4 steps for each training iteration from tf_agents.policies.random_tf_policy import RandomTFPolicy initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) init_driver = DynamicStepDriver( tf_env, initial_collect_policy, observers=[replay_buffer.add_batch, ShowProgress(20000)], num_steps=20000, ) # <=> 80,000 ALE frames final_time_step, final_policy_state = init_driver.run()
def test_all_mepo_variants_work(transition_model, trajectory_sampler, model_free_agent_type): """ Mepo Agent has prespecified transition model, trajectory sampler and model-free agent types. Here we check that all combinations execute without errors. """ # setup the environment and a prespecified model components py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState( observation_spec) # some parameters need to be set correctly ensemble_size = 2 num_elites = 10 population_size = num_elites + 10 horizon = 1 # define agent, many transition model and trajectory optimiser parameters can # be arbitrary agent = MepoAgent( time_step_spec, action_spec, transition_model, 1, 10, tf.nn.relu, ensemble_size, False, 1, 1, [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)], reward_model, initial_state_distribution_model, trajectory_sampler, horizon, population_size, model_free_agent_type, 1, 10, tf.nn.relu, 2, ) # we need some training data random_policy = RandomTFPolicy( time_step_spec, action_spec, info_spec=agent.collect_policy.info_spec, ) model_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( random_policy.trajectory_spec, batch_size=1, max_length=1000) collect_driver_random_policy = TFDriver( tf_env, random_policy, observers=[model_training_buffer.add_batch], max_steps=10, disable_tf_function=True, ) initial_time_step = tf_env.reset() collect_driver_random_policy.run(initial_time_step) pets_agent_trainer = BackgroundPlanningAgentTrainer(10, 10) tf_training_scheduler = pets_agent_trainer.create_training_scheduler( agent, model_training_buffer) training_losses = tf_training_scheduler.maybe_train( tf.constant(10, dtype=tf.int64)) assert EnvironmentModelComponents.TRANSITION in training_losses # test the agent collect_driver_planning_policy = TFDriver( tf_env, agent.collect_policy, observers=[model_training_buffer.add_batch], max_steps=10, disable_tf_function=True, ) time_step = tf_env.reset() collect_driver_planning_policy.run(time_step)
def compile_model(self, X_train, y_train, layers: list = [], imb_ratio: float = None, loss_fn=common.element_wise_squared_loss) -> None: """Initializes the neural networks, DDQN-agent, collect policies and replay buffer. :param X_train: Training data for the model. :type X_train: np.ndarray :param y_train: Labels corresponding to `X_train`. 1 for the positive class, 0 for the negative class. :param y_train: np.ndarray :param layers: List of layers to feed into the TF-agents custom Sequential(!) layer. :type layers: list :param imb_ratio: The imbalance ratio of the data. :type imb_ratio: float :param loss_fn: Callable loss function :type loss_fn: tf.compat.v1.losses :return: None :rtype: NoneType """ if imb_ratio is None: imb_ratio = imbalance_ratio(y_train) self.train_env = TFPyEnvironment( ClassifierEnv(X_train, y_train, imb_ratio)) self.global_episode = tf.Variable( 0, name="global_episode", dtype=np.int64, trainable=False) # Global train episode counter # Custom epsilon decay: https://github.com/tensorflow/agents/issues/339 epsilon_decay = tf.compat.v1.train.polynomial_decay( 1.0, self.global_episode, self.decay_episodes, end_learning_rate=self.min_epsilon) self.q_net = Sequential(layers, self.train_env.observation_spec()) self.agent = DdqnAgent( self.train_env.time_step_spec(), self.train_env.action_spec(), q_network=self.q_net, optimizer=Adam(learning_rate=self.learning_rate), td_errors_loss_fn=loss_fn, train_step_counter=self.global_episode, target_update_period=self.target_update_period, target_update_tau=self.target_update_tau, gamma=self.gamma, epsilon_greedy=epsilon_decay, n_step_update=self.n_step_update, gradient_clipping=self.gradient_clipping) self.agent.initialize() self.random_policy = RandomTFPolicy(self.train_env.time_step_spec(), self.train_env.action_spec()) self.replay_buffer = TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=self.train_env.batch_size, max_length=self.memory_length) self.warmup_driver = DynamicStepDriver( self.train_env, self.random_policy, observers=[self.replay_buffer.add_batch], num_steps=self.warmup_steps) # Uses a random policy self.collect_driver = DynamicStepDriver( self.train_env, self.agent.collect_policy, observers=[self.replay_buffer.add_batch], num_steps=self.collect_steps_per_episode ) # Uses the epsilon-greedy policy of the agent self.agent.train = common.function(self.agent.train) # Optimalization self.warmup_driver.run = common.function(self.warmup_driver.run) self.collect_driver.run = common.function(self.collect_driver.run) self.compiled = True
class TrainDDQN(): """Wrapper for DDQN training, validation, saving etc.""" def __init__(self, episodes: int, warmup_steps: int, learning_rate: float, gamma: float, min_epsilon: float, decay_episodes: int, model_path: str = None, log_dir: str = None, batch_size: int = 64, memory_length: int = None, collect_steps_per_episode: int = 1, val_every: int = None, target_update_period: int = 1, target_update_tau: float = 1.0, progressbar: bool = True, n_step_update: int = 1, gradient_clipping: float = 1.0, collect_every: int = 1) -> None: """ Wrapper to make training easier. Code is partly based of https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial :param episodes: Number of training episodes :type episodes: int :param warmup_steps: Number of episodes to fill Replay Buffer with random state-action pairs before training starts :type warmup_steps: int :param learning_rate: Learning Rate for the Adam Optimizer :type learning_rate: float :param gamma: Discount factor for the Q-values :type gamma: float :param min_epsilon: Lowest and final value for epsilon :type min_epsilon: float :param decay_episodes: Amount of episodes to decay from 1 to `min_epsilon` :type decay_episodes: int :param model_path: Location to save the trained model :type model_path: str :param log_dir: Location to save the logs, usefull for TensorBoard :type log_dir: str :param batch_size: Number of samples in minibatch to train on each step :type batch_size: int :param memory_length: Maximum size of the Replay Buffer :type memory_length: int :param collect_steps_per_episode: Amount of data to collect for Replay Buffer each episiode :type collect_steps_per_episode: int :param collect_every: Step interval to collect data during training :type collect_every: int :param val_every: Validate the model every X episodes using the `collect_metrics()` function :type val_every: int :param target_update_period: Update the target Q-network every X episodes :type target_update_period: int :param target_update_tau: Parameter for softening the `target_update_period` :type target_update_tau: float :param progressbar: Enable or disable the progressbar for collecting data and training :type progressbar: bool :return: None :rtype: NoneType """ self.episodes = episodes # Total episodes self.warmup_steps = warmup_steps # Amount of warmup steps before training self.batch_size = batch_size # Batch size of Replay Memory self.collect_steps_per_episode = collect_steps_per_episode # Amount of steps to collect data each episode self.collect_every = collect_every # Step interval to collect data during training self.learning_rate = learning_rate # Learning Rate self.gamma = gamma # Discount factor self.min_epsilon = min_epsilon # Minimal chance of choosing random action self.decay_episodes = decay_episodes # Number of episodes to decay from 1.0 to `EPSILON` self.target_update_period = target_update_period # Period for soft updates self.target_update_tau = target_update_tau self.progressbar = progressbar # Enable or disable the progressbar for collecting data and training self.n_step_update = n_step_update self.gradient_clipping = gradient_clipping # Clip the loss self.compiled = False NOW = datetime.now().strftime("%Y%m%d_%H%M%S") if memory_length is not None: self.memory_length = memory_length # Max Replay Memory length else: self.memory_length = warmup_steps if val_every is not None: self.val_every = val_every # Validate the policy every `val_every` episodes else: self.val_every = self.episodes // min( 50, self.episodes ) # Can't validate the model 50 times if self.episodes < 50 if model_path is not None: self.model_path = model_path else: self.model_path = "./models/" + NOW + ".pkl" if log_dir is None: log_dir = "./logs/" + NOW self.writer = tf.summary.create_file_writer(log_dir) def compile_model(self, X_train, y_train, layers: list = [], imb_ratio: float = None, loss_fn=common.element_wise_squared_loss) -> None: """Initializes the neural networks, DDQN-agent, collect policies and replay buffer. :param X_train: Training data for the model. :type X_train: np.ndarray :param y_train: Labels corresponding to `X_train`. 1 for the positive class, 0 for the negative class. :param y_train: np.ndarray :param layers: List of layers to feed into the TF-agents custom Sequential(!) layer. :type layers: list :param imb_ratio: The imbalance ratio of the data. :type imb_ratio: float :param loss_fn: Callable loss function :type loss_fn: tf.compat.v1.losses :return: None :rtype: NoneType """ if imb_ratio is None: imb_ratio = imbalance_ratio(y_train) self.train_env = TFPyEnvironment( ClassifierEnv(X_train, y_train, imb_ratio)) self.global_episode = tf.Variable( 0, name="global_episode", dtype=np.int64, trainable=False) # Global train episode counter # Custom epsilon decay: https://github.com/tensorflow/agents/issues/339 epsilon_decay = tf.compat.v1.train.polynomial_decay( 1.0, self.global_episode, self.decay_episodes, end_learning_rate=self.min_epsilon) self.q_net = Sequential(layers, self.train_env.observation_spec()) self.agent = DdqnAgent( self.train_env.time_step_spec(), self.train_env.action_spec(), q_network=self.q_net, optimizer=Adam(learning_rate=self.learning_rate), td_errors_loss_fn=loss_fn, train_step_counter=self.global_episode, target_update_period=self.target_update_period, target_update_tau=self.target_update_tau, gamma=self.gamma, epsilon_greedy=epsilon_decay, n_step_update=self.n_step_update, gradient_clipping=self.gradient_clipping) self.agent.initialize() self.random_policy = RandomTFPolicy(self.train_env.time_step_spec(), self.train_env.action_spec()) self.replay_buffer = TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=self.train_env.batch_size, max_length=self.memory_length) self.warmup_driver = DynamicStepDriver( self.train_env, self.random_policy, observers=[self.replay_buffer.add_batch], num_steps=self.warmup_steps) # Uses a random policy self.collect_driver = DynamicStepDriver( self.train_env, self.agent.collect_policy, observers=[self.replay_buffer.add_batch], num_steps=self.collect_steps_per_episode ) # Uses the epsilon-greedy policy of the agent self.agent.train = common.function(self.agent.train) # Optimalization self.warmup_driver.run = common.function(self.warmup_driver.run) self.collect_driver.run = common.function(self.collect_driver.run) self.compiled = True def train(self, *args) -> None: """Starts the training of the model. Includes warmup period, metrics collection and model saving. :param *args: All arguments will be passed to `collect_metrics()`. This can be usefull to pass callables, testing environments or validation data. Overwrite the TrainDDQN.collect_metrics() function to use your own *args. :type *args: Any :return: None :rtype: NoneType, last step is saving the model as a side-effect """ assert self.compiled, "Model must be compiled with model.compile_model(X_train, y_train, layers) before training." # Warmup period, fill memory with random actions if self.progressbar: print( f"\033[92mCollecting data for {self.warmup_steps:_} steps... This might take a few minutes...\033[0m" ) self.warmup_driver.run( time_step=None, policy_state=self.random_policy.get_initial_state( self.train_env.batch_size)) if self.progressbar: print( f"\033[92m{self.replay_buffer.num_frames():_} frames collected!\033[0m" ) dataset = self.replay_buffer.as_dataset( sample_batch_size=self.batch_size, num_steps=self.n_step_update + 1, num_parallel_calls=data.experimental.AUTOTUNE).prefetch( data.experimental.AUTOTUNE) iterator = iter(dataset) def _train(): experiences, _ = next(iterator) return self.agent.train(experiences).loss _train = common.function(_train) # Optimalization ts = None policy_state = self.agent.collect_policy.get_initial_state( self.train_env.batch_size) self.collect_metrics(*args) # Initial collection for step 0 pbar = tqdm(total=self.episodes, disable=(not self.progressbar), desc="Training the DDQN") # TQDM progressbar for _ in range(self.episodes): if not self.global_episode % self.collect_every: # Collect a few steps using collect_policy and save to `replay_buffer` if self.collect_steps_per_episode != 0: ts, policy_state = self.collect_driver.run( time_step=ts, policy_state=policy_state) pbar.update( self.collect_every ) # More stable TQDM updates, collecting could take some time # Sample a batch of data from `replay_buffer` and update the agent's network train_loss = _train() if not self.global_episode % self.val_every: with self.writer.as_default(): tf.summary.scalar("train_loss", train_loss, step=self.global_episode) self.collect_metrics(*args) pbar.close() def collect_metrics(self, X_val: np.ndarray, y_val: np.ndarray, save_best: str = None): """Collects metrics using the trained Q-network. :param X_val: Features of validation data, same shape as X_train :type X_val: np.ndarray :param y_val: Labels of validation data, same shape as y_train :type y_val: np.ndarray :param save_best: Saving the best model of all validation runs based on given metric: Choose one of: {Gmean, F1, Precision, Recall, TP, TN, FP, FN} This improves stability since the model at the last episode is not guaranteed to be the best model. :type save_best: str """ y_pred = network_predictions(self.agent._target_q_network, X_val) stats = classification_metrics(y_val, y_pred) avgQ = np.mean(decision_function(self.agent._target_q_network, X_val)) # Max action for each x in X if save_best is not None: if not hasattr(self, "best_score"): # If no best model yet self.best_score = 0.0 if stats.get(save_best) >= self.best_score: # Overwrite best model self.save_network( ) # Saving directly to avoid shallow copy without trained weights self.best_score = stats.get(save_best) with self.writer.as_default(): tf.summary.scalar( "AverageQ", avgQ, step=self.global_episode) # Average Q-value for this epoch for k, v in stats.items(): tf.summary.scalar(k, v, step=self.global_episode) def evaluate(self, X_test, y_test, X_train=None, y_train=None): """ Final evaluation of trained Q-network with X_test and y_test. Optional PR and ROC curve comparison to X_train, y_train to ensure no overfitting is taking place. :param X_test: Features of test data, same shape as X_train :type X_test: np.ndarray :param y_test: Labels of test data, same shape as y_train :type y_test: np.ndarray :param X_train: Features of train data :type X_train: np.ndarray :param y_train: Labels of train data :type y_train: np.ndarray """ if hasattr(self, "best_score"): print(f"\033[92mBest score: {self.best_score:6f}!\033[0m") network = self.load_network( self.model_path) # Load best saved model else: network = self.agent._target_q_network # Load latest target model if (X_train is not None) and (y_train is not None): plot_pr_curve(network, X_test, y_test, X_train, y_train) plot_roc_curve(network, X_test, y_test, X_train, y_train) y_pred = network_predictions(network, X_test) return classification_metrics(y_test, y_pred) def save_network(self): """Saves Q-network as pickle to `model_path`.""" with open(self.model_path, "wb") as f: # Save Q-network as pickle pickle.dump(self.agent._target_q_network, f) @staticmethod def load_network(fp: str): """Static method to load Q-network pickle from given filepath. :param fp: Filepath to the saved pickle of the network :type fp: str :returns: The network-object loaded from a pickle file. :rtype: tensorflow.keras.models.Model """ with open(fp, "rb") as f: # Load the Q-network network = pickle.load(f) return network
def create_random_policy(env: TFEnvironment): return RandomTFPolicy(action_spec=env.action_spec(), time_step_spec=env.time_step_spec())
def collect_step(environment, policy): time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer replay_buffer.add_batch(traj) # execute the random policy in the environment for a few steps # and record the data (observations, actions, rewards etc) in the replay buffer print("Collecting initial random steps") random_policy = RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec()) for _ in range(initial_collect_steps): collect_step(train_env, random_policy) dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) iterator = iter(dataset) # train the agent print("Training the agent") tf_agent.train = common.function(tf_agent.train) # Reset the train step tf_agent.train_step_counter.assign(0)
def main(_): # Environment env_name = "Breakout-v4" train_num_parallel_environments = 5 max_steps_per_episode = 1000 # Replay buffer replay_buffer_capacity = 50000 init_replay_buffer = 500 # Driver collect_steps_per_iteration = 1 * train_num_parallel_environments # Training train_batch_size = 32 train_iterations = 100000 train_summary_interval = 200 train_checkpoint_interval = 200 # Evaluation eval_num_parallel_environments = 5 eval_summary_interval = 500 eval_num_episodes = 20 # File paths path = pathlib.Path(__file__) parent_dir = path.parent.resolve() folder_name = path.stem + time.strftime("_%Y%m%d_%H%M%S") train_checkpoint_dir = str(parent_dir / folder_name / "train_checkpoint") train_summary_dir = str(parent_dir / folder_name / "train_summary") eval_summary_dir = str(parent_dir / folder_name / "eval_summary") # Parallel training environment tf_env = TFPyEnvironment( ParallelPyEnvironment([ lambda: suite_atari.load( env_name, env_wrappers= [lambda env: TimeLimit(env, duration=max_steps_per_episode)], gym_env_wrappers=[AtariPreprocessing, FrameStack4], ) ] * train_num_parallel_environments)) tf_env.seed([42] * tf_env.batch_size) tf_env.reset() # Parallel evaluation environment eval_tf_env = TFPyEnvironment( ParallelPyEnvironment([ lambda: suite_atari.load( env_name, env_wrappers= [lambda env: TimeLimit(env, duration=max_steps_per_episode)], gym_env_wrappers=[AtariPreprocessing, FrameStack4], ) ] * eval_num_parallel_environments)) eval_tf_env.seed([42] * eval_tf_env.batch_size) eval_tf_env.reset() # Creating the Deep Q-Network preprocessing_layer = keras.layers.Lambda( lambda obs: tf.cast(obs, np.float32) / 255.) conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)] fc_layer_params = [512] q_net = QNetwork(tf_env.observation_spec(), tf_env.action_spec(), preprocessing_layers=preprocessing_layer, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params) # Creating the DQN Agent optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ε decay_steps=2500000, end_learning_rate=0.01) # final ε global_step = tf.compat.v1.train.get_or_create_global_step() agent = DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=200, td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=0.99, # discount factor train_step_counter=global_step, epsilon_greedy=lambda: epsilon_fn(global_step)) agent.initialize() # Creating the Replay Buffer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) # Observer: Replay Buffer Observer replay_buffer_observer = replay_buffer.add_batch # Observer: Training Metrics train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size), tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size), ] # Creating the Collect Driver collect_driver = DynamicStepDriver(tf_env, agent.collect_policy, observers=[replay_buffer_observer] + train_metrics, num_steps=collect_steps_per_iteration) # Initialize replay buffer initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) init_driver = DynamicStepDriver( tf_env, initial_collect_policy, observers=[replay_buffer_observer, ShowProgress()], num_steps=init_replay_buffer) final_time_step, final_policy_state = init_driver.run() # Creating the Dataset dataset = replay_buffer.as_dataset(sample_batch_size=train_batch_size, num_steps=2, num_parallel_calls=3).prefetch(3) # Optimize by wrapping some of the code in a graph using TF function. collect_driver.run = function(collect_driver.run) agent.train = function(agent.train) print("\n\n++++++++++++++++++++++++++++++++++\n") # Create checkpoint train_checkpointer = Checkpointer( ckpt_dir=train_checkpoint_dir, max_to_keep=1, agent=agent, # replay_buffer=replay_buffer, global_step=global_step, # metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics') ) # Restore checkpoint # train_checkpointer.initialize_or_restore() # Summary writers and metrics train_summary_writer = tf.summary.create_file_writer(train_summary_dir) eval_summary_writer = tf.summary.create_file_writer(eval_summary_dir) eval_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes), tf_metrics.AverageEpisodeLengthMetric( batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes) ] # Create evaluate callback function eval_callback = evaluate(eval_metrics=eval_metrics, eval_tf_env=eval_tf_env, eval_policy=agent.policy, eval_num_episodes=eval_num_episodes, train_step=global_step, eval_summary_writer=eval_summary_writer) # Train agent train_agent(tf_env=tf_env, train_iterations=train_iterations, global_step=global_step, agent=agent, dataset=dataset, collect_driver=collect_driver, train_metrics=train_metrics, train_checkpointer=train_checkpointer, train_checkpoint_interval=train_checkpoint_interval, train_summary_writer=train_summary_writer, train_summary_interval=train_summary_interval, eval_summary_interval=eval_summary_interval, eval_callback=eval_callback) print("\n\n++++++++++ END OF TF_AGENTS RL TRAINING ++++++++++\n\n")