def testPPOLearnerRunTPU(self): if tf.config.list_logical_devices('TPU'): tpu_strategy = self._get_tpu_strategy() else: logging.info( 'TPU hardware is not available, TPU strategy test skipped.') return batch_size = 1 minibatch_size = 5 num_epochs = 3 n_time_steps = 10 num_replicas = tpu_strategy.num_replicas_in_sync # Create a dataset with 10 element of length 10. This simulates a Reverb # dataset. num_collected_episodes = 20 traj = ppo_learner_test_utils.create_trajectories( n_time_steps=n_time_steps, batch_size=batch_size) info = () def dataset_fn(): return tf.data.Dataset.from_tensors( (traj, info), ).repeat(num_collected_episodes) with tpu_strategy.scope(): print('Number of devices for the strategy: {}'.format( tpu_strategy.num_replicas_in_sync)) fake_agent = ppo_learner_test_utils.FakePPOAgent(tpu_strategy) learner = ppo_learner.PPOLearner( root_dir=FLAGS.test_tmpdir, train_step=tf.Variable(0, dtype=tf.int32), agent=fake_agent, experience_dataset_fn=dataset_fn, normalization_dataset_fn=dataset_fn, num_batches=num_collected_episodes, num_epochs=num_epochs, minibatch_size=minibatch_size, # Disable shuffling to have deterministic input into agent.train. shuffle_buffer_size=1, triggers=None, strategy=tpu_strategy) learner.run() # Check that fake agent was called the expected number of times. num_train_frames = (num_collected_episodes * batch_size * n_time_steps * num_epochs) num_minibatches = num_train_frames / minibatch_size num_minibatches_per_replica = int(num_minibatches / num_replicas) self.assertEqual(fake_agent.train_called_times.numpy(), num_minibatches_per_replica) # Check that fake agent was called the expected number of times the second # time it is called. fake_agent.reset() learner.run() self.assertEqual(fake_agent.train_called_times.numpy(), num_minibatches_per_replica)
def test_one_element_dataset(self, num_epochs, num_parallel_environments, minibatch_size, expected_train_times): # Create a dataset with one element that is a length 100 sequence. This # simulates a Reverb dataset if only one sequence was collected. traj = _create_trajectories(n_time_steps=100, batch_size=num_parallel_environments) info = () dataset_fn = lambda: tf.data.Dataset.from_tensors((traj, info), ) fake_agent = FakePPOAgent() learner = ppo_learner.PPOLearner( root_dir=FLAGS.test_tmpdir, train_step=tf.Variable(0, dtype=tf.int32), agent=fake_agent, experience_dataset_fn=dataset_fn, normalization_dataset_fn=dataset_fn, num_batches=1, num_epochs=num_epochs, minibatch_size=minibatch_size, # Disable shuffling to have deterministic input into agent.train. shuffle_buffer_size=1, triggers=None) learner.run() # Check that fake agent was called the expected number of times. self.assertEqual(fake_agent.train_called_times, expected_train_times) # Check that agent.train() is receiving the expected trajectories. if minibatch_size: concated_traj = _concat_and_flatten(traj, multiplier=num_epochs) for i in range(expected_train_times): expected_traj = _get_expected_minibatch(concated_traj, minibatch_size, current_iteration=i) received_traj = fake_agent.experiences[i] tf.nest.map_structure(self.assertAllClose, received_traj, expected_traj) else: for i in range(num_epochs): expected_traj = traj received_traj = fake_agent.experiences[i] tf.nest.map_structure(self.assertAllClose, received_traj, expected_traj)
def test_parallel_iterations_run(self, parallel_iterations): num_episodes = 3 # Create a dataset with three elements. Each element represents an collected # episode of length 40. get_shape = lambda x: x.shape get_dtype = lambda x: tf.as_dtype(x.dtype) traj = _create_trajectories(n_time_steps=40, batch_size=1) unused_info = () shapes = tf.nest.map_structure(get_shape, (traj, unused_info)) dtypes = tf.nest.map_structure(get_dtype, (traj, unused_info)) def generate_data(): for _ in range(num_episodes): yield (traj, unused_info) def dataset_fn(): return tf.data.Dataset.from_generator( generate_data, dtypes, output_shapes=shapes, ) fake_agent = FakePPOAgent() learner = ppo_learner.PPOLearner( root_dir=FLAGS.test_tmpdir, train_step=tf.Variable(0, dtype=tf.int32), agent=fake_agent, experience_dataset_fn=dataset_fn, normalization_dataset_fn=dataset_fn, num_batches=num_episodes, num_epochs=4, minibatch_size=10, # Disable shuffling to have deterministic input into agent.train. shuffle_buffer_size=1, triggers=None) loss = learner.run(parallel_iterations=parallel_iterations) # Check that fake agent was called the expected number of times. self.assertEqual(fake_agent.train_called_times, 48) self.assertAllEqual(loss, (0.0, 0.0))
def test_multi_element_dataset_minibatch(self, num_epochs, num_parallel_environments, minibatch_size, expected_train_times): num_episodes = 3 # Create a dataset with three elements. Each element represents an collected # episode of length 40. get_shape = lambda x: x.shape get_dtype = lambda x: tf.as_dtype(x.dtype) traj = _create_trajectories(n_time_steps=40, batch_size=num_parallel_environments) unused_info = () shapes = tf.nest.map_structure(get_shape, (traj, unused_info)) dtypes = tf.nest.map_structure(get_dtype, (traj, unused_info)) def generate_data(): for _ in range(num_episodes): yield (traj, unused_info) def dataset_fn(): return tf.data.Dataset.from_generator( generate_data, dtypes, output_shapes=shapes, ) fake_agent = FakePPOAgent() learner = ppo_learner.PPOLearner( root_dir=FLAGS.test_tmpdir, train_step=tf.Variable(0, dtype=tf.int32), agent=fake_agent, experience_dataset_fn=dataset_fn, normalization_dataset_fn=dataset_fn, num_batches=num_episodes, num_epochs=num_epochs, minibatch_size=minibatch_size, # Disable shuffling to have deterministic input into agent.train. shuffle_buffer_size=1, triggers=None) learner.run() # Check that fake agent was called the expected number of times. self.assertEqual(fake_agent.train_called_times, expected_train_times) # Check that agent.train() is receiving the expected trajectories. if minibatch_size: concated_traj = _concat_and_flatten(traj, multiplier=num_episodes * num_epochs) for i in range(expected_train_times): expected_traj = _get_expected_minibatch(concated_traj, minibatch_size, current_iteration=i) received_traj = fake_agent.experiences[i] tf.nest.map_structure(self.assertAllClose, received_traj, expected_traj) else: for i in range(expected_train_times): expected_traj = traj received_traj = fake_agent.experiences[i] tf.nest.map_structure(self.assertAllClose, received_traj, expected_traj)
def train_eval( root_dir, env_name='HalfCheetah-v2', # Training params num_iterations=1600, actor_fc_layers=(64, 64), value_fc_layers=(64, 64), learning_rate=3e-4, collect_sequence_length=2048, minibatch_size=64, num_epochs=10, # Agent params importance_ratio_clipping=0.2, lambda_value=0.95, discount_factor=0.99, entropy_regularization=0., value_pred_loss_coef=0.5, use_gae=True, use_td_lambda_return=True, gradient_clipping=0.5, value_clipping=None, # Replay params reverb_port=None, replay_capacity=10000, # Others policy_save_interval=5000, summary_interval=1000, eval_interval=10000, eval_episodes=100, debug_summaries=False, summarize_grads_and_vars=False): """Trains and evaluates PPO (Importance Ratio Clipping). Args: root_dir: Main directory path where checkpoints, saved_models, and summaries will be written to. env_name: Name for the Mujoco environment to load. num_iterations: The number of iterations to perform collection and training. actor_fc_layers: List of fully_connected parameters for the actor network, where each item is the number of units in the layer. value_fc_layers: : List of fully_connected parameters for the value network, where each item is the number of units in the layer. learning_rate: Learning rate used on the Adam optimizer. collect_sequence_length: Number of steps to take in each collect run. minibatch_size: Number of elements in each mini batch. If `None`, the entire collected sequence will be treated as one batch. num_epochs: Number of iterations to repeat over all collected data per data collection step. (Schulman,2017) sets this to 10 for Mujoco, 15 for Roboschool and 3 for Atari. importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective. For more detail, see explanation at the top of the doc. lambda_value: Lambda parameter for TD-lambda computation. discount_factor: Discount factor for return computation. Default to `0.99` which is the value used for all environments from (Schulman, 2017). entropy_regularization: Coefficient for entropy regularization loss term. Default to `0.0` because no entropy bonus was used in (Schulman, 2017). value_pred_loss_coef: Multiplier for value prediction loss to balance with policy gradient loss. Default to `0.5`, which was used for all environments in the OpenAI baseline implementation. This parameters is irrelevant unless you are sharing part of actor_net and value_net. In that case, you would want to tune this coeeficient, whose value depends on the network architecture of your choice. use_gae: If True (default False), uses generalized advantage estimation for computing per-timestep advantage. Else, just subtracts value predictions from empirical return. use_td_lambda_return: If True (default False), uses td_lambda_return for training value function; here: `td_lambda_return = gae_advantage + value_predictions`. `use_gae` must be set to `True` as well to enable TD -lambda returns. If `use_td_lambda_return` is set to True while `use_gae` is False, the empirical return will be used and a warning will be logged. gradient_clipping: Norm length to clip gradients. value_clipping: Difference between new and old value predictions are clipped to this threshold. Value clipping could be helpful when training very deep networks. Default: no clipping. reverb_port: Port for reverb server, if None, use a randomly chosen unused port. replay_capacity: The maximum number of elements for the replay buffer. Items will be wasted if this is smalled than collect_sequence_length. policy_save_interval: How often, in train_steps, the policy will be saved. summary_interval: How often to write data into Tensorboard. eval_interval: How often to run evaluation, in train_steps. eval_episodes: Number of episodes to evaluate over. debug_summaries: Boolean for whether to gather debug summaries. summarize_grads_and_vars: If true, gradient summaries will be written. """ collect_env = suite_mujoco.load(env_name) eval_env = suite_mujoco.load(env_name) num_environments = 1 observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = ( spec_utils.get_tensor_specs(collect_env)) # TODO(b/172267869): Remove this conversion once TensorNormalizer stops # converting float64 inputs to float32. observation_tensor_spec = tf.TensorSpec( dtype=tf.float32, shape=observation_tensor_spec.shape) train_step = train_utils.create_train_step() actor_net_builder = ppo_actor_network.PPOActorNetwork() actor_net = actor_net_builder.create_sequential_actor_net( actor_fc_layers, action_tensor_spec) value_net = value_network.ValueNetwork( observation_tensor_spec, fc_layer_params=value_fc_layers, kernel_initializer=tf.keras.initializers.Orthogonal()) current_iteration = tf.Variable(0, dtype=tf.int64) def learning_rate_fn(): # Linearly decay the learning rate. return learning_rate * (1 - current_iteration / num_iterations) agent = ppo_clip_agent.PPOClipAgent( time_step_tensor_spec, action_tensor_spec, optimizer=tf.keras.optimizers.Adam( learning_rate=learning_rate_fn, epsilon=1e-5), actor_net=actor_net, value_net=value_net, importance_ratio_clipping=importance_ratio_clipping, lambda_value=lambda_value, discount_factor=discount_factor, entropy_regularization=entropy_regularization, value_pred_loss_coef=value_pred_loss_coef, # This is a legacy argument for the number of times we repeat the data # inside of the train function, incompatible with mini batch learning. # We set the epoch number from the replay buffer and tf.Data instead. num_epochs=1, use_gae=use_gae, use_td_lambda_return=use_td_lambda_return, gradient_clipping=gradient_clipping, value_clipping=value_clipping, # TODO(b/150244758): Default compute_value_and_advantage_in_train to False # after Reverb open source. compute_value_and_advantage_in_train=False, # Skips updating normalizers in the agent, as it's handled in the learner. update_normalizers_in_train=False, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step) agent.initialize() reverb_server = reverb.Server( [ reverb.Table( # Replay buffer storing experience for training. name='training_table', sampler=reverb.selectors.Fifo(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1), max_size=replay_capacity, max_times_sampled=1, ), reverb.Table( # Replay buffer storing experience for normalization. name='normalization_table', sampler=reverb.selectors.Fifo(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1), max_size=replay_capacity, max_times_sampled=1, ) ], port=reverb_port) # Create the replay buffer. reverb_replay_train = reverb_replay_buffer.ReverbReplayBuffer( agent.collect_data_spec, sequence_length=collect_sequence_length, table_name='training_table', server_address='localhost:{}'.format(reverb_server.port), # The only collected sequence is used to populate the batches. max_cycle_length=1, rate_limiter_timeout_ms=1000) reverb_replay_normalization = reverb_replay_buffer.ReverbReplayBuffer( agent.collect_data_spec, sequence_length=collect_sequence_length, table_name='normalization_table', server_address='localhost:{}'.format(reverb_server.port), # The only collected sequence is used to populate the batches. max_cycle_length=1, rate_limiter_timeout_ms=1000) rb_observer = reverb_utils.ReverbTrajectorySequenceObserver( reverb_replay_train.py_client, ['training_table', 'normalization_table'], sequence_length=collect_sequence_length, stride_length=collect_sequence_length) saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR) collect_env_step_metric = py_metrics.EnvironmentSteps() learning_triggers = [ triggers.PolicySavedModelTrigger( saved_model_dir, agent, train_step, interval=policy_save_interval, metadata_metrics={ triggers.ENV_STEP_METADATA_KEY: collect_env_step_metric }), triggers.StepPerSecondLogTrigger(train_step, interval=summary_interval), ] def training_dataset_fn(): return reverb_replay_train.as_dataset( sample_batch_size=num_environments, sequence_preprocess_fn=agent.preprocess_sequence) def normalization_dataset_fn(): return reverb_replay_normalization.as_dataset( sample_batch_size=num_environments, sequence_preprocess_fn=agent.preprocess_sequence) agent_learner = ppo_learner.PPOLearner( root_dir, train_step, agent, experience_dataset_fn=training_dataset_fn, normalization_dataset_fn=normalization_dataset_fn, num_samples=1, num_epochs=num_epochs, minibatch_size=minibatch_size, shuffle_buffer_size=collect_sequence_length, triggers=learning_triggers) tf_collect_policy = agent.collect_policy collect_policy = py_tf_eager_policy.PyTFEagerPolicy( tf_collect_policy, use_tf_function=True) collect_actor = actor.Actor( collect_env, collect_policy, train_step, steps_per_run=collect_sequence_length, observers=[rb_observer], metrics=actor.collect_metrics(buffer_size=10) + [collect_env_step_metric], reference_metrics=[collect_env_step_metric], summary_dir=os.path.join(root_dir, learner.TRAIN_DIR), summary_interval=summary_interval) eval_greedy_policy = py_tf_eager_policy.PyTFEagerPolicy( agent.policy, use_tf_function=True) if eval_interval: logging.info('Intial evaluation.') eval_actor = actor.Actor( eval_env, eval_greedy_policy, train_step, metrics=actor.eval_metrics(eval_episodes), reference_metrics=[collect_env_step_metric], summary_dir=os.path.join(root_dir, 'eval'), episodes_per_run=eval_episodes) eval_actor.run_and_log() logging.info('Training on %s', env_name) last_eval_step = 0 for i in range(num_iterations): collect_actor.run() rb_observer.flush() agent_learner.run() reverb_replay_train.clear() reverb_replay_normalization.clear() current_iteration.assign_add(1) # Eval only if `eval_interval` has been set. Then, eval if the current train # step is equal or greater than the `last_eval_step` + `eval_interval` or if # this is the last iteration. This logic exists because agent_learner.run() # does not return after every train step. if (eval_interval and (agent_learner.train_step_numpy >= eval_interval + last_eval_step or i == num_iterations - 1)): logging.info('Evaluating.') eval_actor.run_and_log() last_eval_step = agent_learner.train_step_numpy rb_observer.close() reverb_server.stop()