Esempio n. 1
0
    def test_multi_element_dataset_minibatch(self, num_epochs,
                                             num_parallel_environments,
                                             minibatch_size,
                                             expected_train_times):
        num_episodes = 3
        # Create a dataset with three elements. Each element represents an collected
        # episode of length 40.
        get_shape = lambda x: x.shape
        get_dtype = lambda x: tf.as_dtype(x.dtype)
        traj = _create_trajectories(n_time_steps=40,
                                    batch_size=num_parallel_environments)
        unused_info = ()
        shapes = tf.nest.map_structure(get_shape, (traj, unused_info))
        dtypes = tf.nest.map_structure(get_dtype, (traj, unused_info))

        def generate_data():
            for _ in range(num_episodes):
                yield (traj, unused_info)

        dataset = tf.data.Dataset.from_generator(
            generate_data,
            dtypes,
            output_shapes=shapes,
        )

        fake_agent = FakePPOAgent()

        learner = ppo_learner.PPOLearner(
            root_dir=FLAGS.test_tmpdir,
            train_step=tf.Variable(0, dtype=tf.int32),
            agent=fake_agent,
            minibatch_size=minibatch_size,
            # Disable shuffling to have deterministic input into agent.train.
            shuffle_buffer_size=1,
            triggers=None)
        learner.run(iterations=num_epochs, dataset=dataset)

        # Check that fake agent was called the expected number of times.
        self.assertEqual(fake_agent.train_called_times.numpy(),
                         expected_train_times)

        # Check that agent.train() is receiving the expected trajectories.
        if minibatch_size:
            concated_traj = _concat_and_flatten(traj,
                                                multiplier=num_episodes *
                                                num_epochs)

            for i in range(expected_train_times):
                expected_traj = _get_expected_minibatch(concated_traj,
                                                        minibatch_size,
                                                        current_iteration=i)
                received_traj = fake_agent.experiences[i]
                tf.nest.map_structure(self.assertAllClose, received_traj,
                                      expected_traj)
        else:
            for i in range(expected_train_times):
                expected_traj = traj
                received_traj = fake_agent.experiences[i]
                tf.nest.map_structure(self.assertAllClose, received_traj,
                                      expected_traj)
Esempio n. 2
0
    def testPPOLearnerRunTPU(self):
        if tf.config.list_logical_devices('TPU'):
            tpu_strategy = self._get_tpu_strategy()
        else:
            logging.info(
                'TPU hardware is not available, TPU strategy test skipped.')
            return

        batch_size = 1
        minibatch_size = 5
        num_epochs = 3
        n_time_steps = 10
        num_replicas = tpu_strategy.num_replicas_in_sync

        # Create a dataset with 10 element of length 10. This simulates a Reverb
        # dataset.
        num_collected_episodes = 20
        traj = ppo_learner_test_utils.create_trajectories(
            n_time_steps=n_time_steps, batch_size=batch_size)
        info = ()

        def dataset_fn():
            return tf.data.Dataset.from_tensors(
                (traj, info), ).repeat(num_collected_episodes)

        with tpu_strategy.scope():
            print('Number of devices for the strategy: {}'.format(
                tpu_strategy.num_replicas_in_sync))
            fake_agent = ppo_learner_test_utils.FakePPOAgent(tpu_strategy)

        learner = ppo_learner.PPOLearner(
            root_dir=FLAGS.test_tmpdir,
            train_step=tf.Variable(0, dtype=tf.int32),
            agent=fake_agent,
            experience_dataset_fn=dataset_fn,
            normalization_dataset_fn=dataset_fn,
            num_batches=num_collected_episodes,
            num_epochs=num_epochs,
            minibatch_size=minibatch_size,
            # Disable shuffling to have deterministic input into agent.train.
            shuffle_buffer_size=1,
            triggers=None,
            strategy=tpu_strategy)

        learner.run()

        # Check that fake agent was called the expected number of times.
        num_train_frames = (num_collected_episodes * batch_size *
                            n_time_steps * num_epochs)
        num_minibatches = num_train_frames / minibatch_size
        num_minibatches_per_replica = int(num_minibatches / num_replicas)
        self.assertEqual(fake_agent.train_called_times.numpy(),
                         num_minibatches_per_replica)

        # Check that fake agent was called the expected number of times the second
        # time it is called.
        fake_agent.reset()
        learner.run()
        self.assertEqual(fake_agent.train_called_times.numpy(),
                         num_minibatches_per_replica)
Esempio n. 3
0
    def test_one_element_dataset(self, num_epochs, num_parallel_environments,
                                 minibatch_size, expected_train_times):
        # Create a dataset with one element that is a length 100 sequence. This
        # simulates a Reverb dataset if only one sequence was collected.
        traj = _create_trajectories(n_time_steps=100,
                                    batch_size=num_parallel_environments)
        info = ()

        dataset_fn = lambda: tf.data.Dataset.from_tensors((traj, info), )

        fake_agent = FakePPOAgent()

        learner = ppo_learner.PPOLearner(
            root_dir=FLAGS.test_tmpdir,
            train_step=tf.Variable(0, dtype=tf.int32),
            agent=fake_agent,
            experience_dataset_fn=dataset_fn,
            normalization_dataset_fn=dataset_fn,
            num_batches=1,
            num_epochs=num_epochs,
            minibatch_size=minibatch_size,
            # Disable shuffling to have deterministic input into agent.train.
            shuffle_buffer_size=1,
            triggers=None)
        learner.run()

        # Check that fake agent was called the expected number of times.
        self.assertEqual(fake_agent.train_called_times.numpy(),
                         expected_train_times)

        # Check that agent.train() is receiving the expected trajectories.
        if minibatch_size:
            concated_traj = _concat_and_flatten(traj, multiplier=num_epochs)
            for i in range(expected_train_times):
                expected_traj = _get_expected_minibatch(concated_traj,
                                                        minibatch_size,
                                                        current_iteration=i)
                received_traj = fake_agent.experiences[i]
                tf.nest.map_structure(self.assertAllClose, received_traj,
                                      expected_traj)
        else:
            for i in range(num_epochs):
                expected_traj = traj
                received_traj = fake_agent.experiences[i]
                tf.nest.map_structure(self.assertAllClose, received_traj,
                                      expected_traj)
Esempio n. 4
0
def train_eval(
        root_dir,
        env_name='HalfCheetah-v2',
        # Training params
        num_iterations=20000,
        actor_fc_layers=(64, 64),
        value_fc_layers=(64, 64),
        learning_rate=3e-4,
        collect_sequence_length=2048,
        minibatch_size=64,
        num_epochs=10,
        # Agent params
        importance_ratio_clipping=0.2,
        lambda_value=0.95,
        discount_factor=0.99,
        entropy_regularization=0.,
        value_pred_loss_coef=0.5,
        use_gae=True,
        use_td_lambda_return=True,
        gradient_clipping=None,
        value_clipping=None,
        # Replay params
        reverb_port=None,
        replay_capacity=10000,
        # Others
        policy_save_interval=5000,
        summary_interval=1000,
        eval_interval=10000,
        eval_episodes=30,
        debug_summaries=False,
        summarize_grads_and_vars=False):
    """Trains and evaluates PPO (Importance Ratio Clipping).

  Args:
    root_dir: Main directory path where checkpoints, saved_models, and summaries
      will be written to.
    env_name: Name for the Mujoco environment to load.
    num_iterations: The number of iterations to perform collection and training.
    actor_fc_layers: List of fully_connected parameters for the actor network,
      where each item is the number of units in the layer.
    value_fc_layers: : List of fully_connected parameters for the value network,
      where each item is the number of units in the layer.
    learning_rate: Learning rate used on the Adam optimizer.
    collect_sequence_length: Number of steps to take in each collect run.
    minibatch_size: Number of elements in each mini batch. If `None`, the entire
      collected sequence will be treated as one batch.
    num_epochs: Number of iterations to repeat over all collected data per data
      collection step. (Schulman,2017) sets this to 10 for Mujoco, 15 for
      Roboschool and 3 for Atari.
    importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective. For
      more detail, see explanation at the top of the doc.
    lambda_value: Lambda parameter for TD-lambda computation.
    discount_factor: Discount factor for return computation. Default to `0.99`
      which is the value used for all environments from (Schulman, 2017).
    entropy_regularization: Coefficient for entropy regularization loss term.
      Default to `0.0` because no entropy bonus was used in (Schulman, 2017).
    value_pred_loss_coef: Multiplier for value prediction loss to balance with
      policy gradient loss. Default to `0.5`, which was used for all
      environments in the OpenAI baseline implementation. This parameters is
      irrelevant unless you are sharing part of actor_net and value_net. In that
      case, you would want to tune this coeeficient, whose value depends on the
      network architecture of your choice.
    use_gae: If True (default False), uses generalized advantage estimation for
      computing per-timestep advantage. Else, just subtracts value predictions
      from empirical return.
    use_td_lambda_return: If True (default False), uses td_lambda_return for
      training value function; here: `td_lambda_return = gae_advantage +
        value_predictions`. `use_gae` must be set to `True` as well to enable TD
        -lambda returns. If `use_td_lambda_return` is set to True while
        `use_gae` is False, the empirical return will be used and a warning will
        be logged.
    gradient_clipping: Norm length to clip gradients.
    value_clipping: Difference between new and old value predictions are clipped
      to this threshold. Value clipping could be helpful when training
      very deep networks. Default: no clipping.
    reverb_port: Port for reverb server, if None, use a randomly chosen unused
      port.
    replay_capacity: The maximum number of elements for the replay buffer. Items
      will be wasted if this is smalled than collect_sequence_length.
    policy_save_interval: How often, in train_steps, the policy will be saved.
    summary_interval: How often to write data into Tensorboard.
    eval_interval: How often to run evaluation, in train_steps.
    eval_episodes: Number of episodes to evaluate over.
    debug_summaries: Boolean for whether to gather debug summaries.
    summarize_grads_and_vars: If true, gradient summaries will be written.
  """
    collect_env = suite_mujoco.load(env_name)
    eval_env = suite_mujoco.load(env_name)
    num_environments = 1

    observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = (
        spec_utils.get_tensor_specs(collect_env))

    train_step = train_utils.create_train_step()

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        observation_tensor_spec,
        action_tensor_spec,
        fc_layer_params=actor_fc_layers,
        activation_fn=tf.nn.tanh,
        kernel_initializer=tf.keras.initializers.Orthogonal())
    value_net = value_network.ValueNetwork(
        observation_tensor_spec,
        fc_layer_params=value_fc_layers,
        kernel_initializer=tf.keras.initializers.Orthogonal())

    current_iteration = tf.Variable(0, dtype=tf.int64)

    def learning_rate_fn():
        # Linearly decay the learning rate.
        return learning_rate * (1 - current_iteration / num_iterations)

    agent = ppo_clip_agent.PPOClipAgent(
        time_step_tensor_spec,
        action_tensor_spec,
        optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate_fn, epsilon=1e-5),
        actor_net=actor_net,
        value_net=value_net,
        importance_ratio_clipping=importance_ratio_clipping,
        lambda_value=lambda_value,
        discount_factor=discount_factor,
        entropy_regularization=entropy_regularization,
        value_pred_loss_coef=value_pred_loss_coef,
        # This is a legacy argument for the number of times we repeat the data
        # inside of the train function, incompatible with mini batch learning.
        # We set the epoch number from the replay buffer and tf.Data instead.
        num_epochs=1,
        use_gae=use_gae,
        use_td_lambda_return=use_td_lambda_return,
        gradient_clipping=gradient_clipping,
        value_clipping=value_clipping,
        # TODO(b/150244758): Default compute_value_and_advantage_in_train to False
        # after Reverb open source.
        compute_value_and_advantage_in_train=False,
        # Skips updating normalizers in the agent, as it's handled in the learner.
        update_normalizers_in_train=False,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step)
    agent.initialize()

    reverb_server = reverb.Server(
        [
            reverb.Table(  # Replay buffer storing experience for training.
                name='training_table',
                sampler=reverb.selectors.Fifo(),
                remover=reverb.selectors.Fifo(),
                rate_limiter=reverb.rate_limiters.MinSize(1),
                max_size=replay_capacity,
                max_times_sampled=1,
            ),
            reverb.
            Table(  # Replay buffer storing experience for normalization.
                name='normalization_table',
                sampler=reverb.selectors.Fifo(),
                remover=reverb.selectors.Fifo(),
                rate_limiter=reverb.rate_limiters.MinSize(1),
                max_size=replay_capacity,
                max_times_sampled=1,
            )
        ],
        port=reverb_port)

    # Create the replay buffer.
    reverb_replay_train = reverb_replay_buffer.ReverbReplayBuffer(
        agent.collect_data_spec,
        sequence_length=collect_sequence_length,
        table_name='training_table',
        server_address='localhost:{}'.format(reverb_server.port),
        # The only collected sequence is used to populate the batches.
        max_cycle_length=1,
        rate_limiter_timeout_ms=1000)
    reverb_replay_normalization = reverb_replay_buffer.ReverbReplayBuffer(
        agent.collect_data_spec,
        sequence_length=collect_sequence_length,
        table_name='normalization_table',
        server_address='localhost:{}'.format(reverb_server.port),
        # The only collected sequence is used to populate the batches.
        max_cycle_length=1,
        rate_limiter_timeout_ms=1000)

    rb_observer = reverb_utils.ReverbTrajectorySequenceObserver(
        reverb_replay_train.py_client,
        ['training_table', 'normalization_table'],
        sequence_length=collect_sequence_length,
        stride_length=collect_sequence_length)

    saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR)
    collect_env_step_metric = py_metrics.EnvironmentSteps()
    learning_triggers = [
        triggers.PolicySavedModelTrigger(saved_model_dir,
                                         agent,
                                         train_step,
                                         interval=policy_save_interval,
                                         metadata_metrics={
                                             triggers.ENV_STEP_METADATA_KEY:
                                             collect_env_step_metric
                                         }),
        triggers.StepPerSecondLogTrigger(train_step,
                                         interval=summary_interval),
    ]

    def training_dataset_fn():
        return reverb_replay_train.as_dataset(
            sample_batch_size=num_environments,
            sequence_preprocess_fn=agent.preprocess_sequence)

    def normalization_dataset_fn():
        return reverb_replay_normalization.as_dataset(
            sample_batch_size=num_environments,
            sequence_preprocess_fn=agent.preprocess_sequence)

    agent_learner = ppo_learner.PPOLearner(
        root_dir,
        train_step,
        agent,
        experience_dataset_fn=training_dataset_fn,
        normalization_dataset_fn=normalization_dataset_fn,
        num_batches=1,
        num_epochs=num_epochs,
        minibatch_size=minibatch_size,
        shuffle_buffer_size=collect_sequence_length,
        triggers=learning_triggers)

    tf_collect_policy = agent.collect_policy
    collect_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_collect_policy,
                                                        use_tf_function=True)

    collect_actor = actor.Actor(collect_env,
                                collect_policy,
                                train_step,
                                steps_per_run=collect_sequence_length,
                                observers=[rb_observer],
                                metrics=actor.collect_metrics(buffer_size=10) +
                                [collect_env_step_metric],
                                reference_metrics=[collect_env_step_metric],
                                summary_dir=os.path.join(
                                    root_dir, learner.TRAIN_DIR),
                                summary_interval=summary_interval)

    tf_greedy_policy = agent.policy
    greedy_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_greedy_policy,
                                                       use_tf_function=True)

    if eval_interval:
        logging.info('Intial evaluation.')
        eval_actor = actor.Actor(eval_env,
                                 greedy_policy,
                                 train_step,
                                 metrics=actor.eval_metrics(eval_episodes),
                                 summary_dir=os.path.join(root_dir, 'eval'),
                                 episodes_per_run=eval_episodes)

        eval_actor.run_and_log()

    logging.info('Training.')
    for _ in range(num_iterations):
        collect_actor.run()
        # TODO(b/159615593): Update to use observer.flush.
        # Reset the reverb observer to make sure the data collected is flushed and
        # written to the RB.
        rb_observer.reset()
        agent_learner.run()
        reverb_replay_train.clear()
        reverb_replay_normalization.clear()
        current_iteration.assign_add(1)

        if eval_interval and agent_learner.train_step_numpy % eval_interval == 0:
            logging.info('Evaluating.')
            eval_actor.run_and_log()

    rb_observer.close()
    reverb_server.stop()