Example #1
0
    def _create_optimizer(self):
        """Creates optimizer."""
        params = self.params
        # TODO(b/139414679): Explore the difference between using
        # LearningRateSchedule and callback for GPU runs, and try to merge them.
        lr_schedule = optimizer.LearningRateSchedule(
            params["learning_rate"], params["hidden_size"],
            params["learning_rate_warmup_steps"])
        opt = tf.keras.optimizers.Adam(
            lr_schedule if self.use_tpu else params["learning_rate"],
            params["optimizer_adam_beta1"],
            params["optimizer_adam_beta2"],
            epsilon=params["optimizer_adam_epsilon"])

        if params["dtype"] == tf.float16:
            opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                opt,
                loss_scale=flags_core.get_loss_scale(
                    self.flags_obj, default_for_fp16="dynamic"))
        if self.flags_obj.fp16_implementation == "graph_rewrite":
            # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as
            # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32'
            # which will ensure tf.compat.v2.keras.mixed_precision and
            # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
            # up.
            opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(
                opt)

        return opt
  def train_and_eval(self):
    """Trains the model."""
    lr_schedule = optimizer.LearningRateSchedule(self.params["learning_rate"], self.params["hidden_size"],
                                                 self.params["learning_rate_warmup_steps"])
    opt = tf.keras.optimizers.Adam(lr_schedule,
                                   self.params["optimizer_adam_beta1"],
                                   self.params["optimizer_adam_beta2"],
                                   epsilon=self.params["optimizer_adam_epsilon"])
    self.train_model.compile(opt)
    self.train_model.summary()

    # create train dataset
    train_ds = data_pipeline.train_input_fn(self.params,
                                            shuffle_seed = 42,
                                            num_ranks = tnt.get_size(),
                                            rank = tnt.get_rank())

    # enable global callbacks
    callbacks = []
    if self.flags_obj.enable_tensorboard and self.flags_obj.model_dir:
      callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=self.flags_obj.model_dir))

    # enable logging callbacks only on the master rank
    if self.flags_obj.enable_time_history:
      time_callback = keras_utils.TimeHistory(self.params["batch_size"],
                                              self.params["num_sentences"],
                                              logdir = None)
      tnt_time_callback = tnt.keras.callbacks.Callback(time_callback,
                                                       aggregate_logs = False,
                                                       run_on_all_ranks = False)
      callbacks.append(tnt_time_callback)

    # print messages only once
    if tnt.is_master_rank():
      logging.info("Start train")

    stats = {}
    for epoch in range(0, self.params["train_epochs"], self.params["epochs_between_evals"]):
      # as our dataset is distributed manually, disable the automatic Tarantella distribution
      history = self.train_model.fit(train_ds,
                                     callbacks = callbacks,
                                     tnt_distribute_dataset = False,
                                     initial_epoch = epoch,
                                     epochs = epoch + min(self.params["epochs_between_evals"],
                                                          self.params["train_epochs"]-epoch),
                                     verbose = 2)

      if tnt.is_master_rank():
        logging.info("Train history: {}".format(history.history))
        stats = misc.build_stats(history, callbacks)

      if tnt.is_master_rank():
        eval_stats = self.eval()
        stats.update(eval_stats)

    return stats
Example #3
0
  def _create_optimizer(self):
    """Creates optimizer."""
    params = self.params
    lr_schedule = optimizer.LearningRateSchedule(
        params["learning_rate"], params["hidden_size"],
        params["learning_rate_warmup_steps"])
    opt = tf.keras.optimizers.Adam(
        lr_schedule,
        params["optimizer_adam_beta1"],
        params["optimizer_adam_beta2"],
        epsilon=params["optimizer_adam_epsilon"])

    opt = performance.configure_optimizer(
        opt,
        use_float16=params["dtype"] == tf.float16,
        loss_scale=flags_core.get_loss_scale(
            self.flags_obj, default_for_fp16="dynamic"))

    return opt