Esempio n. 1
0
 def test_batch_size(self):
   self.assertEquals(
       distribution_utils.per_replica_batch_size(147, num_gpus=0), 147)
   self.assertEquals(
       distribution_utils.per_replica_batch_size(147, num_gpus=1), 147)
   self.assertEquals(
       distribution_utils.per_replica_batch_size(147, num_gpus=7), 21)
Esempio n. 2
0
 def test_batch_size(self):
     self.assertEquals(
         distribution_utils.per_replica_batch_size(147, num_gpus=0), 147)
     self.assertEquals(
         distribution_utils.per_replica_batch_size(147, num_gpus=1), 147)
     self.assertEquals(
         distribution_utils.per_replica_batch_size(147, num_gpus=7), 21)
Esempio n. 3
0
 def input_fn_eval():
     return input_function(
         is_training=False,
         data_dir=flags_obj.data_dir,
         batch_size=distribution_utils.per_replica_batch_size(
             flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
         num_epochs=1,
         dtype=flags_core.get_tf_dtype(flags_obj))
Esempio n. 4
0
 def input_fn_eval():
   return input_function(
       is_training=False,
       data_dir=flags_obj.data_dir,
       batch_size=distribution_utils.per_replica_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=1,
       dtype=flags_core.get_tf_dtype(flags_obj))
 def input_fn_train(num_epochs, input_context=None):
   return input_function(
       is_training=True,
       data_dir=flags_obj.data_dir,
       batch_size=distribution_utils.per_replica_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=num_epochs,
       dtype=flags_core.get_tf_dtype(flags_obj),
       datasets_num_private_threads=flags_obj.datasets_num_private_threads,
       input_context=input_context)
Esempio n. 6
0
 def input_fn_train(num_epochs, input_context=None):
   return input_function(
       is_training=True,
       data_dir=flags_obj.data_dir,
       batch_size=distribution_utils.per_replica_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=num_epochs,
       dtype=flags_core.get_tf_dtype(flags_obj),
       datasets_num_private_threads=flags_obj.datasets_num_private_threads,
       num_parallel_batches=flags_obj.datasets_num_parallel_batches,
       input_context=input_context)
Esempio n. 7
0
 def input_fn_eval():
     return input_function(
         is_training=False,
         augment=
         False,  # experiments.opt[flags_obj.opt_id].hyper.augmentation,  # *SC*
         rand_labels=
         False,  # experiments.opt[flags_obj.opt_id].dataset.random_labels,  # *SC*
         data_dir=flags_obj.data_dir,
         batch_size=distribution_utils.per_replica_batch_size(
             experiments.opt[flags_obj.opt_id].hyper.batch_size,
             flags_core.get_num_gpus(flags_obj)),  # *SC*
         num_epochs=1,
         dtype=flags_core.get_tf_dtype(flags_obj))
Esempio n. 8
0
 def input_fn_train(num_epochs, input_context=None):
     return input_function(
         is_training=True,
         augment=experiments.opt[
             flags_obj.opt_id].hyper.augmentation,  # *SC*
         rand_labels=experiments.opt[
             flags_obj.opt_id].dataset.random_labels,  # *SC*
         data_dir=flags_obj.data_dir,
         batch_size=distribution_utils.per_replica_batch_size(
             experiments.opt[flags_obj.opt_id].hyper.batch_size,
             flags_core.get_num_gpus(flags_obj)),  # *SC*
         num_epochs=num_epochs,
         dtype=flags_core.get_tf_dtype(flags_obj),
         datasets_num_private_threads=flags_obj.
         datasets_num_private_threads,
         input_context=input_context)
Esempio n. 9
0
def run_deep_speech(_):
    """Run deep speech training and eval loop."""
    tf.set_random_seed(flags_obj.seed)
    # Data preprocessing
    tf.logging.info("Data preprocessing...")
    train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
    eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)

    # Number of label classes. Label string is "[a-z]' -"
    num_classes = len(train_speech_dataset.speech_labels)

    # Use distribution strategy for multi-gpu training
    num_gpus = flags_core.get_num_gpus(flags_obj)
    distribution_strategy = distribution_utils.get_distribution_strategy(
        num_gpus=num_gpus)
    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy)

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=flags_obj.model_dir,
                                       config=run_config,
                                       params={
                                           "num_classes": num_classes,
                                       })

    # Benchmark logging
    run_params = {
        "batch_size": flags_obj.batch_size,
        "train_epochs": flags_obj.train_epochs,
        "rnn_hidden_size": flags_obj.rnn_hidden_size,
        "rnn_hidden_layers": flags_obj.rnn_hidden_layers,
        "rnn_type": flags_obj.rnn_type,
        "is_bidirectional": flags_obj.is_bidirectional,
        "use_bias": flags_obj.use_bias
    }

    dataset_name = "LibriSpeech"
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info("deep_speech",
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    per_replica_batch_size = distribution_utils.per_replica_batch_size(
        flags_obj.batch_size, num_gpus)

    def input_fn_train():
        return dataset.input_fn(per_replica_batch_size, train_speech_dataset)

    def input_fn_eval():
        return dataset.input_fn(per_replica_batch_size, eval_speech_dataset)

    total_training_cycle = (flags_obj.train_epochs //
                            flags_obj.epochs_between_evals)
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: %d/%d", cycle_index + 1,
                        total_training_cycle)

        # Perform batch_wise dataset shuffling
        train_speech_dataset.entries = dataset.batch_wise_dataset_shuffle(
            train_speech_dataset.entries, cycle_index, flags_obj.sortagrad,
            flags_obj.batch_size)

        estimator.train(input_fn=input_fn_train,
                        hooks=train_hooks,
                        max_steps=flags_obj.max_train_steps)

        if flags_obj.skip_eval:
            break

        # Evaluation
        tf.logging.info("Starting to evaluate...")

        eval_results = evaluate_model(estimator,
                                      eval_speech_dataset.speech_labels,
                                      eval_speech_dataset.entries,
                                      input_fn_eval)

        # Log the WER and CER results.
        benchmark_logger.log_evaluation_result(eval_results)
        tf.logging.info("Iteration {}: WER = {:.2f}, CER = {:.2f}".format(
            cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY]))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(flags_obj.wer_threshold,
                                             eval_results[_WER_KEY]):
            break
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.

  Returns:
    Dict of results of the run.  Contains the keys `eval_results`,
    `train_hooks`, `bleu_cased`, and `bleu_uncased`. `train_hooks` is a list the
    instances of hooks used during training.
  """
    num_gpus = flags_core.get_num_gpus(flags_obj)

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["vocab_file"] = flags_obj.vocab_file
    params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
    params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
    params["allow_ffn_pad"] = not params["use_tpu"]

    params["max_length"] = flags_obj.max_length or params["max_length"]

    params["use_synthetic_data"] = flags_obj.use_synthetic_data

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (
        flags_obj.batch_size
        or (params["default_batch_size_tpu"]
            if params["use_tpu"] else params["default_batch_size"]))

    total_batch_size = params["batch_size"]
    if not params["use_tpu"]:
        params["batch_size"] = distribution_utils.per_replica_batch_size(
            params["batch_size"], num_gpus)

    schedule_manager = schedule.Manager(
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=params["use_tpu"],
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    model_helpers.apply_clean(flags.FLAGS)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        model_dir=flags_obj.model_dir,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=total_batch_size,  # for ExamplesPerSecondHook
        use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
    )
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    estimator = construct_estimator(flags_obj, params, schedule_manager)
    stats = run_loop(
        estimator=estimator,
        # Training arguments
        schedule_manager=schedule_manager,
        train_hooks=train_hooks,
        benchmark_logger=benchmark_logger,
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file=flags_obj.vocab_file)

    if flags_obj.export_dir and not params["use_tpu"]:
        serving_input_fn = export.build_tensor_serving_input_receiver_fn(
            shape=[None], dtype=tf.int64, batch_size=None)
        # Export saved model, and save the vocab file as an extra asset. The vocab
        # file is saved to allow consistent input encoding and output decoding.
        # (See the "Export trained model" section in the README for an example of
        # how to use the vocab file.)
        # Since the model itself does not use the vocab file, this file is saved as
        # an extra asset rather than a core asset.
        estimator.export_savedmodel(
            flags_obj.export_dir,
            serving_input_fn,
            assets_extra={"vocab.txt": flags_obj.vocab_file},
            strip_default_attrs=True)
    return stats
Esempio n. 11
0
 def test_batch_size_with_remainder(self):
   with self.assertRaises(ValueError):
       distribution_utils.per_replica_batch_size(147, num_gpus=5)
Esempio n. 12
0
 def test_batch_size_with_remainder(self):
     with self.assertRaises(ValueError):
         distribution_utils.per_replica_batch_size(147, num_gpus=5)
Esempio n. 13
0
def run_transformer(flags_obj):
  """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.

  Returns:
    Dict of results of the run.  Contains the keys `eval_results`,
    `train_hooks`, `bleu_cased`, and `bleu_uncased`. `train_hooks` is a list the
    instances of hooks used during training.
  """
  num_gpus = flags_core.get_num_gpus(flags_obj)

  # Add flag-defined parameters to params object
  params = PARAMS_MAP[flags_obj.param_set]
  if num_gpus > 1:
    if flags_obj.param_set == "big":
      params = model_params.BIG_MULTI_GPU_PARAMS
    elif flags_obj.param_set == "base":
      params = model_params.BASE_MULTI_GPU_PARAMS

  params["data_dir"] = flags_obj.data_dir
  params["model_dir"] = flags_obj.model_dir
  params["num_parallel_calls"] = flags_obj.num_parallel_calls

  params["tpu"] = flags_obj.tpu
  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
  params["allow_ffn_pad"] = not params["use_tpu"]

  params["use_synthetic_data"] = flags_obj.use_synthetic_data

  # Set batch size parameter, which depends on the availability of
  # TPU and GPU, and distribution settings.
  params["batch_size"] = (flags_obj.batch_size or (
      params["default_batch_size_tpu"] if params["use_tpu"]
      else params["default_batch_size"]))

  total_batch_size = params["batch_size"]
  if not params["use_tpu"]:
    params["batch_size"] = distribution_utils.per_replica_batch_size(
        params["batch_size"], num_gpus)

  schedule_manager = schedule.Manager(
      train_steps=flags_obj.train_steps,
      steps_between_evals=flags_obj.steps_between_evals,
      train_epochs=flags_obj.train_epochs,
      epochs_between_evals=flags_obj.epochs_between_evals,
      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
      batch_size=params["batch_size"],
      max_length=params["max_length"],
      use_tpu=params["use_tpu"],
      num_tpu_shards=flags_obj.num_tpu_shards
  )

  params["repeat_dataset"] = schedule_manager.repeat_dataset

  model_helpers.apply_clean(flags.FLAGS)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
      batch_size=total_batch_size,  # for ExamplesPerSecondHook
      use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
  )
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="transformer",
      dataset_name="wmt_translate_ende",
      run_params=params,
      test_id=flags_obj.benchmark_test_id)

  # Train and evaluate transformer model
  estimator = construct_estimator(flags_obj, params, schedule_manager)
  stats = run_loop(
      estimator=estimator,
      # Training arguments
      schedule_manager=schedule_manager,
      train_hooks=train_hooks,
      benchmark_logger=benchmark_logger,
      # BLEU calculation arguments
      bleu_source=flags_obj.bleu_source,
      bleu_ref=flags_obj.bleu_ref,
      bleu_threshold=flags_obj.stop_threshold,
      vocab_file=flags_obj.vocab_file)

  if flags_obj.export_dir and not params["use_tpu"]:
    serving_input_fn = export.build_tensor_serving_input_receiver_fn(
        shape=[None], dtype=tf.int64, batch_size=None)
    # Export saved model, and save the vocab file as an extra asset. The vocab
    # file is saved to allow consistent input encoding and output decoding.
    # (See the "Export trained model" section in the README for an example of
    # how to use the vocab file.)
    # Since the model itself does not use the vocab file, this file is saved as
    # an extra asset rather than a core asset.
    estimator.export_savedmodel(
        flags_obj.export_dir, serving_input_fn,
        assets_extra={"vocab.txt": flags_obj.vocab_file},
        strip_default_attrs=True)
  return stats
Esempio n. 14
0
def run_deep_speech(_):
  """Run deep speech training and eval loop."""
  tf.set_random_seed(flags_obj.seed)
  # Data preprocessing
  tf.logging.info("Data preprocessing...")
  train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
  eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)

  # Number of label classes. Label string is "[a-z]' -"
  num_classes = len(train_speech_dataset.speech_labels)

  # Use distribution strategy for multi-gpu training
  num_gpus = flags_core.get_num_gpus(flags_obj)
  distribution_strategy = distribution_utils.get_distribution_strategy(num_gpus)
  run_config = tf.estimator.RunConfig(
      train_distribute=distribution_strategy)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      model_dir=flags_obj.model_dir,
      config=run_config,
      params={
          "num_classes": num_classes,
      }
  )

  # Benchmark logging
  run_params = {
      "batch_size": flags_obj.batch_size,
      "train_epochs": flags_obj.train_epochs,
      "rnn_hidden_size": flags_obj.rnn_hidden_size,
      "rnn_hidden_layers": flags_obj.rnn_hidden_layers,
      "rnn_type": flags_obj.rnn_type,
      "is_bidirectional": flags_obj.is_bidirectional,
      "use_bias": flags_obj.use_bias
  }

  dataset_name = "LibriSpeech"
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info("deep_speech", dataset_name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size)

  per_replica_batch_size = distribution_utils.per_replica_batch_size(
      flags_obj.batch_size, num_gpus)

  def input_fn_train():
    return dataset.input_fn(
        per_replica_batch_size, train_speech_dataset)

  def input_fn_eval():
    return dataset.input_fn(
        per_replica_batch_size, eval_speech_dataset)

  total_training_cycle = (flags_obj.train_epochs //
                          flags_obj.epochs_between_evals)
  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: %d/%d",
                    cycle_index + 1, total_training_cycle)

    # Perform batch_wise dataset shuffling
    train_speech_dataset.entries = dataset.batch_wise_dataset_shuffle(
        train_speech_dataset.entries, cycle_index, flags_obj.sortagrad,
        flags_obj.batch_size)

    estimator.train(input_fn=input_fn_train, hooks=train_hooks)

    # Evaluation
    tf.logging.info("Starting to evaluate...")

    eval_results = evaluate_model(
        estimator, eval_speech_dataset.speech_labels,
        eval_speech_dataset.entries, input_fn_eval)

    # Log the WER and CER results.
    benchmark_logger.log_evaluation_result(eval_results)
    tf.logging.info(
        "Iteration {}: WER = {:.2f}, CER = {:.2f}".format(
            cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY]))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(
        flags_obj.wer_threshold, eval_results[_WER_KEY]):
      break
Esempio n. 15
0
def run(_):
    tf.compat.v1.set_random_seed(flags_obj.seed)
    print("Data pre-processing...")
    train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
    eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)
    num_classes = len(train_speech_dataset.speech_labels)

    num_gpus = flags_core.get_num_gpus(flags_obj)
    distribution_strategy = distribution_utils.get_distribution_strategy(
        num_gpus=num_gpus)
    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy)

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=flags_obj.model_dir,
                                       config=run_config,
                                       params={
                                           "num_classes": num_classes,
                                       })

    run_params = {
        "batch_size": flags_obj.batch_size,
        "train_epochs": flags_obj.train_epochs,
        "rnn_hidden_size": flags_obj.rnn_hidden_size,
        "rnn_hidden_layers": flags_obj.rnn_hidden_layers,
        "rnn_type": flags_obj.rnn_type,
        "is_bidirectional": flags_obj.is_bidirectional,
        "use_bias": flags_obj.use_bias
    }

    dataset_name = "LibriSpeech"
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info("deep_speech",
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    per_replica_batch_size = distribution_utils.per_replica_batch_size(
        flags_obj.batch_size, num_gpus)

    def input_fn_train():
        return input_fn(per_replica_batch_size, train_speech_dataset)

    def input_fn_eval():
        return input_fn(per_replica_batch_size, eval_speech_dataset)

    total_training_cycle = (flags_obj.train_epochs //
                            flags_obj.epochs_between_evals)

    for cycle_index in range(total_training_cycle):
        print('Starting a training cycle: %d/%d' %
              (cycle_index + 1, total_training_cycle))

        train_speech_dataset.entries = fn.batch_wise_dataset_shuffle(
            train_speech_dataset.entries, cycle_index, flags_obj.sortagrad,
            flags_obj.batch_size)

        estimator.train(input_fn=input_fn_train, hooks=train_hooks)

        eval_results = evaluate_model(estimator,
                                      eval_speech_dataset.speech_labels,
                                      eval_speech_dataset.entries,
                                      input_fn_eval)

        print("Iteration {}: WER = {:.2f}, CER = {:.2f}".format(
            cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY]))

        if model_helpers.past_stop_threshold(flags_obj.wer_threshold,
                                             eval_results[_WER_KEY]):
            break