def run_model(flags_obj):
    """Run training and eval loop."""

    num_class = dataset.get_num_class(flags_obj.dataset)

    tf.logging.info("Loading the dataset...")

    train_input_fn, eval_input_fn = dataset.construct_input_fns(
        flags_obj.dataset,
        flags_obj.batch_size,
        flags_obj.vocabulary_size,
        flags_obj.sentence_length,
        repeat=flags_obj.epochs_between_evals)

    keras_model = sentiment_model.CNN(flags_obj.embedding_dim,
                                      flags_obj.vocabulary_size,
                                      flags_obj.sentence_length,
                                      flags_obj.cnn_filters, num_class,
                                      flags_obj.dropout_rate)
    num_gpus = flags_core.get_num_gpus(FLAGS)
    tf.logging.info("Creating Estimator from Keras model...")
    estimator = convert_keras_to_estimator(keras_model, num_gpus,
                                           flags_obj.model_dir)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        batch_size=flags_obj.batch_size  # for ExamplesPerSecondHook
    )
    run_params = {
        "batch_size": flags_obj.batch_size,
        "train_epochs": flags_obj.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="sentiment_analysis",
                                  dataset_name=flags_obj.dataset,
                                  run_params=run_params,
                                  test_id=flags_obj.benchmark_test_id)

    # Training and evaluation cycle
    total_training_cycle = flags_obj.train_epochs\
      // flags_obj.epochs_between_evals

    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        # Train the model
        estimator.train(input_fn=train_input_fn, hooks=train_hooks)

        # Evaluate the model
        eval_results = estimator.evaluate(input_fn=eval_input_fn)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)

        tf.logging.info("Iteration {}".format(eval_results))

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
    params["batch_size"] = flags_obj.batch_size or (
        params["default_batch_size_tpu"]
        if params["use_tpu"] else params["default_batch_size"])
    params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
    params["allow_ffn_pad"] = not params["use_tpu"]

    schedule_manager = schedule.Manager(
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=params["use_tpu"],
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
        use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
    )
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    estimator = construct_estimator(flags_obj, params, schedule_manager)
    run_loop(
        estimator=estimator,
        # Training arguments
        schedule_manager=schedule_manager,
        train_hooks=train_hooks,
        benchmark_logger=benchmark_logger,
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file_path=os.path.join(flags_obj.data_dir, flags_obj.vocab_file))
Exemple #3
0
 def test_config_benchmark_file_logger(self):
   # Set the benchmark_log_dir first since the benchmark_logger_type will need
   # the value to be set when it does the validation.
   with flagsaver.flagsaver(benchmark_log_dir='/tmp'):
     with flagsaver.flagsaver(benchmark_logger_type='BenchmarkFileLogger'):
       logger.config_benchmark_logger()
       self.assertIsInstance(logger.get_benchmark_logger(),
                             logger.BenchmarkFileLogger)
 def test_config_benchmark_file_logger(self):
   # Set the benchmark_log_dir first since the benchmark_logger_type will need
   # the value to be set when it does the validation.
   with flagsaver.flagsaver(benchmark_log_dir="/tmp"):
     with flagsaver.flagsaver(benchmark_logger_type="BenchmarkFileLogger"):
       logger.config_benchmark_logger()
       self.assertIsInstance(logger.get_benchmark_logger(),
                             logger.BenchmarkFileLogger)
Exemple #5
0
def get_examples_per_second_callback(every_n_steps=1,
                                     batch_size=32,
                                     metric_logger=None,
                                     **kwargs):  # pylint: disable=unused-argument
    """Function to get ExamplesPerSecondCallback."""
    return ExamplesPerSecondCallback(batch_size=batch_size,
                                     every_n_steps=every_n_steps,
                                     metric_logger=metric_logger
                                     or logger.get_benchmark_logger())
Exemple #6
0
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
    # Determine training schedule based on flags.
    if flags_obj.train_steps is not None:
        train_eval_iterations = (flags_obj.train_steps //
                                 flags_obj.steps_between_evals)
        single_iteration_train_steps = flags_obj.steps_between_evals
        single_iteration_train_epochs = None
    else:
        train_epochs = flags_obj.train_epochs or DEFAULT_TRAIN_EPOCHS
        train_eval_iterations = train_epochs // flags_obj.epochs_between_evals
        single_iteration_train_steps = None
        single_iteration_train_epochs = flags_obj.epochs_between_evals

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    params.data_dir = flags_obj.data_dir
    params.num_parallel_calls = flags_obj.num_parallel_calls
    params.epochs_between_evals = flags_obj.epochs_between_evals
    params.repeat_dataset = single_iteration_train_epochs
    params.batch_size = flags_obj.batch_size or params.batch_size

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=params.batch_size  # for ExamplesPerSecondHook
    )
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params.__dict__,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=flags_obj.model_dir,
                                       params=params)

    train_schedule(
        estimator=estimator,
        # Training arguments
        train_eval_iterations=train_eval_iterations,
        single_iteration_train_steps=single_iteration_train_steps,
        single_iteration_train_epochs=single_iteration_train_epochs,
        train_hooks=train_hooks,
        benchmark_logger=benchmark_logger,
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file_path=os.path.join(flags_obj.data_dir, flags_obj.vocab_file))
Exemple #7
0
def run_loop(name, train_input_fn, eval_input_fn, model_column_fn,
             build_estimator_fn, flags_obj, tensors_to_log, early_stop=False):
  """Define training loop."""
  model_helpers.apply_clean(flags.FLAGS)
  print('+' * 50)
  print('mode type: ' + flags_obj.model_type)
  print('batch size: ' + str(flags_obj.batch_size))
  model = build_estimator_fn(
      model_dir=flags_obj.model_dir, model_type=flags_obj.model_type,
      model_column_fn=model_column_fn,
      inter_op=flags_obj.inter_op_parallelism_threads,
      intra_op=flags_obj.intra_op_parallelism_threads)

  run_params = {
      'batch_size': flags_obj.batch_size,
      'train_epochs': flags_obj.train_epochs,
      'model_type': flags_obj.model_type,
  }

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info('wide_deep', name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '')
  tensors_to_log = {k: v.format(loss_prefix=loss_prefix)
                    for k, v in tensors_to_log.items()}
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks, model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size, tensors_to_log=tensors_to_log)
  train_hooks = []

  # Train and evaluate the model every `flags.epochs_between_evals` epochs.
  for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
    model.train(input_fn=train_input_fn, hooks=train_hooks)

    results = model.evaluate(input_fn=eval_input_fn)

    # Display evaluation metrics
    tf.logging.info('Results at epoch %d / %d',
                    (n + 1) * flags_obj.epochs_between_evals,
                    flags_obj.train_epochs)
    tf.logging.info('-' * 60)

    for key in sorted(results):
      tf.logging.info('%s: %s' % (key, results[key]))

    benchmark_logger.log_evaluation_result(results)

    if early_stop and model_helpers.past_stop_threshold(
        flags_obj.stop_threshold, results['accuracy']):
      break

  # Export the model
  if flags_obj.export_dir is not None:
    export_model(model, flags_obj.model_type, flags_obj.export_dir,
                 model_column_fn)
Exemple #8
0
def run_wide_deep(flags_obj):
    shutil.rmtree(flags_obj.model_dir, ignore_errors=True)
    model = build_estimator(flags_obj.model_dir, flags_obj.model_type)

    train_file = os.path.join(flags_obj.data_dir, 'train.data')
    test_file = os.path.join(flags_obj.data_dir, 'test.data')

    # Train and evaluate the model every `flags.epochs_between_evals` epochs.
    def train_input_fn():
        return input_fn(
            train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size)

    def eval_input_fn():
        return input_fn(test_file, 1, False, flags_obj.batch_size)

    run_params = {
        'batch_size': flags_obj.batch_size,
        'train_epochs': flags_obj.train_epochs,
        'model_type': flags_obj.model_type,
    }

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info('wide_deep', 'Yelp POI', run_params,
                                    test_id=flags_obj.benchmark_test_id)

    loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '')
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks, batch_size=flags_obj.batch_size,
        tensors_to_log={'average_loss': loss_prefix + 'head/truediv',
                        'loss': loss_prefix + 'head/weighted_loss/Sum'})

    # Train and evaluate the model every `flags.epochs_between_evals` epochs.
    for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
        model.train(input_fn=train_input_fn, hooks=train_hooks)
        results = model.evaluate(input_fn=eval_input_fn)

        # Display evaluation metrics
        tf.logging.info('Results at epoch %d / %d',
                        (n + 1) * flags_obj.epochs_between_evals,
                        flags_obj.train_epochs)
        tf.logging.info('-' * 50)

        for key in sorted(results):
            tf.logging.info('%s: %s' % (key, results[key]))

        benchmark_logger.log_evaluation_result(results)

        if model_helpers.past_stop_threshold(
            flags_obj.stop_threshold, results['accuracy']):
            break

    if flags_obj.export_dir is not None:
        export_model(model, flags_obj.model_type, flags_obj.export_dir)
def run_loop(name, train_input_fn, eval_input_fn, model_column_fn,
             build_estimator_fn, flags_obj, tensors_to_log, early_stop=False):
  """Define training loop."""
  model_helpers.apply_clean(flags.FLAGS)
  model = build_estimator_fn(
      model_dir=flags_obj.model_dir, model_type=flags_obj.model_type,
      model_column_fn=model_column_fn,
      inter_op=flags_obj.inter_op_parallelism_threads,
      intra_op=flags_obj.intra_op_parallelism_threads)

  run_params = {
      'batch_size': flags_obj.batch_size,
      'train_epochs': flags_obj.train_epochs,
      'model_type': flags_obj.model_type,
  }

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info('wide_deep', name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '')
  tensors_to_log = {k: v.format(loss_prefix=loss_prefix)
                    for k, v in tensors_to_log.items()}
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks, model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size, tensors_to_log=tensors_to_log)

  # Train and evaluate the model every `flags.epochs_between_evals` epochs.
  for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
    model.train(input_fn=train_input_fn, hooks=train_hooks)

    results = model.evaluate(input_fn=eval_input_fn)

    # Display evaluation metrics
    tf.logging.info('Results at epoch %d / %d',
                    (n + 1) * flags_obj.epochs_between_evals,
                    flags_obj.train_epochs)
    tf.logging.info('-' * 60)

    for key in sorted(results):
      tf.logging.info('%s: %s' % (key, results[key]))

    benchmark_logger.log_evaluation_result(results)

    if early_stop and model_helpers.past_stop_threshold(
        flags_obj.stop_threshold, results['accuracy']):
      break

  # Export the model
  if flags_obj.export_dir is not None:
    export_model(model, flags_obj.model_type, flags_obj.export_dir,
                 model_column_fn)
Exemple #10
0
def get_logging_metric_hook(tensors_to_log=None, every_n_secs=600, **kwargs):  # pylint: disable=unused-argument
    """Function to get LoggingMetricHook.

  Args:
    tensors_to_log: List of tensor names or dictionary mapping labels to tensor
      names. If not set, log _TENSORS_TO_LOG by default.
    every_n_secs: `int`, the frequency for logging the metric. Default to every
      10 mins.

  Returns:
    Returns a LoggingMetricHook that saves tensor values in a JSON format.
  """
    if tensors_to_log is None:
        tensors_to_log = _TENSORS_TO_LOG
    return metric_hook.LoggingMetricHook(
        tensors=tensors_to_log,
        metric_logger=logger.get_benchmark_logger(),
        every_n_secs=every_n_secs)
Exemple #11
0
def get_logging_metric_hook(tensors_to_log=None, every_n_secs=600, **kwargs):  # pylint: disable=unused-argument
    """Function to get LoggingMetricHook.

  Args:
    tensors_to_log: List of tensor names or dictionary mapping labels to tensor
      names. If not set, log _TENSORS_TO_LOG by default.
    every_n_secs: `int`, the frequency for logging the metric. Default to every
      10 mins.

  Returns:
    Returns a ProfilerHook that writes out timelines that can be loaded into
    profiling tools like chrome://tracing.
  """
    if tensors_to_log is None:
        tensors_to_log = _TENSORS_TO_LOG
    return metric_hook.LoggingMetricHook(
        tensors=tensors_to_log,
        metric_logger=logger.get_benchmark_logger(),
        every_n_secs=every_n_secs)
Exemple #12
0
def get_logging_metric_hook(tensors_to_log=None,
                            every_n_secs=600,
                            **kwargs):  # pylint: disable=unused-argument
  """Function to get LoggingMetricHook.

  Args:
    tensors_to_log: List of tensor names or dictionary mapping labels to tensor
      names. If not set, log _TENSORS_TO_LOG by default.
    every_n_secs: `int`, the frequency for logging the metric. Default to every
      10 mins.

  Returns:
    Returns a LoggingMetricHook that saves tensor values in a JSON format.
  """
  if tensors_to_log is None:
    tensors_to_log = _TENSORS_TO_LOG
  return metric_hook.LoggingMetricHook(
      tensors=tensors_to_log,
      metric_logger=logger.get_benchmark_logger(),
      every_n_secs=every_n_secs)
Exemple #13
0
def get_examples_per_second_hook(every_n_steps=100,
                                 batch_size=128,
                                 warm_steps=5,
                                 **kwargs):  # pylint: disable=unused-argument
    """Function to get ExamplesPerSecondHook.

    Args:
      every_n_steps: `int`, print current and average examples per second every
        N steps.
      batch_size: `int`, total batch size used to calculate examples/second from
        global time.
      warm_steps: skip this number of steps before logging and running average.
      **kwargs: a dictionary of arguments to ExamplesPerSecondHook.

    Returns:
      Returns a ProfilerHook that writes out timelines that can be loaded into
      profiling tools like chrome://tracing.
    """
    return hooks.ExamplesPerSecondHook(
        batch_size=batch_size, every_n_steps=every_n_steps,
        warm_steps=warm_steps, metric_logger=logger.get_benchmark_logger())
Exemple #14
0
def get_examples_per_second_hook(every_n_steps=100,
                                 batch_size=128,
                                 warm_steps=5,
                                 **kwargs):  # pylint: disable=unused-argument
  """Function to get ExamplesPerSecondHook.

  Args:
    every_n_steps: `int`, print current and average examples per second every
      N steps.
    batch_size: `int`, total batch size used to calculate examples/second from
      global time.
    warm_steps: skip this number of steps before logging and running average.
    **kwargs: a dictionary of arguments to ExamplesPerSecondHook.

  Returns:
    Returns a ProfilerHook that writes out timelines that can be loaded into
    profiling tools like chrome://tracing.
  """
  return hooks.ExamplesPerSecondHook(
      batch_size=batch_size, every_n_steps=every_n_steps,
      warm_steps=warm_steps, metric_logger=logger.get_benchmark_logger())
def get_logging_metric_hook(tensors_to_log=None,
                            every_n_secs=600,
                            **kwargs):  # pylint: disable=unused-argument
  """Function to get LoggingMetricHook.

  Args:
    tensors_to_log: List of tensor names or dictionary mapping labels to tensor
      names. If not set, log _TENSORS_TO_LOG by default.
    every_n_secs: `int`, the frequency for logging the metric. Default to every
      10 mins.

  Returns:
    Returns a ProfilerHook that writes out timelines that can be loaded into
    profiling tools like chrome://tracing.
  """
  if tensors_to_log is None:
    tensors_to_log = _TENSORS_TO_LOG
  return metric_hook.LoggingMetricHook(
      tensors=tensors_to_log,
      metric_logger=logger.get_benchmark_logger(),
      every_n_secs=every_n_secs)
def log_and_get_hooks(eval_batch_size):
    """Convenience function for hook and logger creation."""
    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
        tensors_to_log={"cross_entropy": "cross_entropy"})
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    return benchmark_logger, train_hooks
Exemple #17
0
def log_and_get_hooks(eval_batch_size):
  """Convenience function for hook and logger creation."""
  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
      tensors_to_log={"cross_entropy": "cross_entropy"}
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "eval_batch_size": eval_batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  return benchmark_logger, train_hooks
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.

  Returns:
    Dict of results of the run.  Contains the keys `eval_results`,
    `train_hooks`, `bleu_cased`, and `bleu_uncased`. `train_hooks` is a list the
    instances of hooks used during training.
  """
    num_gpus = flags_core.get_num_gpus(flags_obj)

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["vocab_file"] = flags_obj.vocab_file
    params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
    params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
    params["allow_ffn_pad"] = not params["use_tpu"]

    params["max_length"] = flags_obj.max_length or params["max_length"]

    params["use_synthetic_data"] = flags_obj.use_synthetic_data

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (
        flags_obj.batch_size
        or (params["default_batch_size_tpu"]
            if params["use_tpu"] else params["default_batch_size"]))

    total_batch_size = params["batch_size"]
    if not params["use_tpu"]:
        params["batch_size"] = distribution_utils.per_replica_batch_size(
            params["batch_size"], num_gpus)

    schedule_manager = schedule.Manager(
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=params["use_tpu"],
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    model_helpers.apply_clean(flags.FLAGS)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        model_dir=flags_obj.model_dir,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=total_batch_size,  # for ExamplesPerSecondHook
        use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
    )
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    estimator = construct_estimator(flags_obj, params, schedule_manager)
    stats = run_loop(
        estimator=estimator,
        # Training arguments
        schedule_manager=schedule_manager,
        train_hooks=train_hooks,
        benchmark_logger=benchmark_logger,
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file=flags_obj.vocab_file)

    if flags_obj.export_dir and not params["use_tpu"]:
        serving_input_fn = export.build_tensor_serving_input_receiver_fn(
            shape=[None], dtype=tf.int64, batch_size=None)
        # Export saved model, and save the vocab file as an extra asset. The vocab
        # file is saved to allow consistent input encoding and output decoding.
        # (See the "Export trained model" section in the README for an example of
        # how to use the vocab file.)
        # Since the model itself does not use the vocab file, this file is saved as
        # an extra asset rather than a core asset.
        estimator.export_savedmodel(
            flags_obj.export_dir,
            serving_input_fn,
            assets_extra={"vocab.txt": flags_obj.vocab_file},
            strip_default_attrs=True)
    return stats
Exemple #19
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)

    eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
    eval_batch_size = int(FLAGS.eval_batch_size
                          or max([FLAGS.batch_size, eval_per_user]))
    if eval_batch_size % eval_per_user:
        eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
        tf.logging.warning(
            "eval examples per user does not evenly divide eval_batch_size. "
            "Overriding to {}".format(eval_batch_size))

    if FLAGS.use_synthetic_data:
        ncf_dataset = None
        cleanup_fn = lambda: None
        num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
            FLAGS.dataset]
        num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
        num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
    else:
        ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
            dataset=FLAGS.dataset,
            data_dir=FLAGS.data_dir,
            batch_size=batch_size,
            eval_batch_size=eval_batch_size,
            num_neg=FLAGS.num_neg,
            epochs_per_cycle=FLAGS.epochs_between_evals,
            match_mlperf=FLAGS.ml_perf,
            deterministic=FLAGS.seed is not None,
            use_subprocess=FLAGS.use_subprocess,
            cache_id=FLAGS.cache_id)
        num_users = ncf_dataset.num_users
        num_items = ncf_dataset.num_items
        num_train_steps = int(
            np.ceil(FLAGS.epochs_between_evals *
                    ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) /
                    FLAGS.batch_size))
        num_eval_steps = int(
            np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users /
                    eval_batch_size))

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "use_seed": FLAGS.seed is not None,
            "hash_pipeline": FLAGS.hash_pipeline,
            "batch_size": batch_size,
            "eval_batch_size": eval_batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": num_users,
            "num_items": num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "num_neg": FLAGS.num_neg,
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
            "beta1": FLAGS.beta1,
            "beta2": FLAGS.beta2,
            "epsilon": FLAGS.epsilon,
            "match_mlperf": FLAGS.ml_perf,
            "use_xla_for_gpu": FLAGS.use_xla_for_gpu,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
        tensors_to_log={"cross_entropy": "cross_entropy"})
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    pred_input_fn = None
    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
    target_reached = False
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
    for cycle_index in range(total_training_cycle):
        assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                                value=cycle_index)

        # Train the model
        train_input_fn, train_record_dir, batch_count = \
          data_preprocessing.make_input_fn(
              ncf_dataset=ncf_dataset, is_training=True)

        if batch_count != num_train_steps:
            raise ValueError(
                "Step counts do not match. ({} vs. {}) The async process is "
                "producing incorrect shards.".format(batch_count,
                                                     num_train_steps))

        train_estimator.train(input_fn=train_input_fn,
                              hooks=train_hooks,
                              steps=num_train_steps)
        if train_record_dir:
            tf.gfile.DeleteRecursively(train_record_dir)

        tf.logging.info("Beginning evaluation.")
        if pred_input_fn is None:
            pred_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
                ncf_dataset=ncf_dataset, is_training=False)

            if eval_batch_count != num_eval_steps:
                raise ValueError(
                    "Step counts do not match. ({} vs. {}) The async process is "
                    "producing incorrect shards.".format(
                        eval_batch_count, num_eval_steps))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                                value=cycle_index)
        eval_results = eval_estimator.evaluate(pred_input_fn,
                                               steps=num_eval_steps)
        hr = float(eval_results[rconst.HR_KEY])
        ndcg = float(eval_results[rconst.NDCG_KEY])
        tf.logging.info("Evaluation complete.")

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET,
                                value={
                                    "epoch": cycle_index,
                                    "value": FLAGS.hr_threshold
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                                value={
                                    "epoch": cycle_index,
                                    "value": hr
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
                                value={
                                    "epoch": cycle_index,
                                    "value": rconst.NUM_EVAL_NEGATIVES
                                })

        # Logged by the async process during record creation.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                                deferred=True)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP,
                                value=cycle_index)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            target_reached = True
            break

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                            value={"success": target_reached})
    cleanup_fn()  # Cleanup data construction artifacts and subprocess.

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Exemple #20
0
def run_keras_model_benchmark(_):
    new_job_thread = threading.Thread(target=receive,
                                      args=(
                                          FLAGS.server_address.split(':')[0],
                                          FLAGS.port,
                                      ),
                                      daemon=True)
    new_job_thread.start()
    """Run the benchmark on keras model."""
    # Ensure a valid model name was supplied via command line argument
    if FLAGS.model not in MODELS.keys():
        raise AssertionError("The --model command line argument should "
                             "be a key in the `MODELS` dictionary.")
    # print(FLAGS.gpus_list)
    # exit()
    # Check if eager execution is enabled
    if FLAGS.eager:
        tf.logging.info("Eager execution is enabled...")
        tf.enable_eager_execution()

    # Load the model
    tf.logging.info("Benchmark on {} model...".format(FLAGS.model))
    keras_model = MODELS[FLAGS.model]
    model = keras_model(weights=None)

    # Get dataset
    dataset_name = "ImageNet"
    if FLAGS.use_synthetic_data:
        tf.logging.info("Using synthetic dataset...")
        dataset_name += "_Synthetic"
        train_dataset = dataset.generate_synthetic_input_dataset(
            FLAGS.model, FLAGS.batch_size)
        val_dataset = dataset.generate_synthetic_input_dataset(
            FLAGS.model, FLAGS.batch_size)
    else:
        raise ValueError("Only synthetic dataset is supported!")

    num_gpus = flags_core.get_num_gpus(FLAGS)

    distribution = None
    # Use distribution strategy
    if FLAGS.dist_strat:
        distribution = distribution_utils.get_distribution_strategy(
            num_gpus=num_gpus)
    elif num_gpus > 1:
        # Run with multi_gpu_model
        # If eager execution is enabled, only one GPU is utilized even if multiple
        # GPUs are provided.
        if FLAGS.eager:
            tf.logging.warning(
                "{} GPUs are provided, but only one GPU is utilized as "
                "eager execution is enabled.".format(num_gpus))
        model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus)

    # Adam optimizer and some other optimizers doesn't work well with
    # distribution strategy (b/113076709)
    # Use GradientDescentOptimizer here
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
    model.compile(loss="categorical_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"],
                  distribute=distribution)

    # Create benchmark logger for benchmark logging
    run_params = {
        "batch_size": FLAGS.batch_size,
        "synthetic_data": FLAGS.use_synthetic_data,
        "train_epochs": FLAGS.train_epochs,
        "num_train_images": FLAGS.num_train_images,
        "num_eval_images": FLAGS.num_eval_images,
    }

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name=FLAGS.model,
                                  dataset_name=dataset_name,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    class LossHistory(tf.keras.callbacks.Callback):
        def __init__(self):
            self.start = time.time()

        def on_train_begin(self, logs={}):
            return

        def on_epoch_end(self, epoch, logs={}):
            global training_flags, have_trained
            if job_status == 'g':
                training_flags = 1
                have_trained = epoch + 1
                self.model.stop_training = True
            if job_status == 's':
                training_flags = 1
                have_trained = epoch + 1
                self.model.stop_training = True

        def on_batch_end(self, batch, logs={}):
            global lock
            if batch == 49 and lock is True:
                hundred = time.time() - self.start
                # calculate the speed and unlock job
                msg = {}
                msg['id'] = FLAGS.id
                msg['status'] = 'un'
                msg['ep_tm'] = FLAGS.num_train_images * hundred / (
                    FLAGS.batch_size * 50)
                send_msg(FLAGS.server_address, msg)
                lock = False

    # Create callbacks that log metric values about the training and evaluation
    callbacks = model_callbacks.get_model_callbacks(
        FLAGS.callbacks,
        batch_size=FLAGS.batch_size,
        metric_logger=benchmark_logger)
    callbacks.append(LossHistory())
    # Train and evaluate the model
    history = model.fit(
        train_dataset,
        epochs=FLAGS.train_epochs,
        callbacks=callbacks,
        validation_data=val_dataset,
        steps_per_epoch=int(np.ceil(FLAGS.num_train_images /
                                    FLAGS.batch_size)),
    )
    ''' No need for evaluation part
    tf.logging.info("Logging the evaluation results...")
    for epoch in range(FLAGS.train_epochs):
        eval_results = {
                "accuracy": history.history["val_acc"][epoch],
                "loss": history.history["val_loss"][epoch],
                tf.GraphKeys.GLOBAL_STEP: (epoch + 1) * np.ceil(
                        FLAGS.num_eval_images/FLAGS.batch_size)
        }
        benchmark_logger.log_evaluation_result(eval_results)
    '''

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
    # Now end the training send back message
    msg = {}
    remain_ep = FLAGS.train_epochs - have_trained
    if training_flags == 0 or remain_ep == 0:
        msg['status'] = 'e'
        msg['id'] = FLAGS.id
        # send_msg(FLAGS.server_address, msg)
    else:
        # ask the scheduler to re-run
        # growing is needed
        gpus_loc = {}
        flags_gpu_list = [int(i) for i in FLAGS.gpus_list]
        if job_status == 'g':
            new_gpus_list = gpus + flags_gpu_list
            msg['status'] = 'g'
        else:
            new_gpus_list = list(set(flags_gpu_list).difference(set(gpus)))
            msg['status'] = 's'
        # TODO hardcoded here
        gpus_loc['localhost'] = new_gpus_list
        msg['gpus_loc'] = gpus_loc
        msg['id'] = FLAGS.id
        msg['ep'] = FLAGS.train_epochs - have_trained
        # send_msg(FLAGS.server_address, msg)

    global exit_code
    exit_code = True
    time.sleep(1)
    send_msg(FLAGS.server_address, msg)
    print('exit')
    exit()
Exemple #21
0
def get_examples_per_second_callback(
    every_n_steps=1, batch_size=32, metric_logger=None, **kwargs):  # pylint: disable=unused-argument
  """Function to get ExamplesPerSecondCallback."""
  return ExamplesPerSecondCallback(
      batch_size=batch_size, every_n_steps=every_n_steps,
      metric_logger=metric_logger or logger.get_benchmark_logger())
def run_transformer(flags_obj):
  """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
  num_gpus = flags_core.get_num_gpus(flags_obj)

  # Add flag-defined parameters to params object
  params = PARAMS_MAP[flags_obj.param_set]
  if num_gpus > 1:
    if flags_obj.param_set == "big":
      params = model_params.BIG_MULTI_GPU_PARAMS
    elif flags_obj.param_set == "base":
      params = model_params.BASE_MULTI_GPU_PARAMS

  params["data_dir"] = flags_obj.data_dir
  params["model_dir"] = flags_obj.model_dir
  params["num_parallel_calls"] = flags_obj.num_parallel_calls

  params["tpu"] = flags_obj.tpu
  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
  params["allow_ffn_pad"] = not params["use_tpu"]

  params["use_synthetic_data"] = flags_obj.use_synthetic_data

  # Set batch size parameter, which depends on the availability of
  # TPU and GPU, and distribution settings.
  params["batch_size"] = (flags_obj.batch_size or (
      params["default_batch_size_tpu"] if params["use_tpu"]
      else params["default_batch_size"]))

  if not params["use_tpu"]:
    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)

  schedule_manager = schedule.Manager(
      train_steps=flags_obj.train_steps,
      steps_between_evals=flags_obj.steps_between_evals,
      train_epochs=flags_obj.train_epochs,
      epochs_between_evals=flags_obj.epochs_between_evals,
      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
      batch_size=params["batch_size"],
      max_length=params["max_length"],
      use_tpu=params["use_tpu"],
      num_tpu_shards=flags_obj.num_tpu_shards
  )

  params["repeat_dataset"] = schedule_manager.repeat_dataset

  model_helpers.apply_clean(flags.FLAGS)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
      batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
      use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
  )
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="transformer",
      dataset_name="wmt_translate_ende",
      run_params=params,
      test_id=flags_obj.benchmark_test_id)

  # Train and evaluate transformer model
  estimator = construct_estimator(flags_obj, params, schedule_manager)
  run_loop(
      estimator=estimator,
      # Training arguments
      schedule_manager=schedule_manager,
      train_hooks=train_hooks,
      benchmark_logger=benchmark_logger,
      # BLEU calculation arguments
      bleu_source=flags_obj.bleu_source,
      bleu_ref=flags_obj.bleu_ref,
      bleu_threshold=flags_obj.stop_threshold,
      vocab_file=flags_obj.vocab_file)

  if flags_obj.export_dir and not params["use_tpu"]:
    serving_input_fn = export.build_tensor_serving_input_receiver_fn(
        shape=[None], dtype=tf.int64, batch_size=None)
    # Export saved model, and save the vocab file as an extra asset. The vocab
    # file is saved to allow consistent input encoding and output decoding.
    # (See the "Export trained model" section in the README for an example of
    # how to use the vocab file.)
    # Since the model itself does not use the vocab file, this file is saved as
    # an extra asset rather than a core asset.
    estimator.export_savedmodel(
        flags_obj.export_dir, serving_input_fn,
        assets_extra={"vocab.txt": flags_obj.vocab_file},
        strip_default_attrs=True)
def densenet_main(
    flags_obj, model_function, input_function, dataset_name, shape=None):
  """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_densenet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.
  """

  model_helpers.apply_clean(flags.FLAGS)

  # Using the Winograd non-fused algorithms provides a small performance boost.
  os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

  # Create session config based on values of inter_op_parallelism_threads and
  # intra_op_parallelism_threads. Note that we default to having
  # allow_soft_placement = True, which is required for multi-GPU and not
  # harmful for other modes.
  '''
  session_config = tf.ConfigProto(
      inter_op_parallelism_threads=1,
      intra_op_parallelism_threads=1,
      allow_soft_placement=True)
  '''
  
  session_config = tf.ConfigProto(allow_soft_placement=True)

  # sirius:
  distribution_strategy = distribution_utils.get_distribution_strategy(
      flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)

  run_config = tf.estimator.RunConfig(
      train_distribute=distribution_strategy, session_config=session_config,
      save_summary_steps=500)
  
  # print all flags inside model main
  # for k,v in tf.flags.FLAGS.__flags.items():
  # print('=================================')
  # for k,v in flags_obj.items():
  #     print('***',v.__dict__['name'],v.__dict__['_value'])
  
  # Note: 这里的flags_obj定义了多种类型的flags
  # print(flags_obj)

  train_dir = r'E:\denseNet\resnet_cifar10\train_dir'
  export_dir_all = r'E:\denseNet\resnet_cifar10\export_dir'
  model_name = 'd_{}_k_{}'.format(flags.FLAGS.d, flags.FLAGS.k)
  model_dir = os.path.join(train_dir, model_name)
  export_dir = os.path.join(export_dir_all, model_name)

  # Note flags
  # parameters that will be passed into model fn
  classifier = tf.estimator.Estimator(
      model_fn=model_function, model_dir=model_dir, config=run_config,
      params={
          'data_format': flags_obj.data_format,
          'batch_size': flags_obj.batch_size,
          'loss_scale': flags_core.get_loss_scale(flags_obj),
          'dtype': flags_core.get_tf_dtype(flags_obj),
          # network parameters
          'd': flags_obj.d,
          'k':flags_obj.k,
          'compressionRate':flags_obj.compressionRate,
          'expansion':flags_obj.expansion,
          'bottleneck':flags_obj.bottleneck
      })
  
  # Note flags
  run_params = {
      'batch_size': flags_obj.batch_size,
      'dtype': flags_core.get_tf_dtype(flags_obj),
      'synthetic_data': flags_obj.use_synthetic_data,
      'train_epochs': flags_obj.train_epochs,
  }
  
  if flags_obj.use_synthetic_data:
    dataset_name = dataset_name + '-synthetic'

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info('densenet', dataset_name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=model_dir,
      batch_size=flags_obj.batch_size)

  def input_fn_train():
    return input_function(
        is_training=True, data_dir=flags_obj.data_dir,
        batch_size=distribution_utils.per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=flags_obj.epochs_between_evals,
        num_gpus=flags_core.get_num_gpus(flags_obj))

  def input_fn_eval():
    return input_function(
        is_training=False, data_dir=flags_obj.data_dir,
        batch_size=distribution_utils.per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=1)

  total_training_cycle = (flags_obj.train_epochs //
                          flags_obj.epochs_between_evals)

  # print('*** total_training_cycle',total_training_cycle)

  for cycle_index in range(total_training_cycle):
    tf.logging.info('Starting a training cycle: %d/%d',
                    cycle_index, total_training_cycle)

    classifier.train(input_fn=input_fn_train, hooks=train_hooks,
                     max_steps=flags_obj.max_train_steps)

    tf.logging.info('Starting to evaluate.')

    # flags_obj.max_train_steps is generally associated with testing and
    # profiling. As a result it is frequently called with synthetic data, which
    # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
    # eval (which is generally unimportant in those circumstances) to terminate.
    # Note that eval will run for max_train_steps each loop, regardless of the
    # global_step count.
    eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                       steps=flags_obj.max_train_steps)

    benchmark_logger.log_evaluation_result(eval_results)

    if model_helpers.past_stop_threshold(
        flags_obj.stop_threshold, eval_results['accuracy']):
      break
    
  # export model at last
  input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
       shape, batch_size=flags_obj.batch_size)
  classifier.export_savedmodel(export_dir, input_receiver_fn)
def run_keras_model_benchmark(_):
  """Run the benchmark on keras model."""
  # Ensure a valid model name was supplied via command line argument
  if FLAGS.model not in MODELS.keys():
    raise AssertionError("The --model command line argument should "
                         "be a key in the `MODELS` dictionary.")

  # Load the model
  tf.logging.info("Benchmark on {} model...".format(FLAGS.model))
  keras_model = MODELS[FLAGS.model]
  model = keras_model(weights=None)

  # Get dataset
  dataset_name = "ImageNet"
  if FLAGS.use_synthetic_data:
    tf.logging.info("Using synthetic dataset...")
    dataset_name += "_Synthetic"
    train_num_images = FLAGS.batch_size
    val_num_images = FLAGS.batch_size
    train_dataset = dataset.generate_synthetic_input_dataset(
        FLAGS.model, train_num_images)
    val_dataset = dataset.generate_synthetic_input_dataset(
        FLAGS.model, val_num_images)
  else:
    raise ValueError("Only synthetic dataset is supported!")

  # If run with multiple GPUs
  num_gpus = flags_core.get_num_gpus(FLAGS)
  if num_gpus > 0:
    model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus)

  # Configure the model
  model.compile(loss="categorical_crossentropy",
                optimizer="sgd",
                metrics=["accuracy"])

  # Create benchmark logger for benchmark logging
  run_params = {
      "batch_size": FLAGS.batch_size,
      "synthetic_data": FLAGS.use_synthetic_data,
      "train_epochs": FLAGS.train_epochs
  }

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name=FLAGS.model,
      dataset_name=dataset_name,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  # Create callbacks that log metric values about the training and evaluation
  callbacks = model_callbacks.get_model_callbacks(
      FLAGS.callbacks,
      batch_size=FLAGS.batch_size,
      metric_logger=benchmark_logger)
  # Train and evaluate the model
  history = model.fit(
      train_dataset,
      epochs=FLAGS.train_epochs,
      callbacks=callbacks,
      validation_data=val_dataset,
      steps_per_epoch=int(np.ceil(train_num_images / FLAGS.batch_size)),
      validation_steps=int(np.ceil(val_num_images / FLAGS.batch_size))
  )

  tf.logging.info("Logging the evaluation results...")
  for epoch in range(FLAGS.train_epochs):
    eval_results = {
        "accuracy": history.history["val_acc"][epoch],
        "loss": history.history["val_loss"][epoch],
        tf.GraphKeys.GLOBAL_STEP: (epoch + 1) * np.ceil(
            train_num_images/FLAGS.batch_size)
    }
    benchmark_logger.log_evaluation_result(eval_results)

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
def run_loop(name, train_input_fn, eval_input_fn, model_column_fn,
             build_estimator_fn, flags_obj, tensors_to_log, early_stop=False):
  """Define training loop."""
  model_helpers.apply_clean(flags.FLAGS)
  model = build_estimator_fn(
      model_dir=flags_obj.model_dir, model_type=flags_obj.model_type,
      model_column_fn=model_column_fn)

  run_params = {
      'batch_size': flags_obj.batch_size,
      'train_epochs': flags_obj.train_epochs,
      'model_type': flags_obj.model_type,
  }

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info('wide_deep', name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '')
  tensors_to_log = {k: v.format(loss_prefix=loss_prefix)
                    for k, v in tensors_to_log.items()}
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks, model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size, tensors_to_log=tensors_to_log)

  profiler_hook = tf.train.ProfilerHook(save_steps= 100, save_secs= None, output_dir="profs", show_memory=True, show_dataflow=True)
  
  #DOGA DEBUG GRAPH
  gdef = gpb.GraphDef()
 
  with open('/tmp/census_model/graph.pbtxt', 'r') as fh:
      graph_str = fh.read()

  pbtf.Parse(graph_str, gdef)

  with tf.Graph().as_default() as graph:
      tf.import_graph_def(gdef)

      operations_tensors = {}
      operations_names = tf.get_default_graph().get_operations()
      count1 = 0
      count2 = 0

      for operation in operations_names:
          operation_name = operation.name
          operations_info = tf.get_default_graph().get_operation_by_name(operation_name).values()
          if len(operations_info) > 0:
              if not (operations_info[0].shape.ndims is None):
                  operation_shape = operations_info[0].shape.as_list()
                  operation_dtype_size = operations_info[0].dtype.size
                  if not (operation_dtype_size is None):
                      operation_no_of_elements = 1
                      for dim in operation_shape:
                          if not(dim is None):
                              operation_no_of_elements = operation_no_of_elements * dim
                      total_size = operation_no_of_elements * operation_dtype_size
                      operations_tensors[operation_name] = total_size
                  else:
                      count1 = count1 + 1
              else:
                  count1 = count1 + 1
                  operations_tensors[operation_name] = -1
          else:
              count2 = count2 + 1
              operations_tensors[operation_name] = -1

      print(count1)
      print(count2)

  with open('tensors_sz.json', 'w') as f:
      json.dump(operations_tensors, f)
  
  # Train and evaluate the model every `flags.epochs_between_evals` epochs.
  for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
      model.train(input_fn=train_input_fn, hooks=[profiler_hook])

      results = model.evaluate(input_fn=eval_input_fn)

      # Display evaluation metrics
      tf.logging.info('Results at epoch %d / %d',
                      (n + 1) * flags_obj.epochs_between_evals,
                      flags_obj.train_epochs)
      tf.logging.info('-' * 60)

      for key in sorted(results):
        tf.logging.info('%s: %s' % (key, results[key]))

      benchmark_logger.log_evaluation_result(results)

      if early_stop and model_helpers.past_stop_threshold(
          flags_obj.stop_threshold, results['accuracy']):
        break

  # Export the model
  if flags_obj.export_dir is not None:
      export_model(model, flags_obj.model_type, flags_obj.export_dir,
                   model_column_fn)
Exemple #26
0
 def test_config_base_benchmark_logger(self):
   logger.config_benchmark_logger("")
   self.assertIsInstance(logger.get_benchmark_logger(),
                         logger.BaseBenchmarkLogger)
Exemple #27
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  # Data preprocessing
  # The file name of training and test dataset
  train_fname = os.path.join(
      FLAGS.data_dir, FLAGS.dataset + "-" + constants.TRAIN_RATINGS_FILENAME)
  test_fname = os.path.join(
      FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_RATINGS_FILENAME)
  neg_fname = os.path.join(
      FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_NEG_FILENAME)

  assert os.path.exists(train_fname), (
      "Run data_download.py first to download and extract {} dataset".format(
          FLAGS.dataset))

  tf.logging.info("Data preprocessing...")
  ncf_dataset = dataset.data_preprocessing(
      train_fname, test_fname, neg_fname, FLAGS.num_neg)

  # Create NeuMF model and convert it to Estimator
  tf.logging.info("Creating Estimator from Keras model...")
  layers = [int(layer) for layer in FLAGS.layers]
  mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization]
  keras_model = neumf_model.NeuMF(
      ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors,
      layers, FLAGS.batch_size, FLAGS.mf_regularization,
      mlp_regularization)
  num_gpus = flags_core.get_num_gpus(FLAGS)
  estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  # Training and evaluation cycle
  def train_input_fn():
    return dataset.input_fn(
        True,
        distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus),
        ncf_dataset, FLAGS.epochs_between_evals)

  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    # Train the model
    estimator.train(input_fn=train_input_fn, hooks=train_hooks)

    # Evaluate the model
    eval_results = evaluate_model(
        estimator, FLAGS.batch_size, num_gpus, ncf_dataset)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    hr = eval_results[_HR_KEY]
    ndcg = eval_results[_NDCG_KEY]
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      break

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
Exemple #28
0
def resnet_main(flags_obj,
                model_function,
                input_function,
                dataset_name,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.
  """

    model_helpers.apply_clean(flags.FLAGS)

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
        allow_soft_placement=True)

    distribution_strategy = distribution_utils.get_distribution_strategy(
        flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)

    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy,
                                        session_config=session_config)

    # initialize our model with all but the dense layer from pretrained resnet
    if flags_obj.pretrained_model_checkpoint_path is not None:
        warm_start_settings = tf.estimator.WarmStartSettings(
            flags_obj.pretrained_model_checkpoint_path,
            vars_to_warm_start='^(?!.*dense)')
    else:
        warm_start_settings = None

    classifier = tf.estimator.Estimator(
        model_fn=model_function,
        model_dir=flags_obj.model_dir,
        config=run_config,
        warm_start_from=warm_start_settings,
        params={
            'resnet_size': int(flags_obj.resnet_size),
            'data_format': flags_obj.data_format,
            'batch_size': flags_obj.batch_size,
            'resnet_version': int(flags_obj.resnet_version),
            'loss_scale': flags_core.get_loss_scale(flags_obj),
            'dtype': flags_core.get_tf_dtype(flags_obj),
            'fine_tune': flags_obj.fine_tune
        })

    run_params = {
        'batch_size': flags_obj.batch_size,
        'dtype': flags_core.get_tf_dtype(flags_obj),
        'resnet_size': flags_obj.resnet_size,
        'resnet_version': flags_obj.resnet_version,
        'synthetic_data': flags_obj.use_synthetic_data,
        'train_epochs': flags_obj.train_epochs,
    }
    if flags_obj.use_synthetic_data:
        dataset_name = dataset_name + '-synthetic'

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info('resnet',
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    def input_fn_train(num_epochs):
        return input_function(
            is_training=True,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=num_epochs,
            num_gpus=flags_core.get_num_gpus(flags_obj),
            dtype=flags_core.get_tf_dtype(flags_obj))

    def input_fn_eval():
        return input_function(
            is_training=False,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=1,
            dtype=flags_core.get_tf_dtype(flags_obj))

    if flags_obj.eval_only or not flags_obj.train_epochs:
        # If --eval_only is set, perform a single loop with zero train epochs.
        schedule, n_loops = [0], 1
    else:
        # Compute the number of times to loop while training. All but the last
        # pass will train for `epochs_between_evals` epochs, while the last will
        # train for the number needed to reach `training_epochs`. For instance if
        #   train_epochs = 25 and epochs_between_evals = 10
        # schedule will be set to [10, 10, 5]. That is to say, the loop will:
        #   Train for 10 epochs and then evaluate.
        #   Train for another 10 epochs and then evaluate.
        #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
        n_loops = math.ceil(flags_obj.train_epochs /
                            flags_obj.epochs_between_evals)
        schedule = [
            flags_obj.epochs_between_evals for _ in range(int(n_loops))
        ]
        schedule[-1] = flags_obj.train_epochs - sum(
            schedule[:-1])  # over counting.

    for cycle_index, num_train_epochs in enumerate(schedule):
        tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops))

        if num_train_epochs:
            classifier.train(input_fn=lambda: input_fn_train(num_train_epochs),
                             hooks=train_hooks,
                             max_steps=flags_obj.max_train_steps)

        tf.logging.info('Starting to evaluate.')

        # flags_obj.max_train_steps is generally associated with testing and
        # profiling. As a result it is frequently called with synthetic data, which
        # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
        # eval (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=flags_obj.max_train_steps)

        benchmark_logger.log_evaluation_result(eval_results)

        if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                             eval_results['accuracy']):
            break

    if flags_obj.export_dir is not None:
        # Exports a saved model for the given classifier.
        input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
            shape, batch_size=flags_obj.batch_size)
        classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
Exemple #29
0
 def test_config_benchmark_bigquery_logger(self, mock_bigquery_client):
   with flagsaver.flagsaver(benchmark_logger_type='BenchmarkBigQueryLogger'):
     logger.config_benchmark_logger()
     self.assertIsInstance(logger.get_benchmark_logger(),
                           logger.BenchmarkBigQueryLogger)
Exemple #30
0
 def test_config_benchmark_file_logger(self):
   logger.config_benchmark_logger("/tmp/abc")
   self.assertIsInstance(logger.get_benchmark_logger(),
                         logger.BenchmarkFileLogger)
Exemple #31
0
 def test_config_base_benchmark_logger(self):
   with flagsaver.flagsaver(benchmark_logger_type='BaseBenchmarkLogger'):
     logger.config_benchmark_logger()
     self.assertIsInstance(logger.get_benchmark_logger(),
                           logger.BaseBenchmarkLogger)
Exemple #32
0
 def test_get_default_benchmark_logger(self):
   with flagsaver.flagsaver(benchmark_logger_type='foo'):
     self.assertIsInstance(logger.get_benchmark_logger(),
                           logger.BaseBenchmarkLogger)
Exemple #33
0
def run_deep_speech(_):
  """Run deep speech training and eval loop."""
  tf.set_random_seed(flags_obj.seed)
  # Data preprocessing
  tf.logging.info("Data preprocessing...")
  train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
  eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)

  # Number of label classes. Label string is "[a-z]' -"
  num_classes = len(train_speech_dataset.speech_labels)

  # Use distribution strategy for multi-gpu training
  num_gpus = flags_core.get_num_gpus(flags_obj)
  distribution_strategy = distribution_utils.get_distribution_strategy(num_gpus)
  run_config = tf.estimator.RunConfig(
      train_distribute=distribution_strategy)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      model_dir=flags_obj.model_dir,
      config=run_config,
      params={
          "num_classes": num_classes,
      }
  )

  # Benchmark logging
  run_params = {
      "batch_size": flags_obj.batch_size,
      "train_epochs": flags_obj.train_epochs,
      "rnn_hidden_size": flags_obj.rnn_hidden_size,
      "rnn_hidden_layers": flags_obj.rnn_hidden_layers,
      "rnn_type": flags_obj.rnn_type,
      "is_bidirectional": flags_obj.is_bidirectional,
      "use_bias": flags_obj.use_bias
  }

  dataset_name = "LibriSpeech"
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info("deep_speech", dataset_name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size)

  per_device_batch_size = distribution_utils.per_device_batch_size(
      flags_obj.batch_size, num_gpus)

  def input_fn_train():
    return dataset.input_fn(
        per_device_batch_size, train_speech_dataset)

  def input_fn_eval():
    return dataset.input_fn(
        per_device_batch_size, eval_speech_dataset)

  total_training_cycle = (flags_obj.train_epochs //
                          flags_obj.epochs_between_evals)
  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: %d/%d",
                    cycle_index + 1, total_training_cycle)

    # Perform batch_wise dataset shuffling
    train_speech_dataset.entries = dataset.batch_wise_dataset_shuffle(
        train_speech_dataset.entries, cycle_index, flags_obj.sortagrad,
        flags_obj.batch_size)

    estimator.train(input_fn=input_fn_train, hooks=train_hooks)

    # Evaluation
    tf.logging.info("Starting to evaluate...")

    eval_results = evaluate_model(
        estimator, eval_speech_dataset.speech_labels,
        eval_speech_dataset.entries, input_fn_eval)

    # Log the WER and CER results.
    benchmark_logger.log_evaluation_result(eval_results)
    tf.logging.info(
        "Iteration {}: WER = {:.2f}, CER = {:.2f}".format(
            cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY]))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(
        flags_obj.wer_threshold, eval_results[_WER_KEY]):
      break
 def test_config_base_benchmark_logger(self):
   with flagsaver.flagsaver(benchmark_logger_type="BaseBenchmarkLogger"):
     logger.config_benchmark_logger()
     self.assertIsInstance(logger.get_benchmark_logger(),
                           logger.BaseBenchmarkLogger)
 def test_config_benchmark_bigquery_logger(self, mock_bigquery_client):
   with flagsaver.flagsaver(benchmark_logger_type="BenchmarkBigQueryLogger"):
     logger.config_benchmark_logger()
     self.assertIsInstance(logger.get_benchmark_logger(),
                           logger.BenchmarkBigQueryLogger)
def run_keras_model_benchmark(_):
    """Run the benchmark on keras model."""
    # Ensure a valid model name was supplied via command line argument
    if FLAGS.model not in MODELS.keys():
        raise AssertionError("The --model command line argument should "
                             "be a key in the `MODELS` dictionary.")

    # Check if eager execution is enabled
    if FLAGS.eager:
        tf.logging.info("Eager execution is enabled...")
        tf.enable_eager_execution()

    # Load the model
    tf.logging.info("Benchmark on {} model...".format(FLAGS.model))
    keras_model = MODELS[FLAGS.model]

    # Get dataset
    dataset_name = "ImageNet"
    if FLAGS.use_synthetic_data:
        tf.logging.info("Using synthetic dataset...")
        dataset_name += "_Synthetic"
        train_dataset = dataset.generate_synthetic_input_dataset(
            FLAGS.model, FLAGS.batch_size)
        val_dataset = dataset.generate_synthetic_input_dataset(
            FLAGS.model, FLAGS.batch_size)
        model = keras_model(weights=None)
    else:
        tf.logging.info("Using CIFAR-10 dataset...")
        dataset_name = "CIFAR-10"
        ds = dataset.Cifar10Dataset(FLAGS.batch_size)
        train_dataset = ds.train_dataset
        val_dataset = ds.test_dataset
        model = keras_model(weights=None,
                            input_shape=ds.input_shape,
                            classes=ds.num_classes)

    num_gpus = flags_core.get_num_gpus(FLAGS)

    distribution = None
    # Use distribution strategy
    if FLAGS.dist_strat:
        distribution = distribution_utils.get_distribution_strategy(
            num_gpus=num_gpus)
    elif num_gpus > 1:
        # Run with multi_gpu_model
        # If eager execution is enabled, only one GPU is utilized even if multiple
        # GPUs are provided.
        if FLAGS.eager:
            tf.logging.warning(
                "{} GPUs are provided, but only one GPU is utilized as "
                "eager execution is enabled.".format(num_gpus))
        model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus)

    # Adam optimizer and some other optimizers doesn't work well with
    # distribution strategy (b/113076709)
    # Use GradientDescentOptimizer here
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
    model.compile(loss="categorical_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"],
                  distribute=distribution)

    # Create benchmark logger for benchmark logging
    run_params = {
        "batch_size": FLAGS.batch_size,
        "synthetic_data": FLAGS.use_synthetic_data,
        "train_epochs": FLAGS.train_epochs,
        "num_train_images": FLAGS.num_train_images,
        "num_eval_images": FLAGS.num_eval_images,
    }

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name=FLAGS.model,
                                  dataset_name=dataset_name,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    # Create callbacks that log metric values about the training and evaluation
    callbacks = model_callbacks.get_model_callbacks(
        FLAGS.callbacks,
        batch_size=FLAGS.batch_size,
        metric_logger=benchmark_logger)
    # Train and evaluate the model
    history = model.fit(train_dataset,
                        epochs=FLAGS.train_epochs,
                        callbacks=callbacks,
                        validation_data=val_dataset,
                        steps_per_epoch=int(
                            np.ceil(FLAGS.num_train_images /
                                    FLAGS.batch_size)),
                        validation_steps=int(
                            np.ceil(FLAGS.num_eval_images / FLAGS.batch_size)))

    tf.logging.info("Logging the evaluation results...")
    for epoch in range(FLAGS.train_epochs):
        eval_results = {
            "accuracy":
            history.history["val_acc"][epoch],
            "loss":
            history.history["val_loss"][epoch],
            tf.GraphKeys.GLOBAL_STEP:
            (epoch + 1) * np.ceil(FLAGS.num_eval_images / FLAGS.batch_size)
        }
        benchmark_logger.log_evaluation_result(eval_results)

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
Exemple #37
0
 def test_get_default_benchmark_logger(self):
   self.assertIsInstance(logger.get_benchmark_logger(),
                         logger.BaseBenchmarkLogger)
 def test_get_default_benchmark_logger(self):
   with flagsaver.flagsaver(benchmark_logger_type="foo"):
     self.assertIsInstance(logger.get_benchmark_logger(),
                           logger.BaseBenchmarkLogger)
Exemple #39
0
def resnet_main(flags_obj,
                model_function,
                input_function,
                dataset_name,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.
  """

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
        allow_soft_placement=True)

    if flags_core.get_num_gpus(flags_obj) == 0:
        distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0')
    elif flags_core.get_num_gpus(flags_obj) == 1:
        distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0')
    else:
        distribution = tf.contrib.distribute.MirroredStrategy(
            num_gpus=flags_core.get_num_gpus(flags_obj))

    run_config = tf.estimator.RunConfig(train_distribute=distribution,
                                        session_config=session_config)

    classifier = tf.estimator.Estimator(
        model_fn=model_function,
        model_dir=flags_obj.model_dir,
        config=run_config,
        params={
            'resnet_size': int(flags_obj.resnet_size),
            'data_format': flags_obj.data_format,
            'batch_size': flags_obj.batch_size,
            'resnet_version': int(flags_obj.resnet_version),
            'loss_scale': flags_core.get_loss_scale(flags_obj),
            'dtype': flags_core.get_tf_dtype(flags_obj)
        })

    run_params = {
        'batch_size': flags_obj.batch_size,
        'dtype': flags_core.get_tf_dtype(flags_obj),
        'resnet_size': flags_obj.resnet_size,
        'resnet_version': flags_obj.resnet_version,
        'synthetic_data': flags_obj.use_synthetic_data,
        'train_epochs': flags_obj.train_epochs,
    }
    if flags_obj.use_synthetic_data:
        dataset_name = dataset_name + "-synthetic"

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info('resnet',
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               batch_size=flags_obj.batch_size)

    def input_fn_train():
        return input_function(is_training=True,
                              data_dir=flags_obj.data_dir,
                              batch_size=per_device_batch_size(
                                  flags_obj.batch_size,
                                  flags_core.get_num_gpus(flags_obj)),
                              num_epochs=flags_obj.epochs_between_evals,
                              num_gpus=flags_core.get_num_gpus(flags_obj))

    def input_fn_eval():
        return input_function(is_training=False,
                              data_dir=flags_obj.data_dir,
                              batch_size=per_device_batch_size(
                                  flags_obj.batch_size,
                                  flags_core.get_num_gpus(flags_obj)),
                              num_epochs=1)

    total_training_cycle = (flags_obj.train_epochs //
                            flags_obj.epochs_between_evals)
    for cycle_index in range(total_training_cycle):
        tf.logging.info('Starting a training cycle: %d/%d', cycle_index,
                        total_training_cycle)

        classifier.train(input_fn=input_fn_train,
                         hooks=train_hooks,
                         max_steps=flags_obj.max_train_steps)

        tf.logging.info('Starting to evaluate.')

        # flags_obj.max_train_steps is generally associated with testing and
        # profiling. As a result it is frequently called with synthetic data, which
        # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
        # eval (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=flags_obj.max_train_steps)

        benchmark_logger.log_evaluation_result(eval_results)

        if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                             eval_results['accuracy']):
            break

    if flags_obj.export_dir is not None:
        # Exports a saved model for the given classifier.
        input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
            shape, batch_size=flags_obj.batch_size)
        classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
Exemple #40
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)
    movielens_dataset.construct_train_eval_csv(
        data_dir=FLAGS.data_dir, dataset=FLAGS.dataset)

  tf.logging.info("Data preprocessing...")
  ncf_dataset = movielens_dataset.data_preprocessing(
      FLAGS.data_dir, FLAGS.dataset, FLAGS.num_neg)

  model_helpers.apply_clean(flags.FLAGS)

  # Create NeuMF model and convert it to Estimator
  tf.logging.info("Creating Estimator from Keras model...")
  layers = [int(layer) for layer in FLAGS.layers]
  mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization]
  keras_model = neumf_model.NeuMF(
      ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors,
      layers, FLAGS.batch_size, FLAGS.mf_regularization,
      mlp_regularization)
  num_gpus = flags_core.get_num_gpus(FLAGS)
  estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  # Training and evaluation cycle
  def get_train_input_fn():
    return movielens_dataset.get_input_fn(
        True,
        distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus),
        ncf_dataset, FLAGS.data_dir, FLAGS.dataset, FLAGS.epochs_between_evals)

  def get_pred_input_fn():
    return movielens_dataset.get_input_fn(
        False,
        distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus),
        ncf_dataset, FLAGS.data_dir, FLAGS.dataset, 1)

  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    # Train the model
    estimator.train(input_fn=get_train_input_fn(), hooks=train_hooks)

    # Evaluate the model
    eval_results = evaluate_model(
        estimator, FLAGS.batch_size, num_gpus, ncf_dataset, get_pred_input_fn())

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    hr = eval_results[_HR_KEY]
    ndcg = eval_results[_NDCG_KEY]
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      break

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
def resnet_main(flags_obj,
                model_function,
                input_function,
                dataset_name,
                shape=None,
                num_images=None,
                zeroshot_eval=False):
    model_helpers.apply_clean(flags.FLAGS)

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = config_utils.get_session_config(flags_obj)
    run_config = config_utils.get_run_config(flags_obj, flags_core,
                                             session_config,
                                             num_images['train'])
    tf.logging.info("ERR1!!!!")

    def gen_estimator(period=None):
        resnet_size = int(flags_obj.resnet_size)
        data_format = flags_obj.data_format
        batch_size = flags_obj.batch_size
        resnet_version = int(flags_obj.resnet_version)
        loss_scale = flags_core.get_loss_scale(flags_obj)
        dtype_tf = flags_core.get_tf_dtype(flags_obj)
        num_epochs_per_decay = flags_obj.num_epochs_per_decay
        learning_rate_decay_factor = flags_obj.learning_rate_decay_factor
        end_learning_rate = flags_obj.end_learning_rate
        learning_rate_decay_type = flags_obj.learning_rate_decay_type
        weight_decay = flags_obj.weight_decay
        zero_gamma = flags_obj.zero_gamma
        lr_warmup_epochs = flags_obj.lr_warmup_epochs
        base_learning_rate = flags_obj.base_learning_rate
        use_resnet_d = flags_obj.use_resnet_d
        use_dropblock = flags_obj.use_dropblock
        dropblock_kp = [float(be) for be in flags_obj.dropblock_kp]
        label_smoothing = flags_obj.label_smoothing
        momentum = flags_obj.momentum
        bn_momentum = flags_obj.bn_momentum
        train_epochs = flags_obj.train_epochs
        piecewise_lr_boundary_epochs = [
            int(be) for be in flags_obj.piecewise_lr_boundary_epochs
        ]
        piecewise_lr_decay_rates = [
            float(dr) for dr in flags_obj.piecewise_lr_decay_rates
        ]
        use_ranking_loss = flags_obj.use_ranking_loss
        use_se_block = flags_obj.use_se_block
        use_sk_block = flags_obj.use_sk_block
        mixup_type = flags_obj.mixup_type
        dataset_name = flags_obj.dataset_name
        kd_temp = flags_obj.kd_temp
        no_downsample = flags_obj.no_downsample
        anti_alias_filter_size = flags_obj.anti_alias_filter_size
        anti_alias_type = flags_obj.anti_alias_type
        cls_loss_type = flags_obj.cls_loss_type
        logit_type = flags_obj.logit_type
        embedding_size = flags_obj.embedding_size
        pool_type = flags_obj.pool_type
        arc_s = flags_obj.arc_s
        arc_m = flags_obj.arc_m
        bl_alpha = flags_obj.bl_alpha
        bl_beta = flags_obj.bl_beta
        exp = None

        if install_hyperdash and flags_obj.use_hyperdash:
            exp = Experiment(flags_obj.model_dir.split("/")[-1])
            resnet_size = exp.param("resnet_size", int(flags_obj.resnet_size))
            batch_size = exp.param("batch_size", flags_obj.batch_size)
            exp.param("dtype", flags_obj.dtype)
            learning_rate_decay_type = exp.param(
                "learning_rate_decay_type", flags_obj.learning_rate_decay_type)
            weight_decay = exp.param("weight_decay", flags_obj.weight_decay)
            zero_gamma = exp.param("zero_gamma", flags_obj.zero_gamma)
            lr_warmup_epochs = exp.param("lr_warmup_epochs",
                                         flags_obj.lr_warmup_epochs)
            base_learning_rate = exp.param("base_learning_rate",
                                           flags_obj.base_learning_rate)
            use_dropblock = exp.param("use_dropblock", flags_obj.use_dropblock)
            dropblock_kp = exp.param(
                "dropblock_kp", [float(be) for be in flags_obj.dropblock_kp])
            piecewise_lr_boundary_epochs = exp.param(
                "piecewise_lr_boundary_epochs",
                [int(be) for be in flags_obj.piecewise_lr_boundary_epochs])
            piecewise_lr_decay_rates = exp.param(
                "piecewise_lr_decay_rates",
                [float(dr) for dr in flags_obj.piecewise_lr_decay_rates])
            mixup_type = exp.param("mixup_type", flags_obj.mixup_type)
            dataset_name = exp.param("dataset_name", flags_obj.dataset_name)
            exp.param("autoaugment_type", flags_obj.autoaugment_type)

        classifier = tf.estimator.Estimator(
            model_fn=model_function,
            model_dir=flags_obj.model_dir,
            config=run_config,
            params={
                'resnet_size': resnet_size,
                'data_format': data_format,
                'batch_size': batch_size,
                'resnet_version': resnet_version,
                'loss_scale': loss_scale,
                'dtype': dtype_tf,
                'num_epochs_per_decay': num_epochs_per_decay,
                'learning_rate_decay_factor': learning_rate_decay_factor,
                'end_learning_rate': end_learning_rate,
                'learning_rate_decay_type': learning_rate_decay_type,
                'weight_decay': weight_decay,
                'zero_gamma': zero_gamma,
                'lr_warmup_epochs': lr_warmup_epochs,
                'base_learning_rate': base_learning_rate,
                'use_resnet_d': use_resnet_d,
                'use_dropblock': use_dropblock,
                'dropblock_kp': dropblock_kp,
                'label_smoothing': label_smoothing,
                'momentum': momentum,
                'bn_momentum': bn_momentum,
                'embedding_size': embedding_size,
                'train_epochs': train_epochs,
                'piecewise_lr_boundary_epochs': piecewise_lr_boundary_epochs,
                'piecewise_lr_decay_rates': piecewise_lr_decay_rates,
                'with_drawing_bbox': flags_obj.with_drawing_bbox,
                'use_ranking_loss': use_ranking_loss,
                'use_se_block': use_se_block,
                'use_sk_block': use_sk_block,
                'mixup_type': mixup_type,
                'kd_temp': kd_temp,
                'no_downsample': no_downsample,
                'dataset_name': dataset_name,
                'anti_alias_filter_size': anti_alias_filter_size,
                'anti_alias_type': anti_alias_type,
                'cls_loss_type': cls_loss_type,
                'logit_type': logit_type,
                'arc_s': arc_s,
                'arc_m': arc_m,
                'pool_type': pool_type,
                'bl_alpha': bl_alpha,
                'bl_beta': bl_beta,
                'train_steps': total_train_steps,
            })
        return classifier, exp

    run_params = {
        'batch_size': flags_obj.batch_size,
        'dtype': flags_core.get_tf_dtype(flags_obj),
        'resnet_size': flags_obj.resnet_size,
        'resnet_version': flags_obj.resnet_version,
        'synthetic_data': flags_obj.use_synthetic_data,
        'train_epochs': flags_obj.train_epochs,
    }
    if flags_obj.use_synthetic_data:
        dataset_name = dataset_name + '-synthetic'

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info('resnet',
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    def input_fn_train(num_epochs):
        return input_function(is_training=True,
                              use_random_crop=flags_obj.training_random_crop,
                              num_epochs=num_epochs,
                              flags_obj=flags_obj)

    def input_fn_eval():
        return input_function(is_training=False,
                              use_random_crop=False,
                              num_epochs=1,
                              flags_obj=flags_obj)

    ckpt_keeper = checkpoint_utils.CheckpointKeeper(
        save_dir=flags_obj.model_dir,
        num_to_keep=flags_obj.num_best_ckpt_to_keep,
        keep_epoch=flags_obj.keep_ckpt_every_eval,
        maximize=True)

    if zeroshot_eval:
        dataset = data_config.get_config(dataset_name)
        model = model_fns_predict.Model(
            int(flags_obj.resnet_size),
            flags_obj.data_format,
            resnet_version=int(flags_obj.resnet_version),
            num_classes=dataset.num_classes,
            zero_gamma=flags_obj.zero_gamma,
            use_se_block=flags_obj.use_se_block,
            use_sk_block=flags_obj.use_sk_block,
            no_downsample=flags_obj.no_downsample,
            anti_alias_filter_size=flags_obj.anti_alias_filter_size,
            anti_alias_type=flags_obj.anti_alias_type,
            bn_momentum=flags_obj.bn_momentum,
            embedding_size=flags_obj.embedding_size,
            pool_type=flags_obj.pool_type,
            bl_alpha=flags_obj.bl_alpha,
            bl_beta=flags_obj.bl_beta,
            dtype=flags_core.get_tf_dtype(flags_obj),
            loss_type=flags_obj.cls_loss_type)

    def train_and_evaluate(hooks):
        tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops))

        if num_train_epochs:
            classifier.train(input_fn=lambda: input_fn_train(num_train_epochs),
                             hooks=hooks,
                             steps=flags_obj.max_train_steps)

        tf.logging.info('Starting to evaluate.')

        if zeroshot_eval:
            tf.reset_default_graph()
            eval_results = recall_metric.recall_at_k(
                flags_obj,
                flags_core,
                input_fns.input_fn_ir_eval,
                model,
                num_images['validation'],
                eval_similarity=flags_obj.eval_similarity,
                return_embedding=True)
        else:
            eval_results = classifier.predict(input_fn=input_fn_eval)

        return eval_results

    total_train_steps = flags_obj.train_epochs * int(
        num_images['train'] / flags_obj.batch_size)

    if flags_obj.eval_only or not flags_obj.train_epochs:
        schedule, n_loops = [0], 1
    elif flags_obj.export_only:
        schedule, n_loops = [], 0
    else:
        n_loops = math.ceil(flags_obj.train_epochs /
                            flags_obj.epochs_between_evals)
        schedule = [
            flags_obj.epochs_between_evals for _ in range(int(n_loops))
        ]
        schedule[-1] = flags_obj.train_epochs - sum(
            schedule[:-1])  # over counting.

        schedule = config_utils.get_epoch_schedule(flags_obj, schedule,
                                                   num_images)
        tf.logging.info('epoch schedule:')
        tf.logging.info(schedule)

    classifier, exp = gen_estimator()
    if flags_obj.pretrained_model_checkpoint_path:
        warm_start_hook = WarmStartHook(
            flags_obj.pretrained_model_checkpoint_path)
        train_hooks.append(warm_start_hook)

    for cycle_index, num_train_epochs in enumerate(schedule):
        tf.logging.info("ERR123!!!!")
        eval_results = train_and_evaluate(train_hooks)
        return eval_results
        if zeroshot_eval:
            metric = eval_results['recall_at_1']
        else:
            metric = eval_results['accuracy']
        tf.logging.info("ERR1234!!!!")
        ckpt_keeper.save(metric, flags_obj.model_dir)
        if exp:
            exp.metric("accuracy", metric)
        benchmark_logger.log_evaluation_result(eval_results)
        tf.logging.info("ERR12345!!!!")
        if model_helpers.past_stop_threshold(flags_obj.stop_threshold, metric):
            break
        if model_helpers.past_stop_threshold(total_train_steps,
                                             eval_results['global_step']):
            break

    if exp:
        exp.end()

    if flags_obj.export_dir is not None:
        export_utils.export_pb(flags_core, flags_obj, shape, classifier)
Exemple #42
0
def run_loop(name, train_input_fn, eval_input_fn, pred_input_fn, model_column_fn,
             build_estimator_fn, flags_obj, tensors_to_log, early_stop=False):
  """Define training loop."""
  model_helpers.apply_clean(flags.FLAGS)
  model = build_estimator_fn(
      model_dir=flags_obj.model_dir, model_type=flags_obj.model_type,
      model_column_fn=model_column_fn,
      inter_op=flags_obj.inter_op_parallelism_threads,
      intra_op=flags_obj.intra_op_parallelism_threads)

  run_params = {
      'batch_size': flags_obj.batch_size,
      'train_epochs': flags_obj.train_epochs,
      'model_type': flags_obj.model_type,
  }

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info('wide_deep', name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '')
  tensors_to_log = {k: v.format(loss_prefix=loss_prefix)
                    for k, v in tensors_to_log.items()}
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks, model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size, tensors_to_log=tensors_to_log)

  # Train and evaluate the model every `flags.epochs_between_evals` epochs.
  for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
    model.train(input_fn=train_input_fn, hooks=train_hooks)

    results = model.evaluate(input_fn=eval_input_fn)
    # Display evaluation metrics
    tf.logging.info('Results at epoch %d / %d',
                    (n + 1) * flags_obj.epochs_between_evals,
                    flags_obj.train_epochs)
    tf.logging.info('-' * 60)

    for key in sorted(results):
      tf.logging.info('%s: %s' % (key, results[key]))

    benchmark_logger.log_evaluation_result(results)

    # only classification
#    if early_stop and model_helpers.past_stop_threshold(
#        flags_obj.stop_threshold, results['accuracy']):
#      break
  
  with open("./submission.csv", 'wb') as fout_preds:
    preds = model.predict(input_fn=pred_input_fn)
    idx = 1
    for i in preds:
      pred = i["predictions"][0]
      final_res = pred 
#      final_res = -math.log(1/pred -1) # y=sigmoid(x) ==> x=-ln(1/y-1)
#      final_res = math.exp(pred) / 1000 # y=log(x) ==> x=exp(y)
      out_list = [idx, final_res]
      out_str = ",".join(map(str, out_list))
      fout_preds.write(out_str + "\n")
      idx += 1

  # Export the model
  if flags_obj.export_dir is not None:
    export_model(model, flags_obj.model_type, flags_obj.export_dir,
                 model_column_fn)
Exemple #43
0
def resnet_main(flags_obj,
                model_function,
                input_function,
                dataset_name,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.

  Dict of results of the run.  Contains the keys `eval_results` and
    `train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5.
    `train_hooks` is a list the instances of hooks used during training.
  """

    model_helpers.apply_clean(flags.FLAGS)

    # Ensures flag override logic is only executed if explicitly triggered.
    if flags_obj.tf_gpu_thread_mode:
        override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)

    # Configures cluster spec for distribution strategy.
    num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts,
                                                       flags_obj.task_index)

    # Creates session config. allow_soft_placement = True, is required for
    # multi-GPU and is not harmful for other modes.
    session_config = tf.compat.v1.ConfigProto(
        inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
        allow_soft_placement=True)

    distribution_strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=flags_core.get_num_gpus(flags_obj),
        num_workers=num_workers,
        all_reduce_alg=flags_obj.all_reduce_alg,
        num_packs=flags_obj.num_packs)

    # Creates a `RunConfig` that checkpoints every 24 hours which essentially
    # results in checkpoints determined only by `epochs_between_evals`.
    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy,
                                        session_config=session_config,
                                        save_checkpoints_secs=60 * 60 * 24,
                                        save_checkpoints_steps=None)

    # Initializes model with all but the dense layer from pretrained ResNet.
    if flags_obj.pretrained_model_checkpoint_path is not None:
        warm_start_settings = tf.estimator.WarmStartSettings(
            flags_obj.pretrained_model_checkpoint_path,
            vars_to_warm_start='^(?!.*dense)')
    else:
        warm_start_settings = None

    classifier = tf.estimator.Estimator(
        model_fn=model_function,
        model_dir=flags_obj.model_dir,
        config=run_config,
        warm_start_from=warm_start_settings,
        params={
            'resnet_size': int(flags_obj.resnet_size),
            'data_format': flags_obj.data_format,
            'batch_size': flags_obj.batch_size,
            'resnet_version': int(flags_obj.resnet_version),
            'loss_scale': flags_core.get_loss_scale(flags_obj),
            'dtype': flags_core.get_tf_dtype(flags_obj),
            'fine_tune': flags_obj.fine_tune,
            'num_workers': num_workers,
        })

    run_params = {
        'batch_size': flags_obj.batch_size,
        'dtype': flags_core.get_tf_dtype(flags_obj),
        'resnet_size': flags_obj.resnet_size,
        'resnet_version': flags_obj.resnet_version,
        'synthetic_data': flags_obj.use_synthetic_data,
        'train_epochs': flags_obj.train_epochs,
        'num_workers': num_workers,
    }
    if flags_obj.use_synthetic_data:
        dataset_name = dataset_name + '-synthetic'

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info('resnet',
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    def input_fn_train(num_epochs, input_context=None):
        return input_function(
            is_training=True,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_replica_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=num_epochs,
            dtype=flags_core.get_tf_dtype(flags_obj),
            datasets_num_private_threads=flags_obj.
            datasets_num_private_threads,
            num_parallel_batches=flags_obj.datasets_num_parallel_batches,
            input_context=input_context)

    def input_fn_eval():
        return input_function(
            is_training=False,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_replica_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=1,
            dtype=flags_core.get_tf_dtype(flags_obj))

    train_epochs = (0 if flags_obj.eval_only or not flags_obj.train_epochs else
                    flags_obj.train_epochs)

    use_train_and_evaluate = flags_obj.use_train_and_evaluate or num_workers > 1
    if use_train_and_evaluate:
        train_spec = tf.estimator.TrainSpec(
            input_fn=lambda input_context=None: input_fn_train(
                train_epochs, input_context=input_context),
            hooks=train_hooks,
            max_steps=flags_obj.max_train_steps)
        eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval)
        tf.compat.v1.logging.info('Starting to train and evaluate.')
        tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
        # tf.estimator.train_and_evalute doesn't return anything in multi-worker
        # case.
        return {}
    else:
        if train_epochs == 0:
            # If --eval_only is set, perform a single loop with zero train epochs.
            schedule, n_loops = [0], 1
        else:
            # Compute the number of times to loop while training. All but the last
            # pass will train for `epochs_between_evals` epochs, while the last will
            # train for the number needed to reach `training_epochs`. For instance if
            #   train_epochs = 25 and epochs_between_evals = 10
            # schedule will be set to [10, 10, 5]. That is to say, the loop will:
            #   Train for 10 epochs and then evaluate.
            #   Train for another 10 epochs and then evaluate.
            #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
            n_loops = math.ceil(train_epochs / flags_obj.epochs_between_evals)
            schedule = [
                flags_obj.epochs_between_evals for _ in range(int(n_loops))
            ]
            schedule[-1] = train_epochs - sum(schedule[:-1])  # over counting.

        for cycle_index, num_train_epochs in enumerate(schedule):
            tf.compat.v1.logging.info('Starting cycle: %d/%d', cycle_index,
                                      int(n_loops))

            if num_train_epochs:
                # Since we are calling classifier.train immediately in each loop, the
                # value of num_train_epochs in the lambda function will not be changed
                # before it is used. So it is safe to ignore the pylint error here
                # pylint: disable=cell-var-from-loop
                classifier.train(
                    input_fn=lambda input_context=None: input_fn_train(
                        num_train_epochs, input_context=input_context),
                    hooks=train_hooks,
                    max_steps=flags_obj.max_train_steps)

            # flags_obj.max_train_steps is generally associated with testing and
            # profiling. As a result it is frequently called with synthetic data,
            # which will iterate forever. Passing steps=flags_obj.max_train_steps
            # allows the eval (which is generally unimportant in those circumstances)
            # to terminate.  Note that eval will run for max_train_steps each loop,
            # regardless of the global_step count.
            tf.compat.v1.logging.info('Starting to evaluate.')
            eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                               steps=flags_obj.max_train_steps)

            benchmark_logger.log_evaluation_result(eval_results)

            if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                                 eval_results['accuracy']):
                break

    if flags_obj.export_dir is not None:
        # Exports a saved model for the given classifier.
        export_dtype = flags_core.get_tf_dtype(flags_obj)
        if flags_obj.image_bytes_as_serving_input:
            input_receiver_fn = functools.partial(image_bytes_serving_input_fn,
                                                  shape,
                                                  dtype=export_dtype)
        else:
            input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
                shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
        classifier.export_savedmodel(flags_obj.export_dir,
                                     input_receiver_fn,
                                     strip_default_attrs=True)

    stats = {}
    stats['eval_results'] = eval_results
    stats['train_hooks'] = train_hooks

    return stats
Exemple #44
0
def vgg_main(
    flags_obj, model_function, input_function, dataset_name, shape=None):
  """Shared main loop for VGG Models.

  Args:
    flags_obj: An object containing parsed flags. See define_vgg_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.

  Returns:
    Dict of results of the run.
  """

  model_helpers.apply_clean(flags.FLAGS)

  # Ensures flag override logic is only executed if explicitly triggered.
  if flags_obj.tf_gpu_thread_mode:
    override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)

  # Creates session config. allow_soft_placement = True, is required for
  # multi-GPU and is not harmful for other modes.
  session_config = tf.ConfigProto(
      inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
      intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
      allow_soft_placement=True)

  distribution_strategy = distribution_utils.get_distribution_strategy(
      flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)

  # Creates a `RunConfig` that checkpoints every 24 hours which essentially
  # results in checkpoints determined only by `epochs_between_evals`.
  run_config = tf.estimator.RunConfig(
      train_distribute=distribution_strategy,
      session_config=session_config,
      save_checkpoints_secs=60*60*24)

  # Initializes model with all but the dense layer from pretrained VGG.
  if flags_obj.pretrained_model_checkpoint_path is not None:
    warm_start_settings = tf.estimator.WarmStartSettings(
        flags_obj.pretrained_model_checkpoint_path,
        vars_to_warm_start='^(?!.*dense)')
  else:
    warm_start_settings = None

  classifier = tf.estimator.Estimator(
      model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
      warm_start_from=warm_start_settings, params={
          'vgg_size': flags_obj.vgg_size,
          'data_format': flags_obj.data_format,
          'batch_size': flags_obj.batch_size,
          'loss_scale': flags_core.get_loss_scale(flags_obj),
          'dtype': flags_core.get_tf_dtype(flags_obj),
          'fine_tune': flags_obj.fine_tune
      })

  run_params = {
      'batch_size': flags_obj.batch_size,
      'dtype': flags_core.get_tf_dtype(flags_obj),
      'vgg_size': flags_obj.vgg_size,
      'synthetic_data': flags_obj.use_synthetic_data,
      'train_epochs': flags_obj.train_epochs,
  }
  if flags_obj.use_synthetic_data:
    dataset_name = dataset_name + '-synthetic'

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info('vgg', dataset_name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size)

  train_hooks = list(train_hooks) + lottery.hooks_from_flags(flags_obj.flag_values_dict())

  def input_fn_train(num_epochs):
    return input_function(
        is_training=True,
        data_dir=flags_obj.data_dir,
        batch_size=distribution_utils.per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=num_epochs,
        dtype=flags_core.get_tf_dtype(flags_obj),
        datasets_num_private_threads=flags_obj.datasets_num_private_threads,
        num_parallel_batches=flags_obj.datasets_num_parallel_batches)

  def input_fn_eval():
    return input_function(
        is_training=False,
        data_dir=flags_obj.data_dir,
        batch_size=distribution_utils.per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=1,
        dtype=flags_core.get_tf_dtype(flags_obj))

  if flags_obj.lth_generate_predictions:
    ckpt = tf.train.latest_checkpoint(flags_obj.model_dir)

    if flags_obj.lth_no_pruning:
      m_hooks = []
    else:
      m_hooks = lottery.hooks_from_flags(flags_obj.flag_values_dict())

    eval_results = classifier.predict(
        input_fn=input_fn_eval,
        checkpoint_path=ckpt,
        hooks=m_hooks,
    )

    assert flags_obj.lth_prediction_result_dir
    with tf.gfile.Open(os.path.join(flags_obj.data_dir, 'test_batch.bin'), 'rb') as f:
      labels = list(f.read()[::32*32*3+1])

    eval_results = list(eval_results)
    if not tf.gfile.Exists(flags_obj.lth_prediction_result_dir):
      tf.gfile.MakeDirs(flags_obj.lth_prediction_result_dir)
    with tf.gfile.Open(os.path.join(flags_obj.lth_prediction_result_dir, 'predictions'), 'wb') as f:
      for label, res in zip(labels, eval_results):
        res['label'] = label
      pickle.dump(eval_results, f)
    return

  try:
    cpr = tf.train.NewCheckpointReader(tf.train.latest_checkpoint(flags_obj.model_dir))
    current_step = cpr.get_tensor('global_step')
  except:
    current_step = 0

  while current_step < flags_obj.max_train_steps:
    next_checkpoint = min(current_step + 10000, flags_obj.max_train_steps)
    classifier.train(input_fn=lambda: input_fn_train(1000), hooks=train_hooks, max_steps=next_checkpoint)
    current_step = next_checkpoint
    tf.logging.info('Starting to evaluate.')
    eval_results = classifier.evaluate(input_fn=input_fn_eval)
    benchmark_logger.log_evaluation_result(eval_results)

  if flags_obj.export_dir is not None:
    # Exports a saved model for the given classifier.
    export_dtype = flags_core.get_tf_dtype(flags_obj)
    if flags_obj.image_bytes_as_serving_input:
      input_receiver_fn = functools.partial(
          image_bytes_serving_input_fn, shape, dtype=export_dtype)
    else:
      input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
          shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
                                 strip_default_attrs=True)
def resnet_main(
    flags_obj, model_function, input_function, dataset_name, shape=None):
  """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.
  """

  # Using the Winograd non-fused algorithms provides a small performance boost.
  os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

  # Create session config based on values of inter_op_parallelism_threads and
  # intra_op_parallelism_threads. Note that we default to having
  # allow_soft_placement = True, which is required for multi-GPU and not
  # harmful for other modes.
  session_config = tf.ConfigProto(
      inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
      intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
      allow_soft_placement=True)

  if flags_core.get_num_gpus(flags_obj) == 0:
    distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0')
  elif flags_core.get_num_gpus(flags_obj) == 1:
    distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0')
  else:
    distribution = tf.contrib.distribute.MirroredStrategy(
        num_gpus=flags_core.get_num_gpus(flags_obj)
    )

  run_config = tf.estimator.RunConfig(train_distribute=distribution,
                                      session_config=session_config)

  classifier = tf.estimator.Estimator(
      model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
      params={
          'resnet_size': int(flags_obj.resnet_size),
          'data_format': flags_obj.data_format,
          'batch_size': flags_obj.batch_size,
          'resnet_version': int(flags_obj.resnet_version),
          'loss_scale': flags_core.get_loss_scale(flags_obj),
          'dtype': flags_core.get_tf_dtype(flags_obj)
      })

  run_params = {
      'batch_size': flags_obj.batch_size,
      'dtype': flags_core.get_tf_dtype(flags_obj),
      'resnet_size': flags_obj.resnet_size,
      'resnet_version': flags_obj.resnet_version,
      'synthetic_data': flags_obj.use_synthetic_data,
      'train_epochs': flags_obj.train_epochs,
  }
  if flags_obj.use_synthetic_data:
    dataset_name = dataset_name + "-synthetic"

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info('resnet', dataset_name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      batch_size=flags_obj.batch_size)

  def input_fn_train():
    return input_function(
        is_training=True, data_dir=flags_obj.data_dir,
        batch_size=per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=flags_obj.epochs_between_evals)

  def input_fn_eval():
    return input_function(
        is_training=False, data_dir=flags_obj.data_dir,
        batch_size=per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=1)
  total_training_cycle = (flags_obj.train_epochs //
                          flags_obj.epochs_between_evals)
  for cycle_index in range(total_training_cycle):
    tf.logging.info('Starting a training cycle: %d/%d',
                    cycle_index, total_training_cycle)

    classifier.train(input_fn=input_fn_train, hooks=train_hooks,
                     max_steps=flags_obj.max_train_steps)

    tf.logging.info('Starting to evaluate.')

    # flags_obj.max_train_steps is generally associated with testing and
    # profiling. As a result it is frequently called with synthetic data, which
    # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
    # eval (which is generally unimportant in those circumstances) to terminate.
    # Note that eval will run for max_train_steps each loop, regardless of the
    # global_step count.
    eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                       steps=flags_obj.max_train_steps)

    benchmark_logger.log_evaluation_result(eval_results)

    if model_helpers.past_stop_threshold(
        flags_obj.stop_threshold, eval_results['accuracy']):
      break

  if flags_obj.export_dir is not None:
    # Exports a saved model for the given classifier.
    input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
        shape, batch_size=flags_obj.batch_size)
    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
def main(_):

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    estimator_config = tf.estimator.RunConfig(
        save_checkpoints_secs=600,  # Save checkpoints every 50 steps.
        keep_checkpoint_max=50,  # Retain the 10 most recent checkpoints.
    )
    classifier = tf.estimator.Estimator(
        model_fn=pop_resnet.resnet_model_fn,
        #model_dir="/home/ubuntu/one_octave_resnet/model",
        model_dir="./model",
        config=estimator_config,
        params=hparams)

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info('resnet', 'MAPS', hparams,
                                  test_id=TEST_ID)

    # Train and validate in turns
    if train_and_val:
        train_spec = tf.estimator.TrainSpec(input_fn=lambda: dataset.tfrecord_train_input_fn(train_dataset_tfrecord,
                                                                                             batch_size=hparams['batch_size'],
                                            num_epochs=hparams['train_epochs']), max_steps=hparams['train_steps'])#, hooks=[train_hooks])
        eval_spec = tf.estimator.EvalSpec(input_fn=lambda: dataset.tfrecord_val_input_fn(val_dataset_tfrecord,
                                                                                         batch_size=hparams['batch_size'],
                                                                                         num_epochs=1),
                                          steps=hparams['eval_steps'], throttle_secs=600)

        tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)

    # Train the Model.
    if train_flag:
        classifier.train(input_fn=lambda: dataset.tfrecord_train_input_fn(train_dataset_tfrecord,
                                                                          batch_size=hparams['batch_size'],
                                                                          num_epochs=hparams['train_epochs']),
                         steps=hparams['train_steps'])

    # Evaluate the model.
    if eval_flag:
        eval_result = classifier.evaluate(input_fn=lambda: dataset.tfrecord_val_input_fn(val_dataset_tfrecord,
                                                                                         batch_size=hparams['batch_size'],
                                                                                         num_epochs=1),
                                          steps=hparams['test_steps'])

        benchmark_logger.log_evaluation_result(eval_result)

    # Predict
    if predict_flag:
        predictions = classifier.predict(input_fn=lambda: dataset.tfrecord_test_input_fn(filepath=test_dataset_tfrecord,
                                                                                         batch_size=1, num_epochs=1))

        # Problem: due to graph structure the value needs to be determined at compilation time?!
        num_test_frames = 11468
        # pythonic way to count elements in generator object
        #num_test_frames = len(list(predictions)) #sum(1 for i in predictions)
        print(num_test_frames)
        props = np.zeros((hparams['num_classes'], num_test_frames))
        notes = np.zeros((hparams['num_classes'], num_test_frames))
        index = 0
        for p in predictions:
            if index < hparams['num_test_examples']:
            #print(np.shape(p['probabilities'][:]))
                props[:, index] = p['probabilities'][:]
                notes[:, index] = p['classes'][:]
            index = index + 1
        np.savez("props_MAPS_MUS-bor_ps6_ENSTDkCl_2018-11-11", props=props)
        #np.savez("notes_MAPS_MUS-chpn_op7_1_ENSTDkAm_2018-18-10", notes=notes)
        print(index)
Exemple #47
0
def get_logging_metric_callback(metric_logger=None, **kwargs):  # pylint: disable=unused-argument
  """Function to get LoggingMetricCallback."""
  return LoggingMetricCallback(
      metric_logger=metric_logger or logger.get_benchmark_logger())
Exemple #48
0
def run_deep_speech(_):
    """Run deep speech training and eval loop."""
    tf.set_random_seed(flags_obj.seed)
    # Data preprocessing
    tf.logging.info("Data preprocessing...")
    train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
    eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)

    # Number of label classes. Label string is "[a-z]' -"
    num_classes = len(train_speech_dataset.speech_labels)

    # Use distribution strategy for multi-gpu training
    num_gpus = flags_core.get_num_gpus(flags_obj)
    distribution_strategy = distribution_utils.get_distribution_strategy(
        num_gpus=num_gpus)
    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy)

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=flags_obj.model_dir,
                                       config=run_config,
                                       params={
                                           "num_classes": num_classes,
                                       })

    # Benchmark logging
    run_params = {
        "batch_size": flags_obj.batch_size,
        "train_epochs": flags_obj.train_epochs,
        "rnn_hidden_size": flags_obj.rnn_hidden_size,
        "rnn_hidden_layers": flags_obj.rnn_hidden_layers,
        "rnn_type": flags_obj.rnn_type,
        "is_bidirectional": flags_obj.is_bidirectional,
        "use_bias": flags_obj.use_bias
    }

    dataset_name = "LibriSpeech"
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info("deep_speech",
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    per_replica_batch_size = distribution_utils.per_replica_batch_size(
        flags_obj.batch_size, num_gpus)

    def input_fn_train():
        return dataset.input_fn(per_replica_batch_size, train_speech_dataset)

    def input_fn_eval():
        return dataset.input_fn(per_replica_batch_size, eval_speech_dataset)

    total_training_cycle = (flags_obj.train_epochs //
                            flags_obj.epochs_between_evals)
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: %d/%d", cycle_index + 1,
                        total_training_cycle)

        # Perform batch_wise dataset shuffling
        train_speech_dataset.entries = dataset.batch_wise_dataset_shuffle(
            train_speech_dataset.entries, cycle_index, flags_obj.sortagrad,
            flags_obj.batch_size)

        estimator.train(input_fn=input_fn_train,
                        hooks=train_hooks,
                        max_steps=flags_obj.max_train_steps)

        if flags_obj.skip_eval:
            break

        # Evaluation
        tf.logging.info("Starting to evaluate...")

        eval_results = evaluate_model(estimator,
                                      eval_speech_dataset.speech_labels,
                                      eval_speech_dataset.entries,
                                      input_fn_eval)

        # Log the WER and CER results.
        benchmark_logger.log_evaluation_result(eval_results)
        tf.logging.info("Iteration {}: WER = {:.2f}, CER = {:.2f}".format(
            cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY]))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(flags_obj.wer_threshold,
                                             eval_results[_WER_KEY]):
            break
Exemple #49
0
def run_keras_model_benchmark(_):
  """Run the benchmark on keras model."""
  # Ensure a valid model name was supplied via command line argument
  if FLAGS.model not in MODELS.keys():
    raise AssertionError("The --model command line argument should "
                         "be a key in the `MODELS` dictionary.")

  # Check if eager execution is enabled
  if FLAGS.eager:
    tf.logging.info("Eager execution is enabled...")
    tf.enable_eager_execution()

  # Load the model
  tf.logging.info("Benchmark on {} model...".format(FLAGS.model))
  keras_model = MODELS[FLAGS.model]
  model = keras_model(weights=None)

  # Get dataset
  dataset_name = "ImageNet"
  if FLAGS.use_synthetic_data:
    tf.logging.info("Using synthetic dataset...")
    dataset_name += "_Synthetic"
    train_dataset = dataset.generate_synthetic_input_dataset(
        FLAGS.model, FLAGS.batch_size)
    val_dataset = dataset.generate_synthetic_input_dataset(
        FLAGS.model, FLAGS.batch_size)
  else:
    raise ValueError("Only synthetic dataset is supported!")

  num_gpus = flags_core.get_num_gpus(FLAGS)

  distribution = None
  # Use distribution strategy
  if FLAGS.dist_strat:
    distribution = distribution_utils.get_distribution_strategy(
        num_gpus=num_gpus)
  elif num_gpus > 1:
    # Run with multi_gpu_model
    # If eager execution is enabled, only one GPU is utilized even if multiple
    # GPUs are provided.
    if FLAGS.eager:
      tf.logging.warning(
          "{} GPUs are provided, but only one GPU is utilized as "
          "eager execution is enabled.".format(num_gpus))
    model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus)

  # Adam optimizer and some other optimizers doesn't work well with
  # distribution strategy (b/113076709)
  # Use GradientDescentOptimizer here
  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
  model.compile(loss="categorical_crossentropy",
                optimizer=optimizer,
                metrics=["accuracy"],
                distribute=distribution)

  # Create benchmark logger for benchmark logging
  run_params = {
      "batch_size": FLAGS.batch_size,
      "synthetic_data": FLAGS.use_synthetic_data,
      "train_epochs": FLAGS.train_epochs,
      "num_train_images": FLAGS.num_train_images,
      "num_eval_images": FLAGS.num_eval_images,
  }

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name=FLAGS.model,
      dataset_name=dataset_name,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  # Create callbacks that log metric values about the training and evaluation
  callbacks = model_callbacks.get_model_callbacks(
      FLAGS.callbacks,
      batch_size=FLAGS.batch_size,
      metric_logger=benchmark_logger)
  # Train and evaluate the model
  history = model.fit(
      train_dataset,
      epochs=FLAGS.train_epochs,
      callbacks=callbacks,
      validation_data=val_dataset,
      steps_per_epoch=int(np.ceil(FLAGS.num_train_images / FLAGS.batch_size)),
      validation_steps=int(np.ceil(FLAGS.num_eval_images / FLAGS.batch_size))
  )

  tf.logging.info("Logging the evaluation results...")
  for epoch in range(FLAGS.train_epochs):
    eval_results = {
        "accuracy": history.history["val_acc"][epoch],
        "loss": history.history["val_loss"][epoch],
        tf.GraphKeys.GLOBAL_STEP: (epoch + 1) * np.ceil(
            FLAGS.num_eval_images/FLAGS.batch_size)
    }
    benchmark_logger.log_evaluation_result(eval_results)

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
Exemple #50
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  num_gpus = flags_core.get_num_gpus(FLAGS)
  batch_size = distribution_utils.per_device_batch_size(
      int(FLAGS.batch_size), num_gpus)
  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
  eval_batch_size = int(FLAGS.eval_batch_size or
                        max([FLAGS.batch_size, eval_per_user]))
  if eval_batch_size % eval_per_user:
    eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
    tf.logging.warning(
        "eval examples per user does not evenly divide eval_batch_size. "
        "Overriding to {}".format(eval_batch_size))

  if FLAGS.use_synthetic_data:
    ncf_dataset = None
    cleanup_fn = lambda: None
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        num_cycles=total_training_cycle,
        match_mlperf=FLAGS.ml_perf,
        deterministic=FLAGS.seed is not None,
        use_subprocess=FLAGS.use_subprocess,
        cache_id=FLAGS.cache_id)
    num_users = ncf_dataset.num_users
    num_items = ncf_dataset.num_items
    num_train_steps = int(np.ceil(
        FLAGS.epochs_between_evals * ncf_dataset.num_train_positives *
        (1 + FLAGS.num_neg) / FLAGS.batch_size))
    num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) *
                                 ncf_dataset.num_users / eval_batch_size))

  model_helpers.apply_clean(flags.FLAGS)

  params = {
      "use_seed": FLAGS.seed is not None,
      "hash_pipeline": FLAGS.hash_pipeline,
      "batch_size": batch_size,
      "eval_batch_size": eval_batch_size,
      "learning_rate": FLAGS.learning_rate,
      "num_users": num_users,
      "num_items": num_items,
      "mf_dim": FLAGS.num_factors,
      "model_layers": [int(layer) for layer in FLAGS.layers],
      "mf_regularization": FLAGS.mf_regularization,
      "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
      "num_neg": FLAGS.num_neg,
      "use_tpu": FLAGS.tpu is not None,
      "tpu": FLAGS.tpu,
      "tpu_zone": FLAGS.tpu_zone,
      "tpu_gcp_project": FLAGS.tpu_gcp_project,
      "beta1": FLAGS.beta1,
      "beta2": FLAGS.beta2,
      "epsilon": FLAGS.epsilon,
      "match_mlperf": FLAGS.ml_perf,
      "use_xla_for_gpu": FLAGS.use_xla_for_gpu,
      "use_estimator": FLAGS.use_estimator,
  }
  if FLAGS.use_estimator:
    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus, model_dir=FLAGS.model_dir,
        iterations=num_train_steps, params=params,
        batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size)
  else:
    runner = model_runner.NcfModelRunner(ncf_dataset, params, num_train_steps,
                                         num_eval_steps, FLAGS.use_while_loop)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
      tensors_to_log={"cross_entropy": "cross_entropy"}
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "eval_batch_size": eval_batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)


  eval_input_fn = None
  target_reached = False
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
  for cycle_index in range(total_training_cycle):
    assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                            value=cycle_index)

    # Train the model
    if FLAGS.use_estimator:
      train_input_fn, train_record_dir, batch_count = \
        data_preprocessing.make_input_fn(
            ncf_dataset=ncf_dataset, is_training=True)

      if batch_count != num_train_steps:
        raise ValueError(
            "Step counts do not match. ({} vs. {}) The async process is "
            "producing incorrect shards.".format(batch_count, num_train_steps))

      train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                            steps=num_train_steps)
      if train_record_dir:
        tf.gfile.DeleteRecursively(train_record_dir)

      tf.logging.info("Beginning evaluation.")
      if eval_input_fn is None:
        eval_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
            ncf_dataset=ncf_dataset, is_training=False)

        if eval_batch_count != num_eval_steps:
          raise ValueError(
              "Step counts do not match. ({} vs. {}) The async process is "
              "producing incorrect shards.".format(
                  eval_batch_count, num_eval_steps))

      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                              value=cycle_index)
      eval_results = eval_estimator.evaluate(eval_input_fn,
                                             steps=num_eval_steps)
      tf.logging.info("Evaluation complete.")
    else:
      runner.train()
      tf.logging.info("Beginning evaluation.")
      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                              value=cycle_index)
      eval_results = runner.eval()
      tf.logging.info("Evaluation complete.")
    hr = float(eval_results[rconst.HR_KEY])
    ndcg = float(eval_results[rconst.NDCG_KEY])

    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_TARGET,
        value={"epoch": cycle_index, "value": FLAGS.hr_threshold})
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                            value={"epoch": cycle_index, "value": hr})
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
        value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES})

    # Logged by the async process during record creation.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                            deferred=True)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      target_reached = True
      break

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                          value={"success": target_reached})
  cleanup_fn()  # Cleanup data construction artifacts and subprocess.

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
    num_gpus = flags_core.get_num_gpus(flags_obj)

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]  # 设置网络规模的种类,基础版还是高级版
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
    params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
    params["allow_ffn_pad"] = not params["use_tpu"]

    params["use_synthetic_data"] = flags_obj.use_synthetic_data  # 什么叫使用合成数据?

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (
        flags_obj.batch_size
        or (params["default_batch_size_tpu"]
            if params["use_tpu"] else params["default_batch_size"]))

    if not params["use_tpu"]:
        params["batch_size"] = distribution_utils.per_device_batch_size(
            params["batch_size"], num_gpus)

    schedule_manager = schedule.Manager(  # 用来管理训练进度的实例,例如steps,验证间隔步数等
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=params["use_tpu"],
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    model_helpers.apply_clean(flags.FLAGS)  # 清理一下数据和之前保存的模型,但是需要在参数中指定允许清理

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(  # 好像是输出日志的时候需要这个实例
        flags_obj.hooks,
        model_dir=flags_obj.model_dir,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
        use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
    )
    benchmark_logger = logger.get_benchmark_logger()  # 还是用来输出日志的
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    estimator = construct_estimator(
        flags_obj, params,
        schedule_manager)  # 返回一个tf.estimator.Estimator用来训练和验证模型
    run_loop(
        estimator=estimator,  # “估计器”,用来帮助训练和验证模型
        # Training arguments
        schedule_manager=schedule_manager,  # 用来管理训练过程的,训练多少steps,多久验证一次等
        train_hooks=train_hooks,  # 打日志的?
        benchmark_logger=benchmark_logger,  # 打日志的?
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,  # 3个关于bleu的文件
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file=flags_obj.vocab_file)  # 词表文件

    if flags_obj.export_dir:
        serving_input_fn = export.build_tensor_serving_input_receiver_fn(
            shape=[None], dtype=tf.int64, batch_size=None)
        # Export saved model, and save the vocab file as an extra asset. The vocab
        # file is saved to allow consistent input encoding and output decoding.
        # (See the "Export trained model" section in the README for an example of
        # how to use the vocab file.)
        # Since the model itself does not use the vocab file, this file is saved as
        # an extra asset rather than a core asset.
        estimator.export_savedmodel(
            flags_obj.export_dir,
            serving_input_fn,
            assets_extra={"vocab.txt": flags_obj.vocab_file})
def resnet_main(
    flags_obj, model_function, input_function, dataset_name, shape=None):
  """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.

  Returns:
    Dict of results of the run.
  """

  model_helpers.apply_clean(flags.FLAGS)

  # Ensures flag override logic is only executed if explicitly triggered.
  if flags_obj.tf_gpu_thread_mode:
    override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)

  # Creates session config. allow_soft_placement = True, is required for
  # multi-GPU and is not harmful for other modes.
  session_config = tf.ConfigProto(
      inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
      intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
      allow_soft_placement=True)

  distribution_strategy = distribution_utils.get_distribution_strategy(
      flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)

  # Creates a `RunConfig` that checkpoints every 24 hours which essentially
  # results in checkpoints determined only by `epochs_between_evals`.
  run_config = tf.estimator.RunConfig(
      train_distribute=distribution_strategy,
      session_config=session_config,
      save_checkpoints_secs=60*60*24)

  # Initializes model with all but the dense layer from pretrained ResNet.
  if flags_obj.pretrained_model_checkpoint_path is not None:
    warm_start_settings = tf.estimator.WarmStartSettings(
        flags_obj.pretrained_model_checkpoint_path,
        vars_to_warm_start='^(?!.*dense)')
  else:
    warm_start_settings = None

  classifier = tf.estimator.Estimator(
      model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
      warm_start_from=warm_start_settings, params={
          'resnet_size': int(flags_obj.resnet_size),
          'data_format': flags_obj.data_format,
          'batch_size': flags_obj.batch_size,
          'resnet_version': int(flags_obj.resnet_version),
          'loss_scale': flags_core.get_loss_scale(flags_obj),
          'dtype': flags_core.get_tf_dtype(flags_obj),
          'fine_tune': flags_obj.fine_tune
      })

  run_params = {
      'batch_size': flags_obj.batch_size,
      'dtype': flags_core.get_tf_dtype(flags_obj),
      'resnet_size': flags_obj.resnet_size,
      'resnet_version': flags_obj.resnet_version,
      'synthetic_data': flags_obj.use_synthetic_data,
      'train_epochs': flags_obj.train_epochs,
  }
  if flags_obj.use_synthetic_data:
    dataset_name = dataset_name + '-synthetic'

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info('resnet', dataset_name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size)

  def input_fn_train(num_epochs):
    return input_function(
        is_training=True,
        data_dir=flags_obj.data_dir,
        batch_size=distribution_utils.per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=num_epochs,
        dtype=flags_core.get_tf_dtype(flags_obj),
        datasets_num_private_threads=flags_obj.datasets_num_private_threads,
        num_parallel_batches=flags_obj.datasets_num_parallel_batches)

  def input_fn_eval():
    return input_function(
        is_training=False,
        data_dir=flags_obj.data_dir,
        batch_size=distribution_utils.per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=1,
        dtype=flags_core.get_tf_dtype(flags_obj))

  if flags_obj.eval_only or not flags_obj.train_epochs:
    # If --eval_only is set, perform a single loop with zero train epochs.
    schedule, n_loops = [0], 1
  else:
    # Compute the number of times to loop while training. All but the last
    # pass will train for `epochs_between_evals` epochs, while the last will
    # train for the number needed to reach `training_epochs`. For instance if
    #   train_epochs = 25 and epochs_between_evals = 10
    # schedule will be set to [10, 10, 5]. That is to say, the loop will:
    #   Train for 10 epochs and then evaluate.
    #   Train for another 10 epochs and then evaluate.
    #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
    n_loops = math.ceil(flags_obj.train_epochs / flags_obj.epochs_between_evals)
    schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))]
    schedule[-1] = flags_obj.train_epochs - sum(schedule[:-1])  # over counting.

  for cycle_index, num_train_epochs in enumerate(schedule):
    tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops))

    if num_train_epochs:
      classifier.train(input_fn=lambda: input_fn_train(num_train_epochs),
                       hooks=train_hooks, max_steps=flags_obj.max_train_steps)

    tf.logging.info('Starting to evaluate.')

    # flags_obj.max_train_steps is generally associated with testing and
    # profiling. As a result it is frequently called with synthetic data, which
    # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
    # eval (which is generally unimportant in those circumstances) to terminate.
    # Note that eval will run for max_train_steps each loop, regardless of the
    # global_step count.
    eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                       steps=flags_obj.max_train_steps)

    benchmark_logger.log_evaluation_result(eval_results)

    if model_helpers.past_stop_threshold(
        flags_obj.stop_threshold, eval_results['accuracy']):
      break

  if flags_obj.export_dir is not None:
    # Exports a saved model for the given classifier.
    export_dtype = flags_core.get_tf_dtype(flags_obj)
    if flags_obj.image_bytes_as_serving_input:
      input_receiver_fn = functools.partial(
          image_bytes_serving_input_fn, shape, dtype=export_dtype)
    else:
      input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
          shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
                                 strip_default_attrs=True)
  return eval_results
Exemple #53
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  num_gpus = flags_core.get_num_gpus(FLAGS)
  batch_size = distribution_utils.per_device_batch_size(
      int(FLAGS.batch_size), num_gpus)
  eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size)
  ncf_dataset = data_preprocessing.instantiate_pipeline(
      dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
      batch_size=batch_size,
      eval_batch_size=eval_batch_size,
      num_neg=FLAGS.num_neg,
      epochs_per_cycle=FLAGS.epochs_between_evals,
      match_mlperf=FLAGS.ml_perf)

  model_helpers.apply_clean(flags.FLAGS)

  train_estimator, eval_estimator = construct_estimator(
      num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={
          "batch_size": batch_size,
          "learning_rate": FLAGS.learning_rate,
          "num_users": ncf_dataset.num_users,
          "num_items": ncf_dataset.num_items,
          "mf_dim": FLAGS.num_factors,
          "model_layers": [int(layer) for layer in FLAGS.layers],
          "mf_regularization": FLAGS.mf_regularization,
          "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
          "use_tpu": FLAGS.tpu is not None,
          "tpu": FLAGS.tpu,
          "tpu_zone": FLAGS.tpu_zone,
          "tpu_gcp_project": FLAGS.tpu_gcp_project,
      }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "eval_batch_size": eval_batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  approx_train_steps = int(ncf_dataset.num_train_positives
                           * (1 + FLAGS.num_neg) // FLAGS.batch_size)
  pred_input_fn = data_preprocessing.make_pred_input_fn(ncf_dataset=ncf_dataset)

  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))


    # Train the model
    train_input_fn, train_record_dir, batch_count = \
      data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

    if np.abs(approx_train_steps - batch_count) > 1:
      tf.logging.warning(
          "Estimated ({}) and reported ({}) number of batches differ by more "
          "than one".format(approx_train_steps, batch_count))
    train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                          steps=batch_count)
    tf.gfile.DeleteRecursively(train_record_dir)

    # Evaluate the model
    eval_results = evaluate_model(
        eval_estimator, ncf_dataset, pred_input_fn)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    hr = eval_results[_HR_KEY]
    ndcg = eval_results[_NDCG_KEY]
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # Some of the NumPy vector math can be quite large and likes to stay in
    # memory for a while.
    gc.collect()

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      break

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
def run_wide_deep(flags_obj):
    """Run Wide-Deep training and eval loop."""

    # Clean up the model directory if present
    shutil.rmtree(flags_obj.model_dir, ignore_errors=True)
    model = build_estimator(flags_obj.model_dir, flags_obj.model_type)

    train_file = os.path.join(flags_obj.data_dir, 'train_data_normalization')
    test_file = os.path.join(flags_obj.data_dir,
                             'validation_data_normalization')

    # Train and evaluate the model every `flags.epochs_between_evals` epochs.
    def train_input_fn():
        return input_fn(train_file, flags_obj.epochs_between_evals, True,
                        flags_obj.batch_size)

    def eval_input_fn():
        return input_fn(test_file, 1, False, flags_obj.batch_size)

    run_params = {
        'batch_size': flags_obj.batch_size,
        'train_epochs': flags_obj.train_epochs,
        'model_type': flags_obj.model_type,
    }

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info('wd_modeling',
                                  'Census Income',
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '')
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        batch_size=flags_obj.batch_size,
        tensors_to_log={
            'average_loss': loss_prefix + 'head/truediv',
            'loss': loss_prefix + 'head/weighted_loss/Sum'
        })

    # Train and evaluate the model every `flags.epochs_between_evals` epochs.
    for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
        model.train(input_fn=train_input_fn, hooks=train_hooks)
        results = model.evaluate(input_fn=eval_input_fn)

        # Display evaluation metrics
        tf.logging.info('Results at epoch %d / %d',
                        (n + 1) * flags_obj.epochs_between_evals,
                        flags_obj.train_epochs)
        tf.logging.info('-' * 60)

        for key in sorted(results):
            tf.logging.info('%s: %s' % (key, results[key]))

    #Export Trained Model for Serving
    wideColumns, DeepColumns = build_model_columns()
    feature_columns = DeepColumns
    feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
    export_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
        feature_spec)
    servable_model_dir = "/tmp/census_exported"
    servable_model_path = model.export_savedmodel(servable_model_dir,
                                                  export_input_fn)
    print(" Done Exporting at Path - %s", servable_model_path)
Exemple #55
0
def run_wide_deep(flags_obj):
  """Run Wide-Deep training and eval loop.

  Args:
    flags_obj: An object containing parsed flag values.
  """

  # Clean up the model directory if present
  shutil.rmtree(flags_obj.model_dir, ignore_errors=True)
  model = build_estimator(flags_obj.model_dir, flags_obj.model_type)

  train_file = os.path.join(flags_obj.data_dir, 'adult.data')
  test_file = os.path.join(flags_obj.data_dir, 'adult.test')

  # Train and evaluate the model every `flags.epochs_between_evals` epochs.
  def train_input_fn():
    return input_fn(
        train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size)

  def eval_input_fn():
    return input_fn(test_file, 1, False, flags_obj.batch_size)

  run_params = {
      'batch_size': flags_obj.batch_size,
      'train_epochs': flags_obj.train_epochs,
      'model_type': flags_obj.model_type,
  }

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info('wide_deep', 'Census Income', run_params,
                                test_id=flags_obj.benchmark_test_id)

  loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '')
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks, batch_size=flags_obj.batch_size,
      tensors_to_log={'average_loss': loss_prefix + 'head/truediv',
                      'loss': loss_prefix + 'head/weighted_loss/Sum'})

  # Train and evaluate the model every `flags.epochs_between_evals` epochs.
  for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
    model.train(input_fn=train_input_fn, hooks=train_hooks)
    results = model.evaluate(input_fn=eval_input_fn)

    # Display evaluation metrics
    tf.logging.info('Results at epoch %d / %d',
                    (n + 1) * flags_obj.epochs_between_evals,
                    flags_obj.train_epochs)
    tf.logging.info('-' * 60)

    for key in sorted(results):
      tf.logging.info('%s: %s' % (key, results[key]))

    benchmark_logger.log_evaluation_result(results)

    if model_helpers.past_stop_threshold(
        flags_obj.stop_threshold, results['accuracy']):
      break

  # Export the model
  if flags_obj.export_dir is not None:
    export_model(model, flags_obj.model_type, flags_obj.export_dir)