Beispiel #1
0
 def test_config_benchmark_file_logger(self):
   # Set the benchmark_log_dir first since the benchmark_logger_type will need
   # the value to be set when it does the validation.
   with flagsaver.flagsaver(benchmark_log_dir='/tmp'):
     with flagsaver.flagsaver(benchmark_logger_type='BenchmarkFileLogger'):
       logger.config_benchmark_logger()
       self.assertIsInstance(logger.get_benchmark_logger(),
                             logger.BenchmarkFileLogger)
 def test_config_benchmark_file_logger(self):
   # Set the benchmark_log_dir first since the benchmark_logger_type will need
   # the value to be set when it does the validation.
   with flagsaver.flagsaver(benchmark_log_dir="/tmp"):
     with flagsaver.flagsaver(benchmark_logger_type="BenchmarkFileLogger"):
       logger.config_benchmark_logger()
       self.assertIsInstance(logger.get_benchmark_logger(),
                             logger.BenchmarkFileLogger)
    def train(self):
        self.createDatasets();

        if self.model is None:
            self._getModelEstimator();

        estimator = self.model.getEstimator();

        run_params = {
            'batch_size': self.flags.batch_size,
            'train_epochs': self.flags.train_epochs,
            'model_type': 'deep',
        }

        benchmark_logger = logger.config_benchmark_logger(self.flags)
        benchmark_logger.log_run_info('deep', 'Readmission Patient', run_params)

        # Train and evaluate the model every `flags.epochs_between_evals` epochs.
        for n in range(self.flags.train_epochs // self.flags.epochs_between_evals):
            estimator.train(input_fn=self._input_fn_train);
            # Display evaluation metrics
            tf.logging.info('Results at epoch %d / %d', (n + 1) * self.flags.epochs_between_evals, self.flags.train_epochs)
            tf.logging.info('-' * 60)

            results = estimator.predict(input_fn=self._input_fn_analyze);
            encodings = [p['encoding'] for p in results];
            basic_encodings = np.array(encodings);
            filename_basic_encodings = self.flags.model_dir + '/basic_encodings_' + str(n).zfill(5) + '.npy'
            np.save(filename_basic_encodings, basic_encodings);
Beispiel #4
0
def train_boosted_trees(flags_obj):
  """Train boosted_trees estimator on HIGGS data.

  Args:
    flags_obj: An object containing parsed flag values.
  """
  # Clean up the model directory if present.
  if tf.gfile.Exists(flags_obj.model_dir):
    tf.gfile.DeleteRecursively(flags_obj.model_dir)
  tf.logging.info("## Data loading...")
  train_data, eval_data = read_higgs_data(
      flags_obj.data_dir, flags_obj.train_start, flags_obj.train_count,
      flags_obj.eval_start, flags_obj.eval_count)
  tf.logging.info("## Data loaded; train: {}{}, eval: {}{}".format(
      train_data.dtype, train_data.shape, eval_data.dtype, eval_data.shape))
  # Data consists of one label column followed by 28 feature columns.
  train_input_fn, feature_names, feature_columns = make_inputs_from_np_arrays(
      features_np=train_data[:, 1:], label_np=train_data[:, 0:1])
  eval_input_fn = make_eval_inputs_from_np_arrays(
      features_np=eval_data[:, 1:], label_np=eval_data[:, 0:1])
  tf.logging.info("## Features prepared. Training starts...")

  # Create benchmark logger to log info about the training and metric values
  run_params = {
      "train_start": flags_obj.train_start,
      "train_count": flags_obj.train_count,
      "eval_start": flags_obj.eval_start,
      "eval_count": flags_obj.eval_count,
      "n_trees": flags_obj.n_trees,
      "max_depth": flags_obj.max_depth,
  }
  benchmark_logger = logger.config_benchmark_logger(flags_obj)
  benchmark_logger.log_run_info(
      model_name="boosted_trees",
      dataset_name="higgs",
      run_params=run_params)

  # Though BoostedTreesClassifier is under tf.estimator, faster in-memory
  # training is yet provided as a contrib library.
  classifier = tf.contrib.estimator.boosted_trees_classifier_train_in_memory(
      train_input_fn,
      feature_columns,
      model_dir=flags_obj.model_dir or None,
      n_trees=flags_obj.n_trees,
      max_depth=flags_obj.max_depth,
      learning_rate=flags_obj.learning_rate)

  # Evaluation.
  eval_results = classifier.evaluate(eval_input_fn)
  # Benchmark the evaluation results
  benchmark_logger.log_evaluation_result(eval_results)

  # Exporting the savedmodel with csv parsing.
  if flags_obj.export_dir is not None:
    classifier.export_savedmodel(
        flags_obj.export_dir,
        _make_csv_serving_input_receiver_fn(
            column_names=feature_names,
            # columns are all floats.
            column_defaults=[[0.0]] * len(feature_names)))
Beispiel #5
0
def run_wide_deep(flags_obj):
  """Run Wide-Deep training and eval loop.

  Args:
    flags_obj: An object containing parsed flag values.
  """

  # Clean up the model directory if present
  shutil.rmtree(flags_obj.model_dir, ignore_errors=True)
  model = build_estimator(flags_obj.model_dir, flags_obj.model_type)

  train_file = os.path.join(flags_obj.data_dir, 'adult.data')
  test_file = os.path.join(flags_obj.data_dir, 'adult.test')

  # Train and evaluate the model every `flags.epochs_between_evals` epochs.
  def train_input_fn():
    return input_fn(
        train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size)

  def eval_input_fn():
    return input_fn(test_file, 1, False, flags_obj.batch_size)

  run_params = {
      'batch_size': flags_obj.batch_size,
      'train_epochs': flags_obj.train_epochs,
      'model_type': flags_obj.model_type,
  }

  benchmark_logger = logger.config_benchmark_logger(flags_obj)
  benchmark_logger.log_run_info('wide_deep', 'Census Income', run_params)

  loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '')
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks, batch_size=flags_obj.batch_size,
      tensors_to_log={'average_loss': loss_prefix + 'head/truediv',
                      'loss': loss_prefix + 'head/weighted_loss/Sum'})

  # Train and evaluate the model every `flags.epochs_between_evals` epochs.
  for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
    model.train(input_fn=train_input_fn, hooks=train_hooks)
    results = model.evaluate(input_fn=eval_input_fn)

    # Display evaluation metrics
    tf.logging.info('Results at epoch %d / %d',
                    (n + 1) * flags_obj.epochs_between_evals,
                    flags_obj.train_epochs)
    tf.logging.info('-' * 60)

    for key in sorted(results):
      tf.logging.info('%s: %s' % (key, results[key]))

    benchmark_logger.log_evaluation_result(results)

    if model_helpers.past_stop_threshold(
        flags_obj.stop_threshold, results['accuracy']):
      break

  # Export the model
  if flags_obj.export_dir is not None:
    export_model(model, flags_obj.model_type, flags_obj.export_dir)
Beispiel #6
0
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
    # Determine training schedule based on flags.
    if flags_obj.train_steps is not None:
        train_eval_iterations = (flags_obj.train_steps //
                                 flags_obj.steps_between_evals)
        single_iteration_train_steps = flags_obj.steps_between_evals
        single_iteration_train_epochs = None
    else:
        train_epochs = flags_obj.train_epochs or DEFAULT_TRAIN_EPOCHS
        train_eval_iterations = train_epochs // flags_obj.epochs_between_evals
        single_iteration_train_steps = None
        single_iteration_train_epochs = flags_obj.epochs_between_evals

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    params.data_dir = flags_obj.data_dir
    params.num_parallel_calls = flags_obj.num_parallel_calls
    params.epochs_between_evals = flags_obj.epochs_between_evals
    params.repeat_dataset = single_iteration_train_epochs
    params.batch_size = flags_obj.batch_size or params.batch_size

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=params.batch_size  # for ExamplesPerSecondHook
    )
    benchmark_logger = logger.config_benchmark_logger(flags_obj)
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params.__dict__)

    # Train and evaluate transformer model
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=flags_obj.model_dir,
                                       params=params)
    train_schedule(
        estimator=estimator,
        # Training arguments
        train_eval_iterations=train_eval_iterations,
        single_iteration_train_steps=single_iteration_train_steps,
        single_iteration_train_epochs=single_iteration_train_epochs,
        train_hooks=train_hooks,
        benchmark_logger=benchmark_logger,
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file_path=os.path.join(flags_obj.data_dir, flags_obj.vocab_file))
Beispiel #7
0
    def train(self):

        self.createDatasets()

        if self.model is None:
            self._getModelEstimator()

        estimator = self.model.getEstimator()

        run_params = {
            'batch_size': self.flags.batch_size,
            'train_epochs': self.flags.train_epochs,
            'model_type': 'deep',
        }

        benchmark_logger = logger.config_benchmark_logger(self.flags)
        benchmark_logger.log_run_info('deep', 'Readmission Patient',
                                      run_params)

        # Train and evaluate the model every `flags.epochs_between_evals` epochs.
        for n in range(self.flags.train_epochs //
                       self.flags.epochs_between_evals):
            # Break from loop if privacy budget is exceedeed and differential privacy is enabled
            if self.flags.enable_dp and self.model.is_privacy_budget_exceeded(
            ):
                break

            print('n: ' + str(n))
            estimator.train(input_fn=self._input_fn_train)
            results = estimator.evaluate(input_fn=self._input_fn_eval)
            # Display evaluation metrics
            tf.logging.info('Results at epoch %d / %d',
                            (n + 1) * self.flags.epochs_between_evals,
                            self.flags.train_epochs)
            tf.logging.info('-' * 60)

            for key in sorted(results):
                tf.logging.info('%s: %s' % (key, results[key]))

            benchmark_logger.log_evaluation_result(results)

            if model_helpers.past_stop_threshold(self.flags.stop_threshold,
                                                 results['accuracy']):
                break

            # Export the model
            print('export the model?')
            if n % 10 == 0 and self.flags.export_dir is not None:
                self.export_model()
def get_logging_metric_hook(benchmark_log_dir=None,
                            tensors_to_log=None,
                            every_n_secs=600,
                            **kwargs):  # pylint: disable=unused-argument
    """Function to get LoggingMetricHook.

  Args:
    benchmark_log_dir: `string`, directory path to save the metric log.
    tensors_to_log: List of tensor names or dictionary mapping labels to tensor
      names. If not set, log _TENSORS_TO_LOG by default.
    every_n_secs: `int`, the frequency for logging the metric. Default to every
      10 mins.

  Returns:
    Returns a ProfilerHook that writes out timelines that can be loaded into
    profiling tools like chrome://tracing.
  """
    logger.config_benchmark_logger(benchmark_log_dir)
    if tensors_to_log is None:
        tensors_to_log = _TENSORS_TO_LOG
    return metric_hook.LoggingMetricHook(
        tensors=tensors_to_log,
        metric_logger=logger.get_benchmark_logger(),
        every_n_secs=every_n_secs)
Beispiel #9
0
def resnet_main(flags_obj,
                model_function,
                input_function,
                dataset_name,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.
  """

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
        allow_soft_placement=True)

    if flags_core.get_num_gpus(flags_obj) == 0:
        distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0')
    elif flags_core.get_num_gpus(flags_obj) == 1:
        distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0')
    else:
        distribution = tf.contrib.distribute.MirroredStrategy(
            num_gpus=flags_core.get_num_gpus(flags_obj))

    run_config = tf.estimator.RunConfig(train_distribute=distribution,
                                        session_config=session_config)

    classifier = tf.estimator.Estimator(
        model_fn=model_function,
        model_dir=flags_obj.model_dir,
        config=run_config,
        params={
            'resnet_size': int(flags_obj.resnet_size),
            'data_format': flags_obj.data_format,
            'batch_size': flags_obj.batch_size,
            'resnet_version': int(flags_obj.resnet_version),
            'loss_scale': flags_core.get_loss_scale(flags_obj),
            'dtype': flags_core.get_tf_dtype(flags_obj)
        })

    run_params = {
        'batch_size': flags_obj.batch_size,
        'dtype': flags_core.get_tf_dtype(flags_obj),
        'resnet_size': flags_obj.resnet_size,
        'resnet_version': flags_obj.resnet_version,
        'synthetic_data': flags_obj.use_synthetic_data,
        'train_epochs': flags_obj.train_epochs,
    }
    benchmark_logger = logger.config_benchmark_logger(flags_obj)
    benchmark_logger.log_run_info('resnet', dataset_name, run_params)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               batch_size=flags_obj.batch_size)

    def input_fn_train(num_epochs):
        return input_function(
            mode="train",
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=num_epochs,
            num_gpus=flags_core.get_num_gpus(flags_obj),
            dtype=flags_core.get_tf_dtype(flags_obj))

    def input_fn_eval():
        return input_function(mode="validate",
                              data_dir=flags_obj.data_dir,
                              batch_size=per_device_batch_size(
                                  flags_obj.batch_size,
                                  flags_core.get_num_gpus(flags_obj)),
                              num_epochs=1)

    def input_fn_pred():
        return input_function(mode="predict",
                              data_dir=flags_obj.data_dir,
                              batch_size=per_device_batch_size(
                                  flags_obj.batch_size,
                                  flags_core.get_num_gpus(flags_obj)),
                              num_epochs=1)

    #
    if flags_obj.predict_only:
        result = classifier.predict(input_fn=lambda: input_fn_pred())
        predicted_values = np.stack([r["predictions"] for r in result], axis=0)
        #print(predicted_values)
        df = pd.DataFrame(predicted_values)
        df.to_csv("validate_result.txt")
        return

    # train
    if flags_obj.eval_only or not flags_obj.train_epochs:
        # If --eval_only is set, perform a single loop with zero train epochs.
        schedule, n_loops = [0], 1
    else:
        # Compute the number of times to loop while training. All but the last
        # pass will train for `epochs_between_evals` epochs, while the last will
        # train for the number needed to reach `training_epochs`. For instance if
        #   train_epochs = 25 and epochs_between_evals = 10
        # schedule will be set to [10, 10, 5]. That is to say, the loop will:
        #   Train for 10 epochs and then evaluate.
        #   Train for another 10 epochs and then evaluate.
        #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
        n_loops = math.ceil(flags_obj.train_epochs /
                            flags_obj.epochs_between_evals)
        schedule = [
            flags_obj.epochs_between_evals for _ in range(int(n_loops))
        ]
        schedule[-1] = flags_obj.train_epochs - sum(
            schedule[:-1])  # over counting.

    for cycle_index, num_train_epochs in enumerate(schedule):
        tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops))

        if num_train_epochs:
            classifier.train(input_fn=lambda: input_fn_train(num_train_epochs),
                             hooks=train_hooks,
                             max_steps=flags_obj.max_train_steps)

        tf.logging.info('Starting to evaluate.')

        # flags_obj.max_train_steps is generally associated with testing and
        # profiling. As a result it is frequently called with synthetic data, which
        # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
        # eval (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=100)

        benchmark_logger.log_evaluation_result(eval_results)

        if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                             eval_results['mse']):
            break

    # save model for serving
    if flags_obj.export_dir is not None:
        # Exports a saved model for the given classifier.
        input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
            shape, batch_size=flags_obj.batch_size)
        classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
 def test_config_base_benchmark_logger(self):
   with flagsaver.flagsaver(benchmark_logger_type="BaseBenchmarkLogger"):
     logger.config_benchmark_logger()
     self.assertIsInstance(logger.get_benchmark_logger(),
                           logger.BaseBenchmarkLogger)
Beispiel #11
0
def simpnet_main(flags, model_function, input_function, shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags: FLAGS object that contains the params for running. See
      SimpnetArgParser for created flags.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags.export_dir is passed.
  """

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    if flags.multi_gpu:
        validate_batch_size_for_multi_gpu(flags.batch_size)

        # There are two steps required if using multi-GPU: (1) wrap the model_fn,
        # and (2) wrap the optimizer. The first happens here, and (2) happens
        # in the model_fn itself when the optimizer is defined.
        model_function = tf.contrib.estimator.replicate_model_fn(
            model_function, loss_reduction=tf.losses.Reduction.MEAN)

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
        allow_soft_placement=True)

    # Set up a RunConfig to save checkpoint and set session config.
    run_config = tf.estimator.RunConfig().replace(
        save_checkpoints_secs=1e9, session_config=session_config)
    classifier = tf.estimator.Estimator(model_fn=model_function,
                                        model_dir=flags.model_dir,
                                        config=run_config,
                                        params={
                                            'data_format': flags.data_format,
                                            'batch_size': flags.batch_size,
                                            'multi_gpu': flags.multi_gpu,
                                            'loss_scale': flags.loss_scale,
                                            'dtype': flags.dtype
                                        })

    benchmark_logger = logger.config_benchmark_logger(flags.benchmark_log_dir)
    benchmark_logger.log_run_info('simpnet')

    train_hooks = hooks_helper.get_train_hooks(
        flags.hooks,
        batch_size=flags.batch_size,
        benchmark_log_dir=flags.benchmark_log_dir)

    def input_fn_train():
        return input_function(True, flags.data_dir, flags.batch_size,
                              flags.epochs_between_evals,
                              flags.num_parallel_calls, flags.multi_gpu)

    def input_fn_eval():
        return input_function(False, flags.data_dir, flags.batch_size, 1,
                              flags.num_parallel_calls, flags.multi_gpu)

    total_training_cycle = flags.train_epochs // flags.epochs_between_evals
    for cycle_index in range(total_training_cycle):
        tf.logging.info('Starting a training cycle: %d/%d', cycle_index,
                        total_training_cycle)

        classifier.train(input_fn=input_fn_train,
                         hooks=train_hooks,
                         max_steps=flags.max_train_steps)

        tf.logging.info('Starting to evaluate.')
        # flags.max_train_steps is generally associated with testing and profiling.
        # As a result it is frequently called with synthetic data, which will
        # iterate forever. Passing steps=flags.max_train_steps allows the eval
        # (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=flags.max_train_steps)

        benchmark_logger.log_evaluation_result(eval_results)

        if model_helpers.past_stop_threshold(flags.stop_threshold,
                                             eval_results['accuracy']):
            break

    if flags.export_dir is not None:
        warn_on_multi_gpu_export(flags.multi_gpu)

        # Exports a saved model for the given classifier.
        input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
            shape, batch_size=flags.batch_size)
        classifier.export_savedmodel(flags.export_dir, input_receiver_fn)
Beispiel #12
0
def resnet_main(
    flags_obj, model_function, input_function, dataset_name, shape=None):
  """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.
  """

  # Using the Winograd non-fused algorithms provides a small performance boost.
  os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

  # Create session config based on values of inter_op_parallelism_threads and
  # intra_op_parallelism_threads. Note that we default to having
  # allow_soft_placement = True, which is required for multi-GPU and not
  # harmful for other modes.
  session_config = tf.ConfigProto(
      inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
      intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
      allow_soft_placement=True)

  if flags_core.get_num_gpus(flags_obj) == 0:
    distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0')
  elif flags_core.get_num_gpus(flags_obj) == 1:
    distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0')
  else:
    distribution = tf.contrib.distribute.MirroredStrategy(
        num_gpus=flags_core.get_num_gpus(flags_obj)
    )

  run_config = tf.estimator.RunConfig(train_distribute=distribution,
                                      session_config=session_config)

  classifier = tf.estimator.Estimator(
      model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
      params={
          'resnet_size': int(flags_obj.resnet_size),
          'data_format': flags_obj.data_format,
          'batch_size': flags_obj.batch_size,
          'resnet_version': int(flags_obj.resnet_version),
          'loss_scale': flags_core.get_loss_scale(flags_obj),
          'dtype': flags_core.get_tf_dtype(flags_obj)
      })

  run_params = {
      'batch_size': flags_obj.batch_size,
      'dtype': flags_core.get_tf_dtype(flags_obj),
      'resnet_size': flags_obj.resnet_size,
      'resnet_version': flags_obj.resnet_version,
      'synthetic_data': flags_obj.use_synthetic_data,
      'train_epochs': flags_obj.train_epochs,
  }
  benchmark_logger = logger.config_benchmark_logger(flags_obj)
  benchmark_logger.log_run_info('resnet', dataset_name, run_params)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      batch_size=flags_obj.batch_size)

  def input_fn_train():
    return input_function(
        is_training=True, data_dir=flags_obj.data_dir,
        batch_size=per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=flags_obj.epochs_between_evals)

  def input_fn_eval():
    return input_function(
        is_training=False, data_dir=flags_obj.data_dir,
        batch_size=per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=1)

  total_training_cycle = (flags_obj.train_epochs //
                          flags_obj.epochs_between_evals)
  for cycle_index in range(total_training_cycle):
    tf.logging.info('Starting a training cycle: %d/%d',
                    cycle_index, total_training_cycle)

    classifier.train(input_fn=input_fn_train, hooks=train_hooks,
                     max_steps=flags_obj.max_train_steps)

    tf.logging.info('Starting to evaluate.')

    # flags_obj.max_train_steps is generally associated with testing and
    # profiling. As a result it is frequently called with synthetic data, which
    # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
    # eval (which is generally unimportant in those circumstances) to terminate.
    # Note that eval will run for max_train_steps each loop, regardless of the
    # global_step count.
    eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                       steps=flags_obj.max_train_steps)

    benchmark_logger.log_evaluation_result(eval_results)

    if model_helpers.past_stop_threshold(
        flags_obj.stop_threshold, eval_results['accuracy']):
      break

  if flags_obj.export_dir is not None:
    # Exports a saved model for the given classifier.
    input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
        shape, batch_size=flags_obj.batch_size)
    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
Beispiel #13
0
 def test_config_benchmark_bigquery_logger(self, mock_bigquery_client):
   with flagsaver.flagsaver(benchmark_logger_type='BenchmarkBigQueryLogger'):
     logger.config_benchmark_logger()
     self.assertIsInstance(logger.get_benchmark_logger(),
                           logger.BenchmarkBigQueryLogger)
Beispiel #14
0
 def test_config_base_benchmark_logger(self):
     logger.config_benchmark_logger("")
     self.assertIsInstance(logger.get_benchmark_logger(),
                           logger.BaseBenchmarkLogger)
Beispiel #15
0
def resnet_main(flags_obj,
                model_function,
                input_function,
                dataset_name,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.
  """

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
        allow_soft_placement=True)

    if flags_core.get_num_gpus(flags_obj) == 0:
        distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0')
    elif flags_core.get_num_gpus(flags_obj) == 1:
        distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0')
    else:
        distribution = tf.contrib.distribute.MirroredStrategy(
            num_gpus=flags_core.get_num_gpus(flags_obj))

    run_config = tf.estimator.RunConfig(train_distribute=distribution,
                                        session_config=session_config)

    classifier = tf.estimator.Estimator(
        model_fn=model_function,
        model_dir=flags_obj.model_dir,
        config=run_config,
        params={
            'resnet_size': int(flags_obj.resnet_size),
            'data_format': flags_obj.data_format,
            'batch_size': flags_obj.batch_size,
            'resnet_version': int(flags_obj.resnet_version),
            'loss_scale': flags_core.get_loss_scale(flags_obj),
            'dtype': flags_core.get_tf_dtype(flags_obj)
        })

    run_params = {
        'batch_size': flags_obj.batch_size,
        'dtype': flags_core.get_tf_dtype(flags_obj),
        'resnet_size': flags_obj.resnet_size,
        'resnet_version': flags_obj.resnet_version,
        'synthetic_data': flags_obj.use_synthetic_data,
        'train_epochs': flags_obj.train_epochs,
    }
    benchmark_logger = logger.config_benchmark_logger(
        flags_obj.benchmark_log_dir)
    benchmark_logger.log_run_info('resnet', dataset_name, run_params)

    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        batch_size=flags_obj.batch_size,
        benchmark_log_dir=flags_obj.benchmark_log_dir)

    def input_fn_train():
        return input_function(is_training=True,
                              data_dir=flags_obj.data_dir,
                              batch_size=per_device_batch_size(
                                  flags_obj.batch_size,
                                  flags_core.get_num_gpus(flags_obj)),
                              num_epochs=flags_obj.epochs_between_evals)

    def input_fn_eval():
        return input_function(is_training=False,
                              data_dir=flags_obj.data_dir,
                              batch_size=per_device_batch_size(
                                  flags_obj.batch_size,
                                  flags_core.get_num_gpus(flags_obj)),
                              num_epochs=1)

    total_training_cycle = (flags_obj.train_epochs //
                            flags_obj.epochs_between_evals)
    for cycle_index in range(total_training_cycle):
        tf.logging.info('Starting a training cycle: %d/%d', cycle_index,
                        total_training_cycle)

        classifier.train(input_fn=input_fn_train,
                         hooks=train_hooks,
                         max_steps=flags_obj.max_train_steps)

        tf.logging.info('Starting to evaluate.')

        # flags_obj.max_train_steps is generally associated with testing and
        # profiling. As a result it is frequently called with synthetic data, which
        # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
        # eval (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=flags_obj.max_train_steps)

        benchmark_logger.log_evaluation_result(eval_results)

        if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                             eval_results['accuracy']):
            break

    if flags_obj.export_dir is not None:
        # Exports a saved model for the given classifier.
        input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
            shape, batch_size=flags_obj.batch_size)
        classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
Beispiel #16
0
 def test_config_benchmark_file_logger(self):
   logger.config_benchmark_logger("/tmp/abc")
   self.assertIsInstance(logger.get_benchmark_logger(),
                         logger.BenchmarkFileLogger)
Beispiel #17
0
 def test_config_benchmark_file_logger(self):
     logger.config_benchmark_logger("/tmp/abc")
     self.assertIsInstance(logger.get_benchmark_logger(),
                           logger.BenchmarkFileLogger)
Beispiel #18
0
 def test_config_base_benchmark_logger(self):
   logger.config_benchmark_logger("")
   self.assertIsInstance(logger.get_benchmark_logger(),
                         logger.BaseBenchmarkLogger)
Beispiel #19
0
def main(_):
    # Data preprocessing
    # The file name of training and test dataset
    train_fname = os.path.join(
        FLAGS.data_dir, FLAGS.dataset + "-" + constants.TRAIN_RATINGS_FILENAME)
    test_fname = os.path.join(
        FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_RATINGS_FILENAME)
    neg_fname = os.path.join(FLAGS.data_dir,
                             FLAGS.dataset + "-" + constants.TEST_NEG_FILENAME)

    assert os.path.exists(train_fname), (
        "Run data_download.py first to download and extract {} dataset".format(
            FLAGS.dataset))

    tf.logging.info("Data preprocessing...")
    ncf_dataset = dataset.data_preprocessing(train_fname, test_fname,
                                             neg_fname, FLAGS.num_neg)

    # Create NeuMF model and convert it to Estimator
    tf.logging.info("Creating Estimator from Keras model...")
    layers = [int(layer) for layer in FLAGS.layers]
    mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization]
    keras_model = neumf_model.NeuMF(ncf_dataset.num_users,
                                    ncf_dataset.num_items, FLAGS.num_factors,
                                    layers, FLAGS.batch_size,
                                    FLAGS.mf_regularization,
                                    mlp_regularization)
    num_gpus = flags_core.get_num_gpus(FLAGS)
    estimator = convert_keras_to_estimator(keras_model, num_gpus,
                                           FLAGS.model_dir)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
    )
    run_params = {
        "batch_size": FLAGS.batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.config_benchmark_logger(FLAGS)
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params)

    # Training and evaluation cycle
    def train_input_fn():
        return dataset.input_fn(
            True, per_device_batch_size(FLAGS.batch_size, num_gpus),
            ncf_dataset, FLAGS.epochs_between_evals)

    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        # Train the model
        estimator.train(input_fn=train_input_fn, hooks=train_hooks)

        # Evaluate the model
        eval_results = evaluate_model(estimator, FLAGS.batch_size, num_gpus,
                                      ncf_dataset)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[_HR_KEY]
        ndcg = eval_results[_NDCG_KEY]
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
Beispiel #20
0
def resnet_main(flags,
                model_function,
                input_function,
                num_train_samps,
                num_eval_samps,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags: FLAGS object that contains the params for running. See
      ResnetArgParser for created flags.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags.export_dir is passed.
  """

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    if flags.multi_gpu:
        validate_batch_size_for_multi_gpu(flags.batch_size)

        # There are two steps required if using multi-GPU: (1) wrap the model_fn,
        # and (2) wrap the optimizer. The first happens here, and (2) happens
        # in the model_fn itself when the optimizer is defined.
        model_function = tf.contrib.estimator.replicate_model_fn(
            loss_reduction=tf.losses.Reduction.MEAN)

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.

    myrank = 0
    numworkers = 1
    if (flags.enable_ml_comm == 1):

        # initialize the Cray PE ML Plugin
        # config the thread team (correcting the number of epochs for the effectice batch size))
        #totsize = sum([reduce(lambda x, y: x*y, v.get_shape().as_list()) for v in tf.trainable_variables()])

        totsize = 25551401  #Specific size for resnet50-v2
        mc.init(2, 1, totsize, "tensorflow")
        myrank = mc.get_rank()
        numworkers = mc.get_nranks()
        if (myrank == 0):
            print("ResNet with {:9d} parameters".format(totsize))

        max_steps_train = int(
            math.ceil(flags.train_epochs * (num_train_samps + num_eval_samps) /
                      (mc.get_nranks() * flags.batch_size)))
        #(0,0,num_steps_before_going_nonblock, max_steps_train, verbose=1, how_often_to_print=100)
        mc.config_team(0, 0, max_steps_train, max_steps_train, 1, 100)

        flags.model_dir = flags.model_dir if mc.get_rank() == 0 else None
        flags.benchmark_log_dir = flags.benchmark_log_dir if mc.get_rank(
        ) == 0 else None
        flags.export_dir = flags.export_dir if mc.get_rank() == 0 else None

    else:
        rank_id = myrank

    session_config = tf.ConfigProto(
        log_device_placement=False,
        inter_op_parallelism_threads=flags.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
        allow_soft_placement=True)

    # Set up a RunConfig to save checkpoint and set session config.
    run_config = tf.estimator.RunConfig().replace(
        save_checkpoints_steps=500, session_config=session_config)

    classifier = tf.estimator.Estimator(model_fn=model_function,
                                        model_dir=flags.model_dir,
                                        config=run_config,
                                        params={
                                            'resnet_size': flags.resnet_size,
                                            'data_format': flags.data_format,
                                            'batch_size': flags.batch_size,
                                            'multi_gpu': flags.multi_gpu,
                                            'train_epochs': flags.train_epochs,
                                            'version': flags.version,
                                            'loss_scale': flags.loss_scale,
                                            'dtype': flags.dtype,
                                            'mlcomm': flags.enable_ml_comm,
                                            'log_freq':
                                            flags.global_perf_log_freq,
                                            'weight_decay': flags.weight_decay,
                                            'init_lr': flags.init_lr,
                                            'base_lr': flags.base_lr,
                                            'warmup_epochs':
                                            flags.warmup_epochs,
                                            'log_freq':
                                            flags.global_perf_log_freq,
                                        })

    benchmark_logger = logger.config_benchmark_logger(flags.benchmark_log_dir)
    benchmark_logger.log_run_info('resnet')

    for _ in range(flags.train_epochs // flags.epochs_between_evals):
        train_hooks = hooks_helper.get_train_hooks(
            flags.hooks,
            batch_size=flags.batch_size,
            benchmark_log_dir=flags.benchmark_log_dir)
        if (myrank == 0):
            print('Starting a training cycle.')

        def input_fn_train():
            return input_function(True, flags.data_dir, flags.batch_size,
                                  flags.epochs_between_evals,
                                  flags.num_parallel_calls, flags.multi_gpu,
                                  numworkers, myrank)

        tsteps = math.ceil(
            float(flags.epochs_between_evals * num_train_samps) /
            (numworkers * flags.batch_size))
        classifier.train(input_fn=input_fn_train,
                         steps=tsteps,
                         max_steps=flags.max_train_steps)

        if (myrank == 0):
            print('Starting to evaluate.')

        # Evaluate the model and print results
        def input_fn_eval():
            return input_function(False, flags.data_dir, flags.batch_size, 3,
                                  flags.num_parallel_calls, flags.multi_gpu,
                                  numworkers, myrank)

        # flags.max_train_steps is generally associated with testing and profiling.
        # As a result it is frequently called with synthetic data, which will
        # iterate forever. Passing steps=flags.max_train_steps allows the eval
        # (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        esteps = math.ceil(
            float(num_eval_samps) / (numworkers * flags.batch_size))
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=esteps)

        benchmark_logger.log_evaluation_result(eval_results)

        if model_helpers.past_stop_threshold(flags.stop_threshold,
                                             eval_results['accuracy']):
            break

    if flags.export_dir is not None:
        warn_on_multi_gpu_export(flags.multi_gpu)

        # Exports a saved model for the given classifier.
        input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
            shape, batch_size=flags.batch_size)
        classifier.export_savedmodel(flags.export_dir, input_receiver_fn)

    if (flags.enable_ml_comm == 1):
        mc.finalize()
 def test_config_benchmark_bigquery_logger(self, mock_bigquery_client):
   with flagsaver.flagsaver(benchmark_logger_type="BenchmarkBigQueryLogger"):
     logger.config_benchmark_logger()
     self.assertIsInstance(logger.get_benchmark_logger(),
                           logger.BenchmarkBigQueryLogger)
Beispiel #22
0
 def test_config_base_benchmark_logger(self):
   with flagsaver.flagsaver(benchmark_logger_type='BaseBenchmarkLogger'):
     logger.config_benchmark_logger()
     self.assertIsInstance(logger.get_benchmark_logger(),
                           logger.BaseBenchmarkLogger)
Beispiel #23
0
def train_boosted_trees(flags_obj):
  """Train boosted_trees estimator on HIGGS data.

  Args:
    flags_obj: An object containing parsed flag values.
  """
  # Clean up the model directory if present.
  if tf.gfile.Exists(flags_obj.model_dir):
    tf.gfile.DeleteRecursively(flags_obj.model_dir)
  tf.logging.info("## Data loading...")
  train_data, eval_data = read_higgs_data(
      flags_obj.data_dir, flags_obj.train_start, flags_obj.train_count,
      flags_obj.eval_start, flags_obj.eval_count)
  tf.logging.info("## Data loaded; train: {}{}, eval: {}{}".format(
      train_data.dtype, train_data.shape, eval_data.dtype, eval_data.shape))
  # Data consists of one label column followed by 28 feature columns.
  train_input_fn, feature_names, feature_columns = make_inputs_from_np_arrays(
      features_np=train_data[:, 1:], label_np=train_data[:, 0:1])
  eval_input_fn = make_eval_inputs_from_np_arrays(
      features_np=eval_data[:, 1:], label_np=eval_data[:, 0:1])
  tf.logging.info("## Features prepared. Training starts...")

  # Create benchmark logger to log info about the training and metric values
  run_params = {
      "train_start": flags_obj.train_start,
      "train_count": flags_obj.train_count,
      "eval_start": flags_obj.eval_start,
      "eval_count": flags_obj.eval_count,
      "n_trees": flags_obj.n_trees,
      "max_depth": flags_obj.max_depth,
  }
  benchmark_logger = logger.config_benchmark_logger(flags_obj)
  benchmark_logger.log_run_info(
      model_name="boosted_trees",
      dataset_name="higgs",
      run_params=run_params,
      test_id=flags_obj.benchmark_test_id)

  # Though BoostedTreesClassifier is under tf.estimator, faster in-memory
  # training is yet provided as a contrib library.
  classifier = tf.contrib.estimator.boosted_trees_classifier_train_in_memory(
      train_input_fn,
      feature_columns,
      model_dir=flags_obj.model_dir or None,
      n_trees=flags_obj.n_trees,
      max_depth=flags_obj.max_depth,
      learning_rate=flags_obj.learning_rate)

  # Evaluation.
  eval_results = classifier.evaluate(eval_input_fn)
  # Benchmark the evaluation results
  benchmark_logger.log_evaluation_result(eval_results)

  # Exporting the savedmodel with csv parsing.
  if flags_obj.export_dir is not None:
    classifier.export_savedmodel(
        flags_obj.export_dir,
        _make_csv_serving_input_receiver_fn(
            column_names=feature_names,
            # columns are all floats.
            column_defaults=[[0.0]] * len(feature_names)),
        strip_default_attrs=True)
Beispiel #24
0
def main(_):
  # Data preprocessing
  # The file name of training and test dataset
  train_fname = os.path.join(
      FLAGS.data_dir, FLAGS.dataset + "-" + constants.TRAIN_RATINGS_FILENAME)
  test_fname = os.path.join(
      FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_RATINGS_FILENAME)
  neg_fname = os.path.join(
      FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_NEG_FILENAME)

  assert os.path.exists(train_fname), (
      "Run data_download.py first to download and extract {} dataset".format(
          FLAGS.dataset))

  tf.logging.info("Data preprocessing...")
  ncf_dataset = dataset.data_preprocessing(
      train_fname, test_fname, neg_fname, FLAGS.num_neg)

  # Create NeuMF model and convert it to Estimator
  tf.logging.info("Creating Estimator from Keras model...")
  layers = [int(layer) for layer in FLAGS.layers]
  mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization]
  keras_model = neumf_model.NeuMF(
      ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors,
      layers, FLAGS.batch_size, FLAGS.mf_regularization,
      mlp_regularization)
  num_gpus = flags_core.get_num_gpus(FLAGS)
  estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.config_benchmark_logger(FLAGS)
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params)

  # Training and evaluation cycle
  def train_input_fn():
    return dataset.input_fn(
        True, per_device_batch_size(FLAGS.batch_size, num_gpus),
        ncf_dataset, FLAGS.epochs_between_evals)

  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    # Train the model
    estimator.train(input_fn=train_input_fn, hooks=train_hooks)

    # Evaluate the model
    eval_results = evaluate_model(
        estimator, FLAGS.batch_size, num_gpus, ncf_dataset)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    hr = eval_results[_HR_KEY]
    ndcg = eval_results[_NDCG_KEY]
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      break

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()