Beispiel #1
0
    def test_end_to_end(self):
        ncf_dataset = data_preprocessing.instantiate_pipeline(
            dataset=DATASET,
            data_dir=self.temp_data_dir,
            batch_size=BATCH_SIZE,
            eval_batch_size=BATCH_SIZE,
            num_data_readers=2,
            num_neg=NUM_NEG)

        for _ in range(30):
            if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
                break
            time.sleep(1)  # allow `alive` file to be written

        g = tf.Graph()
        with g.as_default():
            input_fn, record_dir, batch_count = \
              data_preprocessing.make_train_input_fn(ncf_dataset)
            dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False})
        first_epoch = self.drain_dataset(dataset=dataset, g=g)
        user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()}
        item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()}

        train_examples = {
            True: set(),
            False: set(),
        }
        for features, labels in first_epoch:
            for u, i, l in zip(features[movielens.USER_COLUMN],
                               features[movielens.ITEM_COLUMN], labels):
                u_raw = user_inv_map[u]
                i_raw = item_inv_map[i]
                if ((u_raw, i_raw) in self.seen_pairs) != l:
                    # The evaluation item is not considered during false negative
                    # generation, so it will occasionally appear as a negative example
                    # during training.
                    assert not l
                    assert i_raw == self.holdout[u_raw][1]
                train_examples[l].add((u_raw, i_raw))
        num_positives_seen = len(train_examples[True])

        # The numbers don't match exactly because the last batch spills over into
        # the next epoch
        assert ncf_dataset.num_train_positives - num_positives_seen < BATCH_SIZE

        # This check is more heuristic because negatives are sampled with
        # replacement. It only checks that negative generation is reasonably random.
        assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9
Beispiel #2
0
  def test_end_to_end(self):
    ncf_dataset = data_preprocessing.instantiate_pipeline(
        dataset=DATASET, data_dir=self.temp_data_dir,
        batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE, num_data_readers=2,
        num_neg=NUM_NEG)

    for _ in range(30):
      if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
        break
      time.sleep(1)  # allow `alive` file to be written

    g = tf.Graph()
    with g.as_default():
      input_fn, record_dir, batch_count = \
        data_preprocessing.make_train_input_fn(ncf_dataset)
      dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False})
    first_epoch = self.drain_dataset(dataset=dataset, g=g)
    user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()}
    item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()}

    train_examples = {
        True: set(),
        False: set(),
    }
    for features, labels in first_epoch:
      for u, i, l in zip(features[movielens.USER_COLUMN],
                         features[movielens.ITEM_COLUMN], labels):
        u_raw = user_inv_map[u]
        i_raw = item_inv_map[i]
        if ((u_raw, i_raw) in self.seen_pairs) != l:
          # The evaluation item is not considered during false negative
          # generation, so it will occasionally appear as a negative example
          # during training.
          assert not l
          assert i_raw == self.holdout[u_raw][1]
        train_examples[l].add((u_raw, i_raw))
    num_positives_seen = len(train_examples[True])

    # The numbers don't match exactly because the last batch spills over into
    # the next epoch
    assert ncf_dataset.num_train_positives - num_positives_seen < BATCH_SIZE

    # This check is more heuristic because negatives are sampled with
    # replacement. It only checks that negative generation is reasonably random.
    assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9
Beispiel #3
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)
    eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size)
    ncf_dataset = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset,
        data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        match_mlperf=FLAGS.ml_perf)

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "batch_size": batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": ncf_dataset.num_users,
            "num_items": ncf_dataset.num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
    )
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    approx_train_steps = int(ncf_dataset.num_train_positives *
                             (1 + FLAGS.num_neg) // FLAGS.batch_size)
    pred_input_fn = data_preprocessing.make_pred_input_fn(
        ncf_dataset=ncf_dataset)

    total_training_cycle = 1 if FLAGS.inference_only else FLAGS.train_epochs // FLAGS.epochs_between_evals
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        if not FLAGS.inference_only:
            # Train the model
            train_input_fn, train_record_dir, batch_count = \
                data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

            if np.abs(approx_train_steps - batch_count) > 1:
                tf.logging.warning(
                    "Estimated ({}) and reported ({}) number of batches differ by more "
                    "than one".format(approx_train_steps, batch_count))
            train_estimator.train(input_fn=train_input_fn,
                                  hooks=train_hooks,
                                  steps=batch_count)
            tf.gfile.DeleteRecursively(train_record_dir)

        # Evaluate the model
        eval_results = evaluate_model(eval_estimator, ncf_dataset,
                                      pred_input_fn)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[_HR_KEY]
        ndcg = eval_results[_NDCG_KEY]
        tf.logging.fatal("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # Export SavedModel
        if FLAGS.export_savedmodel:
            eval_estimator.export_savedmodel(FLAGS.model_dir,
                                             serving_input_receiver_fn)
            print("SavedModel successfully exported to: {}/<timestamp>".format(
                FLAGS.model_dir))

        # Some of the NumPy vector math can be quite large and likes to stay in
        # memory for a while.
        gc.collect()

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
Beispiel #4
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  num_gpus = flags_core.get_num_gpus(FLAGS)
  batch_size = distribution_utils.per_device_batch_size(
      int(FLAGS.batch_size), num_gpus)
  eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size)
  ncf_dataset = data_preprocessing.instantiate_pipeline(
      dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
      batch_size=batch_size,
      eval_batch_size=eval_batch_size,
      num_neg=FLAGS.num_neg,
      epochs_per_cycle=FLAGS.epochs_between_evals,
      match_mlperf=FLAGS.ml_perf)

  model_helpers.apply_clean(flags.FLAGS)

  train_estimator, eval_estimator = construct_estimator(
      num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={
          "batch_size": batch_size,
          "learning_rate": FLAGS.learning_rate,
          "num_users": ncf_dataset.num_users,
          "num_items": ncf_dataset.num_items,
          "mf_dim": FLAGS.num_factors,
          "model_layers": [int(layer) for layer in FLAGS.layers],
          "mf_regularization": FLAGS.mf_regularization,
          "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
          "use_tpu": FLAGS.tpu is not None,
          "tpu": FLAGS.tpu,
          "tpu_zone": FLAGS.tpu_zone,
          "tpu_gcp_project": FLAGS.tpu_gcp_project,
      }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "eval_batch_size": eval_batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  approx_train_steps = int(ncf_dataset.num_train_positives
                           * (1 + FLAGS.num_neg) // FLAGS.batch_size)
  pred_input_fn = data_preprocessing.make_pred_input_fn(ncf_dataset=ncf_dataset)

  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))


    # Train the model
    train_input_fn, train_record_dir, batch_count = \
      data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

    if np.abs(approx_train_steps - batch_count) > 1:
      tf.logging.warning(
          "Estimated ({}) and reported ({}) number of batches differ by more "
          "than one".format(approx_train_steps, batch_count))
    train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                          steps=batch_count)
    tf.gfile.DeleteRecursively(train_record_dir)

    # Evaluate the model
    eval_results = evaluate_model(
        eval_estimator, ncf_dataset, pred_input_fn)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    hr = eval_results[_HR_KEY]
    ndcg = eval_results[_NDCG_KEY]
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # Some of the NumPy vector math can be quite large and likes to stay in
    # memory for a while.
    gc.collect()

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      break

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
Beispiel #5
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)

    eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
    eval_batch_size = int(FLAGS.eval_batch_size
                          or max([FLAGS.batch_size, eval_per_user]))
    if eval_batch_size % eval_per_user:
        eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
        tf.logging.warning(
            "eval examples per user does not evenly divide eval_batch_size. "
            "Overriding to {}".format(eval_batch_size))

    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset,
        data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        match_mlperf=FLAGS.ml_perf,
        deterministic=FLAGS.seed is not None)

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "use_seed": FLAGS.seed is not None,
            "hash_pipeline": FLAGS.hash_pipeline,
            "batch_size": batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": ncf_dataset.num_users,
            "num_items": ncf_dataset.num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "num_neg": FLAGS.num_neg,
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
            "beta1": FLAGS.beta1,
            "beta2": FLAGS.beta2,
            "epsilon": FLAGS.epsilon,
            "match_mlperf": FLAGS.ml_perf,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
        tensors_to_log={"cross_entropy": "cross_entropy"})
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    approx_train_steps = int(ncf_dataset.num_train_positives *
                             (1 + FLAGS.num_neg) // FLAGS.batch_size)
    pred_input_fn = data_preprocessing.make_pred_input_fn(
        ncf_dataset=ncf_dataset)

    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        # Train the model
        train_input_fn, train_record_dir, batch_count = \
          data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

        if np.abs(approx_train_steps - batch_count) > 1:
            tf.logging.warning(
                "Estimated ({}) and reported ({}) number of batches differ by more "
                "than one".format(approx_train_steps, batch_count))

        train_estimator.train(input_fn=train_input_fn,
                              hooks=train_hooks,
                              steps=batch_count)
        tf.gfile.DeleteRecursively(train_record_dir)

        tf.logging.info("Beginning evaluation.")
        eval_results = eval_estimator.evaluate(pred_input_fn)
        tf.logging.info("Evaluation complete.")

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[rconst.HR_KEY]
        ndcg = eval_results[rconst.NDCG_KEY]
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    cleanup_fn()  # Cleanup data construction artifacts and subprocess.

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()