コード例 #1
0
ファイル: movielens_main.py プロジェクト: 812864539/models
def run_movie(flags_obj):
  """Construct all necessary functions and call run_loop.

  Args:
    flags_obj: Object containing user specified flags.
  """

  if flags_obj.download_if_missing:
    movielens.download(dataset=flags_obj.dataset, data_dir=flags_obj.data_dir)

  train_input_fn, eval_input_fn, model_column_fn = \
    movielens_dataset.construct_input_fns(
        dataset=flags_obj.dataset, data_dir=flags_obj.data_dir,
        batch_size=flags_obj.batch_size, repeat=flags_obj.epochs_between_evals)

  tensors_to_log = {
      'loss': '{loss_prefix}head/weighted_loss/value'
  }

  wide_deep_run_loop.run_loop(
      name="MovieLens", train_input_fn=train_input_fn,
      eval_input_fn=eval_input_fn,
      model_column_fn=model_column_fn,
      build_estimator_fn=build_estimator,
      flags_obj=flags_obj,
      tensors_to_log=tensors_to_log,
      early_stop=False)
コード例 #2
0
ファイル: ncf_common.py プロジェクト: techRnD/models
def get_inputs(params):
  """Returns some parameters used by the model."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  if FLAGS.use_synthetic_data:
    producer = data_pipeline.DummyConstructor()
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params,
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)

    num_train_steps = (producer.train_batches_per_epoch //
                       params["batches_per_step"])
    num_eval_steps = (producer.eval_batches_per_epoch //
                      params["batches_per_step"])
    assert not producer.train_batches_per_epoch % params["batches_per_step"]
    assert not producer.eval_batches_per_epoch % params["batches_per_step"]

  return num_users, num_items, num_train_steps, num_eval_steps, producer
コード例 #3
0
ファイル: ncf_common.py プロジェクト: rder96/models
def get_inputs(params):
  """Returns some parameters used by the model."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  if FLAGS.use_synthetic_data:
    producer = data_pipeline.DummyConstructor()
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params,
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)

    num_train_steps = (producer.train_batches_per_epoch //
                       params["batches_per_step"])
    num_eval_steps = (producer.eval_batches_per_epoch //
                      params["batches_per_step"])
    assert not producer.train_batches_per_epoch % params["batches_per_step"]
    assert not producer.eval_batches_per_epoch % params["batches_per_step"]

  return num_users, num_items, num_train_steps, num_eval_steps, producer
コード例 #4
0
def run_movie(flags_obj):
    """Construct all necessary functions and call run_loop.

  Args:
    flags_obj: Object containing user specified flags.
  """

    if flags_obj.download_if_missing:
        movielens.download(dataset=flags_obj.dataset,
                           data_dir=flags_obj.data_dir)

    train_input_fn, eval_input_fn, model_column_fn = \
      movielens_dataset.construct_input_fns(
          dataset=flags_obj.dataset, data_dir=flags_obj.data_dir,
          batch_size=flags_obj.batch_size, repeat=flags_obj.epochs_between_evals)

    tensors_to_log = {'loss': '{loss_prefix}head/weighted_loss/value'}

    wide_deep_run_loop.run_loop(name="MovieLens",
                                train_input_fn=train_input_fn,
                                eval_input_fn=eval_input_fn,
                                model_column_fn=model_column_fn,
                                build_estimator_fn=build_estimator,
                                flags_obj=flags_obj,
                                tensors_to_log=tensors_to_log,
                                early_stop=False)
コード例 #5
0
def prepare_raw_data(flag_obj):
    """Downloads and prepares raw data for data generation."""
    movielens.download(flag_obj.dataset, flag_obj.data_dir)

    data_processing_params = {
        "train_epochs": flag_obj.num_train_epochs,
        "batch_size": flag_obj.prebatch_size,
        "eval_batch_size": flag_obj.prebatch_size,
        "batches_per_step": 1,
        "stream_files": True,
        "num_neg": flag_obj.num_negative_samples,
    }

    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=flag_obj.dataset,
        data_dir=flag_obj.data_dir,
        params=data_processing_params,
        constructor_type=flag_obj.constructor_type,
        epoch_dir=flag_obj.data_dir,
        generate_data_offline=True)

    # pylint: disable=protected-access
    input_metadata = {
        "num_users": num_users,
        "num_items": num_items,
        "constructor_type": flag_obj.constructor_type,
        "num_train_elements": producer._elements_in_epoch,
        "num_eval_elements": producer._eval_elements_in_epoch,
        "num_train_epochs": flag_obj.num_train_epochs,
        "prebatch_size": flag_obj.prebatch_size,
    }
    # pylint: enable=protected-access

    return producer, input_metadata
コード例 #6
0
ファイル: movielens_dataset.py プロジェクト: 812864539/models
def main(_):
  movielens.download(dataset=flags.FLAGS.dataset, data_dir=flags.FLAGS.data_dir)
  construct_input_fns(flags.FLAGS.dataset, flags.FLAGS.data_dir)
コード例 #7
0
ファイル: ncf_main.py プロジェクト: pooyadavoodi/models
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  params = parse_flags(FLAGS)
  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  if FLAGS.use_synthetic_data:
    producer = data_pipeline.DummyConstructor()
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params,
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)

    num_train_steps = (producer.train_batches_per_epoch //
                       params["batches_per_step"])
    num_eval_steps = (producer.eval_batches_per_epoch //
                      params["batches_per_step"])
    assert not producer.train_batches_per_epoch % params["batches_per_step"]
    assert not producer.eval_batches_per_epoch % params["batches_per_step"]
  producer.start()

  params["num_users"], params["num_items"] = num_users, num_items
  model_helpers.apply_clean(flags.FLAGS)

  estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params)

  benchmark_logger, train_hooks = log_and_get_hooks(params["eval_batch_size"])

  target_reached = False
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
  for cycle_index in range(total_training_cycle):
    assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                            value=cycle_index)

    train_input_fn = producer.make_input_fn(is_training=True)
    estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                    steps=num_train_steps)

    tf.logging.info("Beginning evaluation.")
    eval_input_fn = producer.make_input_fn(is_training=False)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                            value=cycle_index)
    eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps)
    tf.logging.info("Evaluation complete.")

    hr = float(eval_results[rconst.HR_KEY])
    ndcg = float(eval_results[rconst.NDCG_KEY])
    loss = float(eval_results["loss"])

    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_TARGET,
        value={"epoch": cycle_index, "value": FLAGS.hr_threshold})
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                            value={"epoch": cycle_index, "value": hr})
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
        value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES})

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format(
            cycle_index + 1, hr, ndcg, loss))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      target_reached = True
      break

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                          value={"success": target_reached})
  producer.stop_loop()
  producer.join()

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
コード例 #8
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)

    eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
    eval_batch_size = int(FLAGS.eval_batch_size
                          or max([FLAGS.batch_size, eval_per_user]))
    if eval_batch_size % eval_per_user:
        eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
        tf.logging.warning(
            "eval examples per user does not evenly divide eval_batch_size. "
            "Overriding to {}".format(eval_batch_size))

    if FLAGS.use_synthetic_data:
        ncf_dataset = None
        cleanup_fn = lambda: None
        num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
            FLAGS.dataset]
        num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
        num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
    else:
        ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
            dataset=FLAGS.dataset,
            data_dir=FLAGS.data_dir,
            batch_size=batch_size,
            eval_batch_size=eval_batch_size,
            num_neg=FLAGS.num_neg,
            epochs_per_cycle=FLAGS.epochs_between_evals,
            match_mlperf=FLAGS.ml_perf,
            deterministic=FLAGS.seed is not None,
            use_subprocess=FLAGS.use_subprocess,
            cache_id=FLAGS.cache_id)
        num_users = ncf_dataset.num_users
        num_items = ncf_dataset.num_items
        num_train_steps = int(
            np.ceil(FLAGS.epochs_between_evals *
                    ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) /
                    FLAGS.batch_size))
        num_eval_steps = int(
            np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users /
                    eval_batch_size))

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "use_seed": FLAGS.seed is not None,
            "hash_pipeline": FLAGS.hash_pipeline,
            "batch_size": batch_size,
            "eval_batch_size": eval_batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": num_users,
            "num_items": num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "num_neg": FLAGS.num_neg,
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
            "beta1": FLAGS.beta1,
            "beta2": FLAGS.beta2,
            "epsilon": FLAGS.epsilon,
            "match_mlperf": FLAGS.ml_perf,
            "use_xla_for_gpu": FLAGS.use_xla_for_gpu,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
        tensors_to_log={"cross_entropy": "cross_entropy"})
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    pred_input_fn = None
    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
    target_reached = False
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
    for cycle_index in range(total_training_cycle):
        assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                                value=cycle_index)

        # Train the model
        train_input_fn, train_record_dir, batch_count = \
          data_preprocessing.make_input_fn(
              ncf_dataset=ncf_dataset, is_training=True)

        if batch_count != num_train_steps:
            raise ValueError(
                "Step counts do not match. ({} vs. {}) The async process is "
                "producing incorrect shards.".format(batch_count,
                                                     num_train_steps))

        train_estimator.train(input_fn=train_input_fn,
                              hooks=train_hooks,
                              steps=num_train_steps)
        if train_record_dir:
            tf.gfile.DeleteRecursively(train_record_dir)

        tf.logging.info("Beginning evaluation.")
        if pred_input_fn is None:
            pred_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
                ncf_dataset=ncf_dataset, is_training=False)

            if eval_batch_count != num_eval_steps:
                raise ValueError(
                    "Step counts do not match. ({} vs. {}) The async process is "
                    "producing incorrect shards.".format(
                        eval_batch_count, num_eval_steps))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                                value=cycle_index)
        eval_results = eval_estimator.evaluate(pred_input_fn,
                                               steps=num_eval_steps)
        hr = float(eval_results[rconst.HR_KEY])
        ndcg = float(eval_results[rconst.NDCG_KEY])
        tf.logging.info("Evaluation complete.")

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET,
                                value={
                                    "epoch": cycle_index,
                                    "value": FLAGS.hr_threshold
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                                value={
                                    "epoch": cycle_index,
                                    "value": hr
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
                                value={
                                    "epoch": cycle_index,
                                    "value": rconst.NUM_EVAL_NEGATIVES
                                })

        # Logged by the async process during record creation.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                                deferred=True)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP,
                                value=cycle_index)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            target_reached = True
            break

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                            value={"success": target_reached})
    cleanup_fn()  # Cleanup data construction artifacts and subprocess.

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
コード例 #9
0
ファイル: ncf_main.py プロジェクト: jameslee5656/models
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  num_gpus = flags_core.get_num_gpus(FLAGS)
  batch_size = distribution_utils.per_device_batch_size(
      int(FLAGS.batch_size), num_gpus)
  eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size)
  ncf_dataset = data_preprocessing.instantiate_pipeline(
      dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
      batch_size=batch_size,
      eval_batch_size=eval_batch_size,
      num_neg=FLAGS.num_neg,
      epochs_per_cycle=FLAGS.epochs_between_evals,
      match_mlperf=FLAGS.ml_perf)

  model_helpers.apply_clean(flags.FLAGS)

  train_estimator, eval_estimator = construct_estimator(
      num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={
          "batch_size": batch_size,
          "learning_rate": FLAGS.learning_rate,
          "num_users": ncf_dataset.num_users,
          "num_items": ncf_dataset.num_items,
          "mf_dim": FLAGS.num_factors,
          "model_layers": [int(layer) for layer in FLAGS.layers],
          "mf_regularization": FLAGS.mf_regularization,
          "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
          "use_tpu": FLAGS.tpu is not None,
          "tpu": FLAGS.tpu,
          "tpu_zone": FLAGS.tpu_zone,
          "tpu_gcp_project": FLAGS.tpu_gcp_project,
      }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "eval_batch_size": eval_batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  approx_train_steps = int(ncf_dataset.num_train_positives
                           * (1 + FLAGS.num_neg) // FLAGS.batch_size)
  pred_input_fn = data_preprocessing.make_pred_input_fn(ncf_dataset=ncf_dataset)

  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))


    # Train the model
    train_input_fn, train_record_dir, batch_count = \
      data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

    if np.abs(approx_train_steps - batch_count) > 1:
      tf.logging.warning(
          "Estimated ({}) and reported ({}) number of batches differ by more "
          "than one".format(approx_train_steps, batch_count))
    train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                          steps=batch_count)
    tf.gfile.DeleteRecursively(train_record_dir)

    # Evaluate the model
    eval_results = evaluate_model(
        eval_estimator, ncf_dataset, pred_input_fn)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    hr = eval_results[_HR_KEY]
    ndcg = eval_results[_NDCG_KEY]
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # Some of the NumPy vector math can be quite large and likes to stay in
    # memory for a while.
    gc.collect()

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      break

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
コード例 #10
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    params = parse_flags(FLAGS)
    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

    if FLAGS.use_synthetic_data:
        producer = data_pipeline.DummyConstructor()
        num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
            FLAGS.dataset]
        num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
        num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    else:
        num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
            dataset=FLAGS.dataset,
            data_dir=FLAGS.data_dir,
            params=params,
            constructor_type=FLAGS.constructor_type,
            deterministic=FLAGS.seed is not None)

        num_train_steps = (producer.train_batches_per_epoch //
                           params["batches_per_step"])
        num_eval_steps = (producer.eval_batches_per_epoch //
                          params["batches_per_step"])
        assert not producer.train_batches_per_epoch % params["batches_per_step"]
        assert not producer.eval_batches_per_epoch % params["batches_per_step"]
    producer.start()

    params["num_users"], params["num_items"] = num_users, num_items
    model_helpers.apply_clean(flags.FLAGS)

    estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params)

    benchmark_logger, train_hooks = log_and_get_hooks(
        params["eval_batch_size"])

    target_reached = False
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
    for cycle_index in range(total_training_cycle):
        assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                                value=cycle_index)

        train_input_fn = producer.make_input_fn(is_training=True)
        estimator.train(input_fn=train_input_fn,
                        hooks=train_hooks,
                        steps=num_train_steps)

        tf.logging.info("Beginning evaluation.")
        eval_input_fn = producer.make_input_fn(is_training=False)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                                value=cycle_index)
        eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps)
        tf.logging.info("Evaluation complete.")

        hr = float(eval_results[rconst.HR_KEY])
        ndcg = float(eval_results[rconst.NDCG_KEY])
        loss = float(eval_results["loss"])

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET,
                                value={
                                    "epoch": cycle_index,
                                    "value": FLAGS.hr_threshold
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                                value={
                                    "epoch": cycle_index,
                                    "value": hr
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
                                value={
                                    "epoch": cycle_index,
                                    "value": rconst.NUM_EVAL_NEGATIVES
                                })

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP,
                                value=cycle_index)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        tf.logging.info(
            "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format(
                cycle_index + 1, hr, ndcg, loss))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            target_reached = True
            break

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                            value={"success": target_reached})
    producer.stop_loop()
    producer.join()

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
コード例 #11
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)
    eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size)
    ncf_dataset = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset,
        data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        match_mlperf=FLAGS.ml_perf)

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "batch_size": batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": ncf_dataset.num_users,
            "num_items": ncf_dataset.num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
    )
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    approx_train_steps = int(ncf_dataset.num_train_positives *
                             (1 + FLAGS.num_neg) // FLAGS.batch_size)
    pred_input_fn = data_preprocessing.make_pred_input_fn(
        ncf_dataset=ncf_dataset)

    total_training_cycle = 1 if FLAGS.inference_only else FLAGS.train_epochs // FLAGS.epochs_between_evals
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        if not FLAGS.inference_only:
            # Train the model
            train_input_fn, train_record_dir, batch_count = \
                data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

            if np.abs(approx_train_steps - batch_count) > 1:
                tf.logging.warning(
                    "Estimated ({}) and reported ({}) number of batches differ by more "
                    "than one".format(approx_train_steps, batch_count))
            train_estimator.train(input_fn=train_input_fn,
                                  hooks=train_hooks,
                                  steps=batch_count)
            tf.gfile.DeleteRecursively(train_record_dir)

        # Evaluate the model
        eval_results = evaluate_model(eval_estimator, ncf_dataset,
                                      pred_input_fn)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[_HR_KEY]
        ndcg = eval_results[_NDCG_KEY]
        tf.logging.fatal("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # Export SavedModel
        if FLAGS.export_savedmodel:
            eval_estimator.export_savedmodel(FLAGS.model_dir,
                                             serving_input_receiver_fn)
            print("SavedModel successfully exported to: {}/<timestamp>".format(
                FLAGS.model_dir))

        # Some of the NumPy vector math can be quite large and likes to stay in
        # memory for a while.
        gc.collect()

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
コード例 #12
0
ファイル: ncf_main.py プロジェクト: 812864539/models
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  num_gpus = flags_core.get_num_gpus(FLAGS)
  batch_size = distribution_utils.per_device_batch_size(
      int(FLAGS.batch_size), num_gpus)
  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
  eval_batch_size = int(FLAGS.eval_batch_size or
                        max([FLAGS.batch_size, eval_per_user]))
  if eval_batch_size % eval_per_user:
    eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
    tf.logging.warning(
        "eval examples per user does not evenly divide eval_batch_size. "
        "Overriding to {}".format(eval_batch_size))

  if FLAGS.use_synthetic_data:
    ncf_dataset = None
    cleanup_fn = lambda: None
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        num_cycles=total_training_cycle,
        match_mlperf=FLAGS.ml_perf,
        deterministic=FLAGS.seed is not None,
        use_subprocess=FLAGS.use_subprocess,
        cache_id=FLAGS.cache_id)
    num_users = ncf_dataset.num_users
    num_items = ncf_dataset.num_items
    num_train_steps = int(np.ceil(
        FLAGS.epochs_between_evals * ncf_dataset.num_train_positives *
        (1 + FLAGS.num_neg) / FLAGS.batch_size))
    num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) *
                                 ncf_dataset.num_users / eval_batch_size))

  model_helpers.apply_clean(flags.FLAGS)

  params = {
      "use_seed": FLAGS.seed is not None,
      "hash_pipeline": FLAGS.hash_pipeline,
      "batch_size": batch_size,
      "eval_batch_size": eval_batch_size,
      "learning_rate": FLAGS.learning_rate,
      "num_users": num_users,
      "num_items": num_items,
      "mf_dim": FLAGS.num_factors,
      "model_layers": [int(layer) for layer in FLAGS.layers],
      "mf_regularization": FLAGS.mf_regularization,
      "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
      "num_neg": FLAGS.num_neg,
      "use_tpu": FLAGS.tpu is not None,
      "tpu": FLAGS.tpu,
      "tpu_zone": FLAGS.tpu_zone,
      "tpu_gcp_project": FLAGS.tpu_gcp_project,
      "beta1": FLAGS.beta1,
      "beta2": FLAGS.beta2,
      "epsilon": FLAGS.epsilon,
      "match_mlperf": FLAGS.ml_perf,
      "use_xla_for_gpu": FLAGS.use_xla_for_gpu,
      "use_estimator": FLAGS.use_estimator,
  }
  if FLAGS.use_estimator:
    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus, model_dir=FLAGS.model_dir,
        iterations=num_train_steps, params=params,
        batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size)
  else:
    runner = model_runner.NcfModelRunner(ncf_dataset, params, num_train_steps,
                                         num_eval_steps, FLAGS.use_while_loop)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
      tensors_to_log={"cross_entropy": "cross_entropy"}
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "eval_batch_size": eval_batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)


  eval_input_fn = None
  target_reached = False
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
  for cycle_index in range(total_training_cycle):
    assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                            value=cycle_index)

    # Train the model
    if FLAGS.use_estimator:
      train_input_fn, train_record_dir, batch_count = \
        data_preprocessing.make_input_fn(
            ncf_dataset=ncf_dataset, is_training=True)

      if batch_count != num_train_steps:
        raise ValueError(
            "Step counts do not match. ({} vs. {}) The async process is "
            "producing incorrect shards.".format(batch_count, num_train_steps))

      train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                            steps=num_train_steps)
      if train_record_dir:
        tf.gfile.DeleteRecursively(train_record_dir)

      tf.logging.info("Beginning evaluation.")
      if eval_input_fn is None:
        eval_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
            ncf_dataset=ncf_dataset, is_training=False)

        if eval_batch_count != num_eval_steps:
          raise ValueError(
              "Step counts do not match. ({} vs. {}) The async process is "
              "producing incorrect shards.".format(
                  eval_batch_count, num_eval_steps))

      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                              value=cycle_index)
      eval_results = eval_estimator.evaluate(eval_input_fn,
                                             steps=num_eval_steps)
      tf.logging.info("Evaluation complete.")
    else:
      runner.train()
      tf.logging.info("Beginning evaluation.")
      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                              value=cycle_index)
      eval_results = runner.eval()
      tf.logging.info("Evaluation complete.")
    hr = float(eval_results[rconst.HR_KEY])
    ndcg = float(eval_results[rconst.NDCG_KEY])

    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_TARGET,
        value={"epoch": cycle_index, "value": FLAGS.hr_threshold})
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                            value={"epoch": cycle_index, "value": hr})
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
        value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES})

    # Logged by the async process during record creation.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                            deferred=True)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      target_reached = True
      break

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                          value={"success": target_reached})
  cleanup_fn()  # Cleanup data construction artifacts and subprocess.

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
コード例 #13
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)
        movielens_dataset.construct_train_eval_csv(data_dir=FLAGS.data_dir,
                                                   dataset=FLAGS.dataset)

    tf.logging.info("Data preprocessing...")
    ncf_dataset = movielens_dataset.data_preprocessing(FLAGS.data_dir,
                                                       FLAGS.dataset,
                                                       FLAGS.num_neg)

    model_helpers.apply_clean(flags.FLAGS)

    # Create NeuMF model and convert it to Estimator
    tf.logging.info("Creating Estimator from Keras model...")
    layers = [int(layer) for layer in FLAGS.layers]
    mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization]
    keras_model = neumf_model.NeuMF(ncf_dataset.num_users,
                                    ncf_dataset.num_items, FLAGS.num_factors,
                                    layers, FLAGS.batch_size,
                                    FLAGS.mf_regularization,
                                    mlp_regularization)
    num_gpus = flags_core.get_num_gpus(FLAGS)
    estimator = convert_keras_to_estimator(keras_model, num_gpus,
                                           FLAGS.model_dir)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
    )
    run_params = {
        "batch_size": FLAGS.batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    # Training and evaluation cycle
    def get_train_input_fn():
        return movielens_dataset.get_input_fn(
            True,
            distribution_utils.per_device_batch_size(FLAGS.batch_size,
                                                     num_gpus), ncf_dataset,
            FLAGS.data_dir, FLAGS.dataset, FLAGS.epochs_between_evals)

    def get_pred_input_fn():
        return movielens_dataset.get_input_fn(
            False,
            distribution_utils.per_device_batch_size(FLAGS.batch_size,
                                                     num_gpus), ncf_dataset,
            FLAGS.data_dir, FLAGS.dataset, 1)

    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        # Train the model
        estimator.train(input_fn=get_train_input_fn(), hooks=train_hooks)

        # Evaluate the model
        eval_results = evaluate_model(estimator, FLAGS.batch_size, num_gpus,
                                      ncf_dataset, get_pred_input_fn())

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[_HR_KEY]
        ndcg = eval_results[_NDCG_KEY]
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
コード例 #14
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)

    eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
    eval_batch_size = int(FLAGS.eval_batch_size
                          or max([FLAGS.batch_size, eval_per_user]))
    if eval_batch_size % eval_per_user:
        eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
        tf.logging.warning(
            "eval examples per user does not evenly divide eval_batch_size. "
            "Overriding to {}".format(eval_batch_size))

    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset,
        data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        match_mlperf=FLAGS.ml_perf,
        deterministic=FLAGS.seed is not None)

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "use_seed": FLAGS.seed is not None,
            "hash_pipeline": FLAGS.hash_pipeline,
            "batch_size": batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": ncf_dataset.num_users,
            "num_items": ncf_dataset.num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "num_neg": FLAGS.num_neg,
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
            "beta1": FLAGS.beta1,
            "beta2": FLAGS.beta2,
            "epsilon": FLAGS.epsilon,
            "match_mlperf": FLAGS.ml_perf,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
        tensors_to_log={"cross_entropy": "cross_entropy"})
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    approx_train_steps = int(ncf_dataset.num_train_positives *
                             (1 + FLAGS.num_neg) // FLAGS.batch_size)
    pred_input_fn = data_preprocessing.make_pred_input_fn(
        ncf_dataset=ncf_dataset)

    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        # Train the model
        train_input_fn, train_record_dir, batch_count = \
          data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

        if np.abs(approx_train_steps - batch_count) > 1:
            tf.logging.warning(
                "Estimated ({}) and reported ({}) number of batches differ by more "
                "than one".format(approx_train_steps, batch_count))

        train_estimator.train(input_fn=train_input_fn,
                              hooks=train_hooks,
                              steps=batch_count)
        tf.gfile.DeleteRecursively(train_record_dir)

        tf.logging.info("Beginning evaluation.")
        eval_results = eval_estimator.evaluate(pred_input_fn)
        tf.logging.info("Evaluation complete.")

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[rconst.HR_KEY]
        ndcg = eval_results[rconst.NDCG_KEY]
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    cleanup_fn()  # Cleanup data construction artifacts and subprocess.

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
コード例 #15
0
def main(_):
    movielens.download(dataset=flags.FLAGS.dataset,
                       data_dir=flags.FLAGS.data_dir)
    construct_input_fns(flags.FLAGS.dataset, flags.FLAGS.data_dir)
コード例 #16
0
def main(_):
  movielens.download(dataset=flags.FLAGS.dataset, data_dir=flags.FLAGS.data_dir)
  construct_train_eval_csv(flags.FLAGS.data_dir, flags.FLAGS.dataset)
コード例 #17
0
ファイル: ncf_main.py プロジェクト: AlwaysTheBeginer/models
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)
    movielens_dataset.construct_train_eval_csv(
        data_dir=FLAGS.data_dir, dataset=FLAGS.dataset)

  tf.logging.info("Data preprocessing...")
  ncf_dataset = movielens_dataset.data_preprocessing(
      FLAGS.data_dir, FLAGS.dataset, FLAGS.num_neg)

  model_helpers.apply_clean(flags.FLAGS)

  # Create NeuMF model and convert it to Estimator
  tf.logging.info("Creating Estimator from Keras model...")
  layers = [int(layer) for layer in FLAGS.layers]
  mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization]
  keras_model = neumf_model.NeuMF(
      ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors,
      layers, FLAGS.batch_size, FLAGS.mf_regularization,
      mlp_regularization)
  num_gpus = flags_core.get_num_gpus(FLAGS)
  estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  # Training and evaluation cycle
  def get_train_input_fn():
    return movielens_dataset.get_input_fn(
        True,
        distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus),
        ncf_dataset, FLAGS.data_dir, FLAGS.dataset, FLAGS.epochs_between_evals)

  def get_pred_input_fn():
    return movielens_dataset.get_input_fn(
        False,
        distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus),
        ncf_dataset, FLAGS.data_dir, FLAGS.dataset, 1)

  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    # Train the model
    estimator.train(input_fn=get_train_input_fn(), hooks=train_hooks)

    # Evaluate the model
    eval_results = evaluate_model(
        estimator, FLAGS.batch_size, num_gpus, ncf_dataset, get_pred_input_fn())

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    hr = eval_results[_HR_KEY]
    ndcg = eval_results[_NDCG_KEY]
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      break

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
コード例 #18
0
def instantiate_pipeline(dataset,
                         data_dir,
                         batch_size,
                         eval_batch_size,
                         num_data_readers=None,
                         num_neg=4,
                         epochs_per_cycle=1):
    """Preprocess data and start negative generation subprocess."""

    movielens.download(dataset=dataset, data_dir=data_dir)
    tf.logging.info("Beginning data preprocessing.")
    ncf_dataset = construct_cache(dataset=dataset,
                                  data_dir=data_dir,
                                  num_data_readers=num_data_readers)

    tf.logging.info("Creating training file subprocess.")

    subproc_env = os.environ.copy()

    # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU
    # resources and by default will try to allocate GPU memory. This would cause
    # contention with the main training process.
    subproc_env["CUDA_VISIBLE_DEVICES"] = ""

    python = "python3" if six.PY3 else "python2"

    # By limiting the number of workers we guarantee that the worker
    # pool underlying the training generation doesn't starve other processes.
    num_workers = int(multiprocessing.cpu_count() * 0.75)

    subproc_args = [
        python,
        _ASYNC_GEN_PATH,
        "--data_dir",
        data_dir,
        "--cache_id",
        str(ncf_dataset.cache_paths.cache_id),
        "--num_neg",
        str(num_neg),
        "--num_train_positives",
        str(ncf_dataset.num_train_positives),
        "--num_items",
        str(ncf_dataset.num_items),
        "--num_readers",
        str(ncf_dataset.num_data_readers),
        "--epochs_per_cycle",
        str(epochs_per_cycle),
        "--train_batch_size",
        str(batch_size),
        "--eval_batch_size",
        str(eval_batch_size),
        "--num_workers",
        str(num_workers),
        "--spillover",
        "True",  # This allows the training input function to
        # guarantee batch size and significantly improves
        # performance. (~5% increase in examples/sec on
        # GPU, and needed for TPU XLA.)
        "--redirect_logs",
        "True",
        "--seed",
        str(int(stat_utils.random_int32()))
    ]

    tf.logging.info("Generation subprocess command: {}".format(
        " ".join(subproc_args)))

    proc = subprocess.Popen(args=subproc_args,
                            stdin=subprocess.PIPE,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                            shell=False,
                            env=subproc_env)

    atexit.register(_shutdown, proc=proc)
    atexit.register(tf.gfile.DeleteRecursively,
                    ncf_dataset.cache_paths.cache_root)

    return ncf_dataset
コード例 #19
0
def main(_):
    movielens.download(dataset=flags.FLAGS.dataset,
                       data_dir=flags.FLAGS.data_dir)
    construct_train_eval_csv(flags.FLAGS.data_dir, flags.FLAGS.dataset)