Esempio n. 1
0
def get_inputs(params):
  """Returns some parameters used by the model."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  if FLAGS.use_synthetic_data:
    producer = data_pipeline.DummyConstructor()
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params,
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)

    num_train_steps = (producer.train_batches_per_epoch //
                       params["batches_per_step"])
    num_eval_steps = (producer.eval_batches_per_epoch //
                      params["batches_per_step"])
    assert not producer.train_batches_per_epoch % params["batches_per_step"]
    assert not producer.eval_batches_per_epoch % params["batches_per_step"]

  return num_users, num_items, num_train_steps, num_eval_steps, producer
Esempio n. 2
0
def prepare_raw_data(flag_obj):
    """Downloads and prepares raw data for data generation."""
    movielens.download(flag_obj.dataset, flag_obj.data_dir)

    data_processing_params = {
        "train_epochs": flag_obj.num_train_epochs,
        "batch_size": flag_obj.prebatch_size,
        "eval_batch_size": flag_obj.prebatch_size,
        "batches_per_step": 1,
        "stream_files": True,
        "num_neg": flag_obj.num_negative_samples,
    }

    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=flag_obj.dataset,
        data_dir=flag_obj.data_dir,
        params=data_processing_params,
        constructor_type=flag_obj.constructor_type,
        epoch_dir=flag_obj.data_dir,
        generate_data_offline=True)

    # pylint: disable=protected-access
    input_metadata = {
        "num_users": num_users,
        "num_items": num_items,
        "constructor_type": flag_obj.constructor_type,
        "num_train_elements": producer._elements_in_epoch,
        "num_eval_elements": producer._eval_elements_in_epoch,
        "num_train_epochs": flag_obj.num_train_epochs,
        "prebatch_size": flag_obj.prebatch_size,
    }
    # pylint: enable=protected-access

    return producer, input_metadata
Esempio n. 3
0
def get_inputs(params):
  """Returns some parameters used by the model."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  if FLAGS.use_synthetic_data:
    producer = data_pipeline.DummyConstructor()
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params,
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)

    num_train_steps = (producer.train_batches_per_epoch //
                       params["batches_per_step"])
    num_eval_steps = (producer.eval_batches_per_epoch //
                      params["batches_per_step"])
    assert not producer.train_batches_per_epoch % params["batches_per_step"]
    assert not producer.eval_batches_per_epoch % params["batches_per_step"]

  return num_users, num_items, num_train_steps, num_eval_steps, producer
Esempio n. 4
0
    def test_end_to_end(self):
        ncf_dataset = data_preprocessing.instantiate_pipeline(
            dataset=DATASET,
            data_dir=self.temp_data_dir,
            batch_size=BATCH_SIZE,
            eval_batch_size=BATCH_SIZE,
            num_data_readers=2,
            num_neg=NUM_NEG)

        for _ in range(30):
            if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
                break
            time.sleep(1)  # allow `alive` file to be written

        g = tf.Graph()
        with g.as_default():
            input_fn, record_dir, batch_count = \
              data_preprocessing.make_train_input_fn(ncf_dataset)
            dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False})
        first_epoch = self.drain_dataset(dataset=dataset, g=g)
        user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()}
        item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()}

        train_examples = {
            True: set(),
            False: set(),
        }
        for features, labels in first_epoch:
            for u, i, l in zip(features[movielens.USER_COLUMN],
                               features[movielens.ITEM_COLUMN], labels):
                u_raw = user_inv_map[u]
                i_raw = item_inv_map[i]
                if ((u_raw, i_raw) in self.seen_pairs) != l:
                    # The evaluation item is not considered during false negative
                    # generation, so it will occasionally appear as a negative example
                    # during training.
                    assert not l
                    assert i_raw == self.holdout[u_raw][1]
                train_examples[l].add((u_raw, i_raw))
        num_positives_seen = len(train_examples[True])

        # The numbers don't match exactly because the last batch spills over into
        # the next epoch
        assert ncf_dataset.num_train_positives - num_positives_seen < BATCH_SIZE

        # This check is more heuristic because negatives are sampled with
        # replacement. It only checks that negative generation is reasonably random.
        assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9
Esempio n. 5
0
    def test_end_to_end(self):
        ncf_dataset, _ = data_preprocessing.instantiate_pipeline(
            dataset=DATASET,
            data_dir=self.temp_data_dir,
            batch_size=BATCH_SIZE,
            eval_batch_size=EVAL_BATCH_SIZE,
            num_cycles=1,
            num_data_readers=2,
            num_neg=NUM_NEG)

        g = tf.Graph()
        with g.as_default():
            input_fn, record_dir, batch_count = \
              data_preprocessing.make_input_fn(ncf_dataset, True)
            dataset = input_fn({
                "batch_size": BATCH_SIZE,
                "use_tpu": False,
                "use_xla_for_gpu": False
            })
        first_epoch = self.drain_dataset(dataset=dataset, g=g)
        user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()}
        item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()}

        train_examples = {
            True: set(),
            False: set(),
        }
        for features, labels in first_epoch:
            for u, i, l in zip(features[movielens.USER_COLUMN],
                               features[movielens.ITEM_COLUMN], labels):

                u_raw = user_inv_map[u]
                i_raw = item_inv_map[i]
                if ((u_raw, i_raw) in self.seen_pairs) != l:
                    # The evaluation item is not considered during false negative
                    # generation, so it will occasionally appear as a negative example
                    # during training.
                    assert not l
                    assert i_raw == self.holdout[u_raw][1]
                train_examples[l].add((u_raw, i_raw))
        num_positives_seen = len(train_examples[True])

        assert ncf_dataset.num_train_positives == num_positives_seen

        # This check is more heuristic because negatives are sampled with
        # replacement. It only checks that negative generation is reasonably random.
        assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9
Esempio n. 6
0
  def test_end_to_end(self):
    ncf_dataset = data_preprocessing.instantiate_pipeline(
        dataset=DATASET, data_dir=self.temp_data_dir,
        batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE, num_data_readers=2,
        num_neg=NUM_NEG)

    for _ in range(30):
      if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
        break
      time.sleep(1)  # allow `alive` file to be written

    g = tf.Graph()
    with g.as_default():
      input_fn, record_dir, batch_count = \
        data_preprocessing.make_train_input_fn(ncf_dataset)
      dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False})
    first_epoch = self.drain_dataset(dataset=dataset, g=g)
    user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()}
    item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()}

    train_examples = {
        True: set(),
        False: set(),
    }
    for features, labels in first_epoch:
      for u, i, l in zip(features[movielens.USER_COLUMN],
                         features[movielens.ITEM_COLUMN], labels):
        u_raw = user_inv_map[u]
        i_raw = item_inv_map[i]
        if ((u_raw, i_raw) in self.seen_pairs) != l:
          # The evaluation item is not considered during false negative
          # generation, so it will occasionally appear as a negative example
          # during training.
          assert not l
          assert i_raw == self.holdout[u_raw][1]
        train_examples[l].add((u_raw, i_raw))
    num_positives_seen = len(train_examples[True])

    # The numbers don't match exactly because the last batch spills over into
    # the next epoch
    assert ncf_dataset.num_train_positives - num_positives_seen < BATCH_SIZE

    # This check is more heuristic because negatives are sampled with
    # replacement. It only checks that negative generation is reasonably random.
    assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9
Esempio n. 7
0
  def test_end_to_end(self):
    ncf_dataset, _ = data_preprocessing.instantiate_pipeline(
        dataset=DATASET, data_dir=self.temp_data_dir,
        batch_size=BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE,
        num_cycles=1, num_data_readers=2, num_neg=NUM_NEG)

    g = tf.Graph()
    with g.as_default():
      input_fn, record_dir, batch_count = \
        data_preprocessing.make_input_fn(ncf_dataset, True)
      dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False,
                          "use_xla_for_gpu": False})
    first_epoch = self.drain_dataset(dataset=dataset, g=g)
    user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()}
    item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()}

    train_examples = {
        True: set(),
        False: set(),
    }
    for features, labels in first_epoch:
      for u, i, l in zip(features[movielens.USER_COLUMN],
                         features[movielens.ITEM_COLUMN], labels):

        u_raw = user_inv_map[u]
        i_raw = item_inv_map[i]
        if ((u_raw, i_raw) in self.seen_pairs) != l:
          # The evaluation item is not considered during false negative
          # generation, so it will occasionally appear as a negative example
          # during training.
          assert not l
          assert i_raw == self.holdout[u_raw][1]
        train_examples[l].add((u_raw, i_raw))
    num_positives_seen = len(train_examples[True])

    assert ncf_dataset.num_train_positives == num_positives_seen

    # This check is more heuristic because negatives are sampled with
    # replacement. It only checks that negative generation is reasonably random.
    assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9
Esempio n. 8
0
def main(_):
    """Train NCF model and evaluate its hit rate (HR) metric."""
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    master = tpu_cluster_resolver.master()

    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset,
        data_dir=FLAGS.data_dir,
        # TODO(shizhiw): support multihost.
        batch_size=FLAGS.batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        num_neg=FLAGS.num_neg,
        num_cycles=_NUM_EPOCHS,
        epochs_per_cycle=1,
        match_mlperf=FLAGS.ml_perf,
        use_subprocess=FLAGS.use_subprocess,
        cache_id=FLAGS.cache_id)

    train_params, eval_params = create_params(ncf_dataset)

    eval_graph_spec = build_graph(eval_params, ncf_dataset,
                                  tpu_embedding.INFERENCE)

    for epoch in range(_NUM_EPOCHS):
        tf.logging.info("Training {}...".format(epoch))
        # build training graph each epoch as number of batches per epoch
        # i.e. batch_count might change by 1 between epochs.
        train_graph_spec = build_graph(train_params, ncf_dataset,
                                       tpu_embedding.TRAINING)

        run_graph(master, train_graph_spec, epoch)

        tf.logging.info("Evaluating {}...".format(epoch))
        run_graph(master, eval_graph_spec, epoch)

    cleanup_fn()  # Cleanup data construction artifacts and subprocess.
Esempio n. 9
0
    def _test_fresh_randomness(self, constructor_type):
        train_epochs = 5
        params = self.make_params(train_epochs=train_epochs)
        _, _, producer = data_preprocessing.instantiate_pipeline(
            dataset=DATASET,
            data_dir=self.temp_data_dir,
            params=params,
            constructor_type=constructor_type,
            deterministic=True)

        producer.start()

        results = []
        g = tf.Graph()
        with g.as_default():
            for _ in range(train_epochs):
                input_fn = producer.make_input_fn(is_training=True)
                dataset = input_fn(params)
                results.extend(self.drain_dataset(dataset=dataset, g=g))

        producer.join()
        assert producer._fatal_exception is None

        positive_counts, negative_counts = defaultdict(int), defaultdict(int)
        md5 = hashlib.md5()
        for features, labels in results:
            data_list = [
                features[movielens.USER_COLUMN],
                features[movielens.ITEM_COLUMN],
                features[rconst.VALID_POINT_MASK], labels
            ]
            for i in data_list:
                md5.update(i.tobytes())

            for u, i, v, l in zip(*data_list):
                if not v:
                    continue  # ignore padding

                if l:
                    positive_counts[(u, i)] += 1
                else:
                    negative_counts[(u, i)] += 1

        self.assertRegexpMatches(md5.hexdigest(), FRESH_RANDOMNESS_MD5)

        # The positive examples should appear exactly once each epoch
        self.assertAllEqual(list(positive_counts.values()),
                            [train_epochs for _ in positive_counts])

        # The threshold for the negatives is heuristic, but in general repeats are
        # expected, but should not appear too frequently.

        pair_cardinality = NUM_USERS * NUM_ITEMS
        neg_pair_cardinality = pair_cardinality - len(self.seen_pairs)

        # Approximation for the expectation number of times that a particular
        # negative will appear in a given epoch. Implicit in this calculation is the
        # treatment of all negative pairs as equally likely. Normally is not
        # necessarily reasonable; however the generation in self.setUp() will
        # approximate this behavior sufficiently for heuristic testing.
        e_sample = len(self.seen_pairs) * NUM_NEG / neg_pair_cardinality

        # The frequency of occurance of a given negative pair should follow an
        # approximately binomial distribution in the limit that the cardinality of
        # the negative pair set >> number of samples per epoch.
        approx_pdf = scipy.stats.binom.pmf(k=np.arange(train_epochs + 1),
                                           n=train_epochs,
                                           p=e_sample)

        # Tally the actual observed counts.
        count_distribution = [0 for _ in range(train_epochs + 1)]
        for i in negative_counts.values():
            i = min([i, train_epochs])  # round down tail for simplicity.
            count_distribution[i] += 1
        count_distribution[0] = neg_pair_cardinality - sum(
            count_distribution[1:])

        # Check that the frequency of negative pairs is approximately binomial.
        for i in range(train_epochs + 1):
            if approx_pdf[i] < 0.05:
                continue  # Variance will be high at the tails.

            observed_fraction = count_distribution[i] / neg_pair_cardinality
            deviation = (2 * abs(observed_fraction - approx_pdf[i]) /
                         (observed_fraction + approx_pdf[i]))

            self.assertLess(deviation, 0.2)
Esempio n. 10
0
  def _test_end_to_end(self, constructor_type):
    params = self.make_params(train_epochs=1)
    _, _, producer = data_preprocessing.instantiate_pipeline(
        dataset=DATASET, data_dir=self.temp_data_dir, params=params,
        constructor_type=constructor_type, deterministic=True)

    producer.start()
    producer.join()
    assert producer._fatal_exception is None

    user_inv_map = {v: k for k, v in producer.user_map.items()}
    item_inv_map = {v: k for k, v in producer.item_map.items()}

    # ==========================================================================
    # == Training Data =========================================================
    # ==========================================================================
    g = tf.Graph()
    with g.as_default():
      input_fn = producer.make_input_fn(is_training=True)
      dataset = input_fn(params)

    first_epoch = self.drain_dataset(dataset=dataset, g=g)

    counts = defaultdict(int)
    train_examples = {
        True: set(),
        False: set(),
    }

    md5 = hashlib.md5()
    for features, labels in first_epoch:
      data_list = [
          features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
          features[rconst.VALID_POINT_MASK], labels]
      for i in data_list:
        md5.update(i.tobytes())

      for u, i, v, l in zip(*data_list):
        if not v:
          continue  # ignore padding

        u_raw = user_inv_map[u]
        i_raw = item_inv_map[i]
        if ((u_raw, i_raw) in self.seen_pairs) != l:
          # The evaluation item is not considered during false negative
          # generation, so it will occasionally appear as a negative example
          # during training.
          assert not l
          self.assertEqual(i_raw, self.holdout[u_raw][1])
        train_examples[l].add((u_raw, i_raw))
        counts[(u_raw, i_raw)] += 1

    self.assertRegexpMatches(md5.hexdigest(), END_TO_END_TRAIN_MD5)

    num_positives_seen = len(train_examples[True])
    self.assertEqual(producer._train_pos_users.shape[0], num_positives_seen)

    # This check is more heuristic because negatives are sampled with
    # replacement. It only checks that negative generation is reasonably random.
    self.assertGreater(
        len(train_examples[False]) / NUM_NEG / num_positives_seen, 0.9)

    # This checks that the samples produced are independent by checking the
    # number of duplicate entries. If workers are not properly independent there
    # will be lots of repeated pairs.
    self.assertLess(np.mean(list(counts.values())), 1.1)

    # ==========================================================================
    # == Eval Data =============================================================
    # ==========================================================================
    with g.as_default():
      input_fn = producer.make_input_fn(is_training=False)
      dataset = input_fn(params)

    eval_data = self.drain_dataset(dataset=dataset, g=g)

    current_user = None
    md5 = hashlib.md5()
    for features in eval_data:
      data_list = [
          features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
          features[rconst.DUPLICATE_MASK]]
      for i in data_list:
        md5.update(i.tobytes())

      for idx, (u, i, d) in enumerate(zip(*data_list)):
        u_raw = user_inv_map[u]
        i_raw = item_inv_map[i]
        if current_user is None:
          current_user = u

        # Ensure that users appear in blocks, as the evaluation logic expects
        # this structure.
        self.assertEqual(u, current_user)

        # The structure of evaluation data is 999 negative examples followed
        # by the holdout positive.
        if not (idx + 1) % (rconst.NUM_EVAL_NEGATIVES + 1):
          # Check that the last element in each chunk is the holdout item.
          self.assertEqual(i_raw, self.holdout[u_raw][1])
          current_user = None

        elif i_raw == self.holdout[u_raw][1]:
          # Because the holdout item is not given to the negative generation
          # process, it can appear as a negative. In that case, it should be
          # masked out as a duplicate. (Since the true positive is placed at
          # the end and would therefore lose the tie.)
          assert d

        else:
          # Otherwise check that the other 999 points for a user are selected
          # from the negatives.
          assert (u_raw, i_raw) not in self.seen_pairs

    self.assertRegexpMatches(md5.hexdigest(), END_TO_END_EVAL_MD5)
Esempio n. 11
0
    def _test_end_to_end(self, constructor_type):
        params = self.make_params(train_epochs=1)
        _, _, producer = data_preprocessing.instantiate_pipeline(
            dataset=DATASET,
            data_dir=self.temp_data_dir,
            params=params,
            constructor_type=constructor_type,
            deterministic=True)

        producer.start()
        producer.join()
        assert producer._fatal_exception is None

        user_inv_map = {v: k for k, v in producer.user_map.items()}
        item_inv_map = {v: k for k, v in producer.item_map.items()}

        # ==========================================================================
        # == Training Data =========================================================
        # ==========================================================================
        g = tf.Graph()
        with g.as_default():
            input_fn = producer.make_input_fn(is_training=True)
            dataset = input_fn(params)

        first_epoch = self.drain_dataset(dataset=dataset, g=g)

        counts = defaultdict(int)
        train_examples = {
            True: set(),
            False: set(),
        }

        md5 = hashlib.md5()
        for features, labels in first_epoch:
            data_list = [
                features[movielens.USER_COLUMN],
                features[movielens.ITEM_COLUMN],
                features[rconst.VALID_POINT_MASK], labels
            ]
            for i in data_list:
                md5.update(i.tobytes())

            for u, i, v, l in zip(*data_list):
                if not v:
                    continue  # ignore padding

                u_raw = user_inv_map[u]
                i_raw = item_inv_map[i]
                if ((u_raw, i_raw) in self.seen_pairs) != l:
                    # The evaluation item is not considered during false negative
                    # generation, so it will occasionally appear as a negative example
                    # during training.
                    assert not l
                    self.assertEqual(i_raw, self.holdout[u_raw][1])
                train_examples[l].add((u_raw, i_raw))
                counts[(u_raw, i_raw)] += 1

        self.assertRegexpMatches(md5.hexdigest(), END_TO_END_TRAIN_MD5)

        num_positives_seen = len(train_examples[True])
        self.assertEqual(producer._train_pos_users.shape[0],
                         num_positives_seen)

        # This check is more heuristic because negatives are sampled with
        # replacement. It only checks that negative generation is reasonably random.
        self.assertGreater(
            len(train_examples[False]) / NUM_NEG / num_positives_seen, 0.9)

        # This checks that the samples produced are independent by checking the
        # number of duplicate entries. If workers are not properly independent there
        # will be lots of repeated pairs.
        self.assertLess(np.mean(list(counts.values())), 1.1)

        # ==========================================================================
        # == Eval Data =============================================================
        # ==========================================================================
        with g.as_default():
            input_fn = producer.make_input_fn(is_training=False)
            dataset = input_fn(params)

        eval_data = self.drain_dataset(dataset=dataset, g=g)

        current_user = None
        md5 = hashlib.md5()
        for features in eval_data:
            data_list = [
                features[movielens.USER_COLUMN],
                features[movielens.ITEM_COLUMN],
                features[rconst.DUPLICATE_MASK]
            ]
            for i in data_list:
                md5.update(i.tobytes())

            for idx, (u, i, d) in enumerate(zip(*data_list)):
                u_raw = user_inv_map[u]
                i_raw = item_inv_map[i]
                if current_user is None:
                    current_user = u

                # Ensure that users appear in blocks, as the evaluation logic expects
                # this structure.
                self.assertEqual(u, current_user)

                # The structure of evaluation data is 999 negative examples followed
                # by the holdout positive.
                if not (idx + 1) % (rconst.NUM_EVAL_NEGATIVES + 1):
                    # Check that the last element in each chunk is the holdout item.
                    self.assertEqual(i_raw, self.holdout[u_raw][1])
                    current_user = None

                elif i_raw == self.holdout[u_raw][1]:
                    # Because the holdout item is not given to the negative generation
                    # process, it can appear as a negative. In that case, it should be
                    # masked out as a duplicate. (Since the true positive is placed at
                    # the end and would therefore lose the tie.)
                    assert d

                else:
                    # Otherwise check that the other 999 points for a user are selected
                    # from the negatives.
                    assert (u_raw, i_raw) not in self.seen_pairs

        self.assertRegexpMatches(md5.hexdigest(), END_TO_END_EVAL_MD5)
Esempio n. 12
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  num_gpus = flags_core.get_num_gpus(FLAGS)
  batch_size = distribution_utils.per_device_batch_size(
      int(FLAGS.batch_size), num_gpus)
  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
  eval_batch_size = int(FLAGS.eval_batch_size or
                        max([FLAGS.batch_size, eval_per_user]))
  if eval_batch_size % eval_per_user:
    eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
    tf.logging.warning(
        "eval examples per user does not evenly divide eval_batch_size. "
        "Overriding to {}".format(eval_batch_size))

  if FLAGS.use_synthetic_data:
    ncf_dataset = None
    cleanup_fn = lambda: None
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        num_cycles=total_training_cycle,
        match_mlperf=FLAGS.ml_perf,
        deterministic=FLAGS.seed is not None,
        use_subprocess=FLAGS.use_subprocess,
        cache_id=FLAGS.cache_id)
    num_users = ncf_dataset.num_users
    num_items = ncf_dataset.num_items
    num_train_steps = int(np.ceil(
        FLAGS.epochs_between_evals * ncf_dataset.num_train_positives *
        (1 + FLAGS.num_neg) / FLAGS.batch_size))
    num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) *
                                 ncf_dataset.num_users / eval_batch_size))

  model_helpers.apply_clean(flags.FLAGS)

  params = {
      "use_seed": FLAGS.seed is not None,
      "hash_pipeline": FLAGS.hash_pipeline,
      "batch_size": batch_size,
      "eval_batch_size": eval_batch_size,
      "learning_rate": FLAGS.learning_rate,
      "num_users": num_users,
      "num_items": num_items,
      "mf_dim": FLAGS.num_factors,
      "model_layers": [int(layer) for layer in FLAGS.layers],
      "mf_regularization": FLAGS.mf_regularization,
      "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
      "num_neg": FLAGS.num_neg,
      "use_tpu": FLAGS.tpu is not None,
      "tpu": FLAGS.tpu,
      "tpu_zone": FLAGS.tpu_zone,
      "tpu_gcp_project": FLAGS.tpu_gcp_project,
      "beta1": FLAGS.beta1,
      "beta2": FLAGS.beta2,
      "epsilon": FLAGS.epsilon,
      "match_mlperf": FLAGS.ml_perf,
      "use_xla_for_gpu": FLAGS.use_xla_for_gpu,
      "use_estimator": FLAGS.use_estimator,
  }
  if FLAGS.use_estimator:
    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus, model_dir=FLAGS.model_dir,
        iterations=num_train_steps, params=params,
        batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size)
  else:
    runner = model_runner.NcfModelRunner(ncf_dataset, params, num_train_steps,
                                         num_eval_steps, FLAGS.use_while_loop)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
      tensors_to_log={"cross_entropy": "cross_entropy"}
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "eval_batch_size": eval_batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)


  eval_input_fn = None
  target_reached = False
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
  for cycle_index in range(total_training_cycle):
    assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                            value=cycle_index)

    # Train the model
    if FLAGS.use_estimator:
      train_input_fn, train_record_dir, batch_count = \
        data_preprocessing.make_input_fn(
            ncf_dataset=ncf_dataset, is_training=True)

      if batch_count != num_train_steps:
        raise ValueError(
            "Step counts do not match. ({} vs. {}) The async process is "
            "producing incorrect shards.".format(batch_count, num_train_steps))

      train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                            steps=num_train_steps)
      if train_record_dir:
        tf.gfile.DeleteRecursively(train_record_dir)

      tf.logging.info("Beginning evaluation.")
      if eval_input_fn is None:
        eval_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
            ncf_dataset=ncf_dataset, is_training=False)

        if eval_batch_count != num_eval_steps:
          raise ValueError(
              "Step counts do not match. ({} vs. {}) The async process is "
              "producing incorrect shards.".format(
                  eval_batch_count, num_eval_steps))

      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                              value=cycle_index)
      eval_results = eval_estimator.evaluate(eval_input_fn,
                                             steps=num_eval_steps)
      tf.logging.info("Evaluation complete.")
    else:
      runner.train()
      tf.logging.info("Beginning evaluation.")
      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                              value=cycle_index)
      eval_results = runner.eval()
      tf.logging.info("Evaluation complete.")
    hr = float(eval_results[rconst.HR_KEY])
    ndcg = float(eval_results[rconst.NDCG_KEY])

    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_TARGET,
        value={"epoch": cycle_index, "value": FLAGS.hr_threshold})
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                            value={"epoch": cycle_index, "value": hr})
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
        value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES})

    # Logged by the async process during record creation.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                            deferred=True)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      target_reached = True
      break

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                          value={"success": target_reached})
  cleanup_fn()  # Cleanup data construction artifacts and subprocess.

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Esempio n. 13
0
  def _test_fresh_randomness(self, constructor_type):
    train_epochs = 5
    params = self.make_params(train_epochs=train_epochs)
    _, _, producer = data_preprocessing.instantiate_pipeline(
        dataset=DATASET, data_dir=self.temp_data_dir, params=params,
        constructor_type=constructor_type, deterministic=True)

    producer.start()

    results = []
    g = tf.Graph()
    with g.as_default():
      for _ in range(train_epochs):
        input_fn = producer.make_input_fn(is_training=True)
        dataset = input_fn(params)
        results.extend(self.drain_dataset(dataset=dataset, g=g))

    producer.join()
    assert producer._fatal_exception is None

    positive_counts, negative_counts = defaultdict(int), defaultdict(int)
    md5 = hashlib.md5()
    for features, labels in results:
      data_list = [
          features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
          features[rconst.VALID_POINT_MASK], labels]
      for i in data_list:
        md5.update(i.tobytes())

      for u, i, v, l in zip(*data_list):
        if not v:
          continue  # ignore padding

        if l:
          positive_counts[(u, i)] += 1
        else:
          negative_counts[(u, i)] += 1

    self.assertRegexpMatches(md5.hexdigest(), FRESH_RANDOMNESS_MD5)

    # The positive examples should appear exactly once each epoch
    self.assertAllEqual(list(positive_counts.values()),
                        [train_epochs for _ in positive_counts])

    # The threshold for the negatives is heuristic, but in general repeats are
    # expected, but should not appear too frequently.

    pair_cardinality = NUM_USERS * NUM_ITEMS
    neg_pair_cardinality = pair_cardinality - len(self.seen_pairs)

    # Approximation for the expectation number of times that a particular
    # negative will appear in a given epoch. Implicit in this calculation is the
    # treatment of all negative pairs as equally likely. Normally is not
    # necessarily reasonable; however the generation in self.setUp() will
    # approximate this behavior sufficiently for heuristic testing.
    e_sample = len(self.seen_pairs) * NUM_NEG / neg_pair_cardinality

    # The frequency of occurance of a given negative pair should follow an
    # approximately binomial distribution in the limit that the cardinality of
    # the negative pair set >> number of samples per epoch.
    approx_pdf = scipy.stats.binom.pmf(k=np.arange(train_epochs+1),
                                       n=train_epochs, p=e_sample)

    # Tally the actual observed counts.
    count_distribution = [0 for _ in range(train_epochs + 1)]
    for i in negative_counts.values():
      i = min([i, train_epochs])  # round down tail for simplicity.
      count_distribution[i] += 1
    count_distribution[0] = neg_pair_cardinality - sum(count_distribution[1:])

    # Check that the frequency of negative pairs is approximately binomial.
    for i in range(train_epochs + 1):
      if approx_pdf[i] < 0.05:
        continue  # Variance will be high at the tails.

      observed_fraction = count_distribution[i] / neg_pair_cardinality
      deviation = (2 * abs(observed_fraction - approx_pdf[i]) /
                   (observed_fraction + approx_pdf[i]))

      self.assertLess(deviation, 0.2)
Esempio n. 14
0
def main(_):
    """Train NCF model and evaluate its hit rate (HR) metric."""

    params = create_params()

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    if FLAGS.use_synthetic_data:
        producer = data_pipeline.DummyConstructor()
        num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
            FLAGS.dataset]
        num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
        num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    else:
        num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
            dataset=FLAGS.dataset,
            data_dir=FLAGS.data_dir,
            epoch_dir=os.path.join(params["model_dir"], "epoch"),
            params=get_params_for_dataset(params),
            constructor_type=FLAGS.constructor_type,
            deterministic=FLAGS.seed is not None)

        num_train_steps = (producer.train_batches_per_epoch //
                           params["batches_per_step"])
        num_eval_steps = (producer.eval_batches_per_epoch //
                          params["batches_per_step"])
        assert not producer.train_batches_per_epoch % params["batches_per_step"]
        assert not producer.eval_batches_per_epoch % params["batches_per_step"]
    producer.start()

    params["num_users"] = num_users
    params["num_items"] = num_items

    feature_columns = create_feature_columns(params)

    model_fn = create_model_fn(feature_columns)
    estimator = create_tpu_estimator(model_fn, feature_columns, params)

    train_hooks = hooks_helper.get_train_hooks(
        ["ProfilerHook"],
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
        tensors_to_log={"cross_entropy": "cross_entropy"})

    for cycle_index in range(FLAGS.train_epochs):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, FLAGS.train_epochs))
        train_input_fn = producer.make_input_fn(is_training=True)
        estimator.train(input_fn=train_input_fn,
                        hooks=train_hooks,
                        steps=num_train_steps)
        tf.logging.info("Beginning evaluation.")
        eval_input_fn = producer.make_input_fn(is_training=False)
        eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps)
        tf.logging.info("Evaluation complete.")

        hr = float(eval_results[rconst.HR_KEY])
        ndcg = float(eval_results[rconst.NDCG_KEY])
        loss = float(eval_results["loss"])
        tf.logging.info(
            "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format(
                cycle_index + 1, hr, ndcg, loss))

    producer.stop_loop()
    producer.join()
Esempio n. 15
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)

    eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
    eval_batch_size = int(FLAGS.eval_batch_size
                          or max([FLAGS.batch_size, eval_per_user]))
    if eval_batch_size % eval_per_user:
        eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
        tf.logging.warning(
            "eval examples per user does not evenly divide eval_batch_size. "
            "Overriding to {}".format(eval_batch_size))

    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset,
        data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        match_mlperf=FLAGS.ml_perf,
        deterministic=FLAGS.seed is not None)

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "use_seed": FLAGS.seed is not None,
            "hash_pipeline": FLAGS.hash_pipeline,
            "batch_size": batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": ncf_dataset.num_users,
            "num_items": ncf_dataset.num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "num_neg": FLAGS.num_neg,
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
            "beta1": FLAGS.beta1,
            "beta2": FLAGS.beta2,
            "epsilon": FLAGS.epsilon,
            "match_mlperf": FLAGS.ml_perf,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
        tensors_to_log={"cross_entropy": "cross_entropy"})
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    approx_train_steps = int(ncf_dataset.num_train_positives *
                             (1 + FLAGS.num_neg) // FLAGS.batch_size)
    pred_input_fn = data_preprocessing.make_pred_input_fn(
        ncf_dataset=ncf_dataset)

    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        # Train the model
        train_input_fn, train_record_dir, batch_count = \
          data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

        if np.abs(approx_train_steps - batch_count) > 1:
            tf.logging.warning(
                "Estimated ({}) and reported ({}) number of batches differ by more "
                "than one".format(approx_train_steps, batch_count))

        train_estimator.train(input_fn=train_input_fn,
                              hooks=train_hooks,
                              steps=batch_count)
        tf.gfile.DeleteRecursively(train_record_dir)

        tf.logging.info("Beginning evaluation.")
        eval_results = eval_estimator.evaluate(pred_input_fn)
        tf.logging.info("Evaluation complete.")

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[rconst.HR_KEY]
        ndcg = eval_results[rconst.NDCG_KEY]
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    cleanup_fn()  # Cleanup data construction artifacts and subprocess.

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
Esempio n. 16
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  params = parse_flags(FLAGS)
  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  if FLAGS.use_synthetic_data:
    producer = data_pipeline.DummyConstructor()
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params,
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)

    num_train_steps = (producer.train_batches_per_epoch //
                       params["batches_per_step"])
    num_eval_steps = (producer.eval_batches_per_epoch //
                      params["batches_per_step"])
    assert not producer.train_batches_per_epoch % params["batches_per_step"]
    assert not producer.eval_batches_per_epoch % params["batches_per_step"]
  producer.start()

  params["num_users"], params["num_items"] = num_users, num_items
  model_helpers.apply_clean(flags.FLAGS)

  estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params)

  benchmark_logger, train_hooks = log_and_get_hooks(params["eval_batch_size"])

  target_reached = False
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
  for cycle_index in range(total_training_cycle):
    assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                            value=cycle_index)

    train_input_fn = producer.make_input_fn(is_training=True)
    estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                    steps=num_train_steps)

    tf.logging.info("Beginning evaluation.")
    eval_input_fn = producer.make_input_fn(is_training=False)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                            value=cycle_index)
    eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps)
    tf.logging.info("Evaluation complete.")

    hr = float(eval_results[rconst.HR_KEY])
    ndcg = float(eval_results[rconst.NDCG_KEY])
    loss = float(eval_results["loss"])

    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_TARGET,
        value={"epoch": cycle_index, "value": FLAGS.hr_threshold})
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                            value={"epoch": cycle_index, "value": hr})
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
        value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES})

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format(
            cycle_index + 1, hr, ndcg, loss))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      target_reached = True
      break

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                          value={"success": target_reached})
  producer.stop_loop()
  producer.join()

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Esempio n. 17
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)
    eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size)
    ncf_dataset = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset,
        data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        match_mlperf=FLAGS.ml_perf)

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "batch_size": batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": ncf_dataset.num_users,
            "num_items": ncf_dataset.num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
    )
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    approx_train_steps = int(ncf_dataset.num_train_positives *
                             (1 + FLAGS.num_neg) // FLAGS.batch_size)
    pred_input_fn = data_preprocessing.make_pred_input_fn(
        ncf_dataset=ncf_dataset)

    total_training_cycle = 1 if FLAGS.inference_only else FLAGS.train_epochs // FLAGS.epochs_between_evals
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        if not FLAGS.inference_only:
            # Train the model
            train_input_fn, train_record_dir, batch_count = \
                data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

            if np.abs(approx_train_steps - batch_count) > 1:
                tf.logging.warning(
                    "Estimated ({}) and reported ({}) number of batches differ by more "
                    "than one".format(approx_train_steps, batch_count))
            train_estimator.train(input_fn=train_input_fn,
                                  hooks=train_hooks,
                                  steps=batch_count)
            tf.gfile.DeleteRecursively(train_record_dir)

        # Evaluate the model
        eval_results = evaluate_model(eval_estimator, ncf_dataset,
                                      pred_input_fn)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[_HR_KEY]
        ndcg = eval_results[_NDCG_KEY]
        tf.logging.fatal("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # Export SavedModel
        if FLAGS.export_savedmodel:
            eval_estimator.export_savedmodel(FLAGS.model_dir,
                                             serving_input_receiver_fn)
            print("SavedModel successfully exported to: {}/<timestamp>".format(
                FLAGS.model_dir))

        # Some of the NumPy vector math can be quite large and likes to stay in
        # memory for a while.
        gc.collect()

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
Esempio n. 18
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)

    eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
    eval_batch_size = int(FLAGS.eval_batch_size
                          or max([FLAGS.batch_size, eval_per_user]))
    if eval_batch_size % eval_per_user:
        eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
        tf.logging.warning(
            "eval examples per user does not evenly divide eval_batch_size. "
            "Overriding to {}".format(eval_batch_size))

    if FLAGS.use_synthetic_data:
        ncf_dataset = None
        cleanup_fn = lambda: None
        num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
            FLAGS.dataset]
        num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
        num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
    else:
        ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
            dataset=FLAGS.dataset,
            data_dir=FLAGS.data_dir,
            batch_size=batch_size,
            eval_batch_size=eval_batch_size,
            num_neg=FLAGS.num_neg,
            epochs_per_cycle=FLAGS.epochs_between_evals,
            match_mlperf=FLAGS.ml_perf,
            deterministic=FLAGS.seed is not None,
            use_subprocess=FLAGS.use_subprocess,
            cache_id=FLAGS.cache_id)
        num_users = ncf_dataset.num_users
        num_items = ncf_dataset.num_items
        num_train_steps = int(
            np.ceil(FLAGS.epochs_between_evals *
                    ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) /
                    FLAGS.batch_size))
        num_eval_steps = int(
            np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users /
                    eval_batch_size))

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "use_seed": FLAGS.seed is not None,
            "hash_pipeline": FLAGS.hash_pipeline,
            "batch_size": batch_size,
            "eval_batch_size": eval_batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": num_users,
            "num_items": num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "num_neg": FLAGS.num_neg,
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
            "beta1": FLAGS.beta1,
            "beta2": FLAGS.beta2,
            "epsilon": FLAGS.epsilon,
            "match_mlperf": FLAGS.ml_perf,
            "use_xla_for_gpu": FLAGS.use_xla_for_gpu,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
        tensors_to_log={"cross_entropy": "cross_entropy"})
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    pred_input_fn = None
    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
    target_reached = False
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
    for cycle_index in range(total_training_cycle):
        assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                                value=cycle_index)

        # Train the model
        train_input_fn, train_record_dir, batch_count = \
          data_preprocessing.make_input_fn(
              ncf_dataset=ncf_dataset, is_training=True)

        if batch_count != num_train_steps:
            raise ValueError(
                "Step counts do not match. ({} vs. {}) The async process is "
                "producing incorrect shards.".format(batch_count,
                                                     num_train_steps))

        train_estimator.train(input_fn=train_input_fn,
                              hooks=train_hooks,
                              steps=num_train_steps)
        if train_record_dir:
            tf.gfile.DeleteRecursively(train_record_dir)

        tf.logging.info("Beginning evaluation.")
        if pred_input_fn is None:
            pred_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
                ncf_dataset=ncf_dataset, is_training=False)

            if eval_batch_count != num_eval_steps:
                raise ValueError(
                    "Step counts do not match. ({} vs. {}) The async process is "
                    "producing incorrect shards.".format(
                        eval_batch_count, num_eval_steps))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                                value=cycle_index)
        eval_results = eval_estimator.evaluate(pred_input_fn,
                                               steps=num_eval_steps)
        hr = float(eval_results[rconst.HR_KEY])
        ndcg = float(eval_results[rconst.NDCG_KEY])
        tf.logging.info("Evaluation complete.")

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET,
                                value={
                                    "epoch": cycle_index,
                                    "value": FLAGS.hr_threshold
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                                value={
                                    "epoch": cycle_index,
                                    "value": hr
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
                                value={
                                    "epoch": cycle_index,
                                    "value": rconst.NUM_EVAL_NEGATIVES
                                })

        # Logged by the async process during record creation.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                                deferred=True)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP,
                                value=cycle_index)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            target_reached = True
            break

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                            value={"success": target_reached})
    cleanup_fn()  # Cleanup data construction artifacts and subprocess.

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Esempio n. 19
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  num_gpus = flags_core.get_num_gpus(FLAGS)
  batch_size = distribution_utils.per_device_batch_size(
      int(FLAGS.batch_size), num_gpus)
  eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size)
  ncf_dataset = data_preprocessing.instantiate_pipeline(
      dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
      batch_size=batch_size,
      eval_batch_size=eval_batch_size,
      num_neg=FLAGS.num_neg,
      epochs_per_cycle=FLAGS.epochs_between_evals,
      match_mlperf=FLAGS.ml_perf)

  model_helpers.apply_clean(flags.FLAGS)

  train_estimator, eval_estimator = construct_estimator(
      num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={
          "batch_size": batch_size,
          "learning_rate": FLAGS.learning_rate,
          "num_users": ncf_dataset.num_users,
          "num_items": ncf_dataset.num_items,
          "mf_dim": FLAGS.num_factors,
          "model_layers": [int(layer) for layer in FLAGS.layers],
          "mf_regularization": FLAGS.mf_regularization,
          "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
          "use_tpu": FLAGS.tpu is not None,
          "tpu": FLAGS.tpu,
          "tpu_zone": FLAGS.tpu_zone,
          "tpu_gcp_project": FLAGS.tpu_gcp_project,
      }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "eval_batch_size": eval_batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  approx_train_steps = int(ncf_dataset.num_train_positives
                           * (1 + FLAGS.num_neg) // FLAGS.batch_size)
  pred_input_fn = data_preprocessing.make_pred_input_fn(ncf_dataset=ncf_dataset)

  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))


    # Train the model
    train_input_fn, train_record_dir, batch_count = \
      data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

    if np.abs(approx_train_steps - batch_count) > 1:
      tf.logging.warning(
          "Estimated ({}) and reported ({}) number of batches differ by more "
          "than one".format(approx_train_steps, batch_count))
    train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                          steps=batch_count)
    tf.gfile.DeleteRecursively(train_record_dir)

    # Evaluate the model
    eval_results = evaluate_model(
        eval_estimator, ncf_dataset, pred_input_fn)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    hr = eval_results[_HR_KEY]
    ndcg = eval_results[_NDCG_KEY]
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # Some of the NumPy vector math can be quite large and likes to stay in
    # memory for a while.
    gc.collect()

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      break

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
Esempio n. 20
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    params = parse_flags(FLAGS)
    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

    if FLAGS.use_synthetic_data:
        producer = data_pipeline.DummyConstructor()
        num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
            FLAGS.dataset]
        num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
        num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    else:
        num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
            dataset=FLAGS.dataset,
            data_dir=FLAGS.data_dir,
            params=params,
            constructor_type=FLAGS.constructor_type,
            deterministic=FLAGS.seed is not None)

        num_train_steps = (producer.train_batches_per_epoch //
                           params["batches_per_step"])
        num_eval_steps = (producer.eval_batches_per_epoch //
                          params["batches_per_step"])
        assert not producer.train_batches_per_epoch % params["batches_per_step"]
        assert not producer.eval_batches_per_epoch % params["batches_per_step"]
    producer.start()

    params["num_users"], params["num_items"] = num_users, num_items
    model_helpers.apply_clean(flags.FLAGS)

    estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params)

    benchmark_logger, train_hooks = log_and_get_hooks(
        params["eval_batch_size"])

    target_reached = False
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
    for cycle_index in range(total_training_cycle):
        assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                                value=cycle_index)

        train_input_fn = producer.make_input_fn(is_training=True)
        estimator.train(input_fn=train_input_fn,
                        hooks=train_hooks,
                        steps=num_train_steps)

        tf.logging.info("Beginning evaluation.")
        eval_input_fn = producer.make_input_fn(is_training=False)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                                value=cycle_index)
        eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps)
        tf.logging.info("Evaluation complete.")

        hr = float(eval_results[rconst.HR_KEY])
        ndcg = float(eval_results[rconst.NDCG_KEY])
        loss = float(eval_results["loss"])

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET,
                                value={
                                    "epoch": cycle_index,
                                    "value": FLAGS.hr_threshold
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                                value={
                                    "epoch": cycle_index,
                                    "value": hr
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
                                value={
                                    "epoch": cycle_index,
                                    "value": rconst.NUM_EVAL_NEGATIVES
                                })

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP,
                                value=cycle_index)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        tf.logging.info(
            "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format(
                cycle_index + 1, hr, ndcg, loss))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            target_reached = True
            break

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                            value={"success": target_reached})
    producer.stop_loop()
    producer.join()

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)