Beispiel #1
0
def construct_cache(dataset, data_dir, num_data_readers, match_mlperf,
                    deterministic):
    # type: (str, str, int, bool) -> NCFDataset
    """Load and digest data CSV into a usable form.

  Args:
    dataset: The name of the dataset to be used.
    data_dir: The root directory of the dataset.
    num_data_readers: The number of parallel processes which will request
      data during training.
    match_mlperf: If True, change the behavior of the cache construction to
      match the MLPerf reference implementation.
    deterministic: Try to enforce repeatable behavior, even at the cost of
      performance.
  """
    cache_paths = rconst.Paths(data_dir=data_dir)
    num_data_readers = (num_data_readers
                        or int(multiprocessing.cpu_count() / 2) or 1)
    approx_num_shards = int(movielens.NUM_RATINGS[dataset] //
                            rconst.APPROX_PTS_PER_TRAIN_SHARD) or 1

    st = timeit.default_timer()
    cache_root = os.path.join(data_dir, cache_paths.cache_root)
    if tf.gfile.Exists(cache_root):
        raise ValueError("{} unexpectedly already exists.".format(
            cache_paths.cache_root))
    tf.logging.info(
        "Creating cache directory. This should be deleted on exit.")
    tf.gfile.MakeDirs(cache_paths.cache_root)

    raw_rating_path = os.path.join(data_dir, dataset, movielens.RATINGS_FILE)
    df, user_map, item_map = _filter_index_sort(raw_rating_path, match_mlperf)
    num_users, num_items = DATASET_TO_NUM_USERS_AND_ITEMS[dataset]

    if num_users != len(user_map):
        raise ValueError("Expected to find {} users, but found {}".format(
            num_users, len(user_map)))
    if num_items != len(item_map):
        raise ValueError("Expected to find {} items, but found {}".format(
            num_items, len(item_map)))

    generate_train_eval_data(df=df,
                             approx_num_shards=approx_num_shards,
                             num_items=len(item_map),
                             cache_paths=cache_paths,
                             match_mlperf=match_mlperf)
    del approx_num_shards  # value may have changed.

    ncf_dataset = NCFDataset(user_map=user_map,
                             item_map=item_map,
                             num_data_readers=num_data_readers,
                             cache_paths=cache_paths,
                             num_train_positives=len(df) - len(user_map),
                             deterministic=deterministic)

    run_time = timeit.default_timer() - st
    tf.logging.info(
        "Cache construction complete. Time: {:.1f} sec.".format(run_time))

    return ncf_dataset
Beispiel #2
0
    def test_shard_randomness(self):
        users = [0, 0, 0, 0, 1, 1, 1, 1]
        items = [0, 2, 4, 6, 0, 2, 4, 6]
        times = [1, 2, 3, 4, 1, 2, 3, 4]
        df = pd.DataFrame({
            movielens.USER_COLUMN: users,
            movielens.ITEM_COLUMN: items,
            movielens.TIMESTAMP_COLUMN: times
        })
        cache_paths = rconst.Paths(data_dir=self.temp_data_dir)
        np.random.seed(1)
        data_preprocessing.generate_train_eval_data(df,
                                                    approx_num_shards=2,
                                                    num_items=10,
                                                    cache_paths=cache_paths,
                                                    match_mlperf=True)
        with tf.gfile.Open(cache_paths.eval_raw_file, "rb") as f:
            eval_data = pickle.load(f)
        eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1
        self.assertAllClose(eval_data[0][movielens.USER_COLUMN],
                            [0] * eval_items_per_user +
                            [1] * eval_items_per_user)

        # Each shard process should generate different random items.
        self.assertNotAllClose(
            eval_data[0][movielens.ITEM_COLUMN][:eval_items_per_user],
            eval_data[0][movielens.ITEM_COLUMN][eval_items_per_user:])
Beispiel #3
0
    def test_shard_randomness(self):
        users = [0, 0, 0, 0, 1, 1, 1, 1]
        items = [0, 2, 4, 6, 0, 2, 4, 6]
        times = [1, 2, 3, 4, 1, 2, 3, 4]
        df = pd.DataFrame({
            movielens.USER_COLUMN: users,
            movielens.ITEM_COLUMN: items,
            movielens.TIMESTAMP_COLUMN: times
        })
        cache_paths = rconst.Paths(data_dir=self.temp_data_dir)
        np.random.seed(1)

        num_shards = 2
        num_items = 10
        data_preprocessing.generate_train_eval_data(
            df,
            approx_num_shards=num_shards,
            num_items=num_items,
            cache_paths=cache_paths,
            match_mlperf=True)

        raw_shards = tf.gfile.ListDirectory(cache_paths.train_shard_subdir)
        assert len(raw_shards) == num_shards

        sharded_eval_data = []
        for i in range(2):
            sharded_eval_data.append(
                data_async_generation._process_shard(
                    (os.path.join(cache_paths.train_shard_subdir,
                                  raw_shards[i]),
                     num_items, rconst.NUM_EVAL_NEGATIVES,
                     stat_utils.random_int32(), False, True)))

        if sharded_eval_data[0][0][0] == 1:
            # Order is not assured for this part of the pipeline.
            sharded_eval_data.reverse()

        eval_data = [
            np.concatenate([shard[i] for shard in sharded_eval_data])
            for i in range(3)
        ]
        eval_data = {
            movielens.USER_COLUMN: eval_data[0],
            movielens.ITEM_COLUMN: eval_data[1],
        }

        eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1
        self.assertAllClose(eval_data[movielens.USER_COLUMN],
                            [0] * eval_items_per_user +
                            [1] * eval_items_per_user)

        # Each shard process should generate different random items.
        self.assertNotAllClose(
            eval_data[movielens.ITEM_COLUMN][:eval_items_per_user],
            eval_data[movielens.ITEM_COLUMN][eval_items_per_user:])
Beispiel #4
0
def main(_):
    redirect_logs = flags.FLAGS.redirect_logs
    cache_paths = rconst.Paths(data_dir=flags.FLAGS.data_dir,
                               cache_id=flags.FLAGS.cache_id)

    log_file_name = "data_gen_proc_{}.log".format(cache_paths.cache_id)
    log_file = os.path.join(cache_paths.data_dir, log_file_name)
    if log_file.startswith("gs://") and redirect_logs:
        fallback_log_file = os.path.join(tempfile.gettempdir(), log_file_name)
        print("Unable to log to {}. Falling back to {}".format(
            log_file, fallback_log_file))
        log_file = fallback_log_file

    # This server is generally run in a subprocess.
    if redirect_logs:
        print("Redirecting stdout and stderr to {}".format(log_file))
        log_stream = open(log_file, "wt")  # Note: not tf.gfile.Open().
        stdout = log_stream
        stderr = log_stream
    try:
        if redirect_logs:
            absl_logging.get_absl_logger().addHandler(
                hdlr=logging.StreamHandler(stream=stdout))
            sys.stdout = stdout
            sys.stderr = stderr
            print("Logs redirected.")
        try:
            log_msg("sys.argv: {}".format(" ".join(sys.argv)))

            if flags.FLAGS.seed is not None:
                np.random.seed(flags.FLAGS.seed)

            _generation_loop(
                num_workers=flags.FLAGS.num_workers,
                cache_paths=cache_paths,
                num_readers=flags.FLAGS.num_readers,
                num_neg=flags.FLAGS.num_neg,
                num_train_positives=flags.FLAGS.num_train_positives,
                num_items=flags.FLAGS.num_items,
                spillover=flags.FLAGS.spillover,
                epochs_per_cycle=flags.FLAGS.epochs_per_cycle,
                train_batch_size=flags.FLAGS.train_batch_size,
                eval_batch_size=flags.FLAGS.eval_batch_size,
            )
        except KeyboardInterrupt:
            log_msg("KeyboardInterrupt registered.")
        except:
            traceback.print_exc()
            raise
    finally:
        log_msg("Shutting down generation subprocess.")
        sys.stdout.flush()
        sys.stderr.flush()
        if redirect_logs:
            log_stream.close()
def main(_):
  global _log_file
  cache_paths = rconst.Paths(
      data_dir=flags.FLAGS.data_dir, cache_id=flags.FLAGS.cache_id)

  flagfile = os.path.join(cache_paths.cache_root, rconst.FLAGFILE)
  _parse_flagfile(flagfile)

  redirect_logs = flags.FLAGS.redirect_logs

  log_file_name = "data_gen_proc_{}.log".format(cache_paths.cache_id)
  log_path = os.path.join(cache_paths.data_dir, log_file_name)
  if log_path.startswith("gs://") and redirect_logs:
    fallback_log_file = os.path.join(tempfile.gettempdir(), log_file_name)
    print("Unable to log to {}. Falling back to {}"
          .format(log_path, fallback_log_file))
    log_path = fallback_log_file

  # This server is generally run in a subprocess.
  if redirect_logs:
    print("Redirecting output of data_async_generation.py process to {}"
          .format(log_path))
    _log_file = open(log_path, "wt")  # Note: not tf.gfile.Open().
  try:
    log_msg("sys.argv: {}".format(" ".join(sys.argv)))

    if flags.FLAGS.seed is not None:
      np.random.seed(flags.FLAGS.seed)

    _generation_loop(
        num_workers=flags.FLAGS.num_workers,
        cache_paths=cache_paths,
        num_readers=flags.FLAGS.num_readers,
        num_neg=flags.FLAGS.num_neg,
        num_train_positives=flags.FLAGS.num_train_positives,
        num_items=flags.FLAGS.num_items,
        num_users=flags.FLAGS.num_users,
        epochs_per_cycle=flags.FLAGS.epochs_per_cycle,
        train_batch_size=flags.FLAGS.train_batch_size,
        eval_batch_size=flags.FLAGS.eval_batch_size,
        deterministic=flags.FLAGS.seed is not None,
        match_mlperf=flags.FLAGS.ml_perf,
    )
  except KeyboardInterrupt:
    log_msg("KeyboardInterrupt registered.")
  except:
    traceback.print_exc(file=_log_file)
    raise
  finally:
    log_msg("Shutting down generation subprocess.")
    sys.stdout.flush()
    sys.stderr.flush()
    if redirect_logs:
      _log_file.close()
def construct_cache(dataset, data_dir, num_data_readers):
    # type: (str, str, int, int, bool) -> NCFDataset
    """Load and digest data CSV into a usable form.

  Args:
    dataset: The name of the dataset to be used.
    data_dir: The root directory of the dataset.
    num_data_readers: The number of parallel processes which will request
      data during training.
  """
    cache_paths = rconst.Paths(data_dir=data_dir)
    num_data_readers = (num_data_readers
                        or int(multiprocessing.cpu_count() / 2) or 1)
    approx_num_shards = int(movielens.NUM_RATINGS[dataset] //
                            rconst.APPROX_PTS_PER_TRAIN_SHARD) or 1

    st = timeit.default_timer()
    cache_root = os.path.join(data_dir, cache_paths.cache_root)
    if tf.gfile.Exists(cache_root):
        raise ValueError("{} unexpectedly already exists.".format(
            cache_paths.cache_root))
    tf.logging.info(
        "Creating cache directory. This should be deleted on exit.")
    tf.gfile.MakeDirs(cache_paths.cache_root)

    raw_rating_path = os.path.join(data_dir, dataset, movielens.RATINGS_FILE)
    df, user_map, item_map = _filter_index_sort(raw_rating_path)

    generate_train_eval_data(df=df,
                             approx_num_shards=approx_num_shards,
                             num_items=len(item_map),
                             cache_paths=cache_paths)
    del approx_num_shards  # value may have changed.

    ncf_dataset = NCFDataset(user_map=user_map,
                             item_map=item_map,
                             num_data_readers=num_data_readers,
                             cache_paths=cache_paths,
                             num_train_positives=len(df) - len(user_map))

    run_time = timeit.default_timer() - st
    tf.logging.info(
        "Cache construction complete. Time: {:.1f} sec.".format(run_time))

    return ncf_dataset
Beispiel #7
0
def main(_):
    # Note: The async process must execute the following two steps in the
    #       following order BEFORE doing anything else:
    #       1) Write the alive file
    #       2) Wait for the flagfile to be written.
    global _log_file
    cache_paths = rconst.Paths(data_dir=flags.FLAGS.data_dir,
                               cache_id=flags.FLAGS.cache_id)
    write_alive_file(cache_paths=cache_paths)

    flagfile = os.path.join(cache_paths.cache_root, rconst.FLAGFILE)
    _parse_flagfile(flagfile)

    redirect_logs = flags.FLAGS.redirect_logs

    log_file_name = "data_gen_proc_{}.log".format(cache_paths.cache_id)
    log_path = os.path.join(cache_paths.data_dir, log_file_name)
    if log_path.startswith("gs://") and redirect_logs:
        fallback_log_file = os.path.join(tempfile.gettempdir(), log_file_name)
        print("Unable to log to {}. Falling back to {}".format(
            log_path, fallback_log_file))
        log_path = fallback_log_file

    # This server is generally run in a subprocess.
    if redirect_logs:
        print("Redirecting output of data_async_generation.py process to {}".
              format(log_path))
        _log_file = open(log_path, "wt")  # Note: not tf.gfile.Open().
    try:
        log_msg("sys.argv: {}".format(" ".join(sys.argv)))

        if flags.FLAGS.seed is not None:
            np.random.seed(flags.FLAGS.seed)

        with mlperf_helper.LOGGER(
                enable=flags.FLAGS.output_ml_perf_compliance_logging):
            mlperf_helper.set_ncf_root(
                os.path.split(os.path.abspath(__file__))[0])
            _generation_loop(
                num_workers=flags.FLAGS.num_workers,
                cache_paths=cache_paths,
                num_readers=flags.FLAGS.num_readers,
                num_neg=flags.FLAGS.num_neg,
                num_train_positives=flags.FLAGS.num_train_positives,
                num_items=flags.FLAGS.num_items,
                num_users=flags.FLAGS.num_users,
                epochs_per_cycle=flags.FLAGS.epochs_per_cycle,
                num_cycles=flags.FLAGS.num_cycles,
                train_batch_size=flags.FLAGS.train_batch_size,
                eval_batch_size=flags.FLAGS.eval_batch_size,
                deterministic=flags.FLAGS.seed is not None,
                match_mlperf=flags.FLAGS.ml_perf,
            )
    except KeyboardInterrupt:
        log_msg("KeyboardInterrupt registered.")
    except:
        traceback.print_exc(file=_log_file)
        raise
    finally:
        log_msg("Shutting down generation subprocess.")
        sys.stdout.flush()
        sys.stderr.flush()
        if redirect_logs:
            _log_file.close()