def construct_cache(dataset, data_dir, num_data_readers, match_mlperf, deterministic): # type: (str, str, int, bool) -> NCFDataset """Load and digest data CSV into a usable form. Args: dataset: The name of the dataset to be used. data_dir: The root directory of the dataset. num_data_readers: The number of parallel processes which will request data during training. match_mlperf: If True, change the behavior of the cache construction to match the MLPerf reference implementation. deterministic: Try to enforce repeatable behavior, even at the cost of performance. """ cache_paths = rconst.Paths(data_dir=data_dir) num_data_readers = (num_data_readers or int(multiprocessing.cpu_count() / 2) or 1) approx_num_shards = int(movielens.NUM_RATINGS[dataset] // rconst.APPROX_PTS_PER_TRAIN_SHARD) or 1 st = timeit.default_timer() cache_root = os.path.join(data_dir, cache_paths.cache_root) if tf.gfile.Exists(cache_root): raise ValueError("{} unexpectedly already exists.".format( cache_paths.cache_root)) tf.logging.info( "Creating cache directory. This should be deleted on exit.") tf.gfile.MakeDirs(cache_paths.cache_root) raw_rating_path = os.path.join(data_dir, dataset, movielens.RATINGS_FILE) df, user_map, item_map = _filter_index_sort(raw_rating_path, match_mlperf) num_users, num_items = DATASET_TO_NUM_USERS_AND_ITEMS[dataset] if num_users != len(user_map): raise ValueError("Expected to find {} users, but found {}".format( num_users, len(user_map))) if num_items != len(item_map): raise ValueError("Expected to find {} items, but found {}".format( num_items, len(item_map))) generate_train_eval_data(df=df, approx_num_shards=approx_num_shards, num_items=len(item_map), cache_paths=cache_paths, match_mlperf=match_mlperf) del approx_num_shards # value may have changed. ncf_dataset = NCFDataset(user_map=user_map, item_map=item_map, num_data_readers=num_data_readers, cache_paths=cache_paths, num_train_positives=len(df) - len(user_map), deterministic=deterministic) run_time = timeit.default_timer() - st tf.logging.info( "Cache construction complete. Time: {:.1f} sec.".format(run_time)) return ncf_dataset
def test_shard_randomness(self): users = [0, 0, 0, 0, 1, 1, 1, 1] items = [0, 2, 4, 6, 0, 2, 4, 6] times = [1, 2, 3, 4, 1, 2, 3, 4] df = pd.DataFrame({ movielens.USER_COLUMN: users, movielens.ITEM_COLUMN: items, movielens.TIMESTAMP_COLUMN: times }) cache_paths = rconst.Paths(data_dir=self.temp_data_dir) np.random.seed(1) data_preprocessing.generate_train_eval_data(df, approx_num_shards=2, num_items=10, cache_paths=cache_paths, match_mlperf=True) with tf.gfile.Open(cache_paths.eval_raw_file, "rb") as f: eval_data = pickle.load(f) eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1 self.assertAllClose(eval_data[0][movielens.USER_COLUMN], [0] * eval_items_per_user + [1] * eval_items_per_user) # Each shard process should generate different random items. self.assertNotAllClose( eval_data[0][movielens.ITEM_COLUMN][:eval_items_per_user], eval_data[0][movielens.ITEM_COLUMN][eval_items_per_user:])
def test_shard_randomness(self): users = [0, 0, 0, 0, 1, 1, 1, 1] items = [0, 2, 4, 6, 0, 2, 4, 6] times = [1, 2, 3, 4, 1, 2, 3, 4] df = pd.DataFrame({ movielens.USER_COLUMN: users, movielens.ITEM_COLUMN: items, movielens.TIMESTAMP_COLUMN: times }) cache_paths = rconst.Paths(data_dir=self.temp_data_dir) np.random.seed(1) num_shards = 2 num_items = 10 data_preprocessing.generate_train_eval_data( df, approx_num_shards=num_shards, num_items=num_items, cache_paths=cache_paths, match_mlperf=True) raw_shards = tf.gfile.ListDirectory(cache_paths.train_shard_subdir) assert len(raw_shards) == num_shards sharded_eval_data = [] for i in range(2): sharded_eval_data.append( data_async_generation._process_shard( (os.path.join(cache_paths.train_shard_subdir, raw_shards[i]), num_items, rconst.NUM_EVAL_NEGATIVES, stat_utils.random_int32(), False, True))) if sharded_eval_data[0][0][0] == 1: # Order is not assured for this part of the pipeline. sharded_eval_data.reverse() eval_data = [ np.concatenate([shard[i] for shard in sharded_eval_data]) for i in range(3) ] eval_data = { movielens.USER_COLUMN: eval_data[0], movielens.ITEM_COLUMN: eval_data[1], } eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1 self.assertAllClose(eval_data[movielens.USER_COLUMN], [0] * eval_items_per_user + [1] * eval_items_per_user) # Each shard process should generate different random items. self.assertNotAllClose( eval_data[movielens.ITEM_COLUMN][:eval_items_per_user], eval_data[movielens.ITEM_COLUMN][eval_items_per_user:])
def main(_): redirect_logs = flags.FLAGS.redirect_logs cache_paths = rconst.Paths(data_dir=flags.FLAGS.data_dir, cache_id=flags.FLAGS.cache_id) log_file_name = "data_gen_proc_{}.log".format(cache_paths.cache_id) log_file = os.path.join(cache_paths.data_dir, log_file_name) if log_file.startswith("gs://") and redirect_logs: fallback_log_file = os.path.join(tempfile.gettempdir(), log_file_name) print("Unable to log to {}. Falling back to {}".format( log_file, fallback_log_file)) log_file = fallback_log_file # This server is generally run in a subprocess. if redirect_logs: print("Redirecting stdout and stderr to {}".format(log_file)) log_stream = open(log_file, "wt") # Note: not tf.gfile.Open(). stdout = log_stream stderr = log_stream try: if redirect_logs: absl_logging.get_absl_logger().addHandler( hdlr=logging.StreamHandler(stream=stdout)) sys.stdout = stdout sys.stderr = stderr print("Logs redirected.") try: log_msg("sys.argv: {}".format(" ".join(sys.argv))) if flags.FLAGS.seed is not None: np.random.seed(flags.FLAGS.seed) _generation_loop( num_workers=flags.FLAGS.num_workers, cache_paths=cache_paths, num_readers=flags.FLAGS.num_readers, num_neg=flags.FLAGS.num_neg, num_train_positives=flags.FLAGS.num_train_positives, num_items=flags.FLAGS.num_items, spillover=flags.FLAGS.spillover, epochs_per_cycle=flags.FLAGS.epochs_per_cycle, train_batch_size=flags.FLAGS.train_batch_size, eval_batch_size=flags.FLAGS.eval_batch_size, ) except KeyboardInterrupt: log_msg("KeyboardInterrupt registered.") except: traceback.print_exc() raise finally: log_msg("Shutting down generation subprocess.") sys.stdout.flush() sys.stderr.flush() if redirect_logs: log_stream.close()
def main(_): global _log_file cache_paths = rconst.Paths( data_dir=flags.FLAGS.data_dir, cache_id=flags.FLAGS.cache_id) flagfile = os.path.join(cache_paths.cache_root, rconst.FLAGFILE) _parse_flagfile(flagfile) redirect_logs = flags.FLAGS.redirect_logs log_file_name = "data_gen_proc_{}.log".format(cache_paths.cache_id) log_path = os.path.join(cache_paths.data_dir, log_file_name) if log_path.startswith("gs://") and redirect_logs: fallback_log_file = os.path.join(tempfile.gettempdir(), log_file_name) print("Unable to log to {}. Falling back to {}" .format(log_path, fallback_log_file)) log_path = fallback_log_file # This server is generally run in a subprocess. if redirect_logs: print("Redirecting output of data_async_generation.py process to {}" .format(log_path)) _log_file = open(log_path, "wt") # Note: not tf.gfile.Open(). try: log_msg("sys.argv: {}".format(" ".join(sys.argv))) if flags.FLAGS.seed is not None: np.random.seed(flags.FLAGS.seed) _generation_loop( num_workers=flags.FLAGS.num_workers, cache_paths=cache_paths, num_readers=flags.FLAGS.num_readers, num_neg=flags.FLAGS.num_neg, num_train_positives=flags.FLAGS.num_train_positives, num_items=flags.FLAGS.num_items, num_users=flags.FLAGS.num_users, epochs_per_cycle=flags.FLAGS.epochs_per_cycle, train_batch_size=flags.FLAGS.train_batch_size, eval_batch_size=flags.FLAGS.eval_batch_size, deterministic=flags.FLAGS.seed is not None, match_mlperf=flags.FLAGS.ml_perf, ) except KeyboardInterrupt: log_msg("KeyboardInterrupt registered.") except: traceback.print_exc(file=_log_file) raise finally: log_msg("Shutting down generation subprocess.") sys.stdout.flush() sys.stderr.flush() if redirect_logs: _log_file.close()
def construct_cache(dataset, data_dir, num_data_readers): # type: (str, str, int, int, bool) -> NCFDataset """Load and digest data CSV into a usable form. Args: dataset: The name of the dataset to be used. data_dir: The root directory of the dataset. num_data_readers: The number of parallel processes which will request data during training. """ cache_paths = rconst.Paths(data_dir=data_dir) num_data_readers = (num_data_readers or int(multiprocessing.cpu_count() / 2) or 1) approx_num_shards = int(movielens.NUM_RATINGS[dataset] // rconst.APPROX_PTS_PER_TRAIN_SHARD) or 1 st = timeit.default_timer() cache_root = os.path.join(data_dir, cache_paths.cache_root) if tf.gfile.Exists(cache_root): raise ValueError("{} unexpectedly already exists.".format( cache_paths.cache_root)) tf.logging.info( "Creating cache directory. This should be deleted on exit.") tf.gfile.MakeDirs(cache_paths.cache_root) raw_rating_path = os.path.join(data_dir, dataset, movielens.RATINGS_FILE) df, user_map, item_map = _filter_index_sort(raw_rating_path) generate_train_eval_data(df=df, approx_num_shards=approx_num_shards, num_items=len(item_map), cache_paths=cache_paths) del approx_num_shards # value may have changed. ncf_dataset = NCFDataset(user_map=user_map, item_map=item_map, num_data_readers=num_data_readers, cache_paths=cache_paths, num_train_positives=len(df) - len(user_map)) run_time = timeit.default_timer() - st tf.logging.info( "Cache construction complete. Time: {:.1f} sec.".format(run_time)) return ncf_dataset
def main(_): # Note: The async process must execute the following two steps in the # following order BEFORE doing anything else: # 1) Write the alive file # 2) Wait for the flagfile to be written. global _log_file cache_paths = rconst.Paths(data_dir=flags.FLAGS.data_dir, cache_id=flags.FLAGS.cache_id) write_alive_file(cache_paths=cache_paths) flagfile = os.path.join(cache_paths.cache_root, rconst.FLAGFILE) _parse_flagfile(flagfile) redirect_logs = flags.FLAGS.redirect_logs log_file_name = "data_gen_proc_{}.log".format(cache_paths.cache_id) log_path = os.path.join(cache_paths.data_dir, log_file_name) if log_path.startswith("gs://") and redirect_logs: fallback_log_file = os.path.join(tempfile.gettempdir(), log_file_name) print("Unable to log to {}. Falling back to {}".format( log_path, fallback_log_file)) log_path = fallback_log_file # This server is generally run in a subprocess. if redirect_logs: print("Redirecting output of data_async_generation.py process to {}". format(log_path)) _log_file = open(log_path, "wt") # Note: not tf.gfile.Open(). try: log_msg("sys.argv: {}".format(" ".join(sys.argv))) if flags.FLAGS.seed is not None: np.random.seed(flags.FLAGS.seed) with mlperf_helper.LOGGER( enable=flags.FLAGS.output_ml_perf_compliance_logging): mlperf_helper.set_ncf_root( os.path.split(os.path.abspath(__file__))[0]) _generation_loop( num_workers=flags.FLAGS.num_workers, cache_paths=cache_paths, num_readers=flags.FLAGS.num_readers, num_neg=flags.FLAGS.num_neg, num_train_positives=flags.FLAGS.num_train_positives, num_items=flags.FLAGS.num_items, num_users=flags.FLAGS.num_users, epochs_per_cycle=flags.FLAGS.epochs_per_cycle, num_cycles=flags.FLAGS.num_cycles, train_batch_size=flags.FLAGS.train_batch_size, eval_batch_size=flags.FLAGS.eval_batch_size, deterministic=flags.FLAGS.seed is not None, match_mlperf=flags.FLAGS.ml_perf, ) except KeyboardInterrupt: log_msg("KeyboardInterrupt registered.") except: traceback.print_exc(file=_log_file) raise finally: log_msg("Shutting down generation subprocess.") sys.stdout.flush() sys.stderr.flush() if redirect_logs: _log_file.close()