def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size, num_cycles, num_data_readers=None, num_neg=4, epochs_per_cycle=1, match_mlperf=False, deterministic=False, use_subprocess=True, cache_id=None): # type: (...) -> (NCFDataset, typing.Callable) """Preprocess data and start negative generation subprocess.""" tf.logging.info("Beginning data preprocessing.") tf.gfile.MakeDirs(data_dir) ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir, num_data_readers=num_data_readers, match_mlperf=match_mlperf, deterministic=deterministic, cache_id=cache_id) # By limiting the number of workers we guarantee that the worker # pool underlying the training generation doesn't starve other processes. num_workers = int(multiprocessing.cpu_count() * 0.75) or 1 flags_ = { "data_dir": data_dir, "cache_id": ncf_dataset.cache_paths.cache_id, "num_neg": num_neg, "num_train_positives": ncf_dataset.num_train_positives, "num_items": ncf_dataset.num_items, "num_users": ncf_dataset.num_users, "num_readers": ncf_dataset.num_data_readers, "epochs_per_cycle": epochs_per_cycle, "num_cycles": num_cycles, "train_batch_size": batch_size, "eval_batch_size": eval_batch_size, "num_workers": num_workers, "redirect_logs": use_subprocess, "use_tf_logging": not use_subprocess, "ml_perf": match_mlperf, "output_ml_perf_compliance_logging": mlperf_helper.LOGGER.enabled, } if use_subprocess: tf.logging.info("Creating training file subprocess.") subproc_env = os.environ.copy() # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU # resources and by default will try to allocate GPU memory. This would cause # contention with the main training process. subproc_env["CUDA_VISIBLE_DEVICES"] = "" subproc_args = popen_helper.INVOCATION + [ "--data_dir", data_dir, "--cache_id", str(ncf_dataset.cache_paths.cache_id)] tf.logging.info( "Generation subprocess command: {}".format(" ".join(subproc_args))) proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env) cleanup_called = {"finished": False} @atexit.register def cleanup(): """Remove files and subprocess from data generation.""" if cleanup_called["finished"]: return if use_subprocess: _shutdown(proc) try: tf.gfile.DeleteRecursively(ncf_dataset.cache_paths.cache_root) except tf.errors.NotFoundError: pass cleanup_called["finished"] = True for _ in range(300): if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): break time.sleep(1) # allow `alive` file to be written if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): raise ValueError("Generation subprocess did not start correctly. Data will " "not be available; exiting to avoid waiting forever.") # We start the async process and wait for it to signal that it is alive. It # will then enter a loop waiting for the flagfile to be written. Once we see # that the async process has signaled that it is alive, we clear the system # caches and begin the run. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_CLEAR_CACHES) mlperf_helper.clear_system_caches() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_START) write_flagfile(flags_, ncf_dataset) return ncf_dataset, cleanup
def _filter_index_sort(raw_rating_path, match_mlperf): # type: (str, bool) -> (pd.DataFrame, dict, dict) """Read in data CSV, and output structured data. This function reads in the raw CSV of positive items, and performs three preprocessing transformations: 1) Filter out all users who have not rated at least a certain number of items. (Typically 20 items) 2) Zero index the users and items such that the largest user_id is `num_users - 1` and the largest item_id is `num_items - 1` 3) Sort the dataframe by user_id, with timestamp as a secondary sort key. This allows the dataframe to be sliced by user in-place, and for the last item to be selected simply by calling the `-1` index of a user's slice. While all of these transformations are performed by Pandas (and are therefore single-threaded), they only take ~2 minutes, and the overhead to apply a MapReduce pattern to parallel process the dataset adds significant complexity for no computational gain. For a larger dataset parallelizing this preprocessing could yield speedups. (Also, this preprocessing step is only performed once for an entire run. Args: raw_rating_path: The path to the CSV which contains the raw dataset. match_mlperf: If True, change the sorting algorithm to match the MLPerf reference implementation. Returns: A filtered, zero-index remapped, sorted dataframe, a dict mapping raw user IDs to regularized user IDs, and a dict mapping raw item IDs to regularized item IDs. """ with tf.gfile.Open(raw_rating_path) as f: df = pd.read_csv(f) # Get the info of users who have more than 20 ratings on items grouped = df.groupby(movielens.USER_COLUMN) df = grouped.filter( lambda x: len(x) >= rconst.MIN_NUM_RATINGS) # type: pd.DataFrame original_users = df[movielens.USER_COLUMN].unique() original_items = df[movielens.ITEM_COLUMN].unique() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.PREPROC_HP_MIN_RATINGS, value=rconst.MIN_NUM_RATINGS) # Map the ids of user and item to 0 based index for following processing tf.logging.info("Generating user_map and item_map...") user_map = {user: index for index, user in enumerate(original_users)} item_map = {item: index for index, item in enumerate(original_items)} df[movielens.USER_COLUMN] = df[movielens.USER_COLUMN].apply( lambda user: user_map[user]) df[movielens.ITEM_COLUMN] = df[movielens.ITEM_COLUMN].apply( lambda item: item_map[item]) num_users = len(original_users) num_items = len(original_items) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.PREPROC_HP_NUM_EVAL, value=rconst.NUM_EVAL_NEGATIVES) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=match_mlperf) assert num_users <= np.iinfo(np.int32).max assert num_items <= np.iinfo(np.uint16).max assert df[movielens.USER_COLUMN].max() == num_users - 1 assert df[movielens.ITEM_COLUMN].max() == num_items - 1 # This sort is used to shard the dataframe by user, and later to select # the last item for a user to be used in validation. tf.logging.info("Sorting by user, timestamp...") if match_mlperf: # This sort is equivalent to the non-MLPerf sort, except that the order of # items with the same user and timestamp are sometimes different. For some # reason, this sort results in a better hit-rate during evaluation, matching # the performance of the MLPerf reference implementation. df.sort_values(by=movielens.TIMESTAMP_COLUMN, inplace=True) df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN], inplace=True, kind="mergesort") else: df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN], inplace=True) df = df.reset_index() # The dataframe does not reconstruct indicies in the # sort or filter steps. return df, user_map, item_map
def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size, num_data_readers=None, num_neg=4, epochs_per_cycle=1, match_mlperf=False, deterministic=False, use_subprocess=True, cache_id=None): # type: (...) -> (NCFDataset, typing.Callable) """Preprocess data and start negative generation subprocess.""" tf.logging.info("Beginning data preprocessing.") tf.gfile.MakeDirs(data_dir) ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir, num_data_readers=num_data_readers, match_mlperf=match_mlperf, deterministic=deterministic, cache_id=cache_id) # By limiting the number of workers we guarantee that the worker # pool underlying the training generation doesn't starve other processes. num_workers = int(multiprocessing.cpu_count() * 0.75) or 1 flags_ = { "data_dir": data_dir, "cache_id": ncf_dataset.cache_paths.cache_id, "num_neg": num_neg, "num_train_positives": ncf_dataset.num_train_positives, "num_items": ncf_dataset.num_items, "num_users": ncf_dataset.num_users, "num_readers": ncf_dataset.num_data_readers, "epochs_per_cycle": epochs_per_cycle, "train_batch_size": batch_size, "eval_batch_size": eval_batch_size, "num_workers": num_workers, "redirect_logs": use_subprocess, "use_tf_logging": not use_subprocess, "ml_perf": match_mlperf, "output_ml_perf_compliance_logging": mlperf_helper.LOGGER.enabled, } if use_subprocess: tf.logging.info("Creating training file subprocess.") subproc_env = os.environ.copy() # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU # resources and by default will try to allocate GPU memory. This would cause # contention with the main training process. subproc_env["CUDA_VISIBLE_DEVICES"] = "" subproc_args = popen_helper.INVOCATION + [ "--data_dir", data_dir, "--cache_id", str(ncf_dataset.cache_paths.cache_id) ] tf.logging.info("Generation subprocess command: {}".format( " ".join(subproc_args))) proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env) cleanup_called = {"finished": False} @atexit.register def cleanup(): """Remove files and subprocess from data generation.""" if cleanup_called["finished"]: return if use_subprocess: _shutdown(proc) try: tf.gfile.DeleteRecursively(ncf_dataset.cache_paths.cache_root) except tf.errors.NotFoundError: pass cleanup_called["finished"] = True for _ in range(300): if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): break time.sleep(1) # allow `alive` file to be written if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): raise ValueError( "Generation subprocess did not start correctly. Data will " "not be available; exiting to avoid waiting forever.") # We start the async process and wait for it to signal that it is alive. It # will then enter a loop waiting for the flagfile to be written. Once we see # that the async process has signaled that it is alive, we clear the system # caches and begin the run. mlperf_helper.clear_system_caches() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_START) write_flagfile(flags_, ncf_dataset) return ncf_dataset, cleanup
def _filter_index_sort(raw_rating_path, match_mlperf): # type: (str, bool) -> (pd.DataFrame, dict, dict) """Read in data CSV, and output structured data. This function reads in the raw CSV of positive items, and performs three preprocessing transformations: 1) Filter out all users who have not rated at least a certain number of items. (Typically 20 items) 2) Zero index the users and items such that the largest user_id is `num_users - 1` and the largest item_id is `num_items - 1` 3) Sort the dataframe by user_id, with timestamp as a secondary sort key. This allows the dataframe to be sliced by user in-place, and for the last item to be selected simply by calling the `-1` index of a user's slice. While all of these transformations are performed by Pandas (and are therefore single-threaded), they only take ~2 minutes, and the overhead to apply a MapReduce pattern to parallel process the dataset adds significant complexity for no computational gain. For a larger dataset parallelizing this preprocessing could yield speedups. (Also, this preprocessing step is only performed once for an entire run. Args: raw_rating_path: The path to the CSV which contains the raw dataset. match_mlperf: If True, change the sorting algorithm to match the MLPerf reference implementation. Returns: A filtered, zero-index remapped, sorted dataframe, a dict mapping raw user IDs to regularized user IDs, and a dict mapping raw item IDs to regularized item IDs. """ with tf.gfile.Open(raw_rating_path) as f: df = pd.read_csv(f) # Get the info of users who have more than 20 ratings on items grouped = df.groupby(movielens.USER_COLUMN) df = grouped.filter( lambda x: len(x) >= rconst.MIN_NUM_RATINGS) # type: pd.DataFrame original_users = df[movielens.USER_COLUMN].unique() original_items = df[movielens.ITEM_COLUMN].unique() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.PREPROC_HP_MIN_RATINGS, value=rconst.MIN_NUM_RATINGS) # Map the ids of user and item to 0 based index for following processing tf.logging.info("Generating user_map and item_map...") user_map = {user: index for index, user in enumerate(original_users)} item_map = {item: index for index, item in enumerate(original_items)} df[movielens.USER_COLUMN] = df[movielens.USER_COLUMN].apply( lambda user: user_map[user]) df[movielens.ITEM_COLUMN] = df[movielens.ITEM_COLUMN].apply( lambda item: item_map[item]) num_users = len(original_users) num_items = len(original_items) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.PREPROC_HP_NUM_EVAL, value=num_users * (1 + rconst.NUM_EVAL_NEGATIVES)) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=match_mlperf) assert num_users <= np.iinfo(np.int32).max assert num_items <= np.iinfo(np.uint16).max assert df[movielens.USER_COLUMN].max() == num_users - 1 assert df[movielens.ITEM_COLUMN].max() == num_items - 1 # This sort is used to shard the dataframe by user, and later to select # the last item for a user to be used in validation. tf.logging.info("Sorting by user, timestamp...") if match_mlperf: # This sort is equivalent to the non-MLPerf sort, except that the order of # items with the same user and timestamp are sometimes different. For some # reason, this sort results in a better hit-rate during evaluation, matching # the performance of the MLPerf reference implementation. df.sort_values(by=movielens.TIMESTAMP_COLUMN, inplace=True) df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN], inplace=True, kind="mergesort") else: df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN], inplace=True) df = df.reset_index() # The dataframe does not reconstruct indicies in the # sort or filter steps. return df, user_map, item_map
def construct_model(user_input, item_input, params, need_strip=False): # type: (tf.Tensor, tf.Tensor, dict) -> tf.keras.Model """Initialize NeuMF model. Args: user_input: keras input layer for users item_input: keras input layer for items params: Dict of hyperparameters. Raises: ValueError: if the first model layer is not even. Returns: model: a keras Model for computing the logits """ num_users = params["num_users"] num_items = params["num_items"] model_layers = params["model_layers"] mf_regularization = params["mf_regularization"] mlp_reg_layers = params["mlp_reg_layers"] mf_dim = params["mf_dim"] mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MF_DIM, value=mf_dim) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MLP_LAYER_SIZES, value=model_layers) if model_layers[0] % 2 != 0: raise ValueError("The first layer size should be multiple of 2!") # Initializer for embedding layers embedding_initializer = "glorot_uniform" if need_strip: batch_size = params["batch_size"] user_input_reshaped = tf.keras.layers.Lambda( lambda x: _strip_first_and_last_dimension(x, batch_size))( user_input) item_input_reshaped = tf.keras.layers.Lambda( lambda x: _strip_first_and_last_dimension(x, batch_size))( item_input) # It turns out to be significantly more effecient to store the MF and MLP # embedding portions in the same table, and then slice as needed. mf_slice_fn = lambda x: x[:, :mf_dim] mlp_slice_fn = lambda x: x[:, mf_dim:] embedding_user = tf.keras.layers.Embedding( num_users, mf_dim + model_layers[0] // 2, embeddings_initializer=embedding_initializer, embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization), input_length=1, name="embedding_user")( user_input_reshaped if need_strip else user_input) embedding_item = tf.keras.layers.Embedding( num_items, mf_dim + model_layers[0] // 2, embeddings_initializer=embedding_initializer, embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization), input_length=1, name="embedding_item")( item_input_reshaped if need_strip else item_input) # GMF part mf_user_latent = tf.keras.layers.Lambda( mf_slice_fn, name="embedding_user_mf")(embedding_user) mf_item_latent = tf.keras.layers.Lambda( mf_slice_fn, name="embedding_item_mf")(embedding_item) # MLP part mlp_user_latent = tf.keras.layers.Lambda( mlp_slice_fn, name="embedding_user_mlp")(embedding_user) mlp_item_latent = tf.keras.layers.Lambda( mlp_slice_fn, name="embedding_item_mlp")(embedding_item) # Element-wise multiply mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent]) # Concatenation of two latent features mlp_vector = tf.keras.layers.concatenate( [mlp_user_latent, mlp_item_latent]) num_layer = len(model_layers) # Number of layers in the MLP for layer in xrange(1, num_layer): model_layer = tf.keras.layers.Dense( model_layers[layer], kernel_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[layer]), activation="relu") mlp_vector = model_layer(mlp_vector) # Concatenate GMF and MLP parts predict_vector = tf.keras.layers.concatenate([mf_vector, mlp_vector]) # Final prediction layer logits = tf.keras.layers.Dense( 1, activation=None, kernel_initializer="lecun_uniform", name=movielens.RATING_COLUMN)(predict_vector) # Print model topology. model = tf.keras.models.Model([user_input, item_input], logits) model.summary() sys.stdout.flush() return model
def construct_model(users, items, params): # type: (tf.Tensor, tf.Tensor, dict) -> tf.Tensor """Initialize NeuMF model. Args: users: Tensor of user ids. items: Tensor of item ids. params: Dict of hyperparameters. Raises: ValueError: if the first model layer is not even. Returns: logits: network logits """ num_users = params["num_users"] num_items = params["num_items"] model_layers = params["model_layers"] mf_regularization = params["mf_regularization"] mlp_reg_layers = params["mlp_reg_layers"] mf_dim = params["mf_dim"] mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MF_DIM, value=mf_dim) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MLP_LAYER_SIZES, value=model_layers) if model_layers[0] % 2 != 0: raise ValueError("The first layer size should be multiple of 2!") # Input variables user_input = tf.keras.layers.Input(tensor=users) item_input = tf.keras.layers.Input(tensor=items) batch_size = user_input.get_shape()[0] if params["use_tpu"]: with tf.variable_scope("embed_weights", reuse=tf.AUTO_REUSE): cmb_embedding_user = tf.get_variable( name="embeddings_mf_user", shape=[num_users, mf_dim + model_layers[0] // 2], initializer=tf.glorot_uniform_initializer()) cmb_embedding_item = tf.get_variable( name="embeddings_mf_item", shape=[num_items, mf_dim + model_layers[0] // 2], initializer=tf.glorot_uniform_initializer()) cmb_user_latent = tf.keras.layers.Lambda( lambda ids: tf.gather(cmb_embedding_user, ids))(user_input) cmb_item_latent = tf.keras.layers.Lambda( lambda ids: tf.gather(cmb_embedding_item, ids))(item_input) mlp_user_latent = tf.keras.layers.Lambda(lambda x: tf.slice( x, [0, 0], [batch_size, model_layers[0] // 2]))( cmb_user_latent) mlp_item_latent = tf.keras.layers.Lambda(lambda x: tf.slice( x, [0, 0], [batch_size, model_layers[0] // 2]))( cmb_item_latent) mf_user_latent = tf.keras.layers.Lambda(lambda x: tf.slice( x, [0, model_layers[0] // 2], [batch_size, mf_dim]))( cmb_user_latent) mf_item_latent = tf.keras.layers.Lambda(lambda x: tf.slice( x, [0, model_layers[0] // 2], [batch_size, mf_dim]))( cmb_item_latent) else: # Initializer for embedding layers embedding_initializer = "glorot_uniform" # Embedding layers of GMF and MLP mf_embedding_user = tf.keras.layers.Embedding( num_users, mf_dim, embeddings_initializer=embedding_initializer, embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization), input_length=1) mf_embedding_item = tf.keras.layers.Embedding( num_items, mf_dim, embeddings_initializer=embedding_initializer, embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization), input_length=1) mlp_embedding_user = tf.keras.layers.Embedding( num_users, model_layers[0] // 2, embeddings_initializer=embedding_initializer, embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]), input_length=1) mlp_embedding_item = tf.keras.layers.Embedding( num_items, model_layers[0] // 2, embeddings_initializer=embedding_initializer, embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]), input_length=1) # GMF part mf_user_latent = mf_embedding_user(user_input) mf_item_latent = mf_embedding_item(item_input) # MLP part mlp_user_latent = mlp_embedding_user(user_input) mlp_item_latent = mlp_embedding_item(item_input) # Element-wise multiply mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent]) # Concatenation of two latent features mlp_vector = tf.keras.layers.concatenate( [mlp_user_latent, mlp_item_latent]) num_layer = len(model_layers) # Number of layers in the MLP for layer in xrange(1, num_layer): model_layer = tf.keras.layers.Dense( model_layers[layer], kernel_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[layer]), activation="relu") mlp_vector = model_layer(mlp_vector) # Concatenate GMF and MLP parts predict_vector = tf.keras.layers.concatenate([mf_vector, mlp_vector]) # Final prediction layer logits = tf.keras.layers.Dense( 1, activation=None, kernel_initializer="lecun_uniform", name=movielens.RATING_COLUMN)(predict_vector) # Print model topology. model = tf.keras.models.Model([user_input, item_input], logits) model.summary() sys.stdout.flush() return model
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) params = parse_flags(FLAGS) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = (producer.train_batches_per_epoch // params["batches_per_step"]) num_eval_steps = (producer.eval_batches_per_epoch // params["batches_per_step"]) assert not producer.train_batches_per_epoch % params["batches_per_step"] assert not producer.eval_batches_per_epoch % params["batches_per_step"] producer.start() params["num_users"], params["num_items"] = num_users, num_items model_helpers.apply_clean(flags.FLAGS) estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params) benchmark_logger, train_hooks = log_and_get_hooks( params["eval_batch_size"]) target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) train_input_fn = producer.make_input_fn(is_training=True) estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) tf.logging.info("Beginning evaluation.") eval_input_fn = producer.make_input_fn(is_training=False) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) loss = float(eval_results["loss"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET, value={ "epoch": cycle_index, "value": FLAGS.hr_threshold }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={ "epoch": cycle_index, "value": hr }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={ "epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format( cycle_index + 1, hr, ndcg, loss)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) producer.stop_loop() producer.join() # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def construct_model(users, items, params): # type: (tf.Tensor, tf.Tensor, dict) -> tf.keras.Model """Initialize NeuMF model. Args: users: Tensor of user ids. items: Tensor of item ids. params: Dict of hyperparameters. Raises: ValueError: if the first model layer is not even. Returns: model: a keras Model for computing the logits """ num_users = params["num_users"] num_items = params["num_items"] model_layers = params["model_layers"] mf_regularization = params["mf_regularization"] mlp_reg_layers = params["mlp_reg_layers"] mf_dim = params["mf_dim"] mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MF_DIM, value=mf_dim) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MLP_LAYER_SIZES, value=model_layers) if model_layers[0] % 2 != 0: raise ValueError("The first layer size should be multiple of 2!") # Input variables user_input = tf.keras.layers.Input(tensor=users, name="user_input") item_input = tf.keras.layers.Input(tensor=items, name="item_input") # Initializer for embedding layers embedding_initializer = "glorot_uniform" # It turns out to be significantly more effecient to store the MF and MLP # embedding portions in the same table, and then slice as needed. mf_slice_fn = lambda x: x[:, :mf_dim] mlp_slice_fn = lambda x: x[:, mf_dim:] embedding_user = tf.keras.layers.Embedding( num_users, mf_dim + model_layers[0] // 2, embeddings_initializer=embedding_initializer, embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization), input_length=1, name="embedding_user")(user_input) embedding_item = tf.keras.layers.Embedding( num_items, mf_dim + model_layers[0] // 2, embeddings_initializer=embedding_initializer, embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization), input_length=1, name="embedding_item")(item_input) # GMF part mf_user_latent = tf.keras.layers.Lambda( mf_slice_fn, name="embedding_user_mf")(embedding_user) mf_item_latent = tf.keras.layers.Lambda( mf_slice_fn, name="embedding_item_mf")(embedding_item) # MLP part mlp_user_latent = tf.keras.layers.Lambda( mlp_slice_fn, name="embedding_user_mlp")(embedding_user) mlp_item_latent = tf.keras.layers.Lambda( mlp_slice_fn, name="embedding_item_mlp")(embedding_item) # Element-wise multiply mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent]) # Concatenation of two latent features mlp_vector = tf.keras.layers.concatenate([mlp_user_latent, mlp_item_latent]) num_layer = len(model_layers) # Number of layers in the MLP for layer in xrange(1, num_layer): model_layer = tf.keras.layers.Dense( model_layers[layer], kernel_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[layer]), activation="relu") mlp_vector = model_layer(mlp_vector) # Concatenate GMF and MLP parts predict_vector = tf.keras.layers.concatenate([mf_vector, mlp_vector]) # Final prediction layer logits = tf.keras.layers.Dense( 1, activation=None, kernel_initializer="lecun_uniform", name=movielens.RATING_COLUMN)(predict_vector) # Print model topology. model = tf.keras.models.Model([user_input, item_input], logits) model.summary() sys.stdout.flush() return model
def _construct_records( is_training, # type: bool train_cycle, # type: typing.Optional[int] num_workers, # type: int cache_paths, # type: rconst.Paths num_readers, # type: int num_neg, # type: int num_positives, # type: int num_items, # type: int epochs_per_cycle, # type: int batch_size, # type: int training_shards, # type: typing.List[str] deterministic=False, # type: bool match_mlperf=False # type: bool ): """Generate false negatives and write TFRecords files. Args: is_training: Are training records (True) or eval records (False) created. train_cycle: Integer of which cycle the generated data is for. num_workers: Number of multiprocessing workers to use for negative generation. cache_paths: Paths object with information of where to write files. num_readers: The number of reader datasets in the input_fn. This number is approximate; fewer shards will be created if not all shards are assigned batches. This can occur due to discretization in the assignment process. num_neg: The number of false negatives per positive example. num_positives: The number of positive examples. This value is used to pre-allocate arrays while the imap is still running. (NumPy does not allow dynamic arrays.) num_items: The cardinality of the item set. epochs_per_cycle: The number of epochs worth of data to construct. batch_size: The expected batch size used during training. This is used to properly batch data when writing TFRecords. training_shards: The picked positive examples from which to generate negatives. """ st = timeit.default_timer() if is_training: mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_TRAIN_NEG_GEN) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.INPUT_HP_NUM_NEG, value=num_neg) # set inside _process_shard() mlperf_helper.ncf_print( key=mlperf_helper.TAGS.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True) else: # Later logic assumes that all items for a given user are in the same batch. assert not batch_size % (rconst.NUM_EVAL_NEGATIVES + 1) assert num_neg == rconst.NUM_EVAL_NEGATIVES mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_EVAL_NEG_GEN) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, value=num_positives) assert epochs_per_cycle == 1 or is_training num_workers = min([num_workers, len(training_shards) * epochs_per_cycle]) num_pts = num_positives * (1 + num_neg) # Equivalent to `int(ceil(num_pts / batch_size)) * batch_size`, but without # precision concerns num_pts_with_padding = (num_pts + batch_size - 1) // batch_size * batch_size num_padding = num_pts_with_padding - num_pts # We choose a different random seed for each process, so that the processes # will not all choose the same random numbers. process_seeds = [stat_utils.random_int32() for _ in training_shards * epochs_per_cycle] map_args = [ (shard, num_items, num_neg, process_seeds[i], is_training, match_mlperf) for i, shard in enumerate(training_shards * epochs_per_cycle)] with popen_helper.get_pool(num_workers, init_worker) as pool: map_fn = pool.imap if deterministic else pool.imap_unordered # pylint: disable=no-member data_generator = map_fn(_process_shard, map_args) data = [ np.zeros(shape=(num_pts_with_padding,), dtype=np.int32) - 1, np.zeros(shape=(num_pts_with_padding,), dtype=np.uint16), np.zeros(shape=(num_pts_with_padding,), dtype=np.int8), ] # Training data is shuffled. Evaluation data MUST not be shuffled. # Downstream processing depends on the fact that evaluation data for a given # user is grouped within a batch. if is_training: index_destinations = np.random.permutation(num_pts) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) else: index_destinations = np.arange(num_pts) start_ind = 0 for data_segment in data_generator: n_in_segment = data_segment[0].shape[0] dest = index_destinations[start_ind:start_ind + n_in_segment] start_ind += n_in_segment for i in range(3): data[i][dest] = data_segment[i] assert np.sum(data[0] == -1) == num_padding if is_training: if num_padding: # In order to have a full batch, randomly include points from earlier in # the batch. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) pad_sample_indices = np.random.randint( low=0, high=num_pts, size=(num_padding,)) dest = np.arange(start=start_ind, stop=start_ind + num_padding) start_ind += num_padding for i in range(3): data[i][dest] = data[i][pad_sample_indices] else: # For Evaluation, padding is all zeros. The evaluation input_fn knows how # to interpret and discard the zero padded entries. data[0][num_pts:] = 0 # Check that no points were overlooked. assert not np.sum(data[0] == -1) if is_training: # The number of points is slightly larger than num_pts due to padding. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_SIZE, value=int(data[0].shape[0])) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_BATCH_SIZE, value=batch_size) else: # num_pts is logged instead of int(data[0].shape[0]), because the size # of the data vector includes zero pads which are ignored. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_SIZE, value=num_pts) batches_per_file = np.ceil(num_pts_with_padding / batch_size / num_readers) current_file_id = -1 current_batch_id = -1 batches_by_file = [[] for _ in range(num_readers)] while True: current_batch_id += 1 if (current_batch_id % batches_per_file) == 0: current_file_id += 1 start_ind = current_batch_id * batch_size end_ind = start_ind + batch_size if end_ind > num_pts_with_padding: if start_ind != num_pts_with_padding: raise ValueError("Batch padding does not line up") break batches_by_file[current_file_id].append(current_batch_id) # Drop shards which were not assigned batches batches_by_file = [i for i in batches_by_file if i] num_readers = len(batches_by_file) if is_training: # Empirically it is observed that placing the batch with repeated values at # the start rather than the end improves convergence. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) batches_by_file[0][0], batches_by_file[-1][-1] = \ batches_by_file[-1][-1], batches_by_file[0][0] if is_training: template = rconst.TRAIN_RECORD_TEMPLATE record_dir = os.path.join(cache_paths.train_epoch_dir, get_cycle_folder_name(train_cycle)) tf.gfile.MakeDirs(record_dir) else: template = rconst.EVAL_RECORD_TEMPLATE record_dir = cache_paths.eval_data_subdir batch_count = 0 for i in range(num_readers): fpath = os.path.join(record_dir, template.format(i)) log_msg("Writing {}".format(fpath)) with tf.python_io.TFRecordWriter(fpath) as writer: for j in batches_by_file[i]: start_ind = j * batch_size end_ind = start_ind + batch_size record_kwargs = dict( users=data[0][start_ind:end_ind], items=data[1][start_ind:end_ind], ) if is_training: record_kwargs["labels"] = data[2][start_ind:end_ind] else: record_kwargs["dupe_mask"] = stat_utils.mask_duplicates( record_kwargs["items"].reshape(-1, num_neg + 1), axis=1).flatten().astype(np.int8) batch_bytes = _construct_record(**record_kwargs) writer.write(batch_bytes) batch_count += 1 # We write to a temp file then atomically rename it to the final file, because # writing directly to the final file can cause the main process to read a # partially written JSON file. ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP) with tf.gfile.Open(ready_file_temp, "w") as f: json.dump({ "batch_size": batch_size, "batch_count": batch_count, }, f) ready_file = os.path.join(record_dir, rconst.READY_FILE) tf.gfile.Rename(ready_file_temp, ready_file) if is_training: log_msg("Cycle {} complete. Total time: {:.1f} seconds" .format(train_cycle, timeit.default_timer() - st)) else: log_msg("Eval construction complete. Total time: {:.1f} seconds" .format(timeit.default_timer() - st))
def _construct_records( is_training, # type: bool train_cycle, # type: typing.Optional[int] num_workers, # type: int cache_paths, # type: rconst.Paths num_readers, # type: int num_neg, # type: int num_positives, # type: int num_items, # type: int epochs_per_cycle, # type: int batch_size, # type: int training_shards, # type: typing.List[str] deterministic=False, # type: bool match_mlperf=False # type: bool ): """Generate false negatives and write TFRecords files. Args: is_training: Are training records (True) or eval records (False) created. train_cycle: Integer of which cycle the generated data is for. num_workers: Number of multiprocessing workers to use for negative generation. cache_paths: Paths object with information of where to write files. num_readers: The number of reader datasets in the input_fn. This number is approximate; fewer shards will be created if not all shards are assigned batches. This can occur due to discretization in the assignment process. num_neg: The number of false negatives per positive example. num_positives: The number of positive examples. This value is used to pre-allocate arrays while the imap is still running. (NumPy does not allow dynamic arrays.) num_items: The cardinality of the item set. epochs_per_cycle: The number of epochs worth of data to construct. batch_size: The expected batch size used during training. This is used to properly batch data when writing TFRecords. training_shards: The picked positive examples from which to generate negatives. """ st = timeit.default_timer() if is_training: mlperf_helper.ncf_print( key=mlperf_helper.TAGS.INPUT_STEP_TRAIN_NEG_GEN) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_HP_NUM_NEG, value=num_neg) # set inside _process_shard() mlperf_helper.ncf_print( key=mlperf_helper.TAGS.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True) else: # Later logic assumes that all items for a given user are in the same batch. assert not batch_size % (rconst.NUM_EVAL_NEGATIVES + 1) assert num_neg == rconst.NUM_EVAL_NEGATIVES mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_EVAL_NEG_GEN) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, value=num_positives) assert epochs_per_cycle == 1 or is_training num_workers = min([num_workers, len(training_shards) * epochs_per_cycle]) num_pts = num_positives * (1 + num_neg) # Equivalent to `int(ceil(num_pts / batch_size)) * batch_size`, but without # precision concerns num_pts_with_padding = (num_pts + batch_size - 1) // batch_size * batch_size num_padding = num_pts_with_padding - num_pts # We choose a different random seed for each process, so that the processes # will not all choose the same random numbers. process_seeds = [ stat_utils.random_int32() for _ in training_shards * epochs_per_cycle ] map_args = [(shard, num_items, num_neg, process_seeds[i], is_training, match_mlperf) for i, shard in enumerate(training_shards * epochs_per_cycle)] with popen_helper.get_pool(num_workers, init_worker) as pool: map_fn = pool.imap if deterministic else pool.imap_unordered # pylint: disable=no-member data_generator = map_fn(_process_shard, map_args) data = [ np.zeros(shape=(num_pts_with_padding, ), dtype=np.int32) - 1, np.zeros(shape=(num_pts_with_padding, ), dtype=np.uint16), np.zeros(shape=(num_pts_with_padding, ), dtype=np.int8), ] # Training data is shuffled. Evaluation data MUST not be shuffled. # Downstream processing depends on the fact that evaluation data for a given # user is grouped within a batch. if is_training: index_destinations = np.random.permutation(num_pts) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) else: index_destinations = np.arange(num_pts) start_ind = 0 for data_segment in data_generator: n_in_segment = data_segment[0].shape[0] dest = index_destinations[start_ind:start_ind + n_in_segment] start_ind += n_in_segment for i in range(3): data[i][dest] = data_segment[i] assert np.sum(data[0] == -1) == num_padding if is_training: if num_padding: # In order to have a full batch, randomly include points from earlier in # the batch. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) pad_sample_indices = np.random.randint(low=0, high=num_pts, size=(num_padding, )) dest = np.arange(start=start_ind, stop=start_ind + num_padding) start_ind += num_padding for i in range(3): data[i][dest] = data[i][pad_sample_indices] else: # For Evaluation, padding is all zeros. The evaluation input_fn knows how # to interpret and discard the zero padded entries. data[0][num_pts:] = 0 # Check that no points were overlooked. assert not np.sum(data[0] == -1) if is_training: # The number of points is slightly larger than num_pts due to padding. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_SIZE, value=int(data[0].shape[0])) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_BATCH_SIZE, value=batch_size) else: # num_pts is logged instead of int(data[0].shape[0]), because the size # of the data vector includes zero pads which are ignored. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_SIZE, value=num_pts) batches_per_file = np.ceil(num_pts_with_padding / batch_size / num_readers) current_file_id = -1 current_batch_id = -1 batches_by_file = [[] for _ in range(num_readers)] while True: current_batch_id += 1 if (current_batch_id % batches_per_file) == 0: current_file_id += 1 start_ind = current_batch_id * batch_size end_ind = start_ind + batch_size if end_ind > num_pts_with_padding: if start_ind != num_pts_with_padding: raise ValueError("Batch padding does not line up") break batches_by_file[current_file_id].append(current_batch_id) # Drop shards which were not assigned batches batches_by_file = [i for i in batches_by_file if i] num_readers = len(batches_by_file) if is_training: # Empirically it is observed that placing the batch with repeated values at # the start rather than the end improves convergence. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) batches_by_file[0][0], batches_by_file[-1][-1] = \ batches_by_file[-1][-1], batches_by_file[0][0] if is_training: template = rconst.TRAIN_RECORD_TEMPLATE record_dir = os.path.join(cache_paths.train_epoch_dir, get_cycle_folder_name(train_cycle)) tf.gfile.MakeDirs(record_dir) else: template = rconst.EVAL_RECORD_TEMPLATE record_dir = cache_paths.eval_data_subdir batch_count = 0 for i in range(num_readers): fpath = os.path.join(record_dir, template.format(i)) log_msg("Writing {}".format(fpath)) with tf.python_io.TFRecordWriter(fpath) as writer: for j in batches_by_file[i]: start_ind = j * batch_size end_ind = start_ind + batch_size record_kwargs = dict( users=data[0][start_ind:end_ind], items=data[1][start_ind:end_ind], ) if is_training: record_kwargs["labels"] = data[2][start_ind:end_ind] else: record_kwargs["dupe_mask"] = stat_utils.mask_duplicates( record_kwargs["items"].reshape(-1, num_neg + 1), axis=1).flatten().astype(np.int8) batch_bytes = _construct_record(**record_kwargs) writer.write(batch_bytes) batch_count += 1 # We write to a temp file then atomically rename it to the final file, because # writing directly to the final file can cause the main process to read a # partially written JSON file. ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP) with tf.gfile.Open(ready_file_temp, "w") as f: json.dump({ "batch_size": batch_size, "batch_count": batch_count, }, f) ready_file = os.path.join(record_dir, rconst.READY_FILE) tf.gfile.Rename(ready_file_temp, ready_file) if is_training: log_msg("Cycle {} complete. Total time: {:.1f} seconds".format( train_cycle, timeit.default_timer() - st)) else: log_msg( "Eval construction complete. Total time: {:.1f} seconds".format( timeit.default_timer() - st))
def run_ncf(_): """Run NCF training and eval loop.""" params = ncf_common.parse_flags(FLAGS) num_users, num_items, num_train_steps, num_eval_steps, producer = ( ncf_common.get_inputs(params)) params["num_users"], params["num_items"] = num_users, num_items producer.start() model_helpers.apply_clean(flags.FLAGS) estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params) benchmark_logger, train_hooks = log_and_get_hooks( params["eval_batch_size"]) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) train_input_fn = producer.make_input_fn(is_training=True) estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) logging.info("Beginning evaluation.") eval_input_fn = producer.make_input_fn(is_training=False) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps) logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) loss = float(eval_results["loss"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET, value={ "epoch": cycle_index, "value": FLAGS.hr_threshold }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={ "epoch": cycle_index, "value": hr }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={ "epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format( cycle_index + 1, hr, ndcg, loss)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break #May be better with shape 1, this is the floor of the input at predict time def serving_input_fn(): x = tf.placeholder(dtype=tf.int64, shape=[1], name=movielens.USER_COLUMN) y = tf.placeholder(dtype=tf.int64, shape=[1], name=movielens.ITEM_COLUMN) mask = tf.placeholder(dtype=tf.float32, shape=[1], name="duplicate_mask") inputs = { movielens.USER_COLUMN: x, movielens.ITEM_COLUMN: y, "duplicate_mask": mask } return tf.estimator.export.ServingInputReceiver(inputs, inputs) saved_model_dir = "saved_model" estimator.export_saved_model(saved_model_dir, serving_input_fn) print("saved") subdirs = [ x for x in Path(saved_model_dir).iterdir() if x.is_dir() and 'temp' not in str(x) ] latest = str(sorted(subdirs)[-1]) with tf.Session() as sess_tf: loaded = tf.saved_model.loader.load( sess_tf, [tf.saved_model.tag_constants.SERVING], latest) graph = loaded.graph_def output_name = "concat:0" tf.import_graph_def(graph, name='') print("loaded") for n in graph.node: print('\n', n) frozen_graph = loader.freeze_session(sess_tf, output_names=[output_name]) tf.reset_default_graph() with tf.Session() as sess_tf: tf.import_graph_def(frozen_graph, name='') print(type(frozen_graph)) onnx_graph = process_tf_graph(sess_tf.graph, opset=7, input_names=["userid:0", "itemid:0"], output_names=[output_name]) model_proto = onnx_graph.make_model("ncf") onnx_model_string = model_proto.SerializeToString() #out_file = open("newNCF.onnx", "wb") #out_file.write(onnx_model_string) #out_file.close() onnx_model_bytes = bytearray(onnx_model_string) movielens.run_pio_workflow(onnx_model_bytes, movielens.user_map, movielens.item_map, orig_sys_args) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) producer.stop_loop() producer.join() # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def neumf_model_fn(features, labels, mode, params): """Model Function for NeuMF estimator.""" if params.get("use_seed"): tf.set_random_seed(stat_utils.random_int32()) users = features[movielens.USER_COLUMN] items = tf.cast(features[movielens.ITEM_COLUMN], tf.int32) logits = construct_model(users=users, items=items, params=params) # Softmax with the first column of zeros is equivalent to sigmoid. softmax_logits = tf.concat( [tf.zeros(logits.shape, dtype=logits.dtype), logits], axis=1) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { movielens.ITEM_COLUMN: items, movielens.RATING_COLUMN: logits, } if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, predictions=predictions) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) elif mode == tf.estimator.ModeKeys.EVAL: duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32) return compute_eval_loss_and_metrics(logits, softmax_logits, duplicate_mask, params["num_neg"], params["match_mlperf"], use_tpu_spec=params["use_tpu"] or params["use_xla_for_gpu"]) elif mode == tf.estimator.ModeKeys.TRAIN: labels = tf.cast(labels, tf.int32) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR, value=params["learning_rate"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1, value=params["beta1"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2, value=params["beta2"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON, value=params["epsilon"]) optimizer = tf.train.AdamOptimizer( learning_rate=params["learning_rate"], beta1=params["beta1"], beta2=params["beta2"], epsilon=params["epsilon"]) if params["use_tpu"]: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN, value=mlperf_helper.TAGS.BCE) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=softmax_logits) # This tensor is used by logging hooks. tf.identity(loss, name="cross_entropy") global_step = tf.train.get_global_step() tvars = tf.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) gradients = _sparse_to_dense_grads(gradients) minimize_op = optimizer.apply_gradients(gradients, global_step=global_step, name="train") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) else: raise NotImplementedError
def construct_model(users, items, params): # type: (tf.Tensor, tf.Tensor, dict) -> tf.Tensor """Initialize NeuMF model. Args: users: Tensor of user ids. items: Tensor of item ids. params: Dict of hyperparameters. Raises: ValueError: if the first model layer is not even. """ num_users = params["num_users"] num_items = params["num_items"] model_layers = params["model_layers"] mf_regularization = params["mf_regularization"] mlp_reg_layers = params["mlp_reg_layers"] mf_dim = params["mf_dim"] mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MF_DIM, value=mf_dim) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MLP_LAYER_SIZES, value=model_layers) if model_layers[0] % 2 != 0: raise ValueError("The first layer size should be multiple of 2!") # Input variables user_input = tf.keras.layers.Input(tensor=users) item_input = tf.keras.layers.Input(tensor=items) # Initializer for embedding layers embedding_initializer = "glorot_uniform" # Embedding layers of GMF and MLP mf_embedding_user = tf.keras.layers.Embedding( num_users, mf_dim, embeddings_initializer=embedding_initializer, embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization), input_length=1) mf_embedding_item = tf.keras.layers.Embedding( num_items, mf_dim, embeddings_initializer=embedding_initializer, embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization), input_length=1) mlp_embedding_user = tf.keras.layers.Embedding( num_users, model_layers[0] // 2, embeddings_initializer=embedding_initializer, embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]), input_length=1) mlp_embedding_item = tf.keras.layers.Embedding( num_items, model_layers[0] // 2, embeddings_initializer=embedding_initializer, embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]), input_length=1) # GMF part mf_user_latent = mf_embedding_user(user_input) mf_item_latent = mf_embedding_item(item_input) # Element-wise multiply mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent]) # MLP part mlp_user_latent = mlp_embedding_user(user_input) mlp_item_latent = mlp_embedding_item(item_input) # Concatenation of two latent features mlp_vector = tf.keras.layers.concatenate( [mlp_user_latent, mlp_item_latent]) num_layer = len(model_layers) # Number of layers in the MLP for layer in xrange(1, num_layer): model_layer = tf.keras.layers.Dense( model_layers[layer], kernel_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[layer]), activation="relu") mlp_vector = model_layer(mlp_vector) # Concatenate GMF and MLP parts predict_vector = tf.keras.layers.concatenate([mf_vector, mlp_vector]) # Final prediction layer logits = tf.keras.layers.Dense( 1, activation=None, kernel_initializer="lecun_uniform", name=movielens.RATING_COLUMN)(predict_vector) # Print model topology. tf.keras.models.Model([user_input, item_input], logits).summary() sys.stdout.flush() return logits
def _filter_index_sort(raw_rating_path, cache_path): # type: (str, str, bool) -> (dict, bool) """Read in data CSV, and output structured data. This function reads in the raw CSV of positive items, and performs three preprocessing transformations: 1) Filter out all users who have not rated at least a certain number of items. (Typically 20 items) 2) Zero index the users and items such that the largest user_id is `num_users - 1` and the largest item_id is `num_items - 1` 3) Sort the dataframe by user_id, with timestamp as a secondary sort key. This allows the dataframe to be sliced by user in-place, and for the last item to be selected simply by calling the `-1` index of a user's slice. While all of these transformations are performed by Pandas (and are therefore single-threaded), they only take ~2 minutes, and the overhead to apply a MapReduce pattern to parallel process the dataset adds significant complexity for no computational gain. For a larger dataset parallelizing this preprocessing could yield speedups. (Also, this preprocessing step is only performed once for an entire run. Args: raw_rating_path: The path to the CSV which contains the raw dataset. cache_path: The path to the file where results of this function are saved. Returns: A filtered, zero-index remapped, sorted dataframe, a dict mapping raw user IDs to regularized user IDs, and a dict mapping raw item IDs to regularized item IDs. """ valid_cache = tf.io.gfile.exists(cache_path) if valid_cache: with tf.io.gfile.GFile(cache_path, "rb") as f: cached_data = pickle.load(f) cache_age = time.time() - cached_data.get("create_time", 0) if cache_age > rconst.CACHE_INVALIDATION_SEC: valid_cache = False for key in _EXPECTED_CACHE_KEYS: if key not in cached_data: valid_cache = False if not valid_cache: logging.info("Removing stale raw data cache file.") tf.io.gfile.remove(cache_path) if valid_cache: data = cached_data else: with tf.io.gfile.GFile(raw_rating_path) as f: df = pd.read_csv(f) # Get the info of users who have more than 20 ratings on items grouped = df.groupby(movielens.USER_COLUMN) df = grouped.filter( lambda x: len(x) >= rconst.MIN_NUM_RATINGS) # type: pd.DataFrame original_users = df[movielens.USER_COLUMN].unique() original_items = df[movielens.ITEM_COLUMN].unique() # Map the ids of user and item to 0 based index for following processing logging.info("Generating user_map and item_map...") user_map = {user: index for index, user in enumerate(original_users)} item_map = {item: index for index, item in enumerate(original_items)} df[movielens.USER_COLUMN] = df[movielens.USER_COLUMN].apply( lambda user: user_map[user]) df[movielens.ITEM_COLUMN] = df[movielens.ITEM_COLUMN].apply( lambda item: item_map[item]) num_users = len(original_users) num_items = len(original_items) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.PREPROC_HP_NUM_EVAL, value=rconst.NUM_EVAL_NEGATIVES) assert num_users <= np.iinfo(rconst.USER_DTYPE).max assert num_items <= np.iinfo(rconst.ITEM_DTYPE).max assert df[movielens.USER_COLUMN].max() == num_users - 1 assert df[movielens.ITEM_COLUMN].max() == num_items - 1 # This sort is used to shard the dataframe by user, and later to select # the last item for a user to be used in validation. logging.info("Sorting by user, timestamp...") # This sort is equivalent to # df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN], # inplace=True) # except that the order of items with the same user and timestamp are # sometimes different. For some reason, this sort results in a better # hit-rate during evaluation, matching the performance of the MLPerf # reference implementation. df.sort_values(by=movielens.TIMESTAMP_COLUMN, inplace=True) df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN], inplace=True, kind="mergesort") df = df.reset_index() # The dataframe does not reconstruct indices in the # sort or filter steps. grouped = df.groupby(movielens.USER_COLUMN, group_keys=False) eval_df, train_df = grouped.tail(1), grouped.apply(lambda x: x.iloc[:-1]) data = { rconst.TRAIN_USER_KEY: train_df[movielens.USER_COLUMN] .values.astype(rconst.USER_DTYPE), rconst.TRAIN_ITEM_KEY: train_df[movielens.ITEM_COLUMN] .values.astype(rconst.ITEM_DTYPE), rconst.EVAL_USER_KEY: eval_df[movielens.USER_COLUMN] .values.astype(rconst.USER_DTYPE), rconst.EVAL_ITEM_KEY: eval_df[movielens.ITEM_COLUMN] .values.astype(rconst.ITEM_DTYPE), rconst.USER_MAP: user_map, rconst.ITEM_MAP: item_map, "create_time": time.time(), } logging.info("Writing raw data cache.") with tf.io.gfile.GFile(cache_path, "wb") as f: pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL) # TODO(robieta): MLPerf cache clear. return data, valid_cache
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_batch_size = int(FLAGS.eval_batch_size or max([FLAGS.batch_size, eval_per_user])) if eval_batch_size % eval_per_user: eval_batch_size = eval_batch_size // eval_per_user * eval_per_user tf.logging.warning( "eval examples per user does not evenly divide eval_batch_size. " "Overriding to {}".format(eval_batch_size)) if FLAGS.use_synthetic_data: ncf_dataset = None cleanup_fn = lambda: None num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH else: ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf, deterministic=FLAGS.seed is not None, use_subprocess=FLAGS.use_subprocess, cache_id=FLAGS.cache_id) num_users = ncf_dataset.num_users num_items = ncf_dataset.num_items num_train_steps = int( np.ceil(FLAGS.epochs_between_evals * ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) / FLAGS.batch_size)) num_eval_steps = int( np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users / eval_batch_size)) model_helpers.apply_clean(flags.FLAGS) train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={ "use_seed": FLAGS.seed is not None, "hash_pipeline": FLAGS.hash_pipeline, "batch_size": batch_size, "eval_batch_size": eval_batch_size, "learning_rate": FLAGS.learning_rate, "num_users": num_users, "num_items": num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "num_neg": FLAGS.num_neg, "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, "beta1": FLAGS.beta1, "beta2": FLAGS.beta2, "epsilon": FLAGS.epsilon, "match_mlperf": FLAGS.ml_perf, "use_xla_for_gpu": FLAGS.use_xla_for_gpu, }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"}) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) pred_input_fn = None total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) # Train the model train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=True) if batch_count != num_train_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format(batch_count, num_train_steps)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) if train_record_dir: tf.gfile.DeleteRecursively(train_record_dir) tf.logging.info("Beginning evaluation.") if pred_input_fn is None: pred_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=False) if eval_batch_count != num_eval_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format( eval_batch_count, num_eval_steps)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = eval_estimator.evaluate(pred_input_fn, steps=num_eval_steps) hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) tf.logging.info("Evaluation complete.") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET, value={ "epoch": cycle_index, "value": FLAGS.hr_threshold }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={ "epoch": cycle_index, "value": hr }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={ "epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES }) # Logged by the async process during record creation. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, deferred=True) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) cleanup_fn() # Cleanup data construction artifacts and subprocess. # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_batch_size = int(FLAGS.eval_batch_size or max([FLAGS.batch_size, eval_per_user])) if eval_batch_size % eval_per_user: eval_batch_size = eval_batch_size // eval_per_user * eval_per_user tf.logging.warning( "eval examples per user does not evenly divide eval_batch_size. " "Overriding to {}".format(eval_batch_size)) if FLAGS.use_synthetic_data: ncf_dataset = None cleanup_fn = lambda: None num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH else: ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, num_cycles=total_training_cycle, match_mlperf=FLAGS.ml_perf, deterministic=FLAGS.seed is not None, use_subprocess=FLAGS.use_subprocess, cache_id=FLAGS.cache_id) num_users = ncf_dataset.num_users num_items = ncf_dataset.num_items num_train_steps = int(np.ceil( FLAGS.epochs_between_evals * ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) / FLAGS.batch_size)) num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users / eval_batch_size)) model_helpers.apply_clean(flags.FLAGS) params = { "use_seed": FLAGS.seed is not None, "hash_pipeline": FLAGS.hash_pipeline, "batch_size": batch_size, "eval_batch_size": eval_batch_size, "learning_rate": FLAGS.learning_rate, "num_users": num_users, "num_items": num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "num_neg": FLAGS.num_neg, "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, "beta1": FLAGS.beta1, "beta2": FLAGS.beta2, "epsilon": FLAGS.epsilon, "match_mlperf": FLAGS.ml_perf, "use_xla_for_gpu": FLAGS.use_xla_for_gpu, "use_estimator": FLAGS.use_estimator, } if FLAGS.use_estimator: train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, iterations=num_train_steps, params=params, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) else: runner = model_runner.NcfModelRunner(ncf_dataset, params, num_train_steps, num_eval_steps, FLAGS.use_while_loop) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"} ) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) eval_input_fn = None target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) # Train the model if FLAGS.use_estimator: train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=True) if batch_count != num_train_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format(batch_count, num_train_steps)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) if train_record_dir: tf.gfile.DeleteRecursively(train_record_dir) tf.logging.info("Beginning evaluation.") if eval_input_fn is None: eval_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=False) if eval_batch_count != num_eval_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format( eval_batch_count, num_eval_steps)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = eval_estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") else: runner.train() tf.logging.info("Beginning evaluation.") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = runner.eval() tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_TARGET, value={"epoch": cycle_index, "value": FLAGS.hr_threshold}) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={"epoch": cycle_index, "value": hr}) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES}) # Logged by the async process during record creation. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, deferred=True) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) cleanup_fn() # Cleanup data construction artifacts and subprocess. # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def run_ncf(_): """Run NCF training and eval loop.""" params = ncf_common.parse_flags(FLAGS) num_users, num_items, num_train_steps, num_eval_steps, producer = ( ncf_common.get_inputs(params)) params["num_users"], params["num_items"] = num_users, num_items producer.start() model_helpers.apply_clean(flags.FLAGS) estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params) benchmark_logger, train_hooks = log_and_get_hooks( params["eval_batch_size"]) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) train_input_fn = producer.make_input_fn(is_training=True) estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) logging.info("Beginning evaluation.") eval_input_fn = producer.make_input_fn(is_training=False) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps) logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) loss = float(eval_results["loss"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET, value={ "epoch": cycle_index, "value": FLAGS.hr_threshold }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={ "epoch": cycle_index, "value": hr }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={ "epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format( cycle_index + 1, hr, ndcg, loss)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) producer.stop_loop() producer.join() # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def neumf_model_fn(features, labels, mode, params): """Model Function for NeuMF estimator.""" if params.get("use_seed"): tf.set_random_seed(stat_utils.random_int32()) users = features[movielens.USER_COLUMN] items = features[movielens.ITEM_COLUMN] logits = construct_model(users, items, params).output # Softmax with the first column of zeros is equivalent to sigmoid. softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype), logits], axis=1) if mode == tf.estimator.ModeKeys.EVAL: duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32) return compute_eval_loss_and_metrics( logits, softmax_logits, duplicate_mask, params["num_neg"], params["match_mlperf"], use_tpu_spec=params["use_xla_for_gpu"]) elif mode == tf.estimator.ModeKeys.TRAIN: labels = tf.cast(labels, tf.int32) valid_pt_mask = features[rconst.VALID_POINT_MASK] mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR, value=params["learning_rate"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1, value=params["beta1"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2, value=params["beta2"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON, value=params["epsilon"]) optimizer = tf.train.AdamOptimizer( learning_rate=params["learning_rate"], beta1=params["beta1"], beta2=params["beta2"], epsilon=params["epsilon"]) if params["use_tpu"]: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN, value=mlperf_helper.TAGS.BCE) loss = tf.losses.sparse_softmax_cross_entropy( labels=labels, logits=softmax_logits, weights=tf.cast(valid_pt_mask, tf.float32) ) # This tensor is used by logging hooks. tf.identity(loss, name="cross_entropy") global_step = tf.train.get_global_step() tvars = tf.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) gradients = _sparse_to_dense_grads(gradients) minimize_op = optimizer.apply_gradients( gradients, global_step=global_step, name="train") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) else: raise NotImplementedError
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) params = parse_flags(FLAGS) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = (producer.train_batches_per_epoch // params["batches_per_step"]) num_eval_steps = (producer.eval_batches_per_epoch // params["batches_per_step"]) assert not producer.train_batches_per_epoch % params["batches_per_step"] assert not producer.eval_batches_per_epoch % params["batches_per_step"] producer.start() params["num_users"], params["num_items"] = num_users, num_items model_helpers.apply_clean(flags.FLAGS) estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params) benchmark_logger, train_hooks = log_and_get_hooks(params["eval_batch_size"]) target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) train_input_fn = producer.make_input_fn(is_training=True) estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) tf.logging.info("Beginning evaluation.") eval_input_fn = producer.make_input_fn(is_training=False) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) loss = float(eval_results["loss"]) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_TARGET, value={"epoch": cycle_index, "value": FLAGS.hr_threshold}) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={"epoch": cycle_index, "value": hr}) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES}) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format( cycle_index + 1, hr, ndcg, loss)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) producer.stop_loop() producer.join() # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def neumf_model_fn(features, labels, mode, params): """Model Function for NeuMF estimator.""" if params.get("use_seed"): tf.set_random_seed(stat_utils.random_int32()) users = features[movielens.USER_COLUMN] items = features[movielens.ITEM_COLUMN] user_input = tf.keras.layers.Input(tensor=users) item_input = tf.keras.layers.Input(tensor=items) logits = construct_model(user_input, item_input, params).output # Softmax with the first column of zeros is equivalent to sigmoid. softmax_logits = ncf_common.convert_to_softmax_logits(logits) if mode == tf.estimator.ModeKeys.EVAL: duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32) return _get_estimator_spec_with_metrics( logits, softmax_logits, duplicate_mask, params["num_neg"], params["match_mlperf"], use_tpu_spec=params["use_xla_for_gpu"]) elif mode == tf.estimator.ModeKeys.TRAIN: labels = tf.cast(labels, tf.int32) valid_pt_mask = features[rconst.VALID_POINT_MASK] mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR, value=params["learning_rate"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1, value=params["beta1"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2, value=params["beta2"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON, value=params["epsilon"]) optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=params["learning_rate"], beta1=params["beta1"], beta2=params["beta2"], epsilon=params["epsilon"]) if params["use_tpu"]: # TODO(seemuch): remove this contrib import optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN, value=mlperf_helper.TAGS.BCE) loss = tf.compat.v1.losses.sparse_softmax_cross_entropy( labels=labels, logits=softmax_logits, weights=tf.cast(valid_pt_mask, tf.float32)) # This tensor is used by logging hooks. tf.identity(loss, name="cross_entropy") global_step = tf.compat.v1.train.get_global_step() tvars = tf.compat.v1.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) gradients = _sparse_to_dense_grads(gradients) minimize_op = optimizer.apply_gradients(gradients, global_step=global_step, name="train") update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) else: raise NotImplementedError
def run_ncf(_): """Run NCF training and eval loop.""" params = ncf_common.parse_flags(FLAGS) num_users, num_items, num_train_steps, num_eval_steps, producer = ( ncf_common.get_inputs(params)) params["num_users"], params["num_items"] = num_users, num_items producer.start() model_helpers.apply_clean(flags.FLAGS) estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params) benchmark_logger, train_hooks = log_and_get_hooks(params["eval_batch_size"]) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) train_input_fn = producer.make_input_fn(is_training=True) estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) tf.logging.info("Beginning evaluation.") eval_input_fn = producer.make_input_fn(is_training=False) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) loss = float(eval_results["loss"]) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_TARGET, value={"epoch": cycle_index, "value": FLAGS.hr_threshold}) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={"epoch": cycle_index, "value": hr}) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES}) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format( cycle_index + 1, hr, ndcg, loss)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) producer.stop_loop() producer.join() # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)