def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size, num_data_readers=None, num_neg=4, epochs_per_cycle=1, match_mlperf=False): """Preprocess data and start negative generation subprocess.""" tf.logging.info("Beginning data preprocessing.") ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir, num_data_readers=num_data_readers, match_mlperf=match_mlperf) tf.logging.info("Creating training file subprocess.") subproc_env = os.environ.copy() # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU # resources and by default will try to allocate GPU memory. This would cause # contention with the main training process. subproc_env["CUDA_VISIBLE_DEVICES"] = "" # By limiting the number of workers we guarantee that the worker # pool underlying the training generation doesn't starve other processes. num_workers = int(multiprocessing.cpu_count() * 0.75) or 1 subproc_args = popen_helper.INVOCATION + [ "--data_dir", data_dir, "--cache_id", str(ncf_dataset.cache_paths.cache_id), "--num_neg", str(num_neg), "--num_train_positives", str(ncf_dataset.num_train_positives), "--num_items", str(ncf_dataset.num_items), "--num_readers", str(ncf_dataset.num_data_readers), "--epochs_per_cycle", str(epochs_per_cycle), "--train_batch_size", str(batch_size), "--eval_batch_size", str(eval_batch_size), "--num_workers", str(num_workers), "--spillover", "True", # This allows the training input function to # guarantee batch size and significantly improves # performance. (~5% increase in examples/sec on # GPU, and needed for TPU XLA.) "--redirect_logs", "True", "--seed", str(int(stat_utils.random_int32())) ] tf.logging.info( "Generation subprocess command: {}".format(" ".join(subproc_args))) proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env) atexit.register(_shutdown, proc=proc) atexit.register(tf.gfile.DeleteRecursively, ncf_dataset.cache_paths.cache_root) for _ in range(15): if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): break time.sleep(1) # allow `alive` file to be written if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): raise ValueError("Generation subprocess did not start correctly. Data will " "not be available; exiting to avoid waiting forever.") return ncf_dataset
def neumf_model_fn(features, labels, mode, params): """Model Function for NeuMF estimator.""" if params.get("use_seed"): tf.set_random_seed(stat_utils.random_int32()) users = features[movielens.USER_COLUMN] items = features[movielens.ITEM_COLUMN] user_input = tf.keras.layers.Input(tensor=users) item_input = tf.keras.layers.Input(tensor=items) logits = construct_model(user_input, item_input, params).output # Softmax with the first column of zeros is equivalent to sigmoid. softmax_logits = ncf_common.convert_to_softmax_logits(logits) if mode == tf.estimator.ModeKeys.EVAL: duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32) return _get_estimator_spec_with_metrics( logits, softmax_logits, duplicate_mask, params["num_neg"], params["match_mlperf"], use_tpu_spec=params["use_tpu"]) elif mode == tf.estimator.ModeKeys.TRAIN: labels = tf.cast(labels, tf.int32) valid_pt_mask = features[rconst.VALID_POINT_MASK] optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=params["learning_rate"], beta1=params["beta1"], beta2=params["beta2"], epsilon=params["epsilon"]) if params["use_tpu"]: optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer) loss = tf.compat.v1.losses.sparse_softmax_cross_entropy( labels=labels, logits=softmax_logits, weights=tf.cast(valid_pt_mask, tf.float32) ) tf.identity(loss, name="cross_entropy") global_step = tf.compat.v1.train.get_global_step() tvars = tf.compat.v1.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) gradients = sparse_to_dense_grads(gradients) minimize_op = optimizer.apply_gradients( gradients, global_step=global_step, name="train") update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) else: raise NotImplementedError
def neumf_model_fn(features, labels, mode, params): """Model Function for NeuMF estimator.""" if params.get("use_seed"): tf.set_random_seed(stat_utils.random_int32()) users = features[movielens.USER_COLUMN] items = tf.cast(features[movielens.ITEM_COLUMN], tf.int32) logits = construct_model(users=users, items=items, params=params) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { movielens.ITEM_COLUMN: items, movielens.RATING_COLUMN: logits, } if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, predictions=predictions) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) elif mode == tf.estimator.ModeKeys.TRAIN: labels = tf.cast(labels, tf.int32) optimizer = tf.train.AdamOptimizer( learning_rate=params["learning_rate"], beta1=params["beta1"], beta2=params["beta2"], epsilon=params["epsilon"]) if params["use_tpu"]: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) # Softmax with the first column of ones is equivalent to sigmoid. logits = tf.concat([tf.ones(logits.shape, dtype=logits.dtype), logits], axis=1) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) global_step = tf.train.get_global_step() tvars = tf.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) minimize_op = optimizer.apply_gradients(gradients, global_step=global_step, name="train") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) else: raise NotImplementedError
def _start_shuffle_iterator(self): if self._shuffle_with_forkpool: pool = popen_helper.get_forkpool(3, closing=False) else: pool = popen_helper.get_threadpool(1, closing=False) atexit.register(pool.close) args = [(self._elements_in_epoch, stat_utils.random_int32()) for _ in range(self._maximum_number_epochs)] imap = pool.imap if self.deterministic else pool.imap_unordered self._shuffle_iterator = imap(stat_utils.permutation, args)
def test_shard_randomness(self): users = [0, 0, 0, 0, 1, 1, 1, 1] items = [0, 2, 4, 6, 0, 2, 4, 6] times = [1, 2, 3, 4, 1, 2, 3, 4] df = pd.DataFrame({ movielens.USER_COLUMN: users, movielens.ITEM_COLUMN: items, movielens.TIMESTAMP_COLUMN: times }) cache_paths = rconst.Paths(data_dir=self.temp_data_dir) np.random.seed(1) num_shards = 2 num_items = 10 data_preprocessing.generate_train_eval_data( df, approx_num_shards=num_shards, num_items=num_items, cache_paths=cache_paths, match_mlperf=True) raw_shards = tf.gfile.ListDirectory(cache_paths.train_shard_subdir) assert len(raw_shards) == num_shards sharded_eval_data = [] for i in range(2): sharded_eval_data.append( data_async_generation._process_shard( (os.path.join(cache_paths.train_shard_subdir, raw_shards[i]), num_items, rconst.NUM_EVAL_NEGATIVES, stat_utils.random_int32(), False, True))) if sharded_eval_data[0][0][0] == 1: # Order is not assured for this part of the pipeline. sharded_eval_data.reverse() eval_data = [ np.concatenate([shard[i] for shard in sharded_eval_data]) for i in range(3) ] eval_data = { movielens.USER_COLUMN: eval_data[0], movielens.ITEM_COLUMN: eval_data[1], } eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1 self.assertAllClose(eval_data[movielens.USER_COLUMN], [0] * eval_items_per_user + [1] * eval_items_per_user) # Each shard process should generate different random items. self.assertNotAllClose( eval_data[movielens.ITEM_COLUMN][:eval_items_per_user], eval_data[movielens.ITEM_COLUMN][eval_items_per_user:])
def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size, num_data_readers=None, num_neg=4, epochs_per_cycle=1, match_mlperf=False): """Preprocess data and start negative generation subprocess.""" tf.logging.info("Beginning data preprocessing.") ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir, num_data_readers=num_data_readers, match_mlperf=match_mlperf) tf.logging.info("Creating training file subprocess.") subproc_env = os.environ.copy() # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU # resources and by default will try to allocate GPU memory. This would cause # contention with the main training process. subproc_env["CUDA_VISIBLE_DEVICES"] = "" # By limiting the number of workers we guarantee that the worker # pool underlying the training generation doesn't starve other processes. num_workers = int(multiprocessing.cpu_count() * 0.75) or 1 subproc_args = popen_helper.INVOCATION + [ "--data_dir", data_dir, "--cache_id", str(ncf_dataset.cache_paths.cache_id), "--num_neg", str(num_neg), "--num_train_positives", str(ncf_dataset.num_train_positives), "--num_items", str(ncf_dataset.num_items), "--num_readers", str(ncf_dataset.num_data_readers), "--epochs_per_cycle", str(epochs_per_cycle), "--train_batch_size", str(batch_size), "--eval_batch_size", str(eval_batch_size), "--num_workers", str(num_workers), "--spillover", "True", # This allows the training input function to # guarantee batch size and significantly improves # performance. (~5% increase in examples/sec on # GPU, and needed for TPU XLA.) "--redirect_logs", "True", "--seed", str(int(stat_utils.random_int32())) ] tf.logging.info( "Generation subprocess command: {}".format(" ".join(subproc_args))) proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env) atexit.register(_shutdown, proc=proc) atexit.register(tf.gfile.DeleteRecursively, ncf_dataset.cache_paths.cache_root) return ncf_dataset
def test_shard_randomness(self): users = [0, 0, 0, 0, 1, 1, 1, 1] items = [0, 2, 4, 6, 0, 2, 4, 6] times = [1, 2, 3, 4, 1, 2, 3, 4] df = pd.DataFrame({movielens.USER_COLUMN: users, movielens.ITEM_COLUMN: items, movielens.TIMESTAMP_COLUMN: times}) cache_paths = rconst.Paths(data_dir=self.temp_data_dir) np.random.seed(1) num_shards = 2 num_items = 10 data_preprocessing.generate_train_eval_data( df, approx_num_shards=num_shards, num_items=num_items, cache_paths=cache_paths, match_mlperf=True) raw_shards = tf.gfile.ListDirectory(cache_paths.train_shard_subdir) assert len(raw_shards) == num_shards sharded_eval_data = [] for i in range(2): sharded_eval_data.append(data_async_generation._process_shard( (os.path.join(cache_paths.train_shard_subdir, raw_shards[i]), num_items, rconst.NUM_EVAL_NEGATIVES, stat_utils.random_int32(), False, True))) if sharded_eval_data[0][0][0] == 1: # Order is not assured for this part of the pipeline. sharded_eval_data.reverse() eval_data = [np.concatenate([shard[i] for shard in sharded_eval_data]) for i in range(3)] eval_data = { movielens.USER_COLUMN: eval_data[0], movielens.ITEM_COLUMN: eval_data[1], } eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1 self.assertAllClose(eval_data[movielens.USER_COLUMN], [0] * eval_items_per_user + [1] * eval_items_per_user) # Each shard process should generate different random items. self.assertNotAllClose( eval_data[movielens.ITEM_COLUMN][:eval_items_per_user], eval_data[movielens.ITEM_COLUMN][eval_items_per_user:])
def write_flagfile(flags_, ncf_dataset): """Write flagfile to begin async data generation.""" if ncf_dataset.deterministic: flags_["seed"] = stat_utils.random_int32() # We write to a temp file then atomically rename it to the final file, # because writing directly to the final file can cause the data generation # async process to read a partially written JSON file. flagfile_temp = os.path.join(ncf_dataset.cache_paths.cache_root, rconst.FLAGFILE_TEMP) tf.logging.info("Preparing flagfile for async data generation in {} ..." .format(flagfile_temp)) with tf.gfile.Open(flagfile_temp, "w") as f: for k, v in six.iteritems(flags_): f.write("--{}={}\n".format(k, v)) flagfile = os.path.join(ncf_dataset.cache_paths.cache_root, rconst.FLAGFILE) tf.gfile.Rename(flagfile_temp, flagfile) tf.logging.info( "Wrote flagfile for async data generation in {}.".format(flagfile))
def neumf_model_fn(features, labels, mode, params): """Model Function for NeuMF estimator.""" if params.get("use_seed"): tf.set_random_seed(stat_utils.random_int32()) users = features[movielens.USER_COLUMN] items = features[movielens.ITEM_COLUMN] logits = construct_model(users, items, params).output # Softmax with the first column of zeros is equivalent to sigmoid. softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype), logits], axis=1) if mode == tf.estimator.ModeKeys.EVAL: duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32) return compute_eval_loss_and_metrics( logits, softmax_logits, duplicate_mask, params["num_neg"], params["match_mlperf"], use_tpu_spec=params["use_xla_for_gpu"]) elif mode == tf.estimator.ModeKeys.TRAIN: labels = tf.cast(labels, tf.int32) valid_pt_mask = features[rconst.VALID_POINT_MASK] mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR, value=params["learning_rate"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1, value=params["beta1"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2, value=params["beta2"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON, value=params["epsilon"]) optimizer = tf.train.AdamOptimizer( learning_rate=params["learning_rate"], beta1=params["beta1"], beta2=params["beta2"], epsilon=params["epsilon"]) if params["use_tpu"]: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN, value=mlperf_helper.TAGS.BCE) loss = tf.losses.sparse_softmax_cross_entropy( labels=labels, logits=softmax_logits, weights=tf.cast(valid_pt_mask, tf.float32) ) # This tensor is used by logging hooks. tf.identity(loss, name="cross_entropy") global_step = tf.train.get_global_step() tvars = tf.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) gradients = _sparse_to_dense_grads(gradients) minimize_op = optimizer.apply_gradients( gradients, global_step=global_step, name="train") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) else: raise NotImplementedError
def neumf_model_fn(features, labels, mode, params): """Model Function for NeuMF estimator.""" if params.get("use_seed"): tf.set_random_seed(stat_utils.random_int32()) users = features[movielens.USER_COLUMN] items = tf.cast(features[movielens.ITEM_COLUMN], tf.int32) logits = construct_model(users=users, items=items, params=params) # Softmax with the first column of zeros is equivalent to sigmoid. softmax_logits = tf.concat( [tf.zeros(logits.shape, dtype=logits.dtype), logits], axis=1) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { movielens.ITEM_COLUMN: items, movielens.RATING_COLUMN: logits, } if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, predictions=predictions) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) elif mode == tf.estimator.ModeKeys.EVAL: duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32) return compute_eval_loss_and_metrics(logits, softmax_logits, duplicate_mask, params["num_neg"], params["match_mlperf"], use_tpu_spec=params["use_tpu"] or params["use_xla_for_gpu"]) elif mode == tf.estimator.ModeKeys.TRAIN: labels = tf.cast(labels, tf.int32) optimizer = tf.train.AdamOptimizer( learning_rate=params["learning_rate"], beta1=params["beta1"], beta2=params["beta2"], epsilon=params["epsilon"]) if params["use_tpu"]: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=softmax_logits) # This tensor is used by logging hooks. tf.identity(loss, name="cross_entropy") global_step = tf.train.get_global_step() tvars = tf.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) gradients = _sparse_to_dense_grads(gradients) minimize_op = optimizer.apply_gradients(gradients, global_step=global_step, name="train") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) else: raise NotImplementedError
def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size, num_data_readers=None, num_neg=4, epochs_per_cycle=1, match_mlperf=False, deterministic=False, use_subprocess=True, cache_id=None): # type: (...) -> (NCFDataset, typing.Callable) """Preprocess data and start negative generation subprocess.""" tf.logging.info("Beginning data preprocessing.") ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir, num_data_readers=num_data_readers, match_mlperf=match_mlperf, deterministic=deterministic, cache_id=cache_id) # By limiting the number of workers we guarantee that the worker # pool underlying the training generation doesn't starve other processes. num_workers = int(multiprocessing.cpu_count() * 0.75) or 1 flags_ = { "data_dir": data_dir, "cache_id": ncf_dataset.cache_paths.cache_id, "num_neg": num_neg, "num_train_positives": ncf_dataset.num_train_positives, "num_items": ncf_dataset.num_items, "num_users": ncf_dataset.num_users, "num_readers": ncf_dataset.num_data_readers, "epochs_per_cycle": epochs_per_cycle, "train_batch_size": batch_size, "eval_batch_size": eval_batch_size, "num_workers": num_workers, "redirect_logs": use_subprocess, "use_tf_logging": not use_subprocess, "ml_perf": match_mlperf, } if ncf_dataset.deterministic: flags_["seed"] = stat_utils.random_int32() tf.gfile.MakeDirs(data_dir) # We write to a temp file then atomically rename it to the final file, # because writing directly to the final file can cause the data generation # async process to read a partially written JSON file. flagfile_temp = os.path.join(ncf_dataset.cache_paths.cache_root, rconst.FLAGFILE_TEMP) tf.logging.info("Preparing flagfile for async data generation in {} ..." .format(flagfile_temp)) with tf.gfile.Open(flagfile_temp, "w") as f: for k, v in six.iteritems(flags_): f.write("--{}={}\n".format(k, v)) flagfile = os.path.join(ncf_dataset.cache_paths.cache_root, rconst.FLAGFILE) tf.gfile.Rename(flagfile_temp, flagfile) tf.logging.info( "Wrote flagfile for async data generation in {}." .format(flagfile)) if use_subprocess: tf.logging.info("Creating training file subprocess.") subproc_env = os.environ.copy() # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU # resources and by default will try to allocate GPU memory. This would cause # contention with the main training process. subproc_env["CUDA_VISIBLE_DEVICES"] = "" subproc_args = popen_helper.INVOCATION + [ "--data_dir", data_dir, "--cache_id", str(ncf_dataset.cache_paths.cache_id)] tf.logging.info( "Generation subprocess command: {}".format(" ".join(subproc_args))) proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env) cleanup_called = {"finished": False} @atexit.register def cleanup(): """Remove files and subprocess from data generation.""" if cleanup_called["finished"]: return if use_subprocess: _shutdown(proc) try: tf.gfile.DeleteRecursively(ncf_dataset.cache_paths.cache_root) except tf.errors.NotFoundError: pass cleanup_called["finished"] = True for _ in range(300): if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): break time.sleep(1) # allow `alive` file to be written if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): raise ValueError("Generation subprocess did not start correctly. Data will " "not be available; exiting to avoid waiting forever.") return ncf_dataset, cleanup
def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size, num_data_readers=None, num_neg=4, epochs_per_cycle=1, match_mlperf=False, deterministic=False): # type: (...) -> (NCFDataset, typing.Callable) """Preprocess data and start negative generation subprocess.""" tf.logging.info("Beginning data preprocessing.") ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir, num_data_readers=num_data_readers, match_mlperf=match_mlperf, deterministic=deterministic) tf.logging.info("Creating training file subprocess.") subproc_env = os.environ.copy() # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU # resources and by default will try to allocate GPU memory. This would cause # contention with the main training process. subproc_env["CUDA_VISIBLE_DEVICES"] = "" # By limiting the number of workers we guarantee that the worker # pool underlying the training generation doesn't starve other processes. num_workers = int(multiprocessing.cpu_count() * 0.75) or 1 subproc_args = popen_helper.INVOCATION + [ "--data_dir", data_dir, "--cache_id", str(ncf_dataset.cache_paths.cache_id), "--num_neg", str(num_neg), "--num_train_positives", str(ncf_dataset.num_train_positives), "--num_items", str(ncf_dataset.num_items), "--num_readers", str(ncf_dataset.num_data_readers), "--epochs_per_cycle", str(epochs_per_cycle), "--train_batch_size", str(batch_size), "--eval_batch_size", str(eval_batch_size), "--num_workers", str(num_workers), "--spillover", "True", # This allows the training input function to # guarantee batch size and significantly improves # performance. (~5% increase in examples/sec on # GPU, and needed for TPU XLA.) "--redirect_logs", "True" ] if ncf_dataset.deterministic: subproc_args.extend(["--seed", str(int(stat_utils.random_int32()))]) tf.logging.info( "Generation subprocess command: {}".format(" ".join(subproc_args))) proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env) cleanup_called = {"finished": False} @atexit.register def cleanup(): """Remove files and subprocess from data generation.""" if cleanup_called["finished"]: return _shutdown(proc) try: tf.gfile.DeleteRecursively(ncf_dataset.cache_paths.cache_root) except tf.errors.NotFoundError: pass cleanup_called["finished"] = True for _ in range(300): if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): break time.sleep(1) # allow `alive` file to be written if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): raise ValueError("Generation subprocess did not start correctly. Data will " "not be available; exiting to avoid waiting forever.") return ncf_dataset, cleanup
def neumf_model_fn(features, labels, mode, params): """Model Function for NeuMF estimator.""" if params.get("use_seed"): tf.set_random_seed(stat_utils.random_int32()) users = features[movielens.USER_COLUMN] items = features[movielens.ITEM_COLUMN] logits = construct_model(users, items, params).output # Softmax with the first column of zeros is equivalent to sigmoid. softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype), logits], axis=1) if mode == tf.estimator.ModeKeys.EVAL: duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32) return compute_eval_loss_and_metrics( logits, softmax_logits, duplicate_mask, params["num_neg"], params["match_mlperf"], use_tpu_spec=params["use_xla_for_gpu"]) if mode == tf.estimator.ModeKeys.TRAIN: labels = tf.cast(labels, tf.int32) valid_pt_mask = features[rconst.VALID_POINT_MASK] mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR, value=params["learning_rate"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1, value=params["beta1"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2, value=params["beta2"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON, value=params["epsilon"]) optimizer = tf.train.AdamOptimizer( learning_rate=params["learning_rate"], beta1=params["beta1"], beta2=params["beta2"], epsilon=params["epsilon"]) if params["use_tpu"]: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN, value=mlperf_helper.TAGS.BCE) loss = tf.losses.sparse_softmax_cross_entropy( labels=labels, logits=softmax_logits, weights=tf.cast(valid_pt_mask, tf.float32) ) # This tensor is used by logging hooks. tf.identity(loss, name="cross_entropy") global_step = tf.train.get_global_step() tvars = tf.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) gradients = _sparse_to_dense_grads(gradients) minimize_op = optimizer.apply_gradients( gradients, global_step=global_step, name="train") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) raise NotImplementedError
def neumf_model_fn(features, labels, mode, params): """Model Function for NeuMF estimator.""" if params.get("use_seed"): tf.set_random_seed(stat_utils.random_int32()) users = features[movielens.USER_COLUMN] items = tf.cast(features[movielens.ITEM_COLUMN], tf.int32) keras_model = params.get("keras_model") if keras_model: logits = keras_model([users, items], training=mode == tf.estimator.ModeKeys.TRAIN) else: keras_model = construct_model(users=users, items=items, params=params) logits = keras_model.output if not params["use_estimator"] and "keras_model" not in params: # When we are not using estimator, we need to reuse the Keras model when # this model_fn is called again, so that the variables are shared between # training and eval. So we mutate params to add the Keras model. params["keras_model"] = keras_model # Softmax with the first column of zeros is equivalent to sigmoid. softmax_logits = tf.concat( [tf.zeros(logits.shape, dtype=logits.dtype), logits], axis=1) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { movielens.ITEM_COLUMN: items, movielens.RATING_COLUMN: logits, } if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, predictions=predictions) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) elif mode == tf.estimator.ModeKeys.EVAL: duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32) return compute_eval_loss_and_metrics(logits, softmax_logits, duplicate_mask, params["num_neg"], params["match_mlperf"], use_tpu_spec=params["use_tpu"] or params["use_xla_for_gpu"]) elif mode == tf.estimator.ModeKeys.TRAIN: labels = tf.cast(labels, tf.int32) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR, value=params["learning_rate"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1, value=params["beta1"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2, value=params["beta2"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON, value=params["epsilon"]) optimizer = tf.train.AdamOptimizer( learning_rate=params["learning_rate"], beta1=params["beta1"], beta2=params["beta2"], epsilon=params["epsilon"]) if params["use_tpu"]: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN, value=mlperf_helper.TAGS.BCE) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=softmax_logits) # This tensor is used by logging hooks. tf.identity(loss, name="cross_entropy") global_step = tf.train.get_global_step() tvars = tf.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) gradients = _sparse_to_dense_grads(gradients) minimize_op = optimizer.apply_gradients(gradients, global_step=global_step, name="train") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) else: raise NotImplementedError
def _construct_records( is_training, # type: bool train_cycle, # type: typing.Optional[int] num_workers, # type: int cache_paths, # type: rconst.Paths num_readers, # type: int num_neg, # type: int num_positives, # type: int num_items, # type: int epochs_per_cycle, # type: int batch_size, # type: int training_shards, # type: typing.List[str] deterministic=False, # type: bool match_mlperf=False # type: bool ): """Generate false negatives and write TFRecords files. Args: is_training: Are training records (True) or eval records (False) created. train_cycle: Integer of which cycle the generated data is for. num_workers: Number of multiprocessing workers to use for negative generation. cache_paths: Paths object with information of where to write files. num_readers: The number of reader datasets in the input_fn. This number is approximate; fewer shards will be created if not all shards are assigned batches. This can occur due to discretization in the assignment process. num_neg: The number of false negatives per positive example. num_positives: The number of positive examples. This value is used to pre-allocate arrays while the imap is still running. (NumPy does not allow dynamic arrays.) num_items: The cardinality of the item set. epochs_per_cycle: The number of epochs worth of data to construct. batch_size: The expected batch size used during training. This is used to properly batch data when writing TFRecords. training_shards: The picked positive examples from which to generate negatives. """ st = timeit.default_timer() if is_training: mlperf_helper.ncf_print( key=mlperf_helper.TAGS.INPUT_STEP_TRAIN_NEG_GEN) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_HP_NUM_NEG, value=num_neg) # set inside _process_shard() mlperf_helper.ncf_print( key=mlperf_helper.TAGS.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True) else: # Later logic assumes that all items for a given user are in the same batch. assert not batch_size % (rconst.NUM_EVAL_NEGATIVES + 1) assert num_neg == rconst.NUM_EVAL_NEGATIVES mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_EVAL_NEG_GEN) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, value=num_positives) assert epochs_per_cycle == 1 or is_training num_workers = min([num_workers, len(training_shards) * epochs_per_cycle]) num_pts = num_positives * (1 + num_neg) # Equivalent to `int(ceil(num_pts / batch_size)) * batch_size`, but without # precision concerns num_pts_with_padding = (num_pts + batch_size - 1) // batch_size * batch_size num_padding = num_pts_with_padding - num_pts # We choose a different random seed for each process, so that the processes # will not all choose the same random numbers. process_seeds = [ stat_utils.random_int32() for _ in training_shards * epochs_per_cycle ] map_args = [(shard, num_items, num_neg, process_seeds[i], is_training, match_mlperf) for i, shard in enumerate(training_shards * epochs_per_cycle)] with popen_helper.get_pool(num_workers, init_worker) as pool: map_fn = pool.imap if deterministic else pool.imap_unordered # pylint: disable=no-member data_generator = map_fn(_process_shard, map_args) data = [ np.zeros(shape=(num_pts_with_padding, ), dtype=np.int32) - 1, np.zeros(shape=(num_pts_with_padding, ), dtype=np.uint16), np.zeros(shape=(num_pts_with_padding, ), dtype=np.int8), ] # Training data is shuffled. Evaluation data MUST not be shuffled. # Downstream processing depends on the fact that evaluation data for a given # user is grouped within a batch. if is_training: index_destinations = np.random.permutation(num_pts) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) else: index_destinations = np.arange(num_pts) start_ind = 0 for data_segment in data_generator: n_in_segment = data_segment[0].shape[0] dest = index_destinations[start_ind:start_ind + n_in_segment] start_ind += n_in_segment for i in range(3): data[i][dest] = data_segment[i] assert np.sum(data[0] == -1) == num_padding if is_training: if num_padding: # In order to have a full batch, randomly include points from earlier in # the batch. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) pad_sample_indices = np.random.randint(low=0, high=num_pts, size=(num_padding, )) dest = np.arange(start=start_ind, stop=start_ind + num_padding) start_ind += num_padding for i in range(3): data[i][dest] = data[i][pad_sample_indices] else: # For Evaluation, padding is all zeros. The evaluation input_fn knows how # to interpret and discard the zero padded entries. data[0][num_pts:] = 0 # Check that no points were overlooked. assert not np.sum(data[0] == -1) if is_training: # The number of points is slightly larger than num_pts due to padding. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_SIZE, value=int(data[0].shape[0])) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_BATCH_SIZE, value=batch_size) else: # num_pts is logged instead of int(data[0].shape[0]), because the size # of the data vector includes zero pads which are ignored. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_SIZE, value=num_pts) batches_per_file = np.ceil(num_pts_with_padding / batch_size / num_readers) current_file_id = -1 current_batch_id = -1 batches_by_file = [[] for _ in range(num_readers)] while True: current_batch_id += 1 if (current_batch_id % batches_per_file) == 0: current_file_id += 1 start_ind = current_batch_id * batch_size end_ind = start_ind + batch_size if end_ind > num_pts_with_padding: if start_ind != num_pts_with_padding: raise ValueError("Batch padding does not line up") break batches_by_file[current_file_id].append(current_batch_id) # Drop shards which were not assigned batches batches_by_file = [i for i in batches_by_file if i] num_readers = len(batches_by_file) if is_training: # Empirically it is observed that placing the batch with repeated values at # the start rather than the end improves convergence. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) batches_by_file[0][0], batches_by_file[-1][-1] = \ batches_by_file[-1][-1], batches_by_file[0][0] if is_training: template = rconst.TRAIN_RECORD_TEMPLATE record_dir = os.path.join(cache_paths.train_epoch_dir, get_cycle_folder_name(train_cycle)) tf.gfile.MakeDirs(record_dir) else: template = rconst.EVAL_RECORD_TEMPLATE record_dir = cache_paths.eval_data_subdir batch_count = 0 for i in range(num_readers): fpath = os.path.join(record_dir, template.format(i)) log_msg("Writing {}".format(fpath)) with tf.python_io.TFRecordWriter(fpath) as writer: for j in batches_by_file[i]: start_ind = j * batch_size end_ind = start_ind + batch_size record_kwargs = dict( users=data[0][start_ind:end_ind], items=data[1][start_ind:end_ind], ) if is_training: record_kwargs["labels"] = data[2][start_ind:end_ind] else: record_kwargs["dupe_mask"] = stat_utils.mask_duplicates( record_kwargs["items"].reshape(-1, num_neg + 1), axis=1).flatten().astype(np.int8) batch_bytes = _construct_record(**record_kwargs) writer.write(batch_bytes) batch_count += 1 # We write to a temp file then atomically rename it to the final file, because # writing directly to the final file can cause the main process to read a # partially written JSON file. ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP) with tf.gfile.Open(ready_file_temp, "w") as f: json.dump({ "batch_size": batch_size, "batch_count": batch_count, }, f) ready_file = os.path.join(record_dir, rconst.READY_FILE) tf.gfile.Rename(ready_file_temp, ready_file) if is_training: log_msg("Cycle {} complete. Total time: {:.1f} seconds".format( train_cycle, timeit.default_timer() - st)) else: log_msg( "Eval construction complete. Total time: {:.1f} seconds".format( timeit.default_timer() - st))
def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size, num_data_readers=None, num_neg=4, epochs_per_cycle=1): """Preprocess data and start negative generation subprocess.""" movielens.download(dataset=dataset, data_dir=data_dir) tf.logging.info("Beginning data preprocessing.") ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir, num_data_readers=num_data_readers) tf.logging.info("Creating training file subprocess.") subproc_env = os.environ.copy() # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU # resources and by default will try to allocate GPU memory. This would cause # contention with the main training process. subproc_env["CUDA_VISIBLE_DEVICES"] = "" python = "python3" if six.PY3 else "python2" # By limiting the number of workers we guarantee that the worker # pool underlying the training generation doesn't starve other processes. num_workers = int(multiprocessing.cpu_count() * 0.75) subproc_args = [ python, _ASYNC_GEN_PATH, "--data_dir", data_dir, "--cache_id", str(ncf_dataset.cache_paths.cache_id), "--num_neg", str(num_neg), "--num_train_positives", str(ncf_dataset.num_train_positives), "--num_items", str(ncf_dataset.num_items), "--num_readers", str(ncf_dataset.num_data_readers), "--epochs_per_cycle", str(epochs_per_cycle), "--train_batch_size", str(batch_size), "--eval_batch_size", str(eval_batch_size), "--num_workers", str(num_workers), "--spillover", "True", # This allows the training input function to # guarantee batch size and significantly improves # performance. (~5% increase in examples/sec on # GPU, and needed for TPU XLA.) "--redirect_logs", "True", "--seed", str(int(stat_utils.random_int32())) ] tf.logging.info("Generation subprocess command: {}".format( " ".join(subproc_args))) proc = subprocess.Popen(args=subproc_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, env=subproc_env) atexit.register(_shutdown, proc=proc) atexit.register(tf.gfile.DeleteRecursively, ncf_dataset.cache_paths.cache_root) return ncf_dataset
def _construct_records( is_training, # type: bool train_cycle, # type: typing.Optional[int] num_workers, # type: int cache_paths, # type: rconst.Paths num_readers, # type: int num_neg, # type: int num_positives, # type: int num_items, # type: int epochs_per_cycle, # type: int batch_size, # type: int training_shards, # type: typing.List[str] deterministic=False, # type: bool match_mlperf=False # type: bool ): """Generate false negatives and write TFRecords files. Args: is_training: Are training records (True) or eval records (False) created. train_cycle: Integer of which cycle the generated data is for. num_workers: Number of multiprocessing workers to use for negative generation. cache_paths: Paths object with information of where to write files. num_readers: The number of reader datasets in the input_fn. This number is approximate; fewer shards will be created if not all shards are assigned batches. This can occur due to discretization in the assignment process. num_neg: The number of false negatives per positive example. num_positives: The number of positive examples. This value is used to pre-allocate arrays while the imap is still running. (NumPy does not allow dynamic arrays.) num_items: The cardinality of the item set. epochs_per_cycle: The number of epochs worth of data to construct. batch_size: The expected batch size used during training. This is used to properly batch data when writing TFRecords. training_shards: The picked positive examples from which to generate negatives. """ st = timeit.default_timer() if is_training: mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_TRAIN_NEG_GEN) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.INPUT_HP_NUM_NEG, value=num_neg) # set inside _process_shard() mlperf_helper.ncf_print( key=mlperf_helper.TAGS.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True) else: # Later logic assumes that all items for a given user are in the same batch. assert not batch_size % (rconst.NUM_EVAL_NEGATIVES + 1) assert num_neg == rconst.NUM_EVAL_NEGATIVES mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_EVAL_NEG_GEN) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, value=num_positives) assert epochs_per_cycle == 1 or is_training num_workers = min([num_workers, len(training_shards) * epochs_per_cycle]) num_pts = num_positives * (1 + num_neg) # Equivalent to `int(ceil(num_pts / batch_size)) * batch_size`, but without # precision concerns num_pts_with_padding = (num_pts + batch_size - 1) // batch_size * batch_size num_padding = num_pts_with_padding - num_pts # We choose a different random seed for each process, so that the processes # will not all choose the same random numbers. process_seeds = [stat_utils.random_int32() for _ in training_shards * epochs_per_cycle] map_args = [ (shard, num_items, num_neg, process_seeds[i], is_training, match_mlperf) for i, shard in enumerate(training_shards * epochs_per_cycle)] with popen_helper.get_pool(num_workers, init_worker) as pool: map_fn = pool.imap if deterministic else pool.imap_unordered # pylint: disable=no-member data_generator = map_fn(_process_shard, map_args) data = [ np.zeros(shape=(num_pts_with_padding,), dtype=np.int32) - 1, np.zeros(shape=(num_pts_with_padding,), dtype=np.uint16), np.zeros(shape=(num_pts_with_padding,), dtype=np.int8), ] # Training data is shuffled. Evaluation data MUST not be shuffled. # Downstream processing depends on the fact that evaluation data for a given # user is grouped within a batch. if is_training: index_destinations = np.random.permutation(num_pts) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) else: index_destinations = np.arange(num_pts) start_ind = 0 for data_segment in data_generator: n_in_segment = data_segment[0].shape[0] dest = index_destinations[start_ind:start_ind + n_in_segment] start_ind += n_in_segment for i in range(3): data[i][dest] = data_segment[i] assert np.sum(data[0] == -1) == num_padding if is_training: if num_padding: # In order to have a full batch, randomly include points from earlier in # the batch. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) pad_sample_indices = np.random.randint( low=0, high=num_pts, size=(num_padding,)) dest = np.arange(start=start_ind, stop=start_ind + num_padding) start_ind += num_padding for i in range(3): data[i][dest] = data[i][pad_sample_indices] else: # For Evaluation, padding is all zeros. The evaluation input_fn knows how # to interpret and discard the zero padded entries. data[0][num_pts:] = 0 # Check that no points were overlooked. assert not np.sum(data[0] == -1) if is_training: # The number of points is slightly larger than num_pts due to padding. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_SIZE, value=int(data[0].shape[0])) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_BATCH_SIZE, value=batch_size) else: # num_pts is logged instead of int(data[0].shape[0]), because the size # of the data vector includes zero pads which are ignored. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_SIZE, value=num_pts) batches_per_file = np.ceil(num_pts_with_padding / batch_size / num_readers) current_file_id = -1 current_batch_id = -1 batches_by_file = [[] for _ in range(num_readers)] while True: current_batch_id += 1 if (current_batch_id % batches_per_file) == 0: current_file_id += 1 start_ind = current_batch_id * batch_size end_ind = start_ind + batch_size if end_ind > num_pts_with_padding: if start_ind != num_pts_with_padding: raise ValueError("Batch padding does not line up") break batches_by_file[current_file_id].append(current_batch_id) # Drop shards which were not assigned batches batches_by_file = [i for i in batches_by_file if i] num_readers = len(batches_by_file) if is_training: # Empirically it is observed that placing the batch with repeated values at # the start rather than the end improves convergence. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) batches_by_file[0][0], batches_by_file[-1][-1] = \ batches_by_file[-1][-1], batches_by_file[0][0] if is_training: template = rconst.TRAIN_RECORD_TEMPLATE record_dir = os.path.join(cache_paths.train_epoch_dir, get_cycle_folder_name(train_cycle)) tf.gfile.MakeDirs(record_dir) else: template = rconst.EVAL_RECORD_TEMPLATE record_dir = cache_paths.eval_data_subdir batch_count = 0 for i in range(num_readers): fpath = os.path.join(record_dir, template.format(i)) log_msg("Writing {}".format(fpath)) with tf.python_io.TFRecordWriter(fpath) as writer: for j in batches_by_file[i]: start_ind = j * batch_size end_ind = start_ind + batch_size record_kwargs = dict( users=data[0][start_ind:end_ind], items=data[1][start_ind:end_ind], ) if is_training: record_kwargs["labels"] = data[2][start_ind:end_ind] else: record_kwargs["dupe_mask"] = stat_utils.mask_duplicates( record_kwargs["items"].reshape(-1, num_neg + 1), axis=1).flatten().astype(np.int8) batch_bytes = _construct_record(**record_kwargs) writer.write(batch_bytes) batch_count += 1 # We write to a temp file then atomically rename it to the final file, because # writing directly to the final file can cause the main process to read a # partially written JSON file. ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP) with tf.gfile.Open(ready_file_temp, "w") as f: json.dump({ "batch_size": batch_size, "batch_count": batch_count, }, f) ready_file = os.path.join(record_dir, rconst.READY_FILE) tf.gfile.Rename(ready_file_temp, ready_file) if is_training: log_msg("Cycle {} complete. Total time: {:.1f} seconds" .format(train_cycle, timeit.default_timer() - st)) else: log_msg("Eval construction complete. Total time: {:.1f} seconds" .format(timeit.default_timer() - st))