def input_fn_common(pattern, batch_size=hparams.test_batch_size, mode=tf.estimator.ModeKeys.EVAL, hvd_info=None): return lambda: input_fn( input_pattern=pattern, metadata_path=hparams.metadata_path, batch_size=batch_size, mode=mode, vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK), hvd_info=hvd_info, vocab_table_for_id_ftr=vocab_utils.read_tf_vocab(hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR), feature_names=hparams.feature_names, CLS=hparams.CLS, SEP=hparams.SEP, PAD=hparams.PAD, PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR, max_len=hparams.max_len, min_len=hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0)
def predict(self, output_dir, input_data_path, metadata_file, checkpoint_path, execution_context, schema_params): n_records = 0 n_batch = 0 # Predict on the dataset sharded_dataset_paths, file_level_sharding = shard_input_files( input_data_path, execution_context[constants.NUM_SHARDS], execution_context[constants.SHARD_INDEX]) if file_level_sharding and len(sharded_dataset_paths) == 0: logger.info("No input dataset is found, returning...") return inference_dataset = lambda: input_fn( input_pattern=','.join(sharded_dataset_paths), # noqa: E731 # DeText uses metadata_path metadata_path=self.model_params.metadata_path, batch_size=self.model_params.test_batch_size, mode=tf.estimator.ModeKeys.EVAL, vocab_table=vocab_utils.read_tf_vocab(self.model_params.vocab_file, self.model_params.UNK), vocab_table_for_id_ftr=vocab_utils.read_tf_vocab( self.model_params.vocab_file_for_id_ftr, self.model_params. UNK_FOR_ID_FTR), feature_names=self.model_params.feature_names, CLS=self.model_params.CLS, SEP=self.model_params.SEP, PAD=self.model_params.PAD, PAD_FOR_ID_FTR=self.model_params.PAD_FOR_ID_FTR, max_len=self.model_params.max_len, min_len=self.model_params.min_len, cnn_filter_window_size=max(self.model_params.filter_window_sizes) if self.model_params.ftr_ext == 'cnn' else 0) self.estimator_based_model = detext_train.get_estimator( self.model_params, strategy=None, # local mode best_checkpoint=self.best_checkpoint) output = self.estimator_based_model.predict( inference_dataset, yield_single_examples=False) detext_writer = DetextWriter(schema_params=schema_params) shard_index = execution_context[constants.SHARD_INDEX] output_file = os.path.join(output_dir, "part-{0:05d}.avro".format(shard_index)) for batch_score in output: if n_batch == 0: with tf.io.gfile.GFile(output_file, 'wb') as f: f.seekable = lambda: False n_records, n_batch = detext_writer.save_batch( f, batch_score, output_file, n_records, n_batch) else: with tf.io.gfile.GFile(output_file, 'ab+') as f: f.seek(0, 2) f.seekable = lambda: True f.readable = lambda: True n_records, n_batch = detext_writer.save_batch( f, batch_score, output_file, n_records, n_batch) logger.info("{} batches, e.g. {} records inferenced".format( n_batch, n_records))
def testMultitaskInputFnBuilderTfrecord(self): """Test additional input from multitask training in eval mode""" res_dir = os.path.dirname(__file__) + '/../resources' # create a vocab table vocab_table = vocab_utils.read_tf_vocab(res_dir + '/vocab.txt', '[UNK]') # dataset dir data_dir = os.path.join(res_dir, 'train', 'multitask', 'tfrecord') # test minimum features required for multitask jobs feature_names = ('label', 'query', 'doc_field1', 'doc_field2', 'wide_ftrs', 'task_id') batch_size = 5 dataset = data_fn.input_fn(input_pattern=data_dir, metadata_path=None, batch_size=batch_size, mode=tf.estimator.ModeKeys.EVAL, vocab_table=vocab_table, vocab_table_for_id_ftr=vocab_table, feature_names=feature_names, CLS='[CLS]', SEP='[SEP]', PAD='[PAD]', PAD_FOR_ID_FTR='[PAD]', max_len=16, cnn_filter_window_size=1) # Make iterator iterator = dataset.make_initializable_iterator() batch_data = iterator.get_next() with tf.Session() as sess: sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) sess.run([iterator.initializer]) batch_data_val, = sess.run([batch_data]) features, label = batch_data_val # First dimension of data should be batch_size for ftr_name in feature_names: if ftr_name != 'label': self.assertTrue(ftr_name in features) self.assertTrue(features[ftr_name].shape[0] == batch_size) self.assertTrue(label['label'].shape[0] == batch_size) task_ids = features['task_id'] # Check task_id dimension size self.assertEqual(len(task_ids.shape), 1) # Check task_id value in the sample data for t_id in task_ids: self.assertTrue(t_id in (0, 1))
def testGetInputFnCommon(self): """Tests get_input_fn_common""" feature_type2name = { InputFtrType.QUERY_COLUMN_NAME: 'query', InputFtrType.DOC_TEXT_COLUMN_NAMES: ['doc_completedQuery'], InputFtrType.DOC_ID_COLUMN_NAMES: ['docId_completedQuery'], InputFtrType.USER_TEXT_COLUMN_NAMES: ['usr_headline', 'usr_skills', 'usr_currTitles'], InputFtrType.USER_ID_COLUMN_NAMES: ['usrId_currTitles'], InputFtrType.DENSE_FTRS_COLUMN_NAMES: 'wide_ftrs', InputFtrType.LABEL_COLUMN_NAME: 'label', InputFtrType.WEIGHT_COLUMN_NAME: 'weight' } feature_name2num = {'wide_ftrs': 5} _, vocab_tf_table = vocab_utils.read_tf_vocab(self.vocab_file, self.UNK) vocab_table = vocab_utils.read_vocab(self.vocab_file) data_dir = self.data_dir hparams = HParams(input_pattern=data_dir, filter_window_sizes=[10], CLS=self.CLS, PAD=self.PAD, SEP=self.SEP, UNK=self.UNK, UNK_FOR_ID_FTR=self.UNK, PAD_FOR_ID_FTR=self.PAD, min_len=1, max_len=16, vocab_file=self.vocab_file, vocab_file_for_id_ftr=self.vocab_file, PAD_ID=vocab_table[self.PAD], SEP_ID=vocab_table[self.SEP], CLS_ID=vocab_table[self.CLS], mode=tf.estimator.ModeKeys.EVAL, task_type=self.task_type, vocab_table=vocab_tf_table, vocab_table_for_id_ftr=vocab_tf_table, max_filter_window_size=3, vocab_hub_url='', vocab_hub_url_for_id_ftr='', embedding_hub_url='', embedding_hub_url_for_id_ftr='', feature_type2name=feature_type2name, feature_name2num=feature_name2num) train_model_helper.get_input_fn_common(data_dir, 1, tf.estimator.ModeKeys.TRAIN, hparams)
def testVocabLookUp(self): """Tests whether vocab lookup return the same result for str and unicode in python 2.""" if six.PY2: # Switch to eager execution switch_to(EAGER_MODE) cur_dir = os.path.dirname(__file__) vocab_file = os.path.join(cur_dir, '..', 'resources', 'multilingual_vocab.txt.gz') vocab_table = vocab_utils.read_tf_vocab(vocab_file) def get_index(some_string): return vocab_table.lookup(tf.constant([some_string])).numpy()[0] unk = get_index(vocab_utils.UNK) # Non exist word self.assertEqual(get_index('bj82149aksreuo'), unk) # ascii characters s = 'a' s_u = u'a' self.assertAllEqual(get_index(s), get_index(s_u)) self.assertTrue(get_index(s) != unk) # Special non-ascii characters s = '##' + '¥' s_u = '##' + u'¥' self.assertAllEqual(get_index(s), get_index(s_u)) self.assertTrue(get_index(s) != unk) # Special non-ascii characters s = ' ' s_u = u' ' self.assertAllEqual(get_index(s), get_index(s_u)) self.assertTrue(get_index(s) != unk) # Large-scale test # Disable this if you want fast build with gzip.GzipFile(fileobj=tf.gfile.Open(vocab_file, 'r')) as fin: for line in fin: line = line.strip('\n') if line != vocab_utils.UNK: self.assertAllEqual(get_index(line), get_index(vocab_utils.convert_to_unicode(line))) self.assertTrue(get_index(line) != unk) # Switch to graph execution switch_to(GRAPH_MODE)
def get_usr_fields(hparams): """ Each user field has a placeholder. The regex is to add whitespace on both sides of punctuations. :param hparams: hparams :return: """ usr_text_placeholders = [] usr_fields = [] tf_vocab_table = vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK) for ftr_name in hparams.feature_names: if ftr_name.startswith('usr_'): # If hparams.add_first_dim_for_usr_placeholder is True, the usr placeholders have dimension [None] # This is to use the usr field features as document features in model serving if hparams.add_first_dim_for_usr_placeholder: # each user field is a placeholder (one string) placeholder = tf.placeholder(shape=[None], dtype=tf.string, name=ftr_name + "_placeholder") else: placeholder = tf.placeholder(shape=[], dtype=tf.string, name=ftr_name + "_placeholder") usr_text_placeholders.append(placeholder) one_usr_field = placeholder # add whitespace on both sides of punctuations if regex pattern is not None if hparams.regex_replace_pattern is not None: one_usr_field = tf.regex_replace( input=one_usr_field, pattern=hparams.regex_replace_pattern, rewrite=" \\1 ") # remove added dimension if hparams.add_first_dim_for_usr_placeholder: one_usr_field = tf.squeeze(one_usr_field, [0]) one_usr_field = tf.expand_dims(one_usr_field, axis=0) one_usr_field = data_fn.process_text( one_usr_field, tf_vocab_table, hparams.CLS, hparams.SEP, hparams.PAD, hparams.max_len, hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0) usr_fields.append(one_usr_field) return usr_fields, usr_text_placeholders
def get_doc_id_fields(hparams): """ Returns a list of processed doc id fields and the corresponding list of raw doc id placeholders Each document id field has a placeholder """ doc_id_placeholders = [] doc_fields = [] tf_vocab_table = vocab_utils.read_tf_vocab(hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR) for ftr_name in hparams.feature_names: if ftr_name.startswith('docId_'): # each document id field is a placeholder (a string vector) placeholder = tf.placeholder(shape=[None], dtype=tf.string, name=ftr_name + "_placeholder") doc_id_placeholders.append(placeholder) one_doc_field = placeholder one_doc_field = data_fn.process_id(one_doc_field, tf_vocab_table, hparams.PAD_FOR_ID_FTR) one_doc_field = tf.expand_dims(one_doc_field, axis=0) doc_fields.append(one_doc_field) return doc_fields, doc_id_placeholders
def __init__(self, CLS, SEP, PAD, UNK, vocab_file): """ Initializes the vocabulary layer :param CLS Token that represents the start of a sentence :param SEP Token that represents the end of a segment :param PAD Token that represents padding :param UNK Token that represents unknown tokens :param vocab_file Path to the vocabulary file """ super().__init__() self._vocab_table_initializer, self.vocab_table = read_tf_vocab( vocab_file, UNK) self._CLS = CLS self._SEP = SEP self._PAD = PAD py_vocab_table = read_vocab(vocab_file) self._pad_id = py_vocab_table[PAD] self._cls_id = py_vocab_table[CLS] if CLS else -1 self._sep_id = py_vocab_table[SEP] if SEP else -1 self._vocab_size = len(py_vocab_table)
def get_doc_fields(hparams): """ Each document field has a placeholder. The regex is to add whitespace on both sides of punctuations. :param hparams: hparams :param regex_replace_pattern: The regex pattern to add a white space before and after :return: """ doc_text_placeholders = [] doc_fields = [] tf_vocab_table = vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK) for ftr_name in hparams.feature_names: if ftr_name.startswith('doc_'): # each document field is a placeholder (a string vector) placeholder = tf.placeholder(shape=[None], dtype=tf.string, name=ftr_name + "_placeholder") doc_text_placeholders.append(placeholder) one_doc_field = placeholder # add whitespace on both sides of punctuations if regex pattern is not None if hparams.regex_replace_pattern is not None: one_doc_field = tf.regex_replace( input=one_doc_field, pattern=hparams.regex_replace_pattern, rewrite=" \\1 ") one_doc_field = data_fn.process_text( one_doc_field, tf_vocab_table, hparams.CLS, hparams.SEP, hparams.PAD, hparams.max_len, hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0) one_doc_field = tf.expand_dims(one_doc_field, axis=0) doc_fields.append(one_doc_field) return doc_fields, doc_text_placeholders
def get_query(hparams, regex_replace_pattern, add_dimension=False): """ Helper function to get query and query_placeholder :param hparams: hparams :param regex_replace_pattern: The regex pattern to add a white space before and after :param add_dimension: whether to add a dimension then remove to query (this is to support online model for QAP as quasar model serving requires at least one dimension) :return: query and query_placeholder """ # query text feature if add_dimension: query_placeholder, query = create_placeholder_for_ftrs( "query_placeholder", [None], tf.string, 'query', hparams.feature_names) else: query_placeholder, query = create_placeholder_for_ftrs( "query_placeholder", [], tf.string, 'query', hparams.feature_names) if query is not None: if add_dimension: # remove added dimension query = tf.squeeze(query, [0]) # tokenize query if regex_replace_pattern is not None: query = tf.regex_replace(input=query, pattern=regex_replace_pattern, rewrite=" \\1 ") query = data_fn.process_text( query, vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK), hparams.CLS, hparams.SEP, hparams.PAD, hparams.max_len, hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes)) return query, query_placeholder
def get_query(hparams): """ Helper function to get query and query_placeholder :param hparams: hparams :return: query and query_placeholder """ # query text feature # If hparams.add_first_dim_for_query_placeholder is True, the query placeholder has dimension [None] # This is to use the query feature as a document feature in model serving if hparams.add_first_dim_for_query_placeholder: query_placeholder, query = create_placeholder_for_ftrs( "query_placeholder", [None], tf.string, 'query', hparams.feature_names) else: query_placeholder, query = create_placeholder_for_ftrs( "query_placeholder", [], tf.string, 'query', hparams.feature_names) if query is not None: if hparams.add_first_dim_for_query_placeholder: # remove added dimension query = tf.squeeze(query, [0]) # tokenize query if hparams.regex_replace_pattern is not None: query = tf.regex_replace(input=query, pattern=hparams.regex_replace_pattern, rewrite=" \\1 ") query = data_fn.process_text( query, vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK), hparams.CLS, hparams.SEP, hparams.PAD, hparams.max_len, hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0) return query, query_placeholder
def get_usr_id_fields(hparams): """ Returns a list of processed usr id fields and the corresponding list of raw usr id placeholders Each user field has a placeholder """ usr_id_placeholders = [] usr_fields = [] tf_vocab_table = vocab_utils.read_tf_vocab(hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR) for ftr_name in hparams.feature_names: if ftr_name.startswith('usrId_'): # each user id field is a placeholder (one string) placeholder = tf.placeholder(shape=[], dtype=tf.string, name=ftr_name + "_placeholder") usr_id_placeholders.append(placeholder) one_usr_field = placeholder one_usr_field = tf.expand_dims(one_usr_field, axis=0) one_usr_field = data_fn.process_id(one_usr_field, tf_vocab_table, hparams.PAD_FOR_ID_FTR) usr_fields.append(one_usr_field) return usr_fields, usr_id_placeholders
def testInputFnBuilderTfrecord(self): """Test function input_fn_builder() in eval mode""" res_dir = os.path.dirname(__file__) + '/../resources' # create a vocab table vocab_table = vocab_utils.read_tf_vocab(res_dir + '/vocab.txt', '[UNK]') # dataset dir data_dir = os.path.join(res_dir, 'train', 'dataset') input_files = os.path.join(data_dir, '*.tfrecord') # create a dataset. # Read schema # Parse and process data in dataset feature_names = ('label', 'query', 'doc_completedQuery', 'usr_headline', 'usr_skills', 'usr_currTitles', 'usrId_currTitles', 'docId_completedQuery', 'wide_ftrs', 'weight') batch_size = 2 dataset = data_fn.input_fn(input_pattern=input_files, metadata_path=None, batch_size=batch_size, mode=tf.estimator.ModeKeys.EVAL, vocab_table=vocab_table, vocab_table_for_id_ftr=vocab_table, feature_names=feature_names, CLS='[CLS]', SEP='[SEP]', PAD='[PAD]', PAD_FOR_ID_FTR='[PAD]', max_len=16, cnn_filter_window_size=1) # Make iterator iterator = dataset.make_initializable_iterator() batch_data = iterator.get_next() with tf.Session() as sess: sess.run( [tf.global_variables_initializer(), tf.tables_initializer()]) sess.run([iterator.initializer]) batch_data_val, = sess.run([batch_data]) features, label = batch_data_val # First dimension of data should be batch_size for ftr_name in feature_names: if ftr_name != 'label': self.assertTrue(ftr_name in features) self.assertTrue(features[ftr_name].shape[0] == batch_size) self.assertTrue(label['label'].shape[0] == batch_size) doc_completedQuery = features['doc_completedQuery'] docId_completedQuery = features['docId_completedQuery'] usr_currTitles = features['usr_currTitles'] usrId_currTitles = features['usrId_currTitles'] # vocab[PAD] == PAD_ID self.assertTrue(doc_completedQuery[0, 0, -1] == self.PAD_ID) self.assertTrue(docId_completedQuery[0, 0, -1] == self.PAD_ID) # vocab[CLS] == CLS_ID self.assertTrue(np.all(doc_completedQuery[0, 0, 0] == self.CLS_ID)) self.assertTrue(np.all(usr_currTitles[0, 0] == self.CLS_ID)) # No CLS in id feature self.assertTrue( np.all(docId_completedQuery[:, :, 0] != self.CLS_ID)) # In this TFRecord file, we populate docId_completeQuery using doc_completedQuery # doc id feature should be the same as doc text feature except CLS and SEP addition # Here we make sure this is correct for the first sample for text_arr, id_arr in zip(doc_completedQuery[0], docId_completedQuery[0]): self.assertAllEqual(text_arr[text_arr != self.PAD_ID][1:-1], id_arr[id_arr != self.PAD_ID]) # In this TFRecord file, we populate usrId_currTitles using usr_currTitles # usr id feature should be the same as usr text feature except CLS and SEP addition for text_arr, id_arr in zip(usr_currTitles, usrId_currTitles): self.assertAllEqual(text_arr[text_arr != self.PAD_ID][1:-1], id_arr[id_arr != self.PAD_ID])
class TestFeatureGrouper(tf.test.TestCase, DataSetup): """Unit test for feature_grouper.py""" _, vocab_tf_table = vocab_utils.read_tf_vocab(DataSetup.vocab_file, '[UNK]') vocab_table = vocab_utils.read_vocab(DataSetup.vocab_file) PAD_ID = vocab_table[DataSetup.PAD] SEP_ID = vocab_table[DataSetup.SEP] CLS_ID = vocab_table[DataSetup.CLS] UNK_ID = vocab_table[DataSetup.UNK] max_filter_window_size = 0 def testFeatureGrouperKerasInput(self): """Tests FeatureGrouper with tf.keras.Input""" nums_dense_ftrs = [2, 3] nums_sparse_ftrs = [10, 30] layer = FeatureGrouper() inputs = { InputFtrType.QUERY_COLUMN_NAME: tf.keras.Input(shape=(), dtype='string'), InputFtrType.USER_TEXT_COLUMN_NAMES: [tf.keras.Input(shape=(), dtype='string')], InputFtrType.USER_ID_COLUMN_NAMES: [tf.keras.Input(shape=(), dtype='string')], InputFtrType.DOC_TEXT_COLUMN_NAMES: [tf.keras.Input(shape=(None, ), dtype='string')], InputFtrType.DOC_ID_COLUMN_NAMES: [tf.keras.Input(shape=(None, ), dtype='string')], InputFtrType.DENSE_FTRS_COLUMN_NAMES: [ tf.keras.Input(shape=(num_dense_ftrs, ), dtype='float32') for num_dense_ftrs in nums_dense_ftrs ], InputFtrType.SPARSE_FTRS_COLUMN_NAMES: [ tf.keras.Input(shape=(num_sparse_ftrs, ), dtype='float32', sparse=True) for num_sparse_ftrs in nums_sparse_ftrs ] } outputs = layer(inputs) self.assertLen(outputs, len(inputs)) def testFeatureGrouperTensor(self): """Tests FeatureGrouper with tensor input""" layer = FeatureGrouper() inputs = { InputFtrType.QUERY_COLUMN_NAME: tf.constant(['batch 1 user 1 build', 'batch 2 user 2 word'], dtype=tf.string), InputFtrType.DENSE_FTRS_COLUMN_NAMES: [ tf.constant([[1, 1], [2, 2]], dtype=tf.float32), tf.constant([[0], [1]], dtype=tf.float32) ], InputFtrType.SPARSE_FTRS_COLUMN_NAMES: [ tf.sparse.from_dense( tf.constant([[1, 0], [2, 0]], dtype=tf.float32)), tf.sparse.from_dense(tf.constant([[1], [1]], dtype=tf.float32)) ] } expected_result = { InputFtrType.QUERY_COLUMN_NAME: tf.constant(['batch 1 user 1 build', 'batch 2 user 2 word'], dtype=tf.string), InputFtrType.DENSE_FTRS_COLUMN_NAMES: tf.constant([[1, 1, 0], [2, 2, 1]]), InputFtrType.SPARSE_FTRS_COLUMN_NAMES: [ tf.constant([[1, 0], [2, 0]], dtype=tf.float32), tf.constant([[1], [1]], dtype=tf.float32) ] } outputs = layer(inputs) self.assertEqual( len(outputs), len(expected_result)), "Outputs must have the same shape" for ftr_type, expected_ftr in expected_result.items(): output = outputs[ftr_type] if ftr_type == InputFtrType.SPARSE_FTRS_COLUMN_NAMES: output = [tf.sparse.to_dense(t) for t in output] for e, o in zip(expected_ftr, output): self.assertAllEqual(e, o) continue self.assertAllEqual(expected_ftr, output) def testConcatFtrOnLastDim(self): """Tests concatenate features on last dimension""" tensor_lst = [ tf.constant([1, 2, 3], dtype='int32'), tf.constant([4, 5, 6], dtype='int32') ] result = feature_grouper.concat_on_last_axis_dense(tensor_lst) expected_output = tf.constant([1, 2, 3, 4, 5, 6], dtype='int32') self.assertAllEqual(result, expected_output)
def train(hparams, input_fn): """ Main function for train/evaluate DeText ranking model :param hparams: hparams :param input_fn: input function to create train/eval specs :return: """ eval_log_file = None if hparams.use_horovod is True: import horovod.tensorflow as hvd eval_log_file = path_join(hparams.out_dir, 'eval_log.txt') train_strategy = tf.contrib.distribute.ParameterServerStrategy() estimator = get_estimator(hparams, strategy=train_strategy) # Set model export config for evaluator or primary worker of horovod exporter_list = None if hparams.use_horovod is False or (hparams.use_horovod is True and hvd.rank() == 0): best_model_name = 'best_' + hparams.pmetric # Exporter to save best (in terms of pmetric) checkpoint in the folder [best_model_name], # and export to savedmodel for prediction. best_checkpoint_exporter = BestCheckpointCopier( name=best_model_name, serving_input_receiver_fn=lambda: serving_input_fn(hparams), checkpoints_to_keep=1, # keeping the best checkpoint exports_to_keep=1, # keeping the best savedmodel pmetric='metric/{}'.format(hparams.pmetric), compare_fn=lambda x, y: x.score > y.score, # larger metric better sort_reverse=True, eval_log_file=eval_log_file) exporter_list = [best_checkpoint_exporter] # Handle sync distributed training case via use_horovod if hparams.use_horovod: import horovod.tensorflow as hvd # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Create TrainSpec for model training train_spec = tf.estimator.TrainSpec( input_fn=lambda: input_fn( input_pattern=hparams.train_file, metadata_path=hparams.metadata_path, batch_size=hparams.train_batch_size, mode=tf.estimator.ModeKeys.TRAIN, vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams. UNK), vocab_table_for_id_ftr=vocab_utils.read_tf_vocab( hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR), feature_names=hparams.feature_names, CLS=hparams.CLS, SEP=hparams.SEP, PAD=hparams.PAD, PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR, max_len=hparams.max_len, min_len=hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0, # Add horovod information if applicable hvd_info=hparams.hvd_info if hparams.use_horovod else None), hooks=[bcast_hook] if hparams.use_horovod else None, # Ensure proper initialization with horovod max_steps=hparams.num_train_steps) eval_spec = tf.estimator.EvalSpec( input_fn=lambda: input_fn( input_pattern=hparams.dev_file, metadata_path=hparams.metadata_path, batch_size=hparams.test_batch_size, mode=tf.estimator.ModeKeys.EVAL, vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams. UNK), vocab_table_for_id_ftr=vocab_utils.read_tf_vocab( hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR), feature_names=hparams.feature_names, CLS=hparams.CLS, SEP=hparams.SEP, PAD=hparams.PAD, PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR, max_len=hparams.max_len, min_len=hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0), exporters=exporter_list, steps=None, # Set throttle_secs to 10 min to avoid warning to spam logs # Set throttle to 0 for horovod: https://github.com/horovod/horovod/issues/182#issuecomment-533897757 throttle_secs=0 if hparams.use_horovod else 600, start_delay_secs=10) # Training and evaluation with dev set tf.estimator.train_and_evaluate(estimator=estimator, train_spec=train_spec, eval_spec=eval_spec) print("***** Training finished. *****") # Evaluation with test set: create an estimator with the best_checkpoint_dir to load the best model task_type = executor_utils.get_executor_task_type() do_evaluate = task_type == executor_utils.EVALUATOR or task_type == executor_utils.LOCAL_MODE if (not hparams.use_horovod and do_evaluate) or (hparams.use_horovod and hvd.rank() == 0): best_checkpoint_dir = path_join(hparams.out_dir, best_model_name) estimator_savedmodel = get_estimator( hparams, strategy=train_strategy, best_checkpoint=best_checkpoint_dir) result = estimator_savedmodel.evaluate(input_fn=lambda: input_fn( input_pattern=hparams.test_file, metadata_path=hparams.metadata_path, batch_size=hparams.test_batch_size, mode=tf.estimator.ModeKeys.EVAL, vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams. UNK), vocab_table_for_id_ftr=vocab_utils.read_tf_vocab( hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR), feature_names=hparams.feature_names, CLS=hparams.CLS, SEP=hparams.SEP, PAD=hparams.PAD, PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR, max_len=hparams.max_len, min_len=hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0)) print("\n***** Evaluation on test set with best exported model: *****") for key in sorted(result.keys()): print("%s = %s" % (key, str(result[key])))
class TestData(tf.test.TestCase, DataSetup): """Unit test for data_fn.""" _, vocab_tf_table = vocab_utils.read_tf_vocab(DataSetup.vocab_file, '[UNK]') vocab_table = vocab_utils.read_vocab(DataSetup.vocab_file) CLS = '[CLS]' PAD = '[PAD]' SEP = '[SEP]' PAD_ID = vocab_table[PAD] SEP_ID = vocab_table[SEP] CLS_ID = vocab_table[CLS] nums_sparse_ftrs = [20] def testRankingInputFnBuilderTfrecord(self): """ Tests function input_fn_builder() """ one_device_strategy = distribution_utils.get_distribution_strategy('one_device', num_gpus=0) feature_type2name_list = [ # Contains sparse features {InputFtrType.LABEL_COLUMN_NAME: 'label', InputFtrType.QUERY_COLUMN_NAME: 'query', InputFtrType.DOC_TEXT_COLUMN_NAMES: ['doc_headline', 'doc_title'], InputFtrType.USER_TEXT_COLUMN_NAMES: ['user_headline', 'user_title'], InputFtrType.DOC_ID_COLUMN_NAMES: ['doc_headline_id'], InputFtrType.USER_ID_COLUMN_NAMES: ['user_headline_id'], InputFtrType.DENSE_FTRS_COLUMN_NAMES: ['dense_ftrs'], InputFtrType.SPARSE_FTRS_COLUMN_NAMES: ['sparse_ftrs'], InputFtrType.WEIGHT_COLUMN_NAME: 'weight' }, # No sparse features {InputFtrType.LABEL_COLUMN_NAME: 'label', InputFtrType.QUERY_COLUMN_NAME: 'query', InputFtrType.DOC_TEXT_COLUMN_NAMES: ['doc_headline', 'doc_title'], InputFtrType.USER_TEXT_COLUMN_NAMES: ['user_headline', 'user_title'], InputFtrType.DOC_ID_COLUMN_NAMES: ['doc_headline_id'], InputFtrType.USER_ID_COLUMN_NAMES: ['user_headline_id'], InputFtrType.DENSE_FTRS_COLUMN_NAMES: ['dense_ftrs'], InputFtrType.WEIGHT_COLUMN_NAME: 'weight' }, # Sparse features only {InputFtrType.LABEL_COLUMN_NAME: 'label', InputFtrType.SPARSE_FTRS_COLUMN_NAMES: ['sparse_ftrs']} ] strategy_list = [None, one_device_strategy] for strategy, feature_type2name in product(strategy_list, feature_type2name_list): self._testRankingInputFnBuilderTfrecord(strategy, feature_type2name) def _testRankingInputFnBuilderTfrecord(self, strategy, feature_type2name): """ Tests function input_fn_builder() for given strategy """ data_dir = self.ranking_data_dir feature_name2num = {'dense_ftrs': 2, 'sparse_ftrs': self.nums_sparse_ftrs[0]} def _input_fn_tfrecord(ctx): return data_fn.input_fn_tfrecord(input_pattern=data_dir, batch_size=batch_size, mode=tf.estimator.ModeKeys.EVAL, feature_type2name=feature_type2name, feature_name2num=feature_name2num, input_pipeline_context=ctx) batch_size = 2 if strategy is not None: dataset = strategy.distribute_datasets_from_function(_input_fn_tfrecord) else: dataset = _input_fn_tfrecord(None) # Make iterator for features, label in dataset: for ftr_type, ftr_name_lst in iterate_items_with_list_val(feature_type2name): if ftr_type in (InputFtrType.LABEL_COLUMN_NAME, InputFtrType.WEIGHT_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME): self.assertLen(ftr_name_lst, 1), f'Length for current ftr type ({ftr_type}) should be 1' ftr_name = ftr_name_lst[0] self.assertIn(ftr_name, label) continue for ftr_name in ftr_name_lst: self.assertIn(ftr_name, features) # First dimension of data should be batch_size self.assertTrue(features[ftr_name].shape[0] == batch_size) weight_ftr_name = feature_type2name.get(InputFtrType.WEIGHT_COLUMN_NAME, constant.Constant()._DEFAULT_WEIGHT_FTR_NAME) self.assertAllEqual(tf.shape(label[weight_ftr_name]), [batch_size]) uid_ftr_name = feature_type2name.get(InputFtrType.UID_COLUMN_NAME, constant.Constant()._DEFAULT_UID_FTR_NAME) self.assertAllEqual(tf.shape(label[uid_ftr_name]), [batch_size]) # First dimension of data should be batch_size self.assertEqual(label['label'].shape[0], batch_size) if InputFtrType.DOC_TEXT_COLUMN_NAMES in feature_type2name: self.assertAllEqual(features['doc_title'], tf.constant( [["document title 1", b"title 2 ?", b"doc title 3 ?", b"doc title 4 ?"], ["document title 1", b"title 2 ?", b"doc title 3 ?", b""]] )) if InputFtrType.DOC_ID_COLUMN_NAMES in feature_type2name: self.assertAllEqual(features['doc_headline_id'], tf.constant( [[b"document headline id 1", b"headline id 2 ?", b"doc headline id 3 ?", b"doc headline id 4 ?"], [b"document headline id 1", b"headline id 2 ?", b"doc headline id 3 ?", b""]] )) if InputFtrType.USER_TEXT_COLUMN_NAMES in feature_type2name: self.assertAllEqual(features['user_title'], tf.constant( [b"user title", b"user title"] )) if InputFtrType.USER_ID_COLUMN_NAMES in feature_type2name: self.assertAllEqual(features['user_headline_id'], tf.constant( [b"user headline id", b"user headline id"] )) if InputFtrType.DENSE_FTRS_COLUMN_NAMES in feature_type2name: self.assertAllEqual(features['dense_ftrs'], tf.constant( [[[23.0, 14.0], [44.0, -1.0], [22.0, 19.0], [22.0, 19.0]], [[23.0, 14.0], [44.0, -1.0], [22.0, 19.0], [0.0, 0.0]]] )) if InputFtrType.SPARSE_FTRS_COLUMN_NAMES in feature_type2name: self.assertAllEqual(tf.sparse.to_dense(features['sparse_ftrs']), tf.sparse.to_dense(tf.SparseTensor(indices=[[0, 0, 1], [0, 0, 5], [0, 1, 0], [0, 2, 2], [0, 3, 8], [1, 0, 1], [1, 0, 5], [1, 1, 0], [1, 2, 2]], values=[1., 5., 7., 12., -8., 1., 5., 7., 12.], dense_shape=[batch_size, 4, self.nums_sparse_ftrs[0]])) ) # Only check the first batch break def testClassificationInputFnBuilderTfrecord(self): """Test classification input reader in eval mode""" data_dir = self.cls_data_dir feature_type2name = { InputFtrType.LABEL_COLUMN_NAME: 'label', InputFtrType.DOC_TEXT_COLUMN_NAMES: ['query_text'], InputFtrType.USER_TEXT_COLUMN_NAMES: ['user_headline'], InputFtrType.DENSE_FTRS_COLUMN_NAMES: 'dense_ftrs', } feature_name2num = { 'dense_ftrs': 8 } batch_size = 2 dataset = data_fn.input_fn_tfrecord(input_pattern=data_dir, batch_size=batch_size, mode=tf.estimator.ModeKeys.EVAL, task_type=TaskType.CLASSIFICATION, feature_type2name=feature_type2name, feature_name2num=feature_name2num) for features, label in dataset: # First dimension of data should be batch_size for ftr_type, ftr_name_lst in iterate_items_with_list_val(feature_type2name): if ftr_type in (InputFtrType.LABEL_COLUMN_NAME, InputFtrType.WEIGHT_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME): self.assertLen(ftr_name_lst, 1), f'Length for current ftr type ({ftr_type}) should be 1' ftr_name = ftr_name_lst[0] self.assertIn(ftr_name, label) continue for ftr_name in ftr_name_lst: self.assertIn(ftr_name, features) self.assertEqual(features[ftr_name].shape[0], batch_size) weight_ftr_name = feature_type2name.get(InputFtrType.WEIGHT_COLUMN_NAME, constant.Constant()._DEFAULT_WEIGHT_FTR_NAME) self.assertAllEqual(tf.shape(label[weight_ftr_name]), [batch_size]) uid_ftr_name = feature_type2name.get(InputFtrType.UID_COLUMN_NAME, constant.Constant()._DEFAULT_UID_FTR_NAME) self.assertAllEqual(tf.shape(label[uid_ftr_name]), [batch_size]) self.assertAllEqual(label['label'].shape, [batch_size]) def testBinaryClassificationInputFnBuilderTfrecord(self): """Test binary classification input reader """ data_dir = self.binary_cls_data_dir feature_type2name = { InputFtrType.LABEL_COLUMN_NAME: 'label', InputFtrType.SPARSE_FTRS_COLUMN_NAMES: ['sparse_ftrs'], InputFtrType.SHALLOW_TOWER_SPARSE_FTRS_COLUMN_NAMES: ['shallow_tower_sparse_ftrs', 'sparse_ftrs'] } feature_name2num = { 'sparse_ftrs': 20, 'shallow_tower_sparse_ftrs': 20 } batch_size = 2 dataset = data_fn.input_fn_tfrecord(input_pattern=data_dir, batch_size=batch_size, mode=tf.estimator.ModeKeys.EVAL, task_type=TaskType.BINARY_CLASSIFICATION, feature_type2name=feature_type2name, feature_name2num=feature_name2num ) for features, label in dataset: # First dimension of data should be batch_size for ftr_type, ftr_name_lst in iterate_items_with_list_val(feature_type2name): if ftr_type in (InputFtrType.LABEL_COLUMN_NAME, InputFtrType.WEIGHT_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME): self.assertLen(ftr_name_lst, 1), f'Length for current ftr type ({ftr_type}) should be 1' ftr_name = ftr_name_lst[0] self.assertIn(ftr_name, label) continue for ftr_name in ftr_name_lst: self.assertIn(ftr_name, features) self.assertEqual(features[ftr_name].shape[0], batch_size) weight_ftr_name = feature_type2name.get(InputFtrType.WEIGHT_COLUMN_NAME, constant.Constant()._DEFAULT_WEIGHT_FTR_NAME) self.assertAllEqual(tf.shape(label[weight_ftr_name]), [batch_size]) uid_ftr_name = feature_type2name.get(InputFtrType.UID_COLUMN_NAME, constant.Constant()._DEFAULT_UID_FTR_NAME) self.assertAllEqual(tf.shape(label[uid_ftr_name]), [batch_size]) self.assertAllEqual(label['label'].shape, [batch_size]) self.assertAllEqual(tf.sparse.to_dense(features['sparse_ftrs']), tf.sparse.to_dense( tf.SparseTensor(indices=[[0, 0], [0, 2], [0, 7], [1, 0], [1, 2], [1, 7]], values=[1, 0, 7, 1, 0, 7], dense_shape=[batch_size, self.nums_sparse_ftrs[0]]) ) ) # Only check first batch break def testRankingMultitaskInputFnBuilderTfrecord(self): """Test additional input from multitask training in eval mode""" data_dir = self.ranking_data_dir # Test minimum features required for multitask jobs feature_type2name = { InputFtrType.LABEL_COLUMN_NAME: 'label', InputFtrType.QUERY_COLUMN_NAME: 'query', InputFtrType.DOC_TEXT_COLUMN_NAMES: ['doc_headline', 'doc_title'], InputFtrType.USER_TEXT_COLUMN_NAMES: ['user_headline', 'user_title'], InputFtrType.DOC_ID_COLUMN_NAMES: ['doc_headline_id'], InputFtrType.USER_ID_COLUMN_NAMES: ['user_headline_id'], InputFtrType.DENSE_FTRS_COLUMN_NAMES: ['dense_ftrs'], InputFtrType.WEIGHT_COLUMN_NAME: 'weight', InputFtrType.TASK_ID_COLUMN_NAME: 'task_id_field' } feature_name2num = { 'dense_ftrs': 2 } batch_size = 5 dataset = data_fn.input_fn_tfrecord(input_pattern=data_dir, batch_size=batch_size, mode=tf.estimator.ModeKeys.EVAL, feature_type2name=feature_type2name, feature_name2num=feature_name2num) for features, label in dataset: # First dimension of data should be batch_size for ftr_type, ftr_name_lst in iterate_items_with_list_val(feature_type2name): if ftr_type in (InputFtrType.LABEL_COLUMN_NAME, InputFtrType.WEIGHT_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME): self.assertLen(ftr_name_lst, 1), f'Length for current ftr type ({ftr_type}) should be 1' ftr_name = ftr_name_lst[0] self.assertIn(ftr_name, label) continue for ftr_name in ftr_name_lst: self.assertIn(ftr_name, features) self.assertEqual(features[ftr_name].shape[0], batch_size) weight_ftr_name = feature_type2name.get(InputFtrType.WEIGHT_COLUMN_NAME, constant.Constant()._DEFAULT_WEIGHT_FTR_NAME) self.assertAllEqual(tf.shape(label[weight_ftr_name]), [batch_size]) uid_ftr_name = feature_type2name.get(InputFtrType.UID_COLUMN_NAME, constant.Constant()._DEFAULT_UID_FTR_NAME) self.assertAllEqual(tf.shape(label[uid_ftr_name]), [batch_size]) # First dimension of data should be batch_size self.assertEqual(label['label'].shape[0], batch_size) task_ids = features['task_id_field'] # Check task_id dimension size self.assertEqual(len(task_ids.shape), 1) # Check task_id value in the sample data for t_id in task_ids: self.assertAllEqual(t_id, 5)