Ejemplo n.º 1
0
 def input_fn_common(pattern, batch_size=hparams.test_batch_size, mode=tf.estimator.ModeKeys.EVAL, hvd_info=None):
     return lambda: input_fn(
         input_pattern=pattern, metadata_path=hparams.metadata_path, batch_size=batch_size, mode=mode,
         vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK), hvd_info=hvd_info,
         vocab_table_for_id_ftr=vocab_utils.read_tf_vocab(hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR),
         feature_names=hparams.feature_names, CLS=hparams.CLS, SEP=hparams.SEP, PAD=hparams.PAD,
         PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR, max_len=hparams.max_len, min_len=hparams.min_len,
         cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0)
    def predict(self, output_dir, input_data_path, metadata_file,
                checkpoint_path, execution_context, schema_params):
        n_records = 0
        n_batch = 0
        # Predict on the dataset
        sharded_dataset_paths, file_level_sharding = shard_input_files(
            input_data_path, execution_context[constants.NUM_SHARDS],
            execution_context[constants.SHARD_INDEX])
        if file_level_sharding and len(sharded_dataset_paths) == 0:
            logger.info("No input dataset is found, returning...")
            return

        inference_dataset = lambda: input_fn(
            input_pattern=','.join(sharded_dataset_paths),  # noqa: E731
            # DeText uses metadata_path
            metadata_path=self.model_params.metadata_path,
            batch_size=self.model_params.test_batch_size,
            mode=tf.estimator.ModeKeys.EVAL,
            vocab_table=vocab_utils.read_tf_vocab(self.model_params.vocab_file,
                                                  self.model_params.UNK),
            vocab_table_for_id_ftr=vocab_utils.read_tf_vocab(
                self.model_params.vocab_file_for_id_ftr, self.model_params.
                UNK_FOR_ID_FTR),
            feature_names=self.model_params.feature_names,
            CLS=self.model_params.CLS,
            SEP=self.model_params.SEP,
            PAD=self.model_params.PAD,
            PAD_FOR_ID_FTR=self.model_params.PAD_FOR_ID_FTR,
            max_len=self.model_params.max_len,
            min_len=self.model_params.min_len,
            cnn_filter_window_size=max(self.model_params.filter_window_sizes)
            if self.model_params.ftr_ext == 'cnn' else 0)

        self.estimator_based_model = detext_train.get_estimator(
            self.model_params,
            strategy=None,  # local mode
            best_checkpoint=self.best_checkpoint)
        output = self.estimator_based_model.predict(
            inference_dataset, yield_single_examples=False)
        detext_writer = DetextWriter(schema_params=schema_params)
        shard_index = execution_context[constants.SHARD_INDEX]
        output_file = os.path.join(output_dir,
                                   "part-{0:05d}.avro".format(shard_index))
        for batch_score in output:
            if n_batch == 0:
                with tf.io.gfile.GFile(output_file, 'wb') as f:
                    f.seekable = lambda: False
                    n_records, n_batch = detext_writer.save_batch(
                        f, batch_score, output_file, n_records, n_batch)
            else:
                with tf.io.gfile.GFile(output_file, 'ab+') as f:
                    f.seek(0, 2)
                    f.seekable = lambda: True
                    f.readable = lambda: True
                    n_records, n_batch = detext_writer.save_batch(
                        f, batch_score, output_file, n_records, n_batch)
        logger.info("{} batches, e.g. {} records inferenced".format(
            n_batch, n_records))
Ejemplo n.º 3
0
    def testMultitaskInputFnBuilderTfrecord(self):
        """Test additional input from multitask training in eval mode"""
        res_dir = os.path.dirname(__file__) + '/../resources'

        # create a vocab table
        vocab_table = vocab_utils.read_tf_vocab(res_dir + '/vocab.txt', '[UNK]')

        # dataset dir
        data_dir = os.path.join(res_dir, 'train', 'multitask', 'tfrecord')

        # test minimum features required for multitask jobs
        feature_names = ('label', 'query', 'doc_field1', 'doc_field2', 'wide_ftrs', 'task_id')

        batch_size = 5
        dataset = data_fn.input_fn(input_pattern=data_dir,
                                   metadata_path=None,
                                   batch_size=batch_size,
                                   mode=tf.estimator.ModeKeys.EVAL,
                                   vocab_table=vocab_table,
                                   vocab_table_for_id_ftr=vocab_table,
                                   feature_names=feature_names,
                                   CLS='[CLS]',
                                   SEP='[SEP]',
                                   PAD='[PAD]',
                                   PAD_FOR_ID_FTR='[PAD]',
                                   max_len=16,
                                   cnn_filter_window_size=1)

        # Make iterator
        iterator = dataset.make_initializable_iterator()
        batch_data = iterator.get_next()

        with tf.Session() as sess:
            sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
            sess.run([iterator.initializer])
            batch_data_val, = sess.run([batch_data])
            features, label = batch_data_val

            # First dimension of data should be batch_size
            for ftr_name in feature_names:
                if ftr_name != 'label':
                    self.assertTrue(ftr_name in features)
                    self.assertTrue(features[ftr_name].shape[0] == batch_size)

            self.assertTrue(label['label'].shape[0] == batch_size)

            task_ids = features['task_id']

            # Check task_id dimension size
            self.assertEqual(len(task_ids.shape), 1)

            # Check task_id value in the sample data
            for t_id in task_ids:
                self.assertTrue(t_id in (0, 1))
Ejemplo n.º 4
0
    def testGetInputFnCommon(self):
        """Tests get_input_fn_common"""
        feature_type2name = {
            InputFtrType.QUERY_COLUMN_NAME:
            'query',
            InputFtrType.DOC_TEXT_COLUMN_NAMES: ['doc_completedQuery'],
            InputFtrType.DOC_ID_COLUMN_NAMES: ['docId_completedQuery'],
            InputFtrType.USER_TEXT_COLUMN_NAMES:
            ['usr_headline', 'usr_skills', 'usr_currTitles'],
            InputFtrType.USER_ID_COLUMN_NAMES: ['usrId_currTitles'],
            InputFtrType.DENSE_FTRS_COLUMN_NAMES:
            'wide_ftrs',
            InputFtrType.LABEL_COLUMN_NAME:
            'label',
            InputFtrType.WEIGHT_COLUMN_NAME:
            'weight'
        }
        feature_name2num = {'wide_ftrs': 5}

        _, vocab_tf_table = vocab_utils.read_tf_vocab(self.vocab_file,
                                                      self.UNK)
        vocab_table = vocab_utils.read_vocab(self.vocab_file)
        data_dir = self.data_dir
        hparams = HParams(input_pattern=data_dir,
                          filter_window_sizes=[10],
                          CLS=self.CLS,
                          PAD=self.PAD,
                          SEP=self.SEP,
                          UNK=self.UNK,
                          UNK_FOR_ID_FTR=self.UNK,
                          PAD_FOR_ID_FTR=self.PAD,
                          min_len=1,
                          max_len=16,
                          vocab_file=self.vocab_file,
                          vocab_file_for_id_ftr=self.vocab_file,
                          PAD_ID=vocab_table[self.PAD],
                          SEP_ID=vocab_table[self.SEP],
                          CLS_ID=vocab_table[self.CLS],
                          mode=tf.estimator.ModeKeys.EVAL,
                          task_type=self.task_type,
                          vocab_table=vocab_tf_table,
                          vocab_table_for_id_ftr=vocab_tf_table,
                          max_filter_window_size=3,
                          vocab_hub_url='',
                          vocab_hub_url_for_id_ftr='',
                          embedding_hub_url='',
                          embedding_hub_url_for_id_ftr='',
                          feature_type2name=feature_type2name,
                          feature_name2num=feature_name2num)
        train_model_helper.get_input_fn_common(data_dir, 1,
                                               tf.estimator.ModeKeys.TRAIN,
                                               hparams)
Ejemplo n.º 5
0
    def testVocabLookUp(self):
        """Tests whether vocab lookup return the same result for str and unicode in python 2."""
        if six.PY2:
            # Switch to eager execution
            switch_to(EAGER_MODE)

            cur_dir = os.path.dirname(__file__)
            vocab_file = os.path.join(cur_dir, '..', 'resources', 'multilingual_vocab.txt.gz')
            vocab_table = vocab_utils.read_tf_vocab(vocab_file)

            def get_index(some_string):
                return vocab_table.lookup(tf.constant([some_string])).numpy()[0]

            unk = get_index(vocab_utils.UNK)

            # Non exist word
            self.assertEqual(get_index('bj82149aksreuo'), unk)

            # ascii characters
            s = 'a'
            s_u = u'a'

            self.assertAllEqual(get_index(s), get_index(s_u))
            self.assertTrue(get_index(s) != unk)

            # Special non-ascii characters
            s = '##' + '¥'
            s_u = '##' + u'¥'

            self.assertAllEqual(get_index(s), get_index(s_u))
            self.assertTrue(get_index(s) != unk)

            # Special non-ascii characters
            s = '
'
            s_u = u'
'

            self.assertAllEqual(get_index(s), get_index(s_u))
            self.assertTrue(get_index(s) != unk)

            # Large-scale test
            # Disable this if you want fast build
            with gzip.GzipFile(fileobj=tf.gfile.Open(vocab_file, 'r')) as fin:
                for line in fin:
                    line = line.strip('\n')
                    if line != vocab_utils.UNK:
                        self.assertAllEqual(get_index(line), get_index(vocab_utils.convert_to_unicode(line)))
                        self.assertTrue(get_index(line) != unk)

            # Switch to graph execution
            switch_to(GRAPH_MODE)
Ejemplo n.º 6
0
def get_usr_fields(hparams):
    """
    Each user field has a placeholder.
    The regex is to add whitespace on both sides of punctuations.
    :param hparams: hparams
    :return:
    """
    usr_text_placeholders = []
    usr_fields = []
    tf_vocab_table = vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK)
    for ftr_name in hparams.feature_names:
        if ftr_name.startswith('usr_'):
            # If hparams.add_first_dim_for_usr_placeholder is True, the usr placeholders have dimension [None]
            # This is to use the usr field features as document features in model serving
            if hparams.add_first_dim_for_usr_placeholder:
                # each user field is a placeholder (one string)
                placeholder = tf.placeholder(shape=[None],
                                             dtype=tf.string,
                                             name=ftr_name + "_placeholder")
            else:
                placeholder = tf.placeholder(shape=[],
                                             dtype=tf.string,
                                             name=ftr_name + "_placeholder")
            usr_text_placeholders.append(placeholder)

            one_usr_field = placeholder
            # add whitespace on both sides of punctuations if regex pattern is not None
            if hparams.regex_replace_pattern is not None:
                one_usr_field = tf.regex_replace(
                    input=one_usr_field,
                    pattern=hparams.regex_replace_pattern,
                    rewrite=" \\1 ")

            # remove added dimension
            if hparams.add_first_dim_for_usr_placeholder:
                one_usr_field = tf.squeeze(one_usr_field, [0])
            one_usr_field = tf.expand_dims(one_usr_field, axis=0)
            one_usr_field = data_fn.process_text(
                one_usr_field,
                tf_vocab_table,
                hparams.CLS,
                hparams.SEP,
                hparams.PAD,
                hparams.max_len,
                hparams.min_len,
                cnn_filter_window_size=max(hparams.filter_window_sizes)
                if hparams.ftr_ext == 'cnn' else 0)
            usr_fields.append(one_usr_field)
    return usr_fields, usr_text_placeholders
Ejemplo n.º 7
0
def get_doc_id_fields(hparams):
    """ Returns a list of processed doc id fields and the corresponding list of raw doc id placeholders
    Each document id field has a placeholder
    """
    doc_id_placeholders = []
    doc_fields = []
    tf_vocab_table = vocab_utils.read_tf_vocab(hparams.vocab_file_for_id_ftr,
                                               hparams.UNK_FOR_ID_FTR)
    for ftr_name in hparams.feature_names:
        if ftr_name.startswith('docId_'):
            # each document id field is a placeholder (a string vector)
            placeholder = tf.placeholder(shape=[None],
                                         dtype=tf.string,
                                         name=ftr_name + "_placeholder")
            doc_id_placeholders.append(placeholder)

            one_doc_field = placeholder
            one_doc_field = data_fn.process_id(one_doc_field, tf_vocab_table,
                                               hparams.PAD_FOR_ID_FTR)
            one_doc_field = tf.expand_dims(one_doc_field, axis=0)
            doc_fields.append(one_doc_field)
    return doc_fields, doc_id_placeholders
Ejemplo n.º 8
0
    def __init__(self, CLS, SEP, PAD, UNK, vocab_file):
        """ Initializes the vocabulary layer

        :param CLS Token that represents the start of a sentence
        :param SEP Token that represents the end of a segment
        :param PAD Token that represents padding
        :param UNK Token that represents unknown tokens
        :param vocab_file Path to the vocabulary file
        """
        super().__init__()
        self._vocab_table_initializer, self.vocab_table = read_tf_vocab(
            vocab_file, UNK)

        self._CLS = CLS
        self._SEP = SEP
        self._PAD = PAD

        py_vocab_table = read_vocab(vocab_file)
        self._pad_id = py_vocab_table[PAD]
        self._cls_id = py_vocab_table[CLS] if CLS else -1
        self._sep_id = py_vocab_table[SEP] if SEP else -1
        self._vocab_size = len(py_vocab_table)
Ejemplo n.º 9
0
def get_doc_fields(hparams):
    """
    Each document field has a placeholder.
    The regex is to add whitespace on both sides of punctuations.
    :param hparams: hparams
    :param regex_replace_pattern: The regex pattern to add a white space before and after
    :return:
    """
    doc_text_placeholders = []
    doc_fields = []
    tf_vocab_table = vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK)
    for ftr_name in hparams.feature_names:
        if ftr_name.startswith('doc_'):
            # each document field is a placeholder (a string vector)
            placeholder = tf.placeholder(shape=[None],
                                         dtype=tf.string,
                                         name=ftr_name + "_placeholder")
            doc_text_placeholders.append(placeholder)

            one_doc_field = placeholder
            # add whitespace on both sides of punctuations if regex pattern is not None
            if hparams.regex_replace_pattern is not None:
                one_doc_field = tf.regex_replace(
                    input=one_doc_field,
                    pattern=hparams.regex_replace_pattern,
                    rewrite=" \\1 ")
            one_doc_field = data_fn.process_text(
                one_doc_field,
                tf_vocab_table,
                hparams.CLS,
                hparams.SEP,
                hparams.PAD,
                hparams.max_len,
                hparams.min_len,
                cnn_filter_window_size=max(hparams.filter_window_sizes)
                if hparams.ftr_ext == 'cnn' else 0)
            one_doc_field = tf.expand_dims(one_doc_field, axis=0)
            doc_fields.append(one_doc_field)
    return doc_fields, doc_text_placeholders
Ejemplo n.º 10
0
def get_query(hparams, regex_replace_pattern, add_dimension=False):
    """
    Helper function to get query and query_placeholder
    :param hparams: hparams
    :param regex_replace_pattern: The regex pattern to add a white space before and after
    :param add_dimension: whether to add a dimension then remove to query (this is to support online model for QAP as
    quasar model serving requires at least one dimension)
    :return: query and query_placeholder
    """
    # query text feature
    if add_dimension:
        query_placeholder, query = create_placeholder_for_ftrs(
            "query_placeholder", [None], tf.string, 'query',
            hparams.feature_names)
    else:
        query_placeholder, query = create_placeholder_for_ftrs(
            "query_placeholder", [], tf.string, 'query', hparams.feature_names)
    if query is not None:
        if add_dimension:
            # remove added dimension
            query = tf.squeeze(query, [0])

        # tokenize query
        if regex_replace_pattern is not None:
            query = tf.regex_replace(input=query,
                                     pattern=regex_replace_pattern,
                                     rewrite=" \\1 ")

        query = data_fn.process_text(
            query,
            vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK),
            hparams.CLS,
            hparams.SEP,
            hparams.PAD,
            hparams.max_len,
            hparams.min_len,
            cnn_filter_window_size=max(hparams.filter_window_sizes))
    return query, query_placeholder
Ejemplo n.º 11
0
def get_query(hparams):
    """
    Helper function to get query and query_placeholder
    :param hparams: hparams
    :return: query and query_placeholder
    """
    # query text feature
    # If hparams.add_first_dim_for_query_placeholder is True, the query placeholder has dimension [None]
    # This is to use the query feature as a document feature in model serving
    if hparams.add_first_dim_for_query_placeholder:
        query_placeholder, query = create_placeholder_for_ftrs(
            "query_placeholder", [None], tf.string, 'query',
            hparams.feature_names)
    else:
        query_placeholder, query = create_placeholder_for_ftrs(
            "query_placeholder", [], tf.string, 'query', hparams.feature_names)
    if query is not None:
        if hparams.add_first_dim_for_query_placeholder:
            # remove added dimension
            query = tf.squeeze(query, [0])

        # tokenize query
        if hparams.regex_replace_pattern is not None:
            query = tf.regex_replace(input=query,
                                     pattern=hparams.regex_replace_pattern,
                                     rewrite=" \\1 ")

        query = data_fn.process_text(
            query,
            vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK),
            hparams.CLS,
            hparams.SEP,
            hparams.PAD,
            hparams.max_len,
            hparams.min_len,
            cnn_filter_window_size=max(hparams.filter_window_sizes)
            if hparams.ftr_ext == 'cnn' else 0)
    return query, query_placeholder
Ejemplo n.º 12
0
def get_usr_id_fields(hparams):
    """ Returns a list of processed usr id fields and the corresponding list of raw usr id placeholders
    Each user field has a placeholder
    """
    usr_id_placeholders = []
    usr_fields = []
    tf_vocab_table = vocab_utils.read_tf_vocab(hparams.vocab_file_for_id_ftr,
                                               hparams.UNK_FOR_ID_FTR)
    for ftr_name in hparams.feature_names:
        if ftr_name.startswith('usrId_'):
            # each user id field is a placeholder (one string)
            placeholder = tf.placeholder(shape=[],
                                         dtype=tf.string,
                                         name=ftr_name + "_placeholder")
            usr_id_placeholders.append(placeholder)

            one_usr_field = placeholder

            one_usr_field = tf.expand_dims(one_usr_field, axis=0)
            one_usr_field = data_fn.process_id(one_usr_field, tf_vocab_table,
                                               hparams.PAD_FOR_ID_FTR)

            usr_fields.append(one_usr_field)
    return usr_fields, usr_id_placeholders
Ejemplo n.º 13
0
    def testInputFnBuilderTfrecord(self):
        """Test function input_fn_builder() in eval mode"""
        res_dir = os.path.dirname(__file__) + '/../resources'

        # create a vocab table
        vocab_table = vocab_utils.read_tf_vocab(res_dir + '/vocab.txt',
                                                '[UNK]')

        # dataset dir
        data_dir = os.path.join(res_dir, 'train', 'dataset')
        input_files = os.path.join(data_dir, '*.tfrecord')

        # create a dataset.
        # Read schema
        # Parse and process data in dataset
        feature_names = ('label', 'query', 'doc_completedQuery',
                         'usr_headline', 'usr_skills', 'usr_currTitles',
                         'usrId_currTitles', 'docId_completedQuery',
                         'wide_ftrs', 'weight')

        batch_size = 2
        dataset = data_fn.input_fn(input_pattern=input_files,
                                   metadata_path=None,
                                   batch_size=batch_size,
                                   mode=tf.estimator.ModeKeys.EVAL,
                                   vocab_table=vocab_table,
                                   vocab_table_for_id_ftr=vocab_table,
                                   feature_names=feature_names,
                                   CLS='[CLS]',
                                   SEP='[SEP]',
                                   PAD='[PAD]',
                                   PAD_FOR_ID_FTR='[PAD]',
                                   max_len=16,
                                   cnn_filter_window_size=1)

        # Make iterator
        iterator = dataset.make_initializable_iterator()
        batch_data = iterator.get_next()

        with tf.Session() as sess:
            sess.run(
                [tf.global_variables_initializer(),
                 tf.tables_initializer()])
            sess.run([iterator.initializer])
            batch_data_val, = sess.run([batch_data])
            features, label = batch_data_val

            # First dimension of data should be batch_size
            for ftr_name in feature_names:
                if ftr_name != 'label':
                    self.assertTrue(ftr_name in features)
                    self.assertTrue(features[ftr_name].shape[0] == batch_size)

            self.assertTrue(label['label'].shape[0] == batch_size)

            doc_completedQuery = features['doc_completedQuery']
            docId_completedQuery = features['docId_completedQuery']
            usr_currTitles = features['usr_currTitles']
            usrId_currTitles = features['usrId_currTitles']

            # vocab[PAD] == PAD_ID
            self.assertTrue(doc_completedQuery[0, 0, -1] == self.PAD_ID)
            self.assertTrue(docId_completedQuery[0, 0, -1] == self.PAD_ID)

            # vocab[CLS] == CLS_ID
            self.assertTrue(np.all(doc_completedQuery[0, 0, 0] == self.CLS_ID))
            self.assertTrue(np.all(usr_currTitles[0, 0] == self.CLS_ID))

            # No CLS in id feature
            self.assertTrue(
                np.all(docId_completedQuery[:, :, 0] != self.CLS_ID))

            # In this TFRecord file, we populate docId_completeQuery using doc_completedQuery
            # doc id feature should be the same as doc text feature except CLS and SEP addition
            # Here we make sure this is correct for the first sample
            for text_arr, id_arr in zip(doc_completedQuery[0],
                                        docId_completedQuery[0]):
                self.assertAllEqual(text_arr[text_arr != self.PAD_ID][1:-1],
                                    id_arr[id_arr != self.PAD_ID])

            # In this TFRecord file, we populate usrId_currTitles using usr_currTitles
            # usr id feature should be the same as usr text feature except CLS and SEP addition
            for text_arr, id_arr in zip(usr_currTitles, usrId_currTitles):
                self.assertAllEqual(text_arr[text_arr != self.PAD_ID][1:-1],
                                    id_arr[id_arr != self.PAD_ID])
Ejemplo n.º 14
0
class TestFeatureGrouper(tf.test.TestCase, DataSetup):
    """Unit test for feature_grouper.py"""
    _, vocab_tf_table = vocab_utils.read_tf_vocab(DataSetup.vocab_file,
                                                  '[UNK]')
    vocab_table = vocab_utils.read_vocab(DataSetup.vocab_file)

    PAD_ID = vocab_table[DataSetup.PAD]
    SEP_ID = vocab_table[DataSetup.SEP]
    CLS_ID = vocab_table[DataSetup.CLS]
    UNK_ID = vocab_table[DataSetup.UNK]

    max_filter_window_size = 0

    def testFeatureGrouperKerasInput(self):
        """Tests FeatureGrouper with tf.keras.Input"""
        nums_dense_ftrs = [2, 3]
        nums_sparse_ftrs = [10, 30]
        layer = FeatureGrouper()
        inputs = {
            InputFtrType.QUERY_COLUMN_NAME:
            tf.keras.Input(shape=(), dtype='string'),
            InputFtrType.USER_TEXT_COLUMN_NAMES:
            [tf.keras.Input(shape=(), dtype='string')],
            InputFtrType.USER_ID_COLUMN_NAMES:
            [tf.keras.Input(shape=(), dtype='string')],
            InputFtrType.DOC_TEXT_COLUMN_NAMES:
            [tf.keras.Input(shape=(None, ), dtype='string')],
            InputFtrType.DOC_ID_COLUMN_NAMES:
            [tf.keras.Input(shape=(None, ), dtype='string')],
            InputFtrType.DENSE_FTRS_COLUMN_NAMES: [
                tf.keras.Input(shape=(num_dense_ftrs, ), dtype='float32')
                for num_dense_ftrs in nums_dense_ftrs
            ],
            InputFtrType.SPARSE_FTRS_COLUMN_NAMES: [
                tf.keras.Input(shape=(num_sparse_ftrs, ),
                               dtype='float32',
                               sparse=True)
                for num_sparse_ftrs in nums_sparse_ftrs
            ]
        }
        outputs = layer(inputs)
        self.assertLen(outputs, len(inputs))

    def testFeatureGrouperTensor(self):
        """Tests FeatureGrouper with tensor input"""
        layer = FeatureGrouper()
        inputs = {
            InputFtrType.QUERY_COLUMN_NAME:
            tf.constant(['batch 1 user 1 build', 'batch 2 user 2 word'],
                        dtype=tf.string),
            InputFtrType.DENSE_FTRS_COLUMN_NAMES: [
                tf.constant([[1, 1], [2, 2]], dtype=tf.float32),
                tf.constant([[0], [1]], dtype=tf.float32)
            ],
            InputFtrType.SPARSE_FTRS_COLUMN_NAMES: [
                tf.sparse.from_dense(
                    tf.constant([[1, 0], [2, 0]], dtype=tf.float32)),
                tf.sparse.from_dense(tf.constant([[1], [1]], dtype=tf.float32))
            ]
        }
        expected_result = {
            InputFtrType.QUERY_COLUMN_NAME:
            tf.constant(['batch 1 user 1 build', 'batch 2 user 2 word'],
                        dtype=tf.string),
            InputFtrType.DENSE_FTRS_COLUMN_NAMES:
            tf.constant([[1, 1, 0], [2, 2, 1]]),
            InputFtrType.SPARSE_FTRS_COLUMN_NAMES: [
                tf.constant([[1, 0], [2, 0]], dtype=tf.float32),
                tf.constant([[1], [1]], dtype=tf.float32)
            ]
        }
        outputs = layer(inputs)

        self.assertEqual(
            len(outputs),
            len(expected_result)), "Outputs must have the same shape"
        for ftr_type, expected_ftr in expected_result.items():
            output = outputs[ftr_type]
            if ftr_type == InputFtrType.SPARSE_FTRS_COLUMN_NAMES:
                output = [tf.sparse.to_dense(t) for t in output]
                for e, o in zip(expected_ftr, output):
                    self.assertAllEqual(e, o)
                continue
            self.assertAllEqual(expected_ftr, output)

    def testConcatFtrOnLastDim(self):
        """Tests concatenate features on last dimension"""
        tensor_lst = [
            tf.constant([1, 2, 3], dtype='int32'),
            tf.constant([4, 5, 6], dtype='int32')
        ]
        result = feature_grouper.concat_on_last_axis_dense(tensor_lst)
        expected_output = tf.constant([1, 2, 3, 4, 5, 6], dtype='int32')
        self.assertAllEqual(result, expected_output)
Ejemplo n.º 15
0
def train(hparams, input_fn):
    """
    Main function for train/evaluate DeText ranking model
    :param hparams: hparams
    :param input_fn: input function to create train/eval specs
    :return:
    """
    eval_log_file = None
    if hparams.use_horovod is True:
        import horovod.tensorflow as hvd
        eval_log_file = path_join(hparams.out_dir, 'eval_log.txt')
    train_strategy = tf.contrib.distribute.ParameterServerStrategy()
    estimator = get_estimator(hparams, strategy=train_strategy)

    # Set model export config for evaluator or primary worker of horovod
    exporter_list = None
    if hparams.use_horovod is False or (hparams.use_horovod is True
                                        and hvd.rank() == 0):
        best_model_name = 'best_' + hparams.pmetric
        # Exporter to save best (in terms of pmetric) checkpoint in the folder [best_model_name],
        # and export to savedmodel for prediction.
        best_checkpoint_exporter = BestCheckpointCopier(
            name=best_model_name,
            serving_input_receiver_fn=lambda: serving_input_fn(hparams),
            checkpoints_to_keep=1,  # keeping the best checkpoint
            exports_to_keep=1,  # keeping the best savedmodel
            pmetric='metric/{}'.format(hparams.pmetric),
            compare_fn=lambda x, y: x.score > y.score,  # larger metric better
            sort_reverse=True,
            eval_log_file=eval_log_file)
        exporter_list = [best_checkpoint_exporter]

    # Handle sync distributed training case via use_horovod
    if hparams.use_horovod:
        import horovod.tensorflow as hvd

        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
        # rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights or
        # restored from a checkpoint.
        bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Create TrainSpec for model training
    train_spec = tf.estimator.TrainSpec(
        input_fn=lambda: input_fn(
            input_pattern=hparams.train_file,
            metadata_path=hparams.metadata_path,
            batch_size=hparams.train_batch_size,
            mode=tf.estimator.ModeKeys.TRAIN,
            vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.
                                                  UNK),
            vocab_table_for_id_ftr=vocab_utils.read_tf_vocab(
                hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR),
            feature_names=hparams.feature_names,
            CLS=hparams.CLS,
            SEP=hparams.SEP,
            PAD=hparams.PAD,
            PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR,
            max_len=hparams.max_len,
            min_len=hparams.min_len,
            cnn_filter_window_size=max(hparams.filter_window_sizes)
            if hparams.ftr_ext == 'cnn' else 0,
            # Add horovod information if applicable
            hvd_info=hparams.hvd_info if hparams.use_horovod else None),
        hooks=[bcast_hook] if hparams.use_horovod else
        None,  # Ensure proper initialization with horovod
        max_steps=hparams.num_train_steps)

    eval_spec = tf.estimator.EvalSpec(
        input_fn=lambda: input_fn(
            input_pattern=hparams.dev_file,
            metadata_path=hparams.metadata_path,
            batch_size=hparams.test_batch_size,
            mode=tf.estimator.ModeKeys.EVAL,
            vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.
                                                  UNK),
            vocab_table_for_id_ftr=vocab_utils.read_tf_vocab(
                hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR),
            feature_names=hparams.feature_names,
            CLS=hparams.CLS,
            SEP=hparams.SEP,
            PAD=hparams.PAD,
            PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR,
            max_len=hparams.max_len,
            min_len=hparams.min_len,
            cnn_filter_window_size=max(hparams.filter_window_sizes)
            if hparams.ftr_ext == 'cnn' else 0),
        exporters=exporter_list,
        steps=None,
        # Set throttle_secs to 10 min to avoid warning to spam logs
        # Set throttle to 0 for horovod: https://github.com/horovod/horovod/issues/182#issuecomment-533897757
        throttle_secs=0 if hparams.use_horovod else 600,
        start_delay_secs=10)

    # Training and evaluation with dev set
    tf.estimator.train_and_evaluate(estimator=estimator,
                                    train_spec=train_spec,
                                    eval_spec=eval_spec)
    print("***** Training finished. *****")

    # Evaluation with test set: create an estimator with the best_checkpoint_dir to load the best model
    task_type = executor_utils.get_executor_task_type()
    do_evaluate = task_type == executor_utils.EVALUATOR or task_type == executor_utils.LOCAL_MODE
    if (not hparams.use_horovod and do_evaluate) or (hparams.use_horovod
                                                     and hvd.rank() == 0):
        best_checkpoint_dir = path_join(hparams.out_dir, best_model_name)
        estimator_savedmodel = get_estimator(
            hparams,
            strategy=train_strategy,
            best_checkpoint=best_checkpoint_dir)
        result = estimator_savedmodel.evaluate(input_fn=lambda: input_fn(
            input_pattern=hparams.test_file,
            metadata_path=hparams.metadata_path,
            batch_size=hparams.test_batch_size,
            mode=tf.estimator.ModeKeys.EVAL,
            vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.
                                                  UNK),
            vocab_table_for_id_ftr=vocab_utils.read_tf_vocab(
                hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR),
            feature_names=hparams.feature_names,
            CLS=hparams.CLS,
            SEP=hparams.SEP,
            PAD=hparams.PAD,
            PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR,
            max_len=hparams.max_len,
            min_len=hparams.min_len,
            cnn_filter_window_size=max(hparams.filter_window_sizes)
            if hparams.ftr_ext == 'cnn' else 0))
        print("\n***** Evaluation on test set with best exported model: *****")
        for key in sorted(result.keys()):
            print("%s = %s" % (key, str(result[key])))
Ejemplo n.º 16
0
class TestData(tf.test.TestCase, DataSetup):
    """Unit test for data_fn."""
    _, vocab_tf_table = vocab_utils.read_tf_vocab(DataSetup.vocab_file, '[UNK]')
    vocab_table = vocab_utils.read_vocab(DataSetup.vocab_file)

    CLS = '[CLS]'
    PAD = '[PAD]'
    SEP = '[SEP]'

    PAD_ID = vocab_table[PAD]
    SEP_ID = vocab_table[SEP]
    CLS_ID = vocab_table[CLS]

    nums_sparse_ftrs = [20]

    def testRankingInputFnBuilderTfrecord(self):
        """ Tests function input_fn_builder() """
        one_device_strategy = distribution_utils.get_distribution_strategy('one_device', num_gpus=0)
        feature_type2name_list = [
            # Contains sparse features
            {InputFtrType.LABEL_COLUMN_NAME: 'label',
             InputFtrType.QUERY_COLUMN_NAME: 'query',
             InputFtrType.DOC_TEXT_COLUMN_NAMES: ['doc_headline', 'doc_title'],
             InputFtrType.USER_TEXT_COLUMN_NAMES: ['user_headline', 'user_title'],
             InputFtrType.DOC_ID_COLUMN_NAMES: ['doc_headline_id'],
             InputFtrType.USER_ID_COLUMN_NAMES: ['user_headline_id'],
             InputFtrType.DENSE_FTRS_COLUMN_NAMES: ['dense_ftrs'],
             InputFtrType.SPARSE_FTRS_COLUMN_NAMES: ['sparse_ftrs'],
             InputFtrType.WEIGHT_COLUMN_NAME: 'weight'
             },
            # No sparse features
            {InputFtrType.LABEL_COLUMN_NAME: 'label',
             InputFtrType.QUERY_COLUMN_NAME: 'query',
             InputFtrType.DOC_TEXT_COLUMN_NAMES: ['doc_headline', 'doc_title'],
             InputFtrType.USER_TEXT_COLUMN_NAMES: ['user_headline', 'user_title'],
             InputFtrType.DOC_ID_COLUMN_NAMES: ['doc_headline_id'],
             InputFtrType.USER_ID_COLUMN_NAMES: ['user_headline_id'],
             InputFtrType.DENSE_FTRS_COLUMN_NAMES: ['dense_ftrs'],
             InputFtrType.WEIGHT_COLUMN_NAME: 'weight'
             },
            # Sparse features only
            {InputFtrType.LABEL_COLUMN_NAME: 'label',
             InputFtrType.SPARSE_FTRS_COLUMN_NAMES: ['sparse_ftrs']}
        ]
        strategy_list = [None, one_device_strategy]

        for strategy, feature_type2name in product(strategy_list, feature_type2name_list):
            self._testRankingInputFnBuilderTfrecord(strategy, feature_type2name)

    def _testRankingInputFnBuilderTfrecord(self, strategy, feature_type2name):
        """ Tests function input_fn_builder() for given strategy """
        data_dir = self.ranking_data_dir
        feature_name2num = {'dense_ftrs': 2, 'sparse_ftrs': self.nums_sparse_ftrs[0]}

        def _input_fn_tfrecord(ctx):
            return data_fn.input_fn_tfrecord(input_pattern=data_dir,
                                             batch_size=batch_size,
                                             mode=tf.estimator.ModeKeys.EVAL,
                                             feature_type2name=feature_type2name,
                                             feature_name2num=feature_name2num,
                                             input_pipeline_context=ctx)

        batch_size = 2
        if strategy is not None:
            dataset = strategy.distribute_datasets_from_function(_input_fn_tfrecord)
        else:
            dataset = _input_fn_tfrecord(None)

        # Make iterator
        for features, label in dataset:
            for ftr_type, ftr_name_lst in iterate_items_with_list_val(feature_type2name):
                if ftr_type in (InputFtrType.LABEL_COLUMN_NAME, InputFtrType.WEIGHT_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME):
                    self.assertLen(ftr_name_lst, 1), f'Length for current ftr type ({ftr_type}) should be 1'
                    ftr_name = ftr_name_lst[0]
                    self.assertIn(ftr_name, label)
                    continue

                for ftr_name in ftr_name_lst:
                    self.assertIn(ftr_name, features)
                    # First dimension of data should be batch_size
                    self.assertTrue(features[ftr_name].shape[0] == batch_size)

            weight_ftr_name = feature_type2name.get(InputFtrType.WEIGHT_COLUMN_NAME, constant.Constant()._DEFAULT_WEIGHT_FTR_NAME)
            self.assertAllEqual(tf.shape(label[weight_ftr_name]), [batch_size])

            uid_ftr_name = feature_type2name.get(InputFtrType.UID_COLUMN_NAME, constant.Constant()._DEFAULT_UID_FTR_NAME)
            self.assertAllEqual(tf.shape(label[uid_ftr_name]), [batch_size])

            # First dimension of data should be batch_size
            self.assertEqual(label['label'].shape[0], batch_size)

            if InputFtrType.DOC_TEXT_COLUMN_NAMES in feature_type2name:
                self.assertAllEqual(features['doc_title'],
                                    tf.constant(
                                        [["document title 1", b"title 2 ?", b"doc title 3 ?", b"doc title 4 ?"],
                                         ["document title 1", b"title 2 ?", b"doc title 3 ?", b""]]
                                    ))

            if InputFtrType.DOC_ID_COLUMN_NAMES in feature_type2name:
                self.assertAllEqual(features['doc_headline_id'],
                                    tf.constant(
                                        [[b"document headline id 1", b"headline id 2 ?", b"doc headline id 3 ?", b"doc headline id 4 ?"],
                                         [b"document headline id 1", b"headline id 2 ?", b"doc headline id 3 ?", b""]]
                                    ))

            if InputFtrType.USER_TEXT_COLUMN_NAMES in feature_type2name:
                self.assertAllEqual(features['user_title'],
                                    tf.constant(
                                        [b"user title", b"user title"]
                                    ))
            if InputFtrType.USER_ID_COLUMN_NAMES in feature_type2name:
                self.assertAllEqual(features['user_headline_id'],
                                    tf.constant(
                                        [b"user headline id", b"user headline id"]
                                    ))

            if InputFtrType.DENSE_FTRS_COLUMN_NAMES in feature_type2name:
                self.assertAllEqual(features['dense_ftrs'],
                                    tf.constant(
                                        [[[23.0, 14.0], [44.0, -1.0], [22.0, 19.0], [22.0, 19.0]],
                                         [[23.0, 14.0], [44.0, -1.0], [22.0, 19.0], [0.0, 0.0]]]
                                    ))

            if InputFtrType.SPARSE_FTRS_COLUMN_NAMES in feature_type2name:
                self.assertAllEqual(tf.sparse.to_dense(features['sparse_ftrs']),
                                    tf.sparse.to_dense(tf.SparseTensor(indices=[[0, 0, 1],
                                                                                [0, 0, 5],
                                                                                [0, 1, 0],
                                                                                [0, 2, 2],
                                                                                [0, 3, 8],
                                                                                [1, 0, 1],
                                                                                [1, 0, 5],
                                                                                [1, 1, 0],
                                                                                [1, 2, 2]],
                                                                       values=[1., 5., 7., 12., -8., 1., 5., 7., 12.],
                                                                       dense_shape=[batch_size, 4, self.nums_sparse_ftrs[0]]))
                                    )

            # Only check the first batch
            break

    def testClassificationInputFnBuilderTfrecord(self):
        """Test classification input reader in eval mode"""
        data_dir = self.cls_data_dir

        feature_type2name = {
            InputFtrType.LABEL_COLUMN_NAME: 'label',
            InputFtrType.DOC_TEXT_COLUMN_NAMES: ['query_text'],
            InputFtrType.USER_TEXT_COLUMN_NAMES: ['user_headline'],
            InputFtrType.DENSE_FTRS_COLUMN_NAMES: 'dense_ftrs',
        }
        feature_name2num = {
            'dense_ftrs': 8
        }

        batch_size = 2
        dataset = data_fn.input_fn_tfrecord(input_pattern=data_dir,
                                            batch_size=batch_size,
                                            mode=tf.estimator.ModeKeys.EVAL,
                                            task_type=TaskType.CLASSIFICATION,
                                            feature_type2name=feature_type2name,
                                            feature_name2num=feature_name2num)

        for features, label in dataset:
            # First dimension of data should be batch_size
            for ftr_type, ftr_name_lst in iterate_items_with_list_val(feature_type2name):
                if ftr_type in (InputFtrType.LABEL_COLUMN_NAME, InputFtrType.WEIGHT_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME):
                    self.assertLen(ftr_name_lst, 1), f'Length for current ftr type ({ftr_type}) should be 1'
                    ftr_name = ftr_name_lst[0]
                    self.assertIn(ftr_name, label)
                    continue
                for ftr_name in ftr_name_lst:
                    self.assertIn(ftr_name, features)
                    self.assertEqual(features[ftr_name].shape[0], batch_size)

            weight_ftr_name = feature_type2name.get(InputFtrType.WEIGHT_COLUMN_NAME, constant.Constant()._DEFAULT_WEIGHT_FTR_NAME)
            self.assertAllEqual(tf.shape(label[weight_ftr_name]), [batch_size])

            uid_ftr_name = feature_type2name.get(InputFtrType.UID_COLUMN_NAME, constant.Constant()._DEFAULT_UID_FTR_NAME)
            self.assertAllEqual(tf.shape(label[uid_ftr_name]), [batch_size])

            self.assertAllEqual(label['label'].shape, [batch_size])

    def testBinaryClassificationInputFnBuilderTfrecord(self):
        """Test binary classification input reader """
        data_dir = self.binary_cls_data_dir

        feature_type2name = {
            InputFtrType.LABEL_COLUMN_NAME: 'label',
            InputFtrType.SPARSE_FTRS_COLUMN_NAMES: ['sparse_ftrs'],
            InputFtrType.SHALLOW_TOWER_SPARSE_FTRS_COLUMN_NAMES: ['shallow_tower_sparse_ftrs', 'sparse_ftrs']
        }
        feature_name2num = {
            'sparse_ftrs': 20,
            'shallow_tower_sparse_ftrs': 20
        }

        batch_size = 2
        dataset = data_fn.input_fn_tfrecord(input_pattern=data_dir,
                                            batch_size=batch_size,
                                            mode=tf.estimator.ModeKeys.EVAL,
                                            task_type=TaskType.BINARY_CLASSIFICATION,
                                            feature_type2name=feature_type2name,
                                            feature_name2num=feature_name2num
                                            )

        for features, label in dataset:
            # First dimension of data should be batch_size
            for ftr_type, ftr_name_lst in iterate_items_with_list_val(feature_type2name):
                if ftr_type in (InputFtrType.LABEL_COLUMN_NAME, InputFtrType.WEIGHT_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME):
                    self.assertLen(ftr_name_lst, 1), f'Length for current ftr type ({ftr_type}) should be 1'
                    ftr_name = ftr_name_lst[0]
                    self.assertIn(ftr_name, label)
                    continue
                for ftr_name in ftr_name_lst:
                    self.assertIn(ftr_name, features)
                    self.assertEqual(features[ftr_name].shape[0], batch_size)

            weight_ftr_name = feature_type2name.get(InputFtrType.WEIGHT_COLUMN_NAME, constant.Constant()._DEFAULT_WEIGHT_FTR_NAME)
            self.assertAllEqual(tf.shape(label[weight_ftr_name]), [batch_size])

            uid_ftr_name = feature_type2name.get(InputFtrType.UID_COLUMN_NAME, constant.Constant()._DEFAULT_UID_FTR_NAME)
            self.assertAllEqual(tf.shape(label[uid_ftr_name]), [batch_size])

            self.assertAllEqual(label['label'].shape, [batch_size])
            self.assertAllEqual(tf.sparse.to_dense(features['sparse_ftrs']),
                                tf.sparse.to_dense(
                                    tf.SparseTensor(indices=[[0, 0],
                                                             [0, 2],
                                                             [0, 7],
                                                             [1, 0],
                                                             [1, 2],
                                                             [1, 7]],
                                                    values=[1, 0, 7, 1, 0, 7],
                                                    dense_shape=[batch_size, self.nums_sparse_ftrs[0]])
                                )
                                )

            # Only check first batch
            break

    def testRankingMultitaskInputFnBuilderTfrecord(self):
        """Test additional input from multitask training in eval mode"""
        data_dir = self.ranking_data_dir

        # Test minimum features required for multitask jobs
        feature_type2name = {
            InputFtrType.LABEL_COLUMN_NAME: 'label',
            InputFtrType.QUERY_COLUMN_NAME: 'query',
            InputFtrType.DOC_TEXT_COLUMN_NAMES: ['doc_headline', 'doc_title'],
            InputFtrType.USER_TEXT_COLUMN_NAMES: ['user_headline', 'user_title'],
            InputFtrType.DOC_ID_COLUMN_NAMES: ['doc_headline_id'],
            InputFtrType.USER_ID_COLUMN_NAMES: ['user_headline_id'],
            InputFtrType.DENSE_FTRS_COLUMN_NAMES: ['dense_ftrs'],
            InputFtrType.WEIGHT_COLUMN_NAME: 'weight',
            InputFtrType.TASK_ID_COLUMN_NAME: 'task_id_field'
        }
        feature_name2num = {
            'dense_ftrs': 2
        }

        batch_size = 5
        dataset = data_fn.input_fn_tfrecord(input_pattern=data_dir,
                                            batch_size=batch_size,
                                            mode=tf.estimator.ModeKeys.EVAL,
                                            feature_type2name=feature_type2name,
                                            feature_name2num=feature_name2num)

        for features, label in dataset:
            # First dimension of data should be batch_size
            for ftr_type, ftr_name_lst in iterate_items_with_list_val(feature_type2name):
                if ftr_type in (InputFtrType.LABEL_COLUMN_NAME, InputFtrType.WEIGHT_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME):
                    self.assertLen(ftr_name_lst, 1), f'Length for current ftr type ({ftr_type}) should be 1'
                    ftr_name = ftr_name_lst[0]
                    self.assertIn(ftr_name, label)
                    continue
                for ftr_name in ftr_name_lst:
                    self.assertIn(ftr_name, features)
                    self.assertEqual(features[ftr_name].shape[0], batch_size)

            weight_ftr_name = feature_type2name.get(InputFtrType.WEIGHT_COLUMN_NAME, constant.Constant()._DEFAULT_WEIGHT_FTR_NAME)
            self.assertAllEqual(tf.shape(label[weight_ftr_name]), [batch_size])

            uid_ftr_name = feature_type2name.get(InputFtrType.UID_COLUMN_NAME, constant.Constant()._DEFAULT_UID_FTR_NAME)
            self.assertAllEqual(tf.shape(label[uid_ftr_name]), [batch_size])

            # First dimension of data should be batch_size
            self.assertEqual(label['label'].shape[0], batch_size)

            task_ids = features['task_id_field']

            # Check task_id dimension size
            self.assertEqual(len(task_ids.shape), 1)

            # Check task_id value in the sample data
            for t_id in task_ids:
                self.assertAllEqual(t_id, 5)