Esempio n. 1
0
    def preprocessing_fn(inputs):
        """Preprocessing function used in TF Transform.

    Args:
       inputs: the input dataset of tf.Examples

    Returns:
       preprocessed outputs
    """
        vocab_table = tf.lookup.StaticHashTable(
            tf.lookup.TextFileInitializer(vocab_file, tf.string,
                                          tf.lookup.TextFileIndex.WHOLE_LINE,
                                          tf.int64,
                                          tf.lookup.TextFileIndex.LINE_NUMBER),
            -1)

        tokenizer = BertTokenizer()
        tokens = tokenizer.tokenize(inputs[text_key])
        wordpiece_tokenizer = WordpieceTokenizer(vocab_table,
                                                 token_out_type=tf.string)
        wordpieces = wordpiece_tokenizer.tokenize(tokens)
        wordpieces_flat = wordpieces.flat_values
        wordpieces_flat.set_shape([None])
        wordpieces = tf.RaggedTensor.from_nested_row_splits(
            wordpieces_flat, wordpieces.nested_row_splits)

        known_mask = tf.cast(tf.not_equal(wordpieces, '[UNK]'), tf.int32)
        num_non_unk_wordpieces = tf.reduce_sum(known_mask, axis=[1, 2])

        wordpiece_is_unknown = tf.equal(wordpieces, '[UNK]')
        token_has_unknown = tf.reduce_any(wordpiece_is_unknown, axis=-1)
        unknown_tokens = tf.ragged.boolean_mask(tokens, token_has_unknown)
        unknown_lengths = tf.strings.length(unknown_tokens)
        num_dropped_chars = tf.math.reduce_sum(unknown_lengths, axis=1)

        token_lengths = tf.strings.length(tokens)
        total_chars = tf.reduce_sum(token_lengths, axis=-1)
        num_preserved_chars = total_chars - num_dropped_chars

        flattened = tf.RaggedTensor.from_row_splits(
            wordpieces.flat_values,
            tf.gather(wordpieces.values.row_splits, wordpieces.row_splits))

        outputs = {}
        outputs['num_non_unk_wordpieces'] = tf.cast(num_non_unk_wordpieces,
                                                    tf.int64)
        outputs['num_dropped_chars'] = tf.cast(num_dropped_chars, tf.int64)
        outputs['num_preserved_chars'] = tf.cast(num_preserved_chars, tf.int64)
        outputs['wordpieces'] = flattened.to_sparse()
        outputs['lang'] = tf.convert_to_tensor(inputs[language_code_key])

        return outputs
Esempio n. 2
0
  def preprocessing_fn(inputs):
    """Function used to transform dataset using TF transform.

       Tokenizes input and detects language if there is no associated
       language_code.

    Args:
       inputs: dataset of tf.Examples containing text samples

    Returns:
       transformed outputs
    """

    outputs = {}

    tokenizer = BertTokenizer()
    tokens = tokenizer.tokenize(inputs[text_key])
    outputs['tokens'] = tokens.to_sparse()
    outputs['lang'] = tf.convert_to_tensor(inputs[language_code_key])

    return outputs