Exemple #1
0
    def testDetokenizeIsReversable(self):

        table = _CreateTable(_MIXED_LANG_VOCAB + [b""], 2)
        self.evaluate(table.initializer)

        tokenizer = WordpieceTokenizer(table)

        word_lists = [
            [b"hello", b"there", b"my", b"name", b"is", b"treadness"],
            [b"whatchamacallit?", b"you", b"said"],
            [_Utf8(u"大"), _Utf8(u"易")],
        ]
        words = ragged_factory_ops.constant(word_lists)

        subwords_ids = tokenizer.tokenize(words)

        # detokeinze input shape is (batch, ragged-words, ragged-wordpieces)
        words_output = tokenizer.detokenize(subwords_ids)
        words_output = array_ops.squeeze(words_output, axis=-1)

        self.assertAllEqual(words_output, words)

        # detokeinze input shape is (batch, ragged-wordpieces)
        subwords_id_seqs = subwords_ids.merge_dims(-2, -1)
        words_output = tokenizer.detokenize(subwords_id_seqs)
        self.assertAllEqual(words_output, words)

        # detokeinze input shape is a dense (batch, padded-wordpieces)
        words_output = tokenizer.detokenize(
            subwords_ids.merge_dims(-2, -1)
            # len(_MIXED_LANG_VOCAB) is ""
            .to_tensor(default_value=len(_MIXED_LANG_VOCAB)))

        self.assertAllEqual(words_output, words)
Exemple #2
0
    def preprocessing_fn(inputs):
        """Preprocessing function used in TF Transform.

    Args:
       inputs: the input dataset of tf.Examples

    Returns:
       preprocessed outputs
    """
        vocab_table = tf.lookup.StaticHashTable(
            tf.lookup.TextFileInitializer(vocab_file, tf.string,
                                          tf.lookup.TextFileIndex.WHOLE_LINE,
                                          tf.int64,
                                          tf.lookup.TextFileIndex.LINE_NUMBER),
            -1)

        tokenizer = BertTokenizer()
        tokens = tokenizer.tokenize(inputs[text_key])
        wordpiece_tokenizer = WordpieceTokenizer(vocab_table,
                                                 token_out_type=tf.string)
        wordpieces = wordpiece_tokenizer.tokenize(tokens)
        wordpieces_flat = wordpieces.flat_values
        wordpieces_flat.set_shape([None])
        wordpieces = tf.RaggedTensor.from_nested_row_splits(
            wordpieces_flat, wordpieces.nested_row_splits)

        known_mask = tf.cast(tf.not_equal(wordpieces, '[UNK]'), tf.int32)
        num_non_unk_wordpieces = tf.reduce_sum(known_mask, axis=[1, 2])

        wordpiece_is_unknown = tf.equal(wordpieces, '[UNK]')
        token_has_unknown = tf.reduce_any(wordpiece_is_unknown, axis=-1)
        unknown_tokens = tf.ragged.boolean_mask(tokens, token_has_unknown)
        unknown_lengths = tf.strings.length(unknown_tokens)
        num_dropped_chars = tf.math.reduce_sum(unknown_lengths, axis=1)

        token_lengths = tf.strings.length(tokens)
        total_chars = tf.reduce_sum(token_lengths, axis=-1)
        num_preserved_chars = total_chars - num_dropped_chars

        flattened = tf.RaggedTensor.from_row_splits(
            wordpieces.flat_values,
            tf.gather(wordpieces.values.row_splits, wordpieces.row_splits))

        outputs = {}
        outputs['num_non_unk_wordpieces'] = tf.cast(num_non_unk_wordpieces,
                                                    tf.int64)
        outputs['num_dropped_chars'] = tf.cast(num_dropped_chars, tf.int64)
        outputs['num_preserved_chars'] = tf.cast(num_preserved_chars, tf.int64)
        outputs['wordpieces'] = flattened.to_sparse()
        outputs['lang'] = tf.convert_to_tensor(inputs[language_code_key])

        return outputs
 def testTensors(self,
                 tokens,
                 expected_subwords,
                 vocab,
                 expected_start=None,
                 expected_limit=None,
                 use_unknown_token=True,
                 token_out_type=dtypes.string):
   vocab_table = _CreateTable(vocab)
   self.evaluate(vocab_table.initializer)
   tokenizer = WordpieceTokenizer(vocab_table, token_out_type=token_out_type)
   subwords = tokenizer.tokenize(tokens)
   self.assertAllEqual(subwords, expected_subwords)
 def testWordPieceOpWithMultipleRaggedRank(self,
                                           tokens,
                                           expected_subwords,
                                           vocab,
                                           expected_start=None,
                                           expected_limit=None,
                                           use_unknown_token=True,
                                           token_out_type=dtypes.string):
   for row_splits_dtype in (dtypes.int32, dtypes.int64):
     ragged_tokens = ragged_factory_ops.constant(
         tokens, row_splits_dtype=row_splits_dtype)
     vocab_table = _CreateTable(vocab)
     self.evaluate(vocab_table.initializer)
     tokenizer = WordpieceTokenizer(vocab_table, token_out_type=token_out_type)
     subwords = tokenizer.tokenize(ragged_tokens)
     self.assertAllEqual(subwords, expected_subwords)
Exemple #5
0
    def testDetokenizeFailsForSparseVocab(self):
        vocab = ["a", "##b", "##c"]
        ids = [0, 10, 20]
        init = lookup_ops.KeyValueTensorInitializer(vocab,
                                                    ids,
                                                    key_dtype=dtypes.string,
                                                    value_dtype=dtypes.int64)
        table = lookup_ops.StaticVocabularyTableV1(
            init, num_oov_buckets=1, lookup_key_dtype=dtypes.string)
        self.evaluate(table.initializer)

        tokenizer = WordpieceTokenizer(table)
        words = ragged_factory_ops.constant([["abb", "abc"], ["abcbc"]])
        subwords_ids = tokenizer.tokenize(words)

        with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
                                    "detokenize.*?dense on the interval"):
            result = tokenizer.detokenize(subwords_ids)
            self.evaluate(result)
Exemple #6
0
class BertTokenizer(Tokenizer):
  """Tokenizer used for BERT.

    This tokenizer applies an end-to-end, text string to wordpiece tokenization.
    It first applies basic tokenization, and then follwed by wordpiece
    tokenization.

    See BasicTokenizer and WordpieceTokenizer for their respective details.

  Attributes:
    vocab_lookup_table: A lookup table implementing the LookupInterface
      containing the vocabulary of subwords.
    suffix_indicator: (optional) The characters prepended to a wordpiece to
      indicate that it is a suffix to another subword. Default is '##'.
    max_bytes_per_word: (optional) Max size of input token. Default is 100.
    max_chars_per_token: (optional) Max size of subwords, excluding suffix
      indicator. If known, providing this improves the efficiency of decoding
      long words.
    token_out_type: (optional) The type of the token to return. This can be
      `tf.int64` IDs, or `tf.string` subwords. The default is `tf.int64`.
    unknown_token: (optional) The value to use when an unknown token is found.
      Default is "[UNK]". If this is set to a string, and `token_out_type` is
      `tf.int64`, the `vocab_lookup_table` is used to convert the
      `unknown_token` to an integer. If this is set to `None`,
      out-of-vocabulary tokens are left as is.
    split_unknown_characters: (optional) Whether to split out single unknown
      characters as subtokens. If False (default), words containing unknown
      characters will be treated as single unknown tokens.
    lower_case: bool - If true, a preprocessing step is added to lowercase the
      text, apply NFD normalization, and strip accents characters.
    keep_whitespace: bool - If true, preserves whitespace characters instead of
      stripping them away.
    normalization_form: If true and lower_case=False, the input text will be
      normalized to `normalization_form`. See normalize_utf8() op for a list
      of valid values.
  """

  def __init__(self,
               vocab_lookup_table,
               suffix_indicator="##",
               max_bytes_per_word=100,
               max_chars_per_token=None,
               token_out_type=dtypes.int64,
               unknown_token="[UNK]",
               split_unknown_characters=False,
               lower_case=False,
               keep_whitespace=False,
               normalization_form=None):
    self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace,
                                           normalization_form)
    self._wordpiece_tokenizer = WordpieceTokenizer(
        vocab_lookup_table, suffix_indicator, max_bytes_per_word,
        max_chars_per_token, token_out_type, unknown_token,
        split_unknown_characters)

  def tokenize(self, text_input):
    """Performs untokenized text to wordpiece tokenization for BERT.

    Args:
      text_input: input: A `Tensor` or `RaggedTensor` of untokenized UTF-8
        strings.
    Returns:
      A `RaggedTensor` of tokens where `tokens[i1...iN, j]` is the string
      contents (or ID in the vocab_lookup_table representing that string)
      of the `jth` token in `input[i1...iN]`
    """
    tokens = self._basic_tokenizer.tokenize(text_input)
    return self._wordpiece_tokenizer.tokenize(tokens)
class BertTokenizer(TokenizerWithOffsets):
    r"""Tokenizer used for BERT.

    This tokenizer applies an end-to-end, text string to wordpiece tokenization.
    It first applies basic tokenization, and then follwed by wordpiece
    tokenization.

    See BasicTokenizer and WordpieceTokenizer for their respective details.

  Attributes:
    vocab_lookup_table: A lookup table implementing the LookupInterface
      containing the vocabulary of subwords or a string which is the file path
      to the vocab.txt file.
    suffix_indicator: (optional) The characters prepended to a wordpiece to
      indicate that it is a suffix to another subword. Default is '##'.
    max_bytes_per_word: (optional) Max size of input token. Default is 100.
    max_chars_per_token: (optional) Max size of subwords, excluding suffix
      indicator. If known, providing this improves the efficiency of decoding
      long words.
    token_out_type: (optional) The type of the token to return. This can be
      `tf.int64` IDs, or `tf.string` subwords. The default is `tf.int64`.
    unknown_token: (optional) The value to use when an unknown token is found.
      Default is "[UNK]". If this is set to a string, and `token_out_type` is
      `tf.int64`, the `vocab_lookup_table` is used to convert the
      `unknown_token` to an integer. If this is set to `None`, out-of-vocabulary
      tokens are left as is.
    split_unknown_characters: (optional) Whether to split out single unknown
      characters as subtokens. If False (default), words containing unknown
      characters will be treated as single unknown tokens.
    lower_case: bool - If true, a preprocessing step is added to lowercase the
      text, apply NFD normalization, and strip accents characters.
    keep_whitespace: bool - If true, preserves whitespace characters instead of
      stripping them away.
    normalization_form: If true and lower_case=False, the input text will be
      normalized to `normalization_form`. See normalize_utf8() op for a list of
      valid values.
    preserve_unused_token: If true, text in the regex format `\\[unused\\d+\\]`
      will be treated as a token and thus remain preserved as is to be looked up
      in the vocabulary.
  """
    def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator="##",
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token="[UNK]",
                 split_unknown_characters=False,
                 lower_case=False,
                 keep_whitespace=False,
                 normalization_form=None,
                 preserve_unused_token=False):
        super(BertTokenizer, self).__init__()
        _tf_text_bert_tokenizer_op_create_counter.get_cell().increase_by(1)
        if isinstance(vocab_lookup_table, str) or isinstance(
                vocab_lookup_table, ops.Tensor):
            init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table)
            vocab_lookup_table = lookup_ops.StaticVocabularyTableV1(
                init, num_oov_buckets=1, lookup_key_dtype=dtypes.string)

        self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace,
                                               normalization_form,
                                               preserve_unused_token)
        self._wordpiece_tokenizer = WordpieceTokenizer(
            vocab_lookup_table, suffix_indicator, max_bytes_per_word,
            max_chars_per_token, token_out_type, unknown_token,
            split_unknown_characters)

    def tokenize_with_offsets(self, text_input):
        tokens, begin, _ = self._basic_tokenizer.tokenize_with_offsets(
            text_input)
        wordpieces, wp_begin, wp_end = (
            self._wordpiece_tokenizer.tokenize_with_offsets(tokens))
        begin_expanded = array_ops.expand_dims(begin, axis=2)
        final_begin = begin_expanded + wp_begin
        final_end = begin_expanded + wp_end
        return wordpieces, final_begin, final_end

    def tokenize(self, text_input):
        """Performs untokenized text to wordpiece tokenization for BERT.

    Args:
      text_input: input: A `Tensor` or `RaggedTensor` of untokenized UTF-8
        strings.

    Returns:
      A `RaggedTensor` of tokens where `tokens[i1...iN, j]` is the string
      contents (or ID in the vocab_lookup_table representing that string)
      of the `jth` token in `input[i1...iN]`
    """
        tokens = self._basic_tokenizer.tokenize(text_input)
        return self._wordpiece_tokenizer.tokenize(tokens)
Exemple #8
0
class BertTokenizer(TokenizerWithOffsets, Detokenizer):
    r"""Tokenizer used for BERT.

    This tokenizer applies an end-to-end, text string to wordpiece tokenization.
    It first applies basic tokenization, and then followed by wordpiece
    tokenization.

    See BasicTokenizer and WordpieceTokenizer for their respective details.

  Attributes:
    vocab_lookup_table: A lookup table implementing the LookupInterface
      containing the vocabulary of subwords or a string which is the file path
      to the vocab.txt file.
    suffix_indicator: (optional) The characters prepended to a wordpiece to
      indicate that it is a suffix to another subword. Default is '##'.
    max_bytes_per_word: (optional) Max size of input token. Default is 100.
    max_chars_per_token: (optional) Max size of subwords, excluding suffix
      indicator. If known, providing this improves the efficiency of decoding
      long words.
    token_out_type: (optional) The type of the token to return. This can be
      `tf.int64` IDs, or `tf.string` subwords. The default is `tf.int64`.
    unknown_token: (optional) The value to use when an unknown token is found.
      Default is "[UNK]". If this is set to a string, and `token_out_type` is
      `tf.int64`, the `vocab_lookup_table` is used to convert the
      `unknown_token` to an integer. If this is set to `None`, out-of-vocabulary
      tokens are left as is.
    split_unknown_characters: (optional) Whether to split out single unknown
      characters as subtokens. If False (default), words containing unknown
      characters will be treated as single unknown tokens.
    lower_case: bool - If true, a preprocessing step is added to lowercase the
      text, apply NFD normalization, and strip accents characters.
    keep_whitespace: bool - If true, preserves whitespace characters instead of
      stripping them away.
    normalization_form: If true and lower_case=False, the input text will be
      normalized to `normalization_form`. See normalize_utf8() op for a list of
      valid values.
    preserve_unused_token: If true, text in the regex format `\\[unused\\d+\\]`
      will be treated as a token and thus remain preserved as is to be looked up
      in the vocabulary.
    basic_tokenizer_class: If set, the class to use instead of BasicTokenizer
  """
    def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator="##",
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token="[UNK]",
                 split_unknown_characters=False,
                 lower_case=False,
                 keep_whitespace=False,
                 normalization_form=None,
                 preserve_unused_token=False,
                 basic_tokenizer_class=BasicTokenizer):
        super(BertTokenizer, self).__init__()
        _tf_text_bert_tokenizer_op_create_counter.get_cell().increase_by(1)

        self._basic_tokenizer = basic_tokenizer_class(lower_case,
                                                      keep_whitespace,
                                                      normalization_form,
                                                      preserve_unused_token)
        self._wordpiece_tokenizer = WordpieceTokenizer(
            vocab_lookup_table, suffix_indicator, max_bytes_per_word,
            max_chars_per_token, token_out_type, unknown_token,
            split_unknown_characters)

    def tokenize_with_offsets(self, text_input):
        tokens, begin, _ = self._basic_tokenizer.tokenize_with_offsets(
            text_input)
        wordpieces, wp_begin, wp_end = (
            self._wordpiece_tokenizer.tokenize_with_offsets(tokens))
        begin_expanded = array_ops.expand_dims(begin, axis=2)
        final_begin = begin_expanded + wp_begin
        final_end = begin_expanded + wp_end
        return wordpieces, final_begin, final_end

    def tokenize(self, text_input):
        """Performs untokenized text to wordpiece tokenization for BERT.

    Args:
      text_input: input: A `Tensor` or `RaggedTensor` of untokenized UTF-8
        strings.

    Returns:
      A `RaggedTensor` of tokens where `tokens[i1...iN, j]` is the string
      contents (or ID in the vocab_lookup_table representing that string)
      of the `jth` token in `input[i1...iN]`
    """
        tokens = self._basic_tokenizer.tokenize(text_input)
        return self._wordpiece_tokenizer.tokenize(tokens)

    def detokenize(self, token_ids):
        """Convert a `Tensor` or `RaggedTensor` of wordpiece IDs to string-words.

    See `WordpieceTokenizer.detokenize` for details.

    Note: `BertTokenizer.tokenize`/`BertTokenizer.detokenize` does not round
    trip losslessly. The result of `detokenize` will not, in general, have the
    same content or offsets as the input to `tokenize`. This is because the
    "basic tokenization" step, that splits the strings into words before
    applying the `WordpieceTokenizer`, includes irreversible
    steps like lower-casing and splitting on punctuation. `WordpieceTokenizer`
    on the other hand **is** reversible.

    Note: This method assumes wordpiece IDs are dense on the interval
    `[0, vocab_size)`.

    Args:
      token_ids: A `RaggedTensor` or `Tensor` with an int dtype.

    Returns:
      A `RaggedTensor` with dtype `string` and the same rank as the input
      `token_ids`.
    """
        return self._wordpiece_tokenizer.detokenize(token_ids)