Beispiel #1
0
    def testDetokenizeIsReversable(self):

        table = _CreateTable(_MIXED_LANG_VOCAB + [b""], 2)
        self.evaluate(table.initializer)

        tokenizer = WordpieceTokenizer(table)

        word_lists = [
            [b"hello", b"there", b"my", b"name", b"is", b"treadness"],
            [b"whatchamacallit?", b"you", b"said"],
            [_Utf8(u"大"), _Utf8(u"易")],
        ]
        words = ragged_factory_ops.constant(word_lists)

        subwords_ids = tokenizer.tokenize(words)

        # detokeinze input shape is (batch, ragged-words, ragged-wordpieces)
        words_output = tokenizer.detokenize(subwords_ids)
        words_output = array_ops.squeeze(words_output, axis=-1)

        self.assertAllEqual(words_output, words)

        # detokeinze input shape is (batch, ragged-wordpieces)
        subwords_id_seqs = subwords_ids.merge_dims(-2, -1)
        words_output = tokenizer.detokenize(subwords_id_seqs)
        self.assertAllEqual(words_output, words)

        # detokeinze input shape is a dense (batch, padded-wordpieces)
        words_output = tokenizer.detokenize(
            subwords_ids.merge_dims(-2, -1)
            # len(_MIXED_LANG_VOCAB) is ""
            .to_tensor(default_value=len(_MIXED_LANG_VOCAB)))

        self.assertAllEqual(words_output, words)
Beispiel #2
0
    def testDetokenizeFailsForSparseVocab(self):
        vocab = ["a", "##b", "##c"]
        ids = [0, 10, 20]
        init = lookup_ops.KeyValueTensorInitializer(vocab,
                                                    ids,
                                                    key_dtype=dtypes.string,
                                                    value_dtype=dtypes.int64)
        table = lookup_ops.StaticVocabularyTableV1(
            init, num_oov_buckets=1, lookup_key_dtype=dtypes.string)
        self.evaluate(table.initializer)

        tokenizer = WordpieceTokenizer(table)
        words = ragged_factory_ops.constant([["abb", "abc"], ["abcbc"]])
        subwords_ids = tokenizer.tokenize(words)

        with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
                                    "detokenize.*?dense on the interval"):
            result = tokenizer.detokenize(subwords_ids)
            self.evaluate(result)
Beispiel #3
0
class BertTokenizer(TokenizerWithOffsets, Detokenizer):
    r"""Tokenizer used for BERT.

    This tokenizer applies an end-to-end, text string to wordpiece tokenization.
    It first applies basic tokenization, and then followed by wordpiece
    tokenization.

    See BasicTokenizer and WordpieceTokenizer for their respective details.

  Attributes:
    vocab_lookup_table: A lookup table implementing the LookupInterface
      containing the vocabulary of subwords or a string which is the file path
      to the vocab.txt file.
    suffix_indicator: (optional) The characters prepended to a wordpiece to
      indicate that it is a suffix to another subword. Default is '##'.
    max_bytes_per_word: (optional) Max size of input token. Default is 100.
    max_chars_per_token: (optional) Max size of subwords, excluding suffix
      indicator. If known, providing this improves the efficiency of decoding
      long words.
    token_out_type: (optional) The type of the token to return. This can be
      `tf.int64` IDs, or `tf.string` subwords. The default is `tf.int64`.
    unknown_token: (optional) The value to use when an unknown token is found.
      Default is "[UNK]". If this is set to a string, and `token_out_type` is
      `tf.int64`, the `vocab_lookup_table` is used to convert the
      `unknown_token` to an integer. If this is set to `None`, out-of-vocabulary
      tokens are left as is.
    split_unknown_characters: (optional) Whether to split out single unknown
      characters as subtokens. If False (default), words containing unknown
      characters will be treated as single unknown tokens.
    lower_case: bool - If true, a preprocessing step is added to lowercase the
      text, apply NFD normalization, and strip accents characters.
    keep_whitespace: bool - If true, preserves whitespace characters instead of
      stripping them away.
    normalization_form: If true and lower_case=False, the input text will be
      normalized to `normalization_form`. See normalize_utf8() op for a list of
      valid values.
    preserve_unused_token: If true, text in the regex format `\\[unused\\d+\\]`
      will be treated as a token and thus remain preserved as is to be looked up
      in the vocabulary.
    basic_tokenizer_class: If set, the class to use instead of BasicTokenizer
  """
    def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator="##",
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token="[UNK]",
                 split_unknown_characters=False,
                 lower_case=False,
                 keep_whitespace=False,
                 normalization_form=None,
                 preserve_unused_token=False,
                 basic_tokenizer_class=BasicTokenizer):
        super(BertTokenizer, self).__init__()
        _tf_text_bert_tokenizer_op_create_counter.get_cell().increase_by(1)

        self._basic_tokenizer = basic_tokenizer_class(lower_case,
                                                      keep_whitespace,
                                                      normalization_form,
                                                      preserve_unused_token)
        self._wordpiece_tokenizer = WordpieceTokenizer(
            vocab_lookup_table, suffix_indicator, max_bytes_per_word,
            max_chars_per_token, token_out_type, unknown_token,
            split_unknown_characters)

    def tokenize_with_offsets(self, text_input):
        tokens, begin, _ = self._basic_tokenizer.tokenize_with_offsets(
            text_input)
        wordpieces, wp_begin, wp_end = (
            self._wordpiece_tokenizer.tokenize_with_offsets(tokens))
        begin_expanded = array_ops.expand_dims(begin, axis=2)
        final_begin = begin_expanded + wp_begin
        final_end = begin_expanded + wp_end
        return wordpieces, final_begin, final_end

    def tokenize(self, text_input):
        """Performs untokenized text to wordpiece tokenization for BERT.

    Args:
      text_input: input: A `Tensor` or `RaggedTensor` of untokenized UTF-8
        strings.

    Returns:
      A `RaggedTensor` of tokens where `tokens[i1...iN, j]` is the string
      contents (or ID in the vocab_lookup_table representing that string)
      of the `jth` token in `input[i1...iN]`
    """
        tokens = self._basic_tokenizer.tokenize(text_input)
        return self._wordpiece_tokenizer.tokenize(tokens)

    def detokenize(self, token_ids):
        """Convert a `Tensor` or `RaggedTensor` of wordpiece IDs to string-words.

    See `WordpieceTokenizer.detokenize` for details.

    Note: `BertTokenizer.tokenize`/`BertTokenizer.detokenize` does not round
    trip losslessly. The result of `detokenize` will not, in general, have the
    same content or offsets as the input to `tokenize`. This is because the
    "basic tokenization" step, that splits the strings into words before
    applying the `WordpieceTokenizer`, includes irreversible
    steps like lower-casing and splitting on punctuation. `WordpieceTokenizer`
    on the other hand **is** reversible.

    Note: This method assumes wordpiece IDs are dense on the interval
    `[0, vocab_size)`.

    Args:
      token_ids: A `RaggedTensor` or `Tensor` with an int dtype.

    Returns:
      A `RaggedTensor` with dtype `string` and the same rank as the input
      `token_ids`.
    """
        return self._wordpiece_tokenizer.detokenize(token_ids)