Exemple #1
0
    def testWordPieceOpAndVerifyOffsets(self,
                                        tokens,
                                        expected_subwords,
                                        vocab,
                                        expected_start=None,
                                        expected_limit=None,
                                        use_unknown_token=True,
                                        unknown_token="[UNK]",
                                        token_out_type=dtypes.string,
                                        max_bytes_per_word=100):
        tokens = ragged_factory_ops.constant(tokens)
        vocab_table = _CreateTable(vocab)
        self.evaluate(vocab_table.initializer)
        tokenizer = WordpieceTokenizer(vocab_table,
                                       unknown_token=unknown_token,
                                       token_out_type=token_out_type,
                                       max_bytes_per_word=max_bytes_per_word)
        subwords, begin, end = tokenizer.tokenize_with_offsets(tokens)
        self.assertRaggedEqual(subwords, expected_subwords)

        # Verify the indices by performing the following:
        # - Extract the subwords and join them together to form the original tokens.
        # - Then compare the extracted tokens and original tokens.
        tokens, begin, end = (self.evaluate((tokens, begin, end)))

        # If expected start/limit offsets were provided, check them explicitly.
        # Otherwise test the offsets by extracting subwords using token offsets
        # from the original 'tokens' input.
        if expected_start is None or expected_limit is None:
            extracted_tokens = _GetTokensFromWordpieceOffsets(
                tokens, begin, end)
            self.assertRaggedEqual(extracted_tokens, tokens)
        else:
            self.assertRaggedEqual(begin, expected_start)
            self.assertRaggedEqual(end, expected_limit)
    def testWordPieceOpAndVerifyOffsets(self,
                                        tokens,
                                        expected_subwords,
                                        vocab,
                                        expected_start=None,
                                        expected_limit=None,
                                        use_unknown_token=True,
                                        unknown_token="[UNK]",
                                        token_out_type=tf.string):
        tokens = tf.ragged.constant(tokens)
        vocab_table = _CreateTable(vocab)
        self.evaluate(vocab_table.initializer)
        tokenizer = WordpieceTokenizer(vocab_table,
                                       unknown_token=unknown_token,
                                       token_out_type=token_out_type)
        subwords, begin, end = tokenizer.tokenize_with_offsets(tokens)
        self.assertRaggedEqual(subwords, expected_subwords)

        # Verify the indices by performing the following:
        # - Extract the subwords and join them together to form the original tokens.
        # - Then compare the extracted tokens and original tokens.
        tokens, begin, end = (self.evaluate((tokens, begin, end)))

        extracted_tokens = _GetTokensFromWordpieceOffsets(tokens, begin, end)
        self.assertRaggedEqual(extracted_tokens, tokens)
  def testWordPieceOpWithIdReturned(self):
    """Let the table determine how to do a lookup on unknown tokens."""
    tokens = ragged_factory_ops.constant(
        [[b"don't", b"tread", b"cantfindme", b"treadcantfindme"]])
    vocab_table = _CreateTable(
        _ENGLISH_VOCAB,
        100  # OOV values
    )
    self.evaluate(vocab_table.initializer)
    tokenizer = WordpieceTokenizer(
        vocab_table, unknown_token=None, token_out_type=dtypes.int64)
    subwords, _, _ = tokenizer.tokenize_with_offsets(tokens)

    self.assertAllEqual(subwords, [[[0, 1, 2], [3], [96], [46]]])
Exemple #4
0
    def testWordPieceOpAndVerifyOffsets(self,
                                        tokens,
                                        expected_subwords,
                                        vocab,
                                        expected_start=None,
                                        expected_limit=None,
                                        use_unknown_token=True,
                                        unknown_token="[UNK]",
                                        token_out_type=dtypes.string,
                                        max_bytes_per_word=100,
                                        split_unknown_characters=False):
        for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
            with compat.forward_compatibility_horizon(*horizon):
                tokens_t = ragged_factory_ops.constant(tokens)
                vocab_table = _CreateTable(vocab)
                self.evaluate(vocab_table.initializer)
                tokenizer = WordpieceTokenizer(
                    vocab_table,
                    unknown_token=unknown_token,
                    token_out_type=token_out_type,
                    max_bytes_per_word=max_bytes_per_word,
                    split_unknown_characters=split_unknown_characters,
                )
                subwords_t, begin_t, end_t = tokenizer.tokenize_with_offsets(
                    tokens_t)
                self.assertAllEqual(subwords_t, expected_subwords)

                # Verify the indices by performing the following:
                # - Extract subwords and join them together to form the original tokens.
                # - Then compare the extracted tokens and original tokens.
                begin, end = (self.evaluate((begin_t, end_t)))

                # If expected start/limit offsets were provided, check them explicitly.
                # Otherwise test the offsets by extracting subwords using token offsets
                # from the original 'tokens' input.
                if expected_start is None or expected_limit is None:
                    extracted_tokens = _GetTokensFromWordpieceOffsets(
                        tokens, begin, end)
                    self.assertAllEqual(extracted_tokens, tokens)
                else:
                    self.assertAllEqual(begin, expected_start)
                    self.assertAllEqual(end, expected_limit)
Exemple #5
0
class BertTokenizer(TokenizerWithOffsets):
    """Tokenizer used for BERT.

    This tokenizer applies an end-to-end, text string to wordpiece tokenization.
    It first applies basic tokenization, and then follwed by wordpiece
    tokenization.

    See BasicTokenizer and WordpieceTokenizer for their respective details.

  Attributes:
    vocab_lookup_table: A lookup table implementing the LookupInterface
      containing the vocabulary of subwords or a string which is the file path
      to the vocab.txt file.
    suffix_indicator: (optional) The characters prepended to a wordpiece to
      indicate that it is a suffix to another subword. Default is '##'.
    max_bytes_per_word: (optional) Max size of input token. Default is 100.
    max_chars_per_token: (optional) Max size of subwords, excluding suffix
      indicator. If known, providing this improves the efficiency of decoding
      long words.
    token_out_type: (optional) The type of the token to return. This can be
      `tf.int64` IDs, or `tf.string` subwords. The default is `tf.int64`.
    unknown_token: (optional) The value to use when an unknown token is found.
      Default is "[UNK]". If this is set to a string, and `token_out_type` is
      `tf.int64`, the `vocab_lookup_table` is used to convert the
      `unknown_token` to an integer. If this is set to `None`,
      out-of-vocabulary tokens are left as is.
    split_unknown_characters: (optional) Whether to split out single unknown
      characters as subtokens. If False (default), words containing unknown
      characters will be treated as single unknown tokens.
    lower_case: bool - If true, a preprocessing step is added to lowercase the
      text, apply NFD normalization, and strip accents characters.
    keep_whitespace: bool - If true, preserves whitespace characters instead of
      stripping them away.
    normalization_form: If true and lower_case=False, the input text will be
      normalized to `normalization_form`. See normalize_utf8() op for a list
      of valid values.
  """
    def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator="##",
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token="[UNK]",
                 split_unknown_characters=False,
                 lower_case=False,
                 keep_whitespace=False,
                 normalization_form=None):
        if isinstance(vocab_lookup_table, str):
            init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table)
            vocab_lookup_table = lookup_ops.StaticVocabularyTableV1(
                init, num_oov_buckets=1, lookup_key_dtype=dtypes.string)

        self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace,
                                               normalization_form)
        self._wordpiece_tokenizer = WordpieceTokenizer(
            vocab_lookup_table, suffix_indicator, max_bytes_per_word,
            max_chars_per_token, token_out_type, unknown_token,
            split_unknown_characters)

    def tokenize_with_offsets(self, text_input):
        tokens, begin, _ = self._basic_tokenizer.tokenize_with_offsets(
            text_input)
        wordpieces, wp_begin, wp_end = (
            self._wordpiece_tokenizer.tokenize_with_offsets(tokens))
        begin_expanded = array_ops.expand_dims(begin, axis=2)
        final_begin = begin_expanded + wp_begin
        final_end = begin_expanded + wp_end
        return wordpieces, final_begin, final_end

    def tokenize(self, text_input):
        """Performs untokenized text to wordpiece tokenization for BERT.

    Args:
      text_input: input: A `Tensor` or `RaggedTensor` of untokenized UTF-8
        strings.
    Returns:
      A `RaggedTensor` of tokens where `tokens[i1...iN, j]` is the string
      contents (or ID in the vocab_lookup_table representing that string)
      of the `jth` token in `input[i1...iN]`
    """
        tokens = self._basic_tokenizer.tokenize(text_input)
        return self._wordpiece_tokenizer.tokenize(tokens)
Exemple #6
0
class BertTokenizer(TokenizerWithOffsets, Detokenizer):
    r"""Tokenizer used for BERT.

    This tokenizer applies an end-to-end, text string to wordpiece tokenization.
    It first applies basic tokenization, and then followed by wordpiece
    tokenization.

    See BasicTokenizer and WordpieceTokenizer for their respective details.

  Attributes:
    vocab_lookup_table: A lookup table implementing the LookupInterface
      containing the vocabulary of subwords or a string which is the file path
      to the vocab.txt file.
    suffix_indicator: (optional) The characters prepended to a wordpiece to
      indicate that it is a suffix to another subword. Default is '##'.
    max_bytes_per_word: (optional) Max size of input token. Default is 100.
    max_chars_per_token: (optional) Max size of subwords, excluding suffix
      indicator. If known, providing this improves the efficiency of decoding
      long words.
    token_out_type: (optional) The type of the token to return. This can be
      `tf.int64` IDs, or `tf.string` subwords. The default is `tf.int64`.
    unknown_token: (optional) The value to use when an unknown token is found.
      Default is "[UNK]". If this is set to a string, and `token_out_type` is
      `tf.int64`, the `vocab_lookup_table` is used to convert the
      `unknown_token` to an integer. If this is set to `None`, out-of-vocabulary
      tokens are left as is.
    split_unknown_characters: (optional) Whether to split out single unknown
      characters as subtokens. If False (default), words containing unknown
      characters will be treated as single unknown tokens.
    lower_case: bool - If true, a preprocessing step is added to lowercase the
      text, apply NFD normalization, and strip accents characters.
    keep_whitespace: bool - If true, preserves whitespace characters instead of
      stripping them away.
    normalization_form: If true and lower_case=False, the input text will be
      normalized to `normalization_form`. See normalize_utf8() op for a list of
      valid values.
    preserve_unused_token: If true, text in the regex format `\\[unused\\d+\\]`
      will be treated as a token and thus remain preserved as is to be looked up
      in the vocabulary.
    basic_tokenizer_class: If set, the class to use instead of BasicTokenizer
  """
    def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator="##",
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token="[UNK]",
                 split_unknown_characters=False,
                 lower_case=False,
                 keep_whitespace=False,
                 normalization_form=None,
                 preserve_unused_token=False,
                 basic_tokenizer_class=BasicTokenizer):
        super(BertTokenizer, self).__init__()
        _tf_text_bert_tokenizer_op_create_counter.get_cell().increase_by(1)

        self._basic_tokenizer = basic_tokenizer_class(lower_case,
                                                      keep_whitespace,
                                                      normalization_form,
                                                      preserve_unused_token)
        self._wordpiece_tokenizer = WordpieceTokenizer(
            vocab_lookup_table, suffix_indicator, max_bytes_per_word,
            max_chars_per_token, token_out_type, unknown_token,
            split_unknown_characters)

    def tokenize_with_offsets(self, text_input):
        tokens, begin, _ = self._basic_tokenizer.tokenize_with_offsets(
            text_input)
        wordpieces, wp_begin, wp_end = (
            self._wordpiece_tokenizer.tokenize_with_offsets(tokens))
        begin_expanded = array_ops.expand_dims(begin, axis=2)
        final_begin = begin_expanded + wp_begin
        final_end = begin_expanded + wp_end
        return wordpieces, final_begin, final_end

    def tokenize(self, text_input):
        """Performs untokenized text to wordpiece tokenization for BERT.

    Args:
      text_input: input: A `Tensor` or `RaggedTensor` of untokenized UTF-8
        strings.

    Returns:
      A `RaggedTensor` of tokens where `tokens[i1...iN, j]` is the string
      contents (or ID in the vocab_lookup_table representing that string)
      of the `jth` token in `input[i1...iN]`
    """
        tokens = self._basic_tokenizer.tokenize(text_input)
        return self._wordpiece_tokenizer.tokenize(tokens)

    def detokenize(self, token_ids):
        """Convert a `Tensor` or `RaggedTensor` of wordpiece IDs to string-words.

    See `WordpieceTokenizer.detokenize` for details.

    Note: `BertTokenizer.tokenize`/`BertTokenizer.detokenize` does not round
    trip losslessly. The result of `detokenize` will not, in general, have the
    same content or offsets as the input to `tokenize`. This is because the
    "basic tokenization" step, that splits the strings into words before
    applying the `WordpieceTokenizer`, includes irreversible
    steps like lower-casing and splitting on punctuation. `WordpieceTokenizer`
    on the other hand **is** reversible.

    Note: This method assumes wordpiece IDs are dense on the interval
    `[0, vocab_size)`.

    Args:
      token_ids: A `RaggedTensor` or `Tensor` with an int dtype.

    Returns:
      A `RaggedTensor` with dtype `string` and the same rank as the input
      `token_ids`.
    """
        return self._wordpiece_tokenizer.detokenize(token_ids)