def testDetokenizeIsReversable(self): table = _CreateTable(_MIXED_LANG_VOCAB + [b""], 2) self.evaluate(table.initializer) tokenizer = WordpieceTokenizer(table) word_lists = [ [b"hello", b"there", b"my", b"name", b"is", b"treadness"], [b"whatchamacallit?", b"you", b"said"], [_Utf8(u"大"), _Utf8(u"易")], ] words = ragged_factory_ops.constant(word_lists) subwords_ids = tokenizer.tokenize(words) # detokeinze input shape is (batch, ragged-words, ragged-wordpieces) words_output = tokenizer.detokenize(subwords_ids) words_output = array_ops.squeeze(words_output, axis=-1) self.assertAllEqual(words_output, words) # detokeinze input shape is (batch, ragged-wordpieces) subwords_id_seqs = subwords_ids.merge_dims(-2, -1) words_output = tokenizer.detokenize(subwords_id_seqs) self.assertAllEqual(words_output, words) # detokeinze input shape is a dense (batch, padded-wordpieces) words_output = tokenizer.detokenize( subwords_ids.merge_dims(-2, -1) # len(_MIXED_LANG_VOCAB) is "" .to_tensor(default_value=len(_MIXED_LANG_VOCAB))) self.assertAllEqual(words_output, words)
def preprocessing_fn(inputs): """Preprocessing function used in TF Transform. Args: inputs: the input dataset of tf.Examples Returns: preprocessed outputs """ vocab_table = tf.lookup.StaticHashTable( tf.lookup.TextFileInitializer(vocab_file, tf.string, tf.lookup.TextFileIndex.WHOLE_LINE, tf.int64, tf.lookup.TextFileIndex.LINE_NUMBER), -1) tokenizer = BertTokenizer() tokens = tokenizer.tokenize(inputs[text_key]) wordpiece_tokenizer = WordpieceTokenizer(vocab_table, token_out_type=tf.string) wordpieces = wordpiece_tokenizer.tokenize(tokens) wordpieces_flat = wordpieces.flat_values wordpieces_flat.set_shape([None]) wordpieces = tf.RaggedTensor.from_nested_row_splits( wordpieces_flat, wordpieces.nested_row_splits) known_mask = tf.cast(tf.not_equal(wordpieces, '[UNK]'), tf.int32) num_non_unk_wordpieces = tf.reduce_sum(known_mask, axis=[1, 2]) wordpiece_is_unknown = tf.equal(wordpieces, '[UNK]') token_has_unknown = tf.reduce_any(wordpiece_is_unknown, axis=-1) unknown_tokens = tf.ragged.boolean_mask(tokens, token_has_unknown) unknown_lengths = tf.strings.length(unknown_tokens) num_dropped_chars = tf.math.reduce_sum(unknown_lengths, axis=1) token_lengths = tf.strings.length(tokens) total_chars = tf.reduce_sum(token_lengths, axis=-1) num_preserved_chars = total_chars - num_dropped_chars flattened = tf.RaggedTensor.from_row_splits( wordpieces.flat_values, tf.gather(wordpieces.values.row_splits, wordpieces.row_splits)) outputs = {} outputs['num_non_unk_wordpieces'] = tf.cast(num_non_unk_wordpieces, tf.int64) outputs['num_dropped_chars'] = tf.cast(num_dropped_chars, tf.int64) outputs['num_preserved_chars'] = tf.cast(num_preserved_chars, tf.int64) outputs['wordpieces'] = flattened.to_sparse() outputs['lang'] = tf.convert_to_tensor(inputs[language_code_key]) return outputs
def testTensors(self, tokens, expected_subwords, vocab, expected_start=None, expected_limit=None, use_unknown_token=True, token_out_type=dtypes.string): vocab_table = _CreateTable(vocab) self.evaluate(vocab_table.initializer) tokenizer = WordpieceTokenizer(vocab_table, token_out_type=token_out_type) subwords = tokenizer.tokenize(tokens) self.assertAllEqual(subwords, expected_subwords)
def testWordPieceOpWithMultipleRaggedRank(self, tokens, expected_subwords, vocab, expected_start=None, expected_limit=None, use_unknown_token=True, token_out_type=dtypes.string): for row_splits_dtype in (dtypes.int32, dtypes.int64): ragged_tokens = ragged_factory_ops.constant( tokens, row_splits_dtype=row_splits_dtype) vocab_table = _CreateTable(vocab) self.evaluate(vocab_table.initializer) tokenizer = WordpieceTokenizer(vocab_table, token_out_type=token_out_type) subwords = tokenizer.tokenize(ragged_tokens) self.assertAllEqual(subwords, expected_subwords)
def testDetokenizeFailsForSparseVocab(self): vocab = ["a", "##b", "##c"] ids = [0, 10, 20] init = lookup_ops.KeyValueTensorInitializer(vocab, ids, key_dtype=dtypes.string, value_dtype=dtypes.int64) table = lookup_ops.StaticVocabularyTableV1( init, num_oov_buckets=1, lookup_key_dtype=dtypes.string) self.evaluate(table.initializer) tokenizer = WordpieceTokenizer(table) words = ragged_factory_ops.constant([["abb", "abc"], ["abcbc"]]) subwords_ids = tokenizer.tokenize(words) with self.assertRaisesRegex(errors_impl.InvalidArgumentError, "detokenize.*?dense on the interval"): result = tokenizer.detokenize(subwords_ids) self.evaluate(result)
class BertTokenizer(Tokenizer): """Tokenizer used for BERT. This tokenizer applies an end-to-end, text string to wordpiece tokenization. It first applies basic tokenization, and then follwed by wordpiece tokenization. See BasicTokenizer and WordpieceTokenizer for their respective details. Attributes: vocab_lookup_table: A lookup table implementing the LookupInterface containing the vocabulary of subwords. suffix_indicator: (optional) The characters prepended to a wordpiece to indicate that it is a suffix to another subword. Default is '##'. max_bytes_per_word: (optional) Max size of input token. Default is 100. max_chars_per_token: (optional) Max size of subwords, excluding suffix indicator. If known, providing this improves the efficiency of decoding long words. token_out_type: (optional) The type of the token to return. This can be `tf.int64` IDs, or `tf.string` subwords. The default is `tf.int64`. unknown_token: (optional) The value to use when an unknown token is found. Default is "[UNK]". If this is set to a string, and `token_out_type` is `tf.int64`, the `vocab_lookup_table` is used to convert the `unknown_token` to an integer. If this is set to `None`, out-of-vocabulary tokens are left as is. split_unknown_characters: (optional) Whether to split out single unknown characters as subtokens. If False (default), words containing unknown characters will be treated as single unknown tokens. lower_case: bool - If true, a preprocessing step is added to lowercase the text, apply NFD normalization, and strip accents characters. keep_whitespace: bool - If true, preserves whitespace characters instead of stripping them away. normalization_form: If true and lower_case=False, the input text will be normalized to `normalization_form`. See normalize_utf8() op for a list of valid values. """ def __init__(self, vocab_lookup_table, suffix_indicator="##", max_bytes_per_word=100, max_chars_per_token=None, token_out_type=dtypes.int64, unknown_token="[UNK]", split_unknown_characters=False, lower_case=False, keep_whitespace=False, normalization_form=None): self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace, normalization_form) self._wordpiece_tokenizer = WordpieceTokenizer( vocab_lookup_table, suffix_indicator, max_bytes_per_word, max_chars_per_token, token_out_type, unknown_token, split_unknown_characters) def tokenize(self, text_input): """Performs untokenized text to wordpiece tokenization for BERT. Args: text_input: input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings. Returns: A `RaggedTensor` of tokens where `tokens[i1...iN, j]` is the string contents (or ID in the vocab_lookup_table representing that string) of the `jth` token in `input[i1...iN]` """ tokens = self._basic_tokenizer.tokenize(text_input) return self._wordpiece_tokenizer.tokenize(tokens)
class BertTokenizer(TokenizerWithOffsets): r"""Tokenizer used for BERT. This tokenizer applies an end-to-end, text string to wordpiece tokenization. It first applies basic tokenization, and then follwed by wordpiece tokenization. See BasicTokenizer and WordpieceTokenizer for their respective details. Attributes: vocab_lookup_table: A lookup table implementing the LookupInterface containing the vocabulary of subwords or a string which is the file path to the vocab.txt file. suffix_indicator: (optional) The characters prepended to a wordpiece to indicate that it is a suffix to another subword. Default is '##'. max_bytes_per_word: (optional) Max size of input token. Default is 100. max_chars_per_token: (optional) Max size of subwords, excluding suffix indicator. If known, providing this improves the efficiency of decoding long words. token_out_type: (optional) The type of the token to return. This can be `tf.int64` IDs, or `tf.string` subwords. The default is `tf.int64`. unknown_token: (optional) The value to use when an unknown token is found. Default is "[UNK]". If this is set to a string, and `token_out_type` is `tf.int64`, the `vocab_lookup_table` is used to convert the `unknown_token` to an integer. If this is set to `None`, out-of-vocabulary tokens are left as is. split_unknown_characters: (optional) Whether to split out single unknown characters as subtokens. If False (default), words containing unknown characters will be treated as single unknown tokens. lower_case: bool - If true, a preprocessing step is added to lowercase the text, apply NFD normalization, and strip accents characters. keep_whitespace: bool - If true, preserves whitespace characters instead of stripping them away. normalization_form: If true and lower_case=False, the input text will be normalized to `normalization_form`. See normalize_utf8() op for a list of valid values. preserve_unused_token: If true, text in the regex format `\\[unused\\d+\\]` will be treated as a token and thus remain preserved as is to be looked up in the vocabulary. """ def __init__(self, vocab_lookup_table, suffix_indicator="##", max_bytes_per_word=100, max_chars_per_token=None, token_out_type=dtypes.int64, unknown_token="[UNK]", split_unknown_characters=False, lower_case=False, keep_whitespace=False, normalization_form=None, preserve_unused_token=False): super(BertTokenizer, self).__init__() _tf_text_bert_tokenizer_op_create_counter.get_cell().increase_by(1) if isinstance(vocab_lookup_table, str) or isinstance( vocab_lookup_table, ops.Tensor): init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table) vocab_lookup_table = lookup_ops.StaticVocabularyTableV1( init, num_oov_buckets=1, lookup_key_dtype=dtypes.string) self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace, normalization_form, preserve_unused_token) self._wordpiece_tokenizer = WordpieceTokenizer( vocab_lookup_table, suffix_indicator, max_bytes_per_word, max_chars_per_token, token_out_type, unknown_token, split_unknown_characters) def tokenize_with_offsets(self, text_input): tokens, begin, _ = self._basic_tokenizer.tokenize_with_offsets( text_input) wordpieces, wp_begin, wp_end = ( self._wordpiece_tokenizer.tokenize_with_offsets(tokens)) begin_expanded = array_ops.expand_dims(begin, axis=2) final_begin = begin_expanded + wp_begin final_end = begin_expanded + wp_end return wordpieces, final_begin, final_end def tokenize(self, text_input): """Performs untokenized text to wordpiece tokenization for BERT. Args: text_input: input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings. Returns: A `RaggedTensor` of tokens where `tokens[i1...iN, j]` is the string contents (or ID in the vocab_lookup_table representing that string) of the `jth` token in `input[i1...iN]` """ tokens = self._basic_tokenizer.tokenize(text_input) return self._wordpiece_tokenizer.tokenize(tokens)
class BertTokenizer(TokenizerWithOffsets, Detokenizer): r"""Tokenizer used for BERT. This tokenizer applies an end-to-end, text string to wordpiece tokenization. It first applies basic tokenization, and then followed by wordpiece tokenization. See BasicTokenizer and WordpieceTokenizer for their respective details. Attributes: vocab_lookup_table: A lookup table implementing the LookupInterface containing the vocabulary of subwords or a string which is the file path to the vocab.txt file. suffix_indicator: (optional) The characters prepended to a wordpiece to indicate that it is a suffix to another subword. Default is '##'. max_bytes_per_word: (optional) Max size of input token. Default is 100. max_chars_per_token: (optional) Max size of subwords, excluding suffix indicator. If known, providing this improves the efficiency of decoding long words. token_out_type: (optional) The type of the token to return. This can be `tf.int64` IDs, or `tf.string` subwords. The default is `tf.int64`. unknown_token: (optional) The value to use when an unknown token is found. Default is "[UNK]". If this is set to a string, and `token_out_type` is `tf.int64`, the `vocab_lookup_table` is used to convert the `unknown_token` to an integer. If this is set to `None`, out-of-vocabulary tokens are left as is. split_unknown_characters: (optional) Whether to split out single unknown characters as subtokens. If False (default), words containing unknown characters will be treated as single unknown tokens. lower_case: bool - If true, a preprocessing step is added to lowercase the text, apply NFD normalization, and strip accents characters. keep_whitespace: bool - If true, preserves whitespace characters instead of stripping them away. normalization_form: If true and lower_case=False, the input text will be normalized to `normalization_form`. See normalize_utf8() op for a list of valid values. preserve_unused_token: If true, text in the regex format `\\[unused\\d+\\]` will be treated as a token and thus remain preserved as is to be looked up in the vocabulary. basic_tokenizer_class: If set, the class to use instead of BasicTokenizer """ def __init__(self, vocab_lookup_table, suffix_indicator="##", max_bytes_per_word=100, max_chars_per_token=None, token_out_type=dtypes.int64, unknown_token="[UNK]", split_unknown_characters=False, lower_case=False, keep_whitespace=False, normalization_form=None, preserve_unused_token=False, basic_tokenizer_class=BasicTokenizer): super(BertTokenizer, self).__init__() _tf_text_bert_tokenizer_op_create_counter.get_cell().increase_by(1) self._basic_tokenizer = basic_tokenizer_class(lower_case, keep_whitespace, normalization_form, preserve_unused_token) self._wordpiece_tokenizer = WordpieceTokenizer( vocab_lookup_table, suffix_indicator, max_bytes_per_word, max_chars_per_token, token_out_type, unknown_token, split_unknown_characters) def tokenize_with_offsets(self, text_input): tokens, begin, _ = self._basic_tokenizer.tokenize_with_offsets( text_input) wordpieces, wp_begin, wp_end = ( self._wordpiece_tokenizer.tokenize_with_offsets(tokens)) begin_expanded = array_ops.expand_dims(begin, axis=2) final_begin = begin_expanded + wp_begin final_end = begin_expanded + wp_end return wordpieces, final_begin, final_end def tokenize(self, text_input): """Performs untokenized text to wordpiece tokenization for BERT. Args: text_input: input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings. Returns: A `RaggedTensor` of tokens where `tokens[i1...iN, j]` is the string contents (or ID in the vocab_lookup_table representing that string) of the `jth` token in `input[i1...iN]` """ tokens = self._basic_tokenizer.tokenize(text_input) return self._wordpiece_tokenizer.tokenize(tokens) def detokenize(self, token_ids): """Convert a `Tensor` or `RaggedTensor` of wordpiece IDs to string-words. See `WordpieceTokenizer.detokenize` for details. Note: `BertTokenizer.tokenize`/`BertTokenizer.detokenize` does not round trip losslessly. The result of `detokenize` will not, in general, have the same content or offsets as the input to `tokenize`. This is because the "basic tokenization" step, that splits the strings into words before applying the `WordpieceTokenizer`, includes irreversible steps like lower-casing and splitting on punctuation. `WordpieceTokenizer` on the other hand **is** reversible. Note: This method assumes wordpiece IDs are dense on the interval `[0, vocab_size)`. Args: token_ids: A `RaggedTensor` or `Tensor` with an int dtype. Returns: A `RaggedTensor` with dtype `string` and the same rank as the input `token_ids`. """ return self._wordpiece_tokenizer.detokenize(token_ids)