def test_count_lists(self): data = [['aaa', 'bb', 'c'], ['aaa', 'aaa'], ['c']] counts = learner.count_words(data) self.assertEqual(counts, collections.Counter({ 'aaa': 3, 'bb': 1, 'c': 2 }))
def test_count_ragged_dataset(self): ds = dataset_ops.DatasetV2.from_tensor_slices( ['aaa bb c', 'aaa aaa', 'c']) ds = ds.map(ragged_string_ops.string_split_v2) counts = learner.count_words(ds) self.assertEqual(counts, collections.Counter({ 'aaa': 3, 'bb': 1, 'c': 2 }))
def test_count_numpy_gen(self): def get_words(): yield np.array(['aaa', 'bb', 'c']) yield np.array(['aaa', 'aaa']) yield np.array(['c']) counts = learner.count_words(get_words()) self.assertEqual(counts, collections.Counter({ 'aaa': 3, 'bb': 1, 'c': 2 }))
def bert_vocab_from_dataset(dataset, vocab_size: int, reserved_tokens: List[str], bert_tokenizer_params=None, learn_params=None) -> List[str]: """Generate a Bert wordpiece vocabulary from a `tf.data.Dataset` of texts. ``` import tensorflow_text as text vocab = bert_vocab_from_dataset(dataset, vocab_size, reserved_tokens, bert_tokenizer_params, learn_params) bert_tokenizer = text.BertTokenizer(vocab, **bert_tokenizer_params) token_ids = bert_tokenizer.tokenize(text) ``` This uses Bert's splitting algorithm to split the text into words before generating the subword vocabulary from the resulting words. The resulting vocabulary _can_ be used directly with a `text.WordpieceTokenizer`, but note that the vocabulary will be sub-optimal or **broken** if you split the text into words a different way. ``` wordpiece_tokenizer = text.WordpieceTokenizer(vocab, ...) words = split(text) token_ids = wordpiece_tokenizer.tokenize(words) ``` Args: dataset: `A tf.data.Dataset` containing string-tensor elements. vocab_size: The target vocabulary size. This is the maximum size. reserved_tokens: A list of tokens that must be included in the vocabulary. bert_tokenizer_params: The `text.BertTokenizer` arguments relavant for to vocabulary-generation: * `lower_case` * `keep_whitespace` * `normalization_form` * `preserve_unused_token` See `BertTokenizer` for details. You should use the same values for these to both generate the vocabulary and build the `BertTokenizer`. learn_params: A dict of additional key word arguments for the the vocabulary learning function. See `wordpiece_tokenizer_learner_lib.learn` for details. Returns: A list strings containing the vocabulary. Raises: TypeError: If the dataset contains structured elements instead of single tensors. """ if bert_tokenizer_params is None: bert_tokenizer_params = {} if learn_params is None: learn_params = {} element_spec = dataset.element_spec try: element_spec.shape except AttributeError: raise TypeError("The dataset should contain single-tensor elements.") tokenizer = bert_tokenizer.BasicTokenizer(**bert_tokenizer_params) words_dataset = dataset.map(tokenizer.tokenize) word_counts = learner.count_words(words_dataset) vocab = learner.learn(word_counts, vocab_size, reserved_tokens, **learn_params) return vocab