def tokenize(self, text): """Tokenizes a piece of `text` and returns a list of `SubToken`s.""" text = bert_tokenization.convert_to_unicode(text) # Begin with the entire input as a single string. subtokens = [SubToken(text, text, is_good=True)] del text # unused after this point subtokens = self._clean_text(subtokens) # This was added on November 1st, 2018 for the multilingual and Chinese # models. This is also applied to the English models now, but it doesn't # matter since the English models were not trained on any Chinese data # and generally don't have any Chinese data in them (there are Chinese # characters in the vocabulary because Wikipedia does have some Chinese # words in the English Wikipedia.). subtokens = self._tokenize_chinese_chars(subtokens) # Split punctuation, preserving special tokens. subtokens = whitespace_tokenize(subtokens) split_subtokens = [] for subtoken, orig_subtoken, is_good in subtokens: assert subtoken == orig_subtoken if not is_good: split_subtokens.append(SubToken(subtoken, subtoken, is_good=False)) continue if bert_tokenization.preserve_token(subtoken, self.vocab): split_subtokens.append(SubToken(subtoken, subtoken, is_good=True)) continue split_subtokens.extend( self._run_split_on_punc([SubToken(subtoken, subtoken, is_good=True)])) return split_subtokens
def tokenize(self, text): """Tokenizes a piece of `text` and returns a list of `SubToken`s.""" split_tokens = [] # list of `SubToken`s. for token, orig_token, is_good_token in self.basic_tokenizer.tokenize(text): if not is_good_token: split_tokens.append(SubToken(token, orig_token, is_good=False)) continue # Preserve special tokens such as '[Q]' and '[SEP]'. if bert_tokenization.preserve_token(token, self.vocab): split_tokens.append(SubToken(token, orig_token, is_good=True)) continue # For everything else, send the text-like tokens that have survived # whitespace and puncutation splitting through a wordpiece tokenizer. for sub_token in self.wordpiece_tokenizer.tokenize( [SubToken(token, orig_token, is_good_token)]): # `sub_token` has type `SubToken`. split_tokens.append(sub_token) return split_tokens