def tokenize(self, text): """ :param text: :return: """ text = convert_to_unicode(text) split_tokens = text.split(self.split_char) return split_tokens
def tokenize(self, text): """Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. For example: input = "unaffable" output = ["un", "##aff", "##able"] Args: text: A single token or whitespace separated tokens. This should have already been passed through `BasicTokenizer. Returns: A list of wordpiece tokens. """ text = convert_to_unicode(text) output_tokens = [] for token in whitespace_tokenize(text): chars = list(token) if len(chars) > self.max_input_chars_per_word: output_tokens.append(self.unk_token) continue is_bad = False start = 0 sub_tokens = [] while start < len(chars): end = len(chars) cur_substr = None while start < end: substr = "".join(chars[start:end]) if start > 0: substr = "##" + substr if substr in self.vocabulary.vocab_dict: cur_substr = substr break end -= 1 if cur_substr is None: is_bad = True break sub_tokens.append(cur_substr) start = end if is_bad: output_tokens.append(self.unk_token) else: output_tokens.extend(sub_tokens) return output_tokens
def tokenize(self, text): """Tokenizes a piece of text into its word pieces. Returns: A list of wordpiece tokens. """ text = text.lower() if self.do_lower_case else text text = convert_to_unicode(text) output_tokens = [] for token in text.split(self.split_token): if token in self.vocab: output_tokens.append(token) else: sp_tokens = self.tokenizer.EncodeAsPieces(token) for sp_token in sp_tokens: if sp_token in self.vocab: output_tokens.append(sp_token) return output_tokens
def tokenize(self, text): """Tokenizes a piece of text into its word pieces. Returns: A list of wordpiece tokens. """ text = text.lower() if self.do_lower_case else text text = convert_to_unicode(text.replace("\1", " ")) tokens = self.tokenizer.EncodeAsPieces(text) output_tokens = [] for token in tokens: if token == self.sp_unk_token: token = self.unk_token if token in self.vocab: output_tokens.append(token) else: output_tokens.append(self.unk_token) return output_tokens
def load_vocab(self): """ :return: """ vocab_dict = collections.OrderedDict() id_dict = collections.OrderedDict() file_vocab = open(self.vocab_path) for num, line in enumerate(file_vocab): items = convert_to_unicode(line.strip()).split("\t") if len(items) > 2: break token = items[0] if len(items) == 2: index = items[1] else: index = num token = token.strip() vocab_dict[token] = int(index) id_dict[index] = token return vocab_dict, id_dict
def tokenize(self, text): """Tokenizes a piece of text.""" text = convert_to_unicode(text) text = self._clean_text(text) # This was added on November 1st, 2018 for the multilingual and Chinese # models. This is also applied to the English models now, but it doesn't # matter since the English models were not trained on any Chinese data # and generally don't have any Chinese data in them (there are Chinese # characters in the vocabulary because Wikipedia does have some Chinese # words in the English Wikipedia.). text = self._tokenize_chinese_chars(text) orig_tokens = whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: if self.do_lower_case: token = token.lower() token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token)) output_tokens = whitespace_tokenize(" ".join(split_tokens)) return output_tokens
def tokenize(self, text): """ :param text: :return: """ text = convert_to_unicode(text) output_tokens = [] for token in whitespace_tokenize(text): chars = list(token) if len(chars) > self.max_input_chars_per_word: output_tokens.append(self.unk_token) continue is_bad = False start = 0 sub_tokens = [] while start < len(chars): end = len(chars) cur_substr = None while start < end: substr = "".join(chars[start:end]) if start == 0: substr = u'\u2581' + substr if substr in self.vocabulary.vocab_dict: cur_substr = substr break end -= 1 if cur_substr is None: is_bad = True break sub_tokens.append(cur_substr) start = end if is_bad: output_tokens.append(self.unk_token) else: output_tokens.extend(sub_tokens) return output_tokens