return [self.tokenize(text) for text in texts] #overrides def tokenize(self, text): if self._lowercase_characters: text = text.lower() if self._byte_encoding is not None: # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out # of this. tokens = [ Token(text_id=c + 1) for c in text.encode(self._byte_encoding) ] else: tokens = [Token(t) for t in list(text)] for start_token in self._start_tokens: if isinstance(start_token, int): token = Token(text_id=start_token, idx=0) else: token = Token(text=start_token, idx=0) tokens.insert(0, token) for end_token in self._end_tokens: if isinstance(end_token, int): token = Token(text_id=end_token, idx=0) else: token = Token(text=end_token, idx=0) tokens.append(token) return tokens CharacterTokenizer = Tokenizer.register(u"character")(CharacterTokenizer)
#overrides def tokenize(self, text): u""" Does whatever processing is required to convert a string of text into a sequence of tokens. At a minimum, this uses a ``WordSplitter`` to split words into text. It may also do stemming or stopword removal, depending on the parameters given to the constructor. """ words = self._word_splitter.split_words(text) return self._filter_and_stem(words) #overrides def batch_tokenize(self, texts): batched_words = self._word_splitter.batch_split_words(texts) return [self._filter_and_stem(words) for words in batched_words] def _filter_and_stem(self, words): filtered_words = self._word_filter.filter_words(words) stemmed_words = [ self._word_stemmer.stem_word(word) for word in filtered_words ] for start_token in self._start_tokens: stemmed_words.insert(0, Token(start_token, 0)) for end_token in self._end_tokens: stemmed_words.append(Token(end_token, -1)) return stemmed_words WordTokenizer = Tokenizer.register(u"word")(WordTokenizer)