def _initialize_preprocessing_tools(self, tokenize, stemmer, excluded_words, is_valid_word): if stemmer: self.stemmer = stemmer else: self.stemmer = LancasterStemmer() if excluded_words: self.excluded_words = set(stem_all(excluded_words, self.stemmer)) else: self.excluded_words = set(stem_all(stopwords.words("english"), self.stemmer)) if is_valid_word is None: self.is_valid_word = lambda word: (len(word) > 2) and (word not in self.excluded_words) else: self.is_valid_word = is_valid_word if tokenize is None: self.tokenize = word_tokenize else: self.tokenize = tokenize
def _split_text(self, text): words = self.tokenize(text) words = [word.rstrip('.') for word in words] words = stem_all(words, self.stemmer) return words