コード例 #1
0
ファイル: text_process.py プロジェクト: cuptrail/citemachine
    def _initialize_preprocessing_tools(self, tokenize, stemmer, excluded_words, is_valid_word):
        if stemmer:
            self.stemmer = stemmer
        else:
            self.stemmer = LancasterStemmer()

        if excluded_words:
            self.excluded_words = set(stem_all(excluded_words, self.stemmer))
        else:
            self.excluded_words = set(stem_all(stopwords.words("english"), self.stemmer))

        if is_valid_word is None:
            self.is_valid_word = lambda word: (len(word) > 2) and (word not in self.excluded_words)
        else:
            self.is_valid_word = is_valid_word

        if tokenize is None:
            self.tokenize = word_tokenize
        else:
            self.tokenize = tokenize
コード例 #2
0
 def _split_text(self, text):
     words = self.tokenize(text)
     words = [word.rstrip('.') for word in words]
     words = stem_all(words, self.stemmer)
     return words