コード例 #1
0
    def RemoveStopWords(self):
        LOGGER.debug("text before StopWords processing: {}".format(
            self.corpus))

        self.corpus = self._remove_punct()
        self.corpus = self._remove_stop_words()

        LOGGER.debug("text after StopWords processing: {}".format(self.corpus))
コード例 #2
0
 def Stem(self, text):
     LOGGER.debug("text before Stem processing: {}".format(text))
     tokens = word_tokenize(text)
     porter = PorterStemmer()
     # vectorizing function to able to call on list of tokens
     stem_words = np.vectorize(porter.stem)
     self.corpus = stem_words(tokens)
     LOGGER.debug("text after Stem processing: {}".format(self.corpus))
コード例 #3
0
    def Lemmatize(self):

        LOGGER.debug("text before Lemm processing: {}".format(self.corpus))

        wordnet_lemmatizer = WordNetLemmatizer()
        # vectorizing function to able to call on list of tokens
        lemmatize_words = np.vectorize(wordnet_lemmatizer.lemmatize)
        self.corpus = lemmatize_words(self.corpus)
        LOGGER.debug("text after Lemm processing: {}".format(self.corpus))
コード例 #4
0
    def Lemmatize(self):
        """
        Spacy lemmatized much better than nltk,
        one of the examples risen -> rise, only spacy handled that.
        """
        LOGGER.debug("text before Lemm processing: {}".format(self.corpus))

        new_corpus = []
        nlp = spacy.load("en_core_web_sm")
        for el in self.corpus:
            new_corpus += [nlp(str(el))]
        self.corpus = new_corpus.copy()
        LOGGER.debug("text after Lemm processing: {}".format(self.corpus))