Beispiel #1
0
    def applyStem(my_list):
        porter = Stemmer.PorterStemmer()
        data = []
        for token in my_list:
            if re.match(r'[a-z]+', token):
                token = porter.stem(token, 0, len(token) - 1)

            data += [token]

        return data
Beispiel #2
0
def clean_text(doc):
    words = re.sub("[^a-zA-z0-9]", " ",
                   doc)  # removing puncutations and otherchracters.
    clean_words = words.lower().split()  # spliting docs into words
    stop = set(stopwords.words("english"))  # stop words (NLTK module is used)
    stop.add('w')
    important_words = [w for w in clean_words
                       if not w in stop]  # removing stop words from list
    last_words = [
        Stemmer.PorterStemmer().stem(w, 0,
                                     len(w) - 1) for w in important_words
    ]  # stemming the words(given stemmer is used)

    return " ".join(last_words)  # joining and returning