def applyStem(my_list): porter = Stemmer.PorterStemmer() data = [] for token in my_list: if re.match(r'[a-z]+', token): token = porter.stem(token, 0, len(token) - 1) data += [token] return data
def clean_text(doc): words = re.sub("[^a-zA-z0-9]", " ", doc) # removing puncutations and otherchracters. clean_words = words.lower().split() # spliting docs into words stop = set(stopwords.words("english")) # stop words (NLTK module is used) stop.add('w') important_words = [w for w in clean_words if not w in stop] # removing stop words from list last_words = [ Stemmer.PorterStemmer().stem(w, 0, len(w) - 1) for w in important_words ] # stemming the words(given stemmer is used) return " ".join(last_words) # joining and returning