def filter(text): #get rid stop word, puctuation, number, turn to lower case and check spelling, also stemming return_list = [] for i in re.split("[,. \-!?:_'%$/#@&;\n\d]+", text): j = i.lower() if not is_ascii(j): j = j.encode('ascii','ignore') #print j if len(j) > 1 and (j not in stop) and (len(j) > 3): k = PorterStemmer().stem_word(j) if isinstance(k, unicode): k = k.encode('ascii','ignore') if (not k.isdigit()): return_list.append(k) return return_list
def filter(text): #get rid stop word, puctuation, number, turn to lower case and check length, also stemming return_list = [] for i in re.split("[,. ()\- \\\\s =\n-\!?#:_'%$/@\"]+", text): j = i.lower() if 'votetrump' in j or 'votehillary' in j: #print "j is: " + j j = j.replace('votetrump', '') j = j.replace('votehillary', '') #print "after is: " + j if len(j) > 1 and is_ascii(j) and (j not in stop): k = PorterStemmer().stem_word(j) if isinstance(k, unicode): k = k.encode('ascii', 'ignore') if (not k.isdigit()): return_list.append(k) return return_list
def filter(text, removeWords): #get rid stop word, puctuation, number, turn to lower case and check length, also stemming return_list = [] for i in re.split("[,. ()\- \\\\s =\n-\!?#:_'%$/@\"]+", text): j = i.lower() # remove words like 'votetrump', 'votehillary' removeList = removeWords.lower().replace(" ", "").split(",") for remove_element in removeList: j = j.replace(remove_element, '') if len(j) > 1 and is_ascii(j) and (j not in stop): k = PorterStemmer().stem_word(j) #k = PorterStemmer().stem(j) nltk3.2.2 if isinstance(k, unicode): k = k.encode('ascii', 'ignore') if (not k.isdigit()): return_list.append(k) return return_list