Beispiel #1
0
def filter(text):
    #get rid stop word, puctuation, number, turn to lower case and check spelling, also stemming
    return_list = []
    for i in re.split("[,. \-!?:_'%$/#@&;\n\d]+", text):
        j = i.lower()
        if not is_ascii(j):
            j = j.encode('ascii','ignore')
            #print j
        if len(j) > 1 and (j not in stop) and (len(j) > 3):
            k = PorterStemmer().stem_word(j)
            if isinstance(k, unicode):
                k = k.encode('ascii','ignore')
            if (not k.isdigit()):
                return_list.append(k)
    return return_list
Beispiel #2
0
def filter(text):
    #get rid stop word, puctuation, number, turn to lower case and check length, also stemming
    return_list = []
    for i in re.split("[,. ()\- \\\\s =\n-\!?#:_'%$/@\"]+", text):
        j = i.lower()
        if 'votetrump' in j or 'votehillary' in j:
            #print "j is: " + j
            j = j.replace('votetrump', '')
            j = j.replace('votehillary', '')
            #print "after is: " + j

        if len(j) > 1 and is_ascii(j) and (j not in stop):
            k = PorterStemmer().stem_word(j)
            if isinstance(k, unicode):
                k = k.encode('ascii', 'ignore')
            if (not k.isdigit()):
                return_list.append(k)
    return return_list
Beispiel #3
0
def filter(text, removeWords):
    #get rid stop word, puctuation, number, turn to lower case and check length, also stemming
    return_list = []
    for i in re.split("[,. ()\- \\\\s =\n-\!?#:_'%$/@\"]+", text):
        j = i.lower()

        # remove words like 'votetrump', 'votehillary'
        removeList = removeWords.lower().replace(" ", "").split(",")
        for remove_element in removeList:
            j = j.replace(remove_element, '')

        if len(j) > 1 and is_ascii(j) and (j not in stop):
            k = PorterStemmer().stem_word(j)
            #k = PorterStemmer().stem(j) nltk3.2.2
            if isinstance(k, unicode):
                k = k.encode('ascii', 'ignore')
            if (not k.isdigit()):
                return_list.append(k)
    return return_list