def clean(self):
        # initialize Porter stemmer and load stop words
        stemmer = PorterStemmer()
        stop_words = [word.decode('utf-8') for word in DataLoader.load('data/stop_words/stop_words.txt', dtype=str)]

        for article_id in self.article_dict:
            a = self.article_dict[article_id]

            # convert characters to lower
            a.title = a.title.lower()
            a.body = a.body.lower()

            # replace new line characters with spaces
            a.body = a.body.replace('\n', ' ')

            # remove numbers and punctuations
            a.title = sub(r'[^a-z ]', '', a.title)
            a.body = sub(r'[^a-z ]', '', a.body)

            # stem and skip stop words
            title = []
            for word in a.title.select_subset():
                word = stemmer.stem_word(word)
                if word not in stop_words:
                    title.append(word)
            a.title = ' '.join(title)

            body = []
            for word in a.body.select_subset():
                word = stemmer.stem_word(word)
                if word not in stop_words:
                    body.append(word)
            a.body = ' '.join(body)

        self._save('articles_preprocessed.json', self.article_dict, 'cleaned (stemmed, removed stop words)')
def stemming(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Iterates over all terms in lines, stem them

    Return: stemmed_list (list of strings(terms that stemmed))
    """
    stemmed_list = []
    stemmer = PorterStemmer()
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        # tokenize
        line_token = wt(nopunct_line)
        # list to store stemmed terms
        stemmed_line = []
        for term in line_token:
            term = stemmer.stem_word(term)
            stemmed_line.append(term)
        # back to sentence as a string
        stemmed_sentence = ' '.join(stemmed_line)
        stemmed_list.append(stemmed_sentence)
    return stemmed_list
Beispiel #3
0
def make_tags(title_string):
    stemmer = PorterStemmer()
    ret = []
    for word in title_string.split():
        if word not in stop_words:
            ret.append(stemmer.stem_word(word.lower()))
    return ret
Beispiel #4
0
  def __process_email(self, email_contents, vocab):
    '''
    Preprocess a the body of an email and returns a
    list of word_indices.

    Arguments:
      email_contents (str): Email body.
      vocab (dict): Words dictionary.

    Return:
      (str list): Tokenized email body after processing.
    '''
    # Lower case.
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    # Tokenize and also get rid of any punctuation
    word_list = re.split(' |@|$|/|#|\.|-|:|&|\*|\+|=|[|]|\?|!|(|)|{|}|,|''|"|>|_|<|;|%',
                        email_contents)

    # Remove empty string and skip the word if it is too short.
    word_list = [s for s in word_list if s and len(s) > 1]

    # Remove any non alphanumeric characters
    word_list = [re.sub('[^a-zA-Z0-9]', '', s) for s in word_list]

    # Remove empty string and skip the word if it is too short.
    word_list = [s for s in word_list if s and len(s) > 1]

    # Stem the word
    ps = PorterStemmer() 
    word_list = [ps.stem_word(s) for s in word_list]
    word_indices = []

    # Find index in vocab list.
    for w in word_list:
      if w in vocab:
        word_indices.append(vocab[w])
    return word_indices
 def getStemmedWords(self,html):
     
     stemmed_words=[]
     #stemmer = SnowballStemmer("english")
     stemmer = PorterStemmer()
     for token in html:
         stemmed_words.append(stemmer.stem_word(token))
         
     return ' '.join(stemmed_words)
Beispiel #6
0
def stemWordsCountFrequency():
    global sentenceDictionary, backwardStemming, wordDictionary

    import nltk
    from nltk import PorterStemmer
    from nltk.corpus import stopwords

    for sentence in sentenceDictionary:
        #removes all characters from the content that create problems for tokenization
        sentence = sentence.replace('\n', ' ')
        sentence = sentence.replace('[', ' ')
        sentence = sentence.replace(']', ' ')
        sentence = sentence.replace('\x92', '\'')
        sentence = sentence.replace('\x85', '...')
        sentence = sentence.replace('\x96', '-')
        sentence = sentence.replace('\x93', '\"')
        sentence = sentence.replace('\x94', '\"')
        sentence = sentence.replace('\xa0', ' ')
        sentence = sentence.replace('\x97', ' ')
        sentence = sentence.lower()

        try:
            tokens = nltk.word_tokenize(sentence)
        except:
            print " TOKENIZER CRASHED "
            print " [Copy this string in the shell to see unicode characters "
            print sentence

        #stems the tokens. For example 'complications' gets to u'Complic'
        stems = []
        stemmer = PorterStemmer()  #uses the Porter stemming algorithm
        for token in tokens:
            stem = str(stemmer.stem_word(token))
            backwardStemming[stem] = token
            stems.append(stem)

        #gets rid of useless English words (or, and, this, that...)
        stop = stopwords.words('english')

        tokens = [i for i in stems if i not in stop]

        #counts the tokens
        for token in tokens:
            if wordDictionary.has_key(token):  #update count
                wordDictionary[token] = wordDictionary[token] + 1
            else:
                wordDictionary[token] = 1  #if the entry is new, set to 1

        #print wordDictionary

    processWordDictionary()
 def __init__(self,text):
     lmtzr = WordNetLemmatizer()
     porter_stem = PorterStemmer()
     wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'}
     data = text.lower()
     tokens = nltk.word_tokenize(data)
     tagged = nltk.pos_tag(tokens)
     word_list = []
     for t in tagged:
         try:
             word_list.append(lmtzr.lemmatize(t[0],wordnet_tag[t[1][:2]]))
         except:
             word_list.append(porter_stem.stem_word(t[0]))
     self.filtered_words = [w for w in word_list if not w in stopwords.words('english')]
Beispiel #8
0
def process_email(email_contents):
    """
    Preprocesses a the body of an email and returns a list of word indices.

    Parameters
    ----------
    email_contents : string
        The email content.

    Returns
    -------
    list
        A list of word indices.

    """
    vocab_list = get_vocablist()

    email_contents = email_contents.lower()
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)
    email_contents = re.sub('[0-9]+', 'number', email_contents)
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    words = split(""" @$/#.-:&*+=[]?!(){},'">_<;%\n\r""", email_contents)
    word_indices = []
    stemmer = PorterStemmer()
    for word in words:
        word = re.sub('[^a-zA-Z0-9]', '', word)
        if word == '':
            continue
        word = stemmer.stem_word(word)
        print word,
        if word in vocab_list:
            idx = vocab_list.index(word)
            word_indices.append(idx)

    return word_indices
    df_to_append = tmp_df_agg.sort(('AUC test mean'), ascending=False).head(1) # выбор строки с максимальным AUC
    df_to_append = df_to_append[['C', 'AUC test mean']] # выбор только нужных колонок
    df_to_append.columns = ['C', 'CV AUC'] # переименование выбранных колонок
    df_to_append.insert(0, 'Vectorizer', train_data_ind) # добавление колонок
    df_to_append.insert(1, 'Stemming', stemming) # добавление колонок
    df_auc_agg = df_auc_agg.append(df_to_append, ignore_index=True) # добавление строки в dataframe с агрегатами

for x in vectorized_text:
    run_cv(x, stemming = False)

# Стемминг исходного текста
from nltk import PorterStemmer
ps = PorterStemmer()
import re
stemmed_train_text =  [' '.join([ps.stem_word(x) for x in re.findall(r"[\w']+", y)]) for y in twenty_train.data]
# Словарь векторайзеров для текста после стемминга
vectorizers_stem = {'CountVect': CountVectorizer(binary = False), 'CountVectBin': CountVectorizer(binary = True), 'TFIDFVect': TfidfVectorizer()}
# Векторизация текста со стеммингом
vectorized_stemmed_text_train = {}
for i in vectorizers_stem:
    vectorized_stemmed_text_train[i] = vectorizers_stem[i].fit_transform(stemmed_train_text)
vectorized_stemmed_text_train
for x in vectorized_stemmed_text_train:
    run_cv(x, stemming = True)
print('Best models')
df_auc_agg.sort(('CV AUC'), ascending=False)
# Обучение лучшей модели на всей обучающей выборке и расчет AUC на обучающей и тестовой выборках
best_model = LogisticRegression(class_weight = 'balanced', penalty = 'l1', C = 6.0).fit(vectorized_stemmed_text_train['TFIDFVect'], train_labels)
train_auc = calc_auc(y_labels = train_labels, y_predicted = best_model.predict_proba(vectorized_stemmed_text_train['TFIDFVect'])[:, 1])
print('Train AUC = ' + str(train_auc))
Beispiel #10
0
def stem_word(word):
    return PorterStemmer.stem_word(word)
Beispiel #11
0
import cjson
from nltk import PorterStemmer

infile = './stop_word_list_new'
x = PorterStemmer()
f = open(infile, 'r')
listt = cjson.decode(f.readline())
nw = list(set(listt))
new_list = []
for word in nw:
    word1 = x.stem_word(word)
    if word1 not in new_list:
        new_list.append(word1)
newlist = list(set(new_list))
print new_list
print len(new_list)

outfile = './stop_word_porter_stems'
o = open(outfile, 'w')
o.write(cjson.encode(new_list))
outfile1 = './stop_word_list_new'
o1 = open(outfile1, 'w')
o1.write(cjson.encode(nw))
Beispiel #12
0
import cjson
from nltk import PorterStemmer
infile='./stop_word_list_new'
x=PorterStemmer()
f=open(infile,'r')
listt=cjson.decode(f.readline())
nw=list(set(listt))
new_list=[]
for word in nw:
    word1=x.stem_word(word)
    if word1 not in new_list:
        new_list.append(word1)
newlist=list(set(new_list))
print new_list
print len(new_list)

outfile='./stop_word_porter_stems'
o=open(outfile,'w')
o.write(cjson.encode(new_list))
outfile1='./stop_word_list_new'
o1=open(outfile1,'w')
o1.write(cjson.encode(nw))