def clean(self): # initialize Porter stemmer and load stop words stemmer = PorterStemmer() stop_words = [word.decode('utf-8') for word in DataLoader.load('data/stop_words/stop_words.txt', dtype=str)] for article_id in self.article_dict: a = self.article_dict[article_id] # convert characters to lower a.title = a.title.lower() a.body = a.body.lower() # replace new line characters with spaces a.body = a.body.replace('\n', ' ') # remove numbers and punctuations a.title = sub(r'[^a-z ]', '', a.title) a.body = sub(r'[^a-z ]', '', a.body) # stem and skip stop words title = [] for word in a.title.select_subset(): word = stemmer.stem_word(word) if word not in stop_words: title.append(word) a.title = ' '.join(title) body = [] for word in a.body.select_subset(): word = stemmer.stem_word(word) if word not in stop_words: body.append(word) a.body = ' '.join(body) self._save('articles_preprocessed.json', self.article_dict, 'cleaned (stemmed, removed stop words)')
def stemming(line_list): """ Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data Iterates over all terms in lines, stem them Return: stemmed_list (list of strings(terms that stemmed)) """ stemmed_list = [] stemmer = PorterStemmer() for i, line in enumerate(line_list): # linercase line = line.lower() # remove punctuation # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom # nopunct_line = ''.join([c for c in line # if re.match("[a-z\-\' \n\t]", c)]) # this solve the problem above: nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line) # tokenize line_token = wt(nopunct_line) # list to store stemmed terms stemmed_line = [] for term in line_token: term = stemmer.stem_word(term) stemmed_line.append(term) # back to sentence as a string stemmed_sentence = ' '.join(stemmed_line) stemmed_list.append(stemmed_sentence) return stemmed_list
def make_tags(title_string): stemmer = PorterStemmer() ret = [] for word in title_string.split(): if word not in stop_words: ret.append(stemmer.stem_word(word.lower())) return ret
def __process_email(self, email_contents, vocab): ''' Preprocess a the body of an email and returns a list of word_indices. Arguments: email_contents (str): Email body. vocab (dict): Words dictionary. Return: (str list): Tokenized email body after processing. ''' # Lower case. email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+', 'dollar', email_contents) # Tokenize and also get rid of any punctuation word_list = re.split(' |@|$|/|#|\.|-|:|&|\*|\+|=|[|]|\?|!|(|)|{|}|,|''|"|>|_|<|;|%', email_contents) # Remove empty string and skip the word if it is too short. word_list = [s for s in word_list if s and len(s) > 1] # Remove any non alphanumeric characters word_list = [re.sub('[^a-zA-Z0-9]', '', s) for s in word_list] # Remove empty string and skip the word if it is too short. word_list = [s for s in word_list if s and len(s) > 1] # Stem the word ps = PorterStemmer() word_list = [ps.stem_word(s) for s in word_list] word_indices = [] # Find index in vocab list. for w in word_list: if w in vocab: word_indices.append(vocab[w]) return word_indices
def getStemmedWords(self,html): stemmed_words=[] #stemmer = SnowballStemmer("english") stemmer = PorterStemmer() for token in html: stemmed_words.append(stemmer.stem_word(token)) return ' '.join(stemmed_words)
def stemWordsCountFrequency(): global sentenceDictionary, backwardStemming, wordDictionary import nltk from nltk import PorterStemmer from nltk.corpus import stopwords for sentence in sentenceDictionary: #removes all characters from the content that create problems for tokenization sentence = sentence.replace('\n', ' ') sentence = sentence.replace('[', ' ') sentence = sentence.replace(']', ' ') sentence = sentence.replace('\x92', '\'') sentence = sentence.replace('\x85', '...') sentence = sentence.replace('\x96', '-') sentence = sentence.replace('\x93', '\"') sentence = sentence.replace('\x94', '\"') sentence = sentence.replace('\xa0', ' ') sentence = sentence.replace('\x97', ' ') sentence = sentence.lower() try: tokens = nltk.word_tokenize(sentence) except: print " TOKENIZER CRASHED " print " [Copy this string in the shell to see unicode characters " print sentence #stems the tokens. For example 'complications' gets to u'Complic' stems = [] stemmer = PorterStemmer() #uses the Porter stemming algorithm for token in tokens: stem = str(stemmer.stem_word(token)) backwardStemming[stem] = token stems.append(stem) #gets rid of useless English words (or, and, this, that...) stop = stopwords.words('english') tokens = [i for i in stems if i not in stop] #counts the tokens for token in tokens: if wordDictionary.has_key(token): #update count wordDictionary[token] = wordDictionary[token] + 1 else: wordDictionary[token] = 1 #if the entry is new, set to 1 #print wordDictionary processWordDictionary()
def __init__(self,text): lmtzr = WordNetLemmatizer() porter_stem = PorterStemmer() wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'} data = text.lower() tokens = nltk.word_tokenize(data) tagged = nltk.pos_tag(tokens) word_list = [] for t in tagged: try: word_list.append(lmtzr.lemmatize(t[0],wordnet_tag[t[1][:2]])) except: word_list.append(porter_stem.stem_word(t[0])) self.filtered_words = [w for w in word_list if not w in stopwords.words('english')]
def process_email(email_contents): """ Preprocesses a the body of an email and returns a list of word indices. Parameters ---------- email_contents : string The email content. Returns ------- list A list of word indices. """ vocab_list = get_vocablist() email_contents = email_contents.lower() email_contents = re.sub('<[^<>]+>', ' ', email_contents) email_contents = re.sub('[0-9]+', 'number', email_contents) email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents) email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) email_contents = re.sub('[$]+', 'dollar', email_contents) words = split(""" @$/#.-:&*+=[]?!(){},'">_<;%\n\r""", email_contents) word_indices = [] stemmer = PorterStemmer() for word in words: word = re.sub('[^a-zA-Z0-9]', '', word) if word == '': continue word = stemmer.stem_word(word) print word, if word in vocab_list: idx = vocab_list.index(word) word_indices.append(idx) return word_indices
df_to_append = tmp_df_agg.sort(('AUC test mean'), ascending=False).head(1) # выбор строки с максимальным AUC df_to_append = df_to_append[['C', 'AUC test mean']] # выбор только нужных колонок df_to_append.columns = ['C', 'CV AUC'] # переименование выбранных колонок df_to_append.insert(0, 'Vectorizer', train_data_ind) # добавление колонок df_to_append.insert(1, 'Stemming', stemming) # добавление колонок df_auc_agg = df_auc_agg.append(df_to_append, ignore_index=True) # добавление строки в dataframe с агрегатами for x in vectorized_text: run_cv(x, stemming = False) # Стемминг исходного текста from nltk import PorterStemmer ps = PorterStemmer() import re stemmed_train_text = [' '.join([ps.stem_word(x) for x in re.findall(r"[\w']+", y)]) for y in twenty_train.data] # Словарь векторайзеров для текста после стемминга vectorizers_stem = {'CountVect': CountVectorizer(binary = False), 'CountVectBin': CountVectorizer(binary = True), 'TFIDFVect': TfidfVectorizer()} # Векторизация текста со стеммингом vectorized_stemmed_text_train = {} for i in vectorizers_stem: vectorized_stemmed_text_train[i] = vectorizers_stem[i].fit_transform(stemmed_train_text) vectorized_stemmed_text_train for x in vectorized_stemmed_text_train: run_cv(x, stemming = True) print('Best models') df_auc_agg.sort(('CV AUC'), ascending=False) # Обучение лучшей модели на всей обучающей выборке и расчет AUC на обучающей и тестовой выборках best_model = LogisticRegression(class_weight = 'balanced', penalty = 'l1', C = 6.0).fit(vectorized_stemmed_text_train['TFIDFVect'], train_labels) train_auc = calc_auc(y_labels = train_labels, y_predicted = best_model.predict_proba(vectorized_stemmed_text_train['TFIDFVect'])[:, 1]) print('Train AUC = ' + str(train_auc))
def stem_word(word): return PorterStemmer.stem_word(word)
import cjson from nltk import PorterStemmer infile = './stop_word_list_new' x = PorterStemmer() f = open(infile, 'r') listt = cjson.decode(f.readline()) nw = list(set(listt)) new_list = [] for word in nw: word1 = x.stem_word(word) if word1 not in new_list: new_list.append(word1) newlist = list(set(new_list)) print new_list print len(new_list) outfile = './stop_word_porter_stems' o = open(outfile, 'w') o.write(cjson.encode(new_list)) outfile1 = './stop_word_list_new' o1 = open(outfile1, 'w') o1.write(cjson.encode(nw))
import cjson from nltk import PorterStemmer infile='./stop_word_list_new' x=PorterStemmer() f=open(infile,'r') listt=cjson.decode(f.readline()) nw=list(set(listt)) new_list=[] for word in nw: word1=x.stem_word(word) if word1 not in new_list: new_list.append(word1) newlist=list(set(new_list)) print new_list print len(new_list) outfile='./stop_word_porter_stems' o=open(outfile,'w') o.write(cjson.encode(new_list)) outfile1='./stop_word_list_new' o1=open(outfile1,'w') o1.write(cjson.encode(nw))