def stem_file(Inputcsv, Outcsv, delim): """ :param Inputcsv: The input file with the original news articles :param Outcsv: The name of the output file that we wish the stemmed articles to be saved to :param delim: The delimeter used in the reading file :return: Does not return any value """ csv.register_dialect('perispwmeni', delimiter=delim) fw = open(Outcsv, 'wb') fw2 = csv.writer(fw, delimiter=delim) f = open(Inputcsv, 'r') try: reader = csv.reader(f, dialect='perispwmeni') cnt_row = 0 for row in reader: # reads per line #print row cmp_two = [] # two strings that will be compared cnt_row = cnt_row +1 # row counter if len(row)>1: # if len(row) == 1, then it is an empty line, skip it for elem in [5, 6]: # The text (news articles) is in columns 6,7 in the current format line_out = '' # line after the processing line1 = row[elem] words = line1.split() for word in words: # loop over each element of list "words" ww = stem.get_decoded_input(word) # it was 'str' before and becomes 'unicode' from type (ww) last_char_spec =''.encode('utf-8') last_char = ww[-1] # if the last character is a special if last_char == ',' or last_char == '.' or last_char == '!' or last_char == ';': # char (',','.','!',';'), we trim it ww = ww[:-1] last_char_spec = last_char.encode('utf-8') ww = strip_accents(ww) if len(ww)<1: continue english_char = re.search('[a-zA-Z]', ww) # Check whether the word contains English characters cont_dig = not contains_digits(ww) if (not ww[0].isupper()) and (english_char is None) and cont_dig: # if the first letter is capital, it contains English characters or it is a number, # then we don't want to stem it ww = ww.upper() stemmed = stem.stem(ww) stemmed = stemmed.lower() stemmed = stemmed.encode('utf-8') line_out = line_out + ' ' + stemmed + last_char_spec elif cont_dig: #ww = ww.upper() line_out = line_out + ' ' + ww.encode('utf-8') + last_char_spec else: line_out = line_out + ' NUM' + last_char_spec # programmer comment: if cm_two = [cmp_two,line_out], the encoding does not appear correctly if elem == 5: cmp_two = line_out else: cmp_two = [cmp_two, line_out] fw2.writerow(cmp_two) else: fw2.writerow([' ']) finally: f.close()
def prepare_text(text): text = (i if i[0] in capital_letters else dummy for i in text.strip(' «»').split()) text = it.groupby(text, lambda i: i is dummy) text = { ' '.join(stem(normalise(i)) for i in v) for k, v in text if k is False } return text
def stem_emotions(dirpath, emotionlist): for emotion in emotionlist: filepath = os.path.join(dirpath, emotion + '.txt') words = load_csv_file(filepath) stemmed_words = {} for word, score in words.items(): stemmed_word = stem(word) if stemmed_word in stemmed_words: stemmed_words[stemmed_word] = stemmed_words[stemmed_word] \ if stemmed_words[stemmed_word] > score else score else: stemmed_words[stemmed_word] = score write_csv_file(filepath, stemmed_words)
def search_and_create_data(fileIn,data): csv.register_dialect('perispwmeni', delimiter='~') f = open(fileIn, 'r') try: reader = csv.reader(f, dialect='perispwmeni') cnt_row = 0 new_data = [] for row in reader: # reads per line #print row cnt_row = cnt_row +1 if len(row)>1: # if len(row) == 1, then it is an empty line, skip it for elem in [5, 6]: # The text (news articles) is in columns 6,7 in the current format line1 = row[elem] words = line1.split() for word in words: # loop over each element of list "words" ww = stem.get_decoded_input(word) # it was 'str' before and becomes 'unicode' from type (ww) last_char_spec =''.encode('utf-8') last_char = ww[-1] # if the last character is a special if last_char == ',' or last_char == '.' or last_char == '!' or last_char == ';': # char (',','.','!',';'), we trim it ww = ww[:-1] #last_char_spec = last_char.encode('utf-8') if len(ww) <= 4: # cut down the articles and small words continue ww = strip_accents(ww) english_char = re.search('[a-zA-Z]', ww) # Check whether the word contains English characters cont_dig = not contains_digits(ww) if (not ww[0].isupper()) and (english_char is None) and cont_dig: # if the first letter is capital, it contains English characters or it is a number, # then we don't want to stem it ww = ww.upper() stemmed = stem.stem(ww) stemmed = stemmed.lower() stemmed = stemmed.encode('utf-8') try: # try to locate the word in the current list data.index(stemmed) except: try: # try to locate the word in the new list new_data.index(stemmed) except: # if it's not in neither of them, append it to the new list new_data.append(stemmed) print stemmed finally: f.close() return new_data
def test_stem(self): words = [ 'aufeinander', 'aufeinanderbiss', 'aufeinanderfolge', 'aufeinanderfolgen', 'aufeinanderfolgend', 'aufeinanderfolgende', 'aufeinanderfolgenden', 'aufeinanderfolgender', 'aufeinanderfolgt', 'Käufer', 'Kätzchen', 'katholischer', 'auffallen', 'auffallend', 'auffallenden', 'auffallender', 'auffällig', 'auffälligen', 'auffälliges' ] stems = [ 'aufeinand', 'aufeinanderbiss', 'aufeinanderfolg', 'aufeinanderfolg', 'aufeinanderfolg', 'aufeinanderfolg', 'aufeinanderfolg', 'aufeinanderfolg', 'aufeinanderfolgt', 'kauf', 'katzch', 'kathol', 'auffall', 'auffall', 'auffall', 'auffall', 'auffall', 'auffall', 'auffall' ] results = [stem(word) for word in words] self.assertEqual(results, stems)
def document_terms(): for filepath, content, date in documents(): print(filepath) extension = path.splitext(filepath)[1] words = None title = filename(filepath) if extension in ['.html', '.htm', '.jspy']: html_title, words, links = tokenize_html(content) html_title = html_title.strip() if html_title: title = html_title else: words = tokenize_text(content) words = remove_stopwords(words, stopword_list) words = (stem(word) for word in words) yield title, filepath, words, date
google_tokenize = tk.tokenize(content_google) # REMOVING STOPWORDS AND LOWERCASING import remove_stopwords as stopwords facebook_removed_stopwords = stopwords.remove(facebook_tokenize) apple_removed_stopwords = stopwords.remove(apple_tokenize) amazon_removed_stopwords = stopwords.remove(amazon_tokenize) netflix_removed_stopwords = stopwords.remove(netflix_tokenize) google_removed_stopwords = stopwords.remove(google_tokenize) # STEMMING import stemming as stemming facebook_stemmed = stemming.stem(facebook_removed_stopwords) apple_stemmed = stemming.stem(apple_removed_stopwords) amazon_stemmed = stemming.stem(amazon_removed_stopwords) netflix_stemmed = stemming.stem(netflix_removed_stopwords) google_stemmed = stemming.stem(google_removed_stopwords)
import remove_stopwords as stopwords imagine_dragons_removed_stopwords = stopwords.remove(imagine_dragons_tokenize) maroon5_removed_stopwords = stopwords.remove(maroon5_tokenize) one_republic_removed_stopwords = stopwords.remove(one_republic_tokenize) coldplay_removed_stopwords = stopwords.remove(coldplay_tokenize) the_beatles_removed_stopwords = stopwords.remove(the_beatles_tokenize) # STEMMING import stemming as stemming imagine_dragons_stemmed = stemming.stem(imagine_dragons_removed_stopwords) maroon5_stemmed = stemming.stem(maroon5_removed_stopwords) one_republic_stemmed = stemming.stem(one_republic_removed_stopwords) coldplay_stemmed = stemming.stem(coldplay_removed_stopwords) the_beatles_stemmed = stemming.stem(the_beatles_removed_stopwords) # import stemming # ps = stemming.PorterStemmer() # print(ps.stem('laughing'))
def words_steam_cleaner(first_list): return Counter( [stem(word) for word in first_list if word not in UKRAINIAN])
capital_letters = 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ' decompose = icu.Transliterator.createInstance('any-NFD').transliterate normalise = icu.Transliterator.createInstance( 'any-NFD; ' '[:nonspacing mark:] any-remove; ' '[:punctuation:] any-remove; ' 'any-upper').transliterate locations = [{ **l, 'name': decompose(l['name']) } for p in Path('data').glob('childrenJSON*') for l in json.load(p.open())['geonames']] location_pairs = MultiDict( (' '.join(stem(w) for w in normalise(l['name']).split()), l) for l in locations) location_stems = set(location_pairs) def prepare_text(text): text = (i if i[0] in capital_letters else dummy for i in text.strip(' «»').split()) text = it.groupby(text, lambda i: i is dummy) text = { ' '.join(stem(normalise(i)) for i in v) for k, v in text if k is False } return text
def preprocessing(str): word=tokenize(str) word=stopword_removal(word) wordlist=stemming.stem(word) return word
# sm.makeSimilarityMatrixToFile(stemmed_newsgroups_train) print nostop_newsgroups_train print("finish stop word") from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() vectors_noIDF_train = vectorizer.fit_transform(nostop_newsgroups_train) vectors_noIDF_test = vectorizer.transform(nostop_newsgroups_test) pprint(vectors_noIDF_train.shape) #2034 pprint(vectors_noIDF_test.shape) # ######################## Stemming ################### pprint("stemming") import stemming stemmed_newsgroups_train = stemming.stem(newsgroups_train.data) stemmed_newsgroups_test = stemming.stem(newsgroups_test.data) pprint("finish stemming") ######################## ################### pprint("vectorize") from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() # vectors_noIDF_train = vectorizer.fit_transform(newsgroups_train.data) # vectors_noIDF_test = vectorizer.transform(newsgroups_test.data) # pprint(vectors_noIDF_train.shape) #2034 # pprint(vectors_noIDF_test.shape) #1353 vectors_noIDF_train = vectorizer.fit_transform(stemmed_newsgroups_train) vectors_noIDF_test = vectorizer.transform(stemmed_newsgroups_test)
from document_read_write import doc_read, doc_write from stemming import stem from tf_idf_score import scoring import re import operator all_sentence_list = [] stemmed_sentence_list = [] document = doc_read("input1.txt") all_sentence_list = document.all_sentence stm = stem() for s in all_sentence_list: word_list = re.split('\s+', s) new_snt = "" for w in word_list: if new_snt == "": new_snt = new_snt + stm.stemmed(w) else: new_snt = new_snt + " " + stm.stemmed(w) stemmed_sentence_list.append(new_snt) snt_scr = scoring(stemmed_sentence_list) snt_scr.update() snt_scr_list = snt_scr.sentence_score_list score = list(zip(all_sentence_list, snt_scr_list)) score.sort(key=operator.itemgetter(1), reverse=True)
def stemmer(tokens, new_tokens): for i in tokens: if i not in stopwords.dict: new_tokens.append(stemming.stem(i)) return new_tokens
def stem_words(filepath): with open(filepath, 'r') as f: words = f.readlines() with open(filepath, 'w') as f: f.writelines("%s\n" % stem(w) for w in words)