def clean_document(document): ''' Takes in a string. Returns cleaned string. ''' # lowercase the strings doc_lower = document.lower() #tokenize tokens = word_tokenize(doc_lower) # remove punctuation punc = set(string.punctuation) tokens_no_punc = [word for word in tokens if word not in punc] # remove stopwords s_words = set(stopwords.words('english')) s_words_list = ['tablespoon', 'tbsp', 'teaspoon', 'tsp', 'cup', 'oz', 'lb', 'c.'] for word in s_words_list: s_words.add(word) tokens_no_sw = [word for word in tokens_no_punc if word not in s_words] # stem the words to get rid of multiple forms of the same word porter = PS() tokens_stemmed = [porter.stem(word) for word in tokens_no_sw] # join all words into one string cleaned_doc = ' '.join(tokens_stemmed) return cleaned_doc
def nlp2line(f_name='sentiment.txt'): sentences = [] sentence_word_list = [] word_list = [] reg = re.compile(r''' (?<=[.;:?!]) # (. or ; or : or ? or !) に続いて \s # 空白文字 (?=[A-Z]) # 英大文字が続く場合だけマッチする ''', flags=re.VERBOSE) with open(f_name, "r", encoding='latin-1') as f: for line in map(lambda x: x.rstrip(), f): if not line: continue for res_line in reg.split(line): sentences.append(res_line) for i, sentence in enumerate(sentences): words = re.findall("[a-zA-Z0-9]{2,}", sentence) for word in words: word_list.append(word) sentence_word_list.append(word_list) for i, sentence in enumerate(sentence_word_list): len_sentence = len(sentence) for j in range(1, len_sentence): if is_stopword_takahashi(word): sentence_word_list[i].pop(j) ps = PS() len_word_list = len(sentence_word_list) for i in range(len_word_list): len_sentence = len(sentence) for j in range(1, len_sentence): sentence_word_list[i][j] = ps.stem(sentence_word_list[i][j]) return sentence_word_list
def extract_features(file_path): ps = PS() labels, docs = [], [] for line in open(file_path): label, *sentence = line.split() tokens = [stem for stem in map(ps.stem, sentence) if check(stem)] labels.append(int(label)) docs.append(" ".join(tokens)) return labels, docs
def remove_noise(words): from nltk.stem.porter import PorterStemmer as PS result = [] ps = PS() for w in words: # ストップワードを除去 if not ex71.is_stop_word(w) and len(w) > 1: # 各単語をステミング処理して登録する result += [ps.stem(w)] return result
def data_preprocessing ( data): tokens = word_tokenize(data) words = [token for token in tokens if token.isalpha()] no_integers = [x for x in words if not isinstance(x, int)] porter = PS() stemmed = [porter.stem(word) for word in no_integers] stop_words = stopwords.words('english') words_new = [word for word in stemmed if word not in stop_words] tokens = [w.lower() for w in words_new] cleaned_sentence = " " cleaned_sentence = cleaned_sentence.join(tokens) return cleaned_sentence
def getFeatures(line): ps = PS() features = [] sentiment = line[:2] for word in line[3:].split(): word = word.strip() if is_stopword(word): continue else: if len(word) > 1: features.append(ps.stem(word)) return features, sentiment
def doc_read(filename): file=open(filename,'r') text=file.read() file.close() tokens=text.lower() tokens= re.sub(r'[^\w\s]','',tokens) tokens=text.split() tokens = [word for word in tokens if word.isalpha()] stop_words = set(stopwords.words('english')) ps=PS() tokens = [ps.stem(w) for w in tokens if not w in stop_words] tokens = [word for word in tokens if len(word) > 1] return tokens
def doc_read(filename): file = open(filename, 'r') text = file.read() file.close() tokens = text.lower() tokens = text.split() table = str.maketrans('', '', string.punctuation) tokens = [w.translate(table) for w in tokens] tokens = [word for word in tokens if word.isalpha()] stop_words = set(stopwords.words('english')) ps = PS() tokens = [ps.stem(w) for w in tokens if not w in stop_words] tokens = [word for word in tokens if len(word) > 1] tokens = " ".join(tokens) return tokens
def stemming(no_stopword_features="no_stopword_features.txt", features="features.txt"): pbar = tqdm(total=10662) with open(no_stopword_features, "r", encoding="latin-1") as f, open(features, "w", encoding="latin-1") as fw: ps = PS() for line in f: label_words = line.rstrip().split(" ") label = label_words[0] words = label_words[1:] fw.write(label) for word in words: word = ps.stem(word) fw.write(" " + word) fw.write("\n") pbar.update(1) pbar.close()
def Result(): print("Enter a string to judge sentiment:") S=str(input()) S=S.lower() S= re.sub(r'[^\w\s]','',S) S=S.split() S=[word for word in S if word.isalpha()] stop_words=set(stopwords.words('english')) ps=PS() S = [ps.stem(w) for w in S if not w in stop_words] S= [word for word in S if len(word) > 1] good=0 bad=0 for i in S: if(i in common): tmpvar1=common.index(i) tmpvar2=commonprob[tmpvar1] good=good+(S.count(i)*tmpvar2[1]) bad=bad+(S.count(i)*tmpvar2[2]) elif(i in finalp): good=good+1 elif(i in finaln): bad=bad+1 for i in range(0,len(S)): if(S[i]=='highli' or S[i]=='much'): if(S[i+1] in common): tmpvar1=common.index(S[i]) tmpvar2=commonprob[tmpvar1] good=good+(2*S.count(i)*tmpvar2[1]) bad=bad+(2*S.count(i)*tmpvar2[2]) elif(S[i+1] in finaln): bad=bad+2 elif(S[i+1] in finalp): good=good+2 if(good>bad): print("\nPositive") else: print("\nNegative")
def extract_features(title): tokens = title.split(" ") tokens = filter(check, map(PS().stem, tokens)) return " ".join(tokens)
def stemming(words_list): new_words_list = [] ps = PS() for words in words_list: new_words_list.append(ps.stem(words)) return new_words_list
def __init__(self): self._porter_stemmer = PS()
def stemming(): # 問題にあるstemmingモジュールはPython3に非対応らしい # Porterのステミングアルゴリズムはnltkのもので代用 ps = PS() for w in word_split(): yield [w, ps.stem(w)]
def stem(x): return PS().stem(x)
stop_words.extend( [".", ",", ":", ";", "!", "?", "-", "--", "(", ")", "\"", "\'"]) # re用 symbol = [".", ",", ":", ";", "!", "?", "\-", "(", ")", "\"", "\'"] def isStopWord(word: str) -> bool: # ストップワードならTrue return word.lower() in stop_words def my_sub(word: str) -> str: # 記号を削除 return re.sub(f"[{''.join(symbol)}]", "", word) if __name__ == "__main__": words = [] ps = PS() with open("sentiment.txt") as f: for l in f: # ストップワードの除去 # 1文字の単語も除く words.extend([ ps.stem(my_sub(w.strip())) for w in l.split(" ")[1:] if not isStopWord(w) and len(my_sub(w.strip())) > 1 ]) with open("stem_words.txt", "w") as f: f.write("\n".join([w for w in set(words) if w]))
Created on Thu Aug 9 22:50:11 2018 @author: yohei """ import pyprind import pandas as pd import os from collections import Counter import re from sklearn.feature_extraction.text import CountVectorizer as CV from nltk.stem.porter import PorterStemmer as PS from nltk.corpus import stopwords count = CV() porter = PS() stop = stopwords.words('english') class Final_Assignment: def tokenizer_porter(self, text): return [porter.stem(word) for word in text.split()] def merge_dict_add_values(self, d1, d2): return dict(Counter(d1) + Counter(d2)) def making_csv(self): basepath = 'aclImdb' labels = {'pos': 1, 'neg': 1} pbar = pyprind.ProgBar(50000) df = pd.DataFrame()