def get_stopwords(filename): stopwords = [] f = open(filename, 'r') for line in f: line = line.strip(" \n\t") stopwords.append(line) f.close() return stopwords
def getTermsAndStopList(self, str): stopwords = [] words = [] tokens = self.tokenizer.tokenize(str) for token in tokens: if token.isdigit() == False: if token not in self.stop: words.append(token) else: stopwords.append(token) terms = [self.stemmer.stem(word) for word in words] return terms, stopwords
def read(self, list, path): #read the file (forcing utf8) and output a list lemmer = WordNetLemmatizer() #count the words ,save those that have a frequency of 15 or less #so that the algorithm doesn't get distracted by irrellevant non-recurrent words data_file = open(path, 'rt', encoding='utf8', errors='replace') cn = Counter(word for l in data_file for word in l.split()) words = dict((word, v) for word, v in cn.items() if v < 6) words_list = words.keys() data_file.close() #read the stopwords file and create a list with them stopwords_f = open('stopwords.txt', 'rt', encoding='utf8', errors='replace') stopwords = [] for line in stopwords_f: stopwords.append(str(line.strip())) stopwords_f.close() data_file = open(path, 'rt', encoding='utf8', errors='replace') for line in data_file: #manually check for stopwords new_line = '' #manually eliminate puntuation and grammar stuff, according to stastics #of the data, least repeated and most repeated words (irrellevant ones) line = str(line).replace(",", " ") line = str(line).replace(' " ', ' ') line = str(line).replace(".", "") line = str(line).replace(" ?", "") line = str(line).replace(" : ", " ") line = str(line).replace(" ; ", " ") line = str(line).replace(" ( ", " ") line = str(line).replace(" ) ", " ") line = str(line).replace(". . .", " ") line = str(line).replace(" -- ", " ") #remove some stopwords manually as well as digits from the strings line = self.replaceMultiple(str(line), words_list, ' ') #remove least used words (words that have a frequency of 3 and less) line = self.replaceMultiple(str(line), stopwords, ' ') #lematize the final words in the line for word in line.split(' '): new_line = new_line + ' ' + str(lemmer.lemmatize(word)) list.append(line.rstrip()) data_file.close() #output the lsit return list
def remove_stop_word(tweet): stopwords = [] with open("english") as files: for line in files: values = line.split() word = values[0] stopwords.append(word) pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*') tweet = pattern.sub('', tweet) return tweet
def remove_stopwords(df): """ Removes stopwords based on a known set of stopwords available in the nltk package. In addition, we include our made up word in here. """ # Luckily nltk already has a set of stopwords that we can remove from the texts. stopwords = nltk.corpus.stopwords.words('english') # we'll add our own special word in here 'qwerty' stopwords.append(our_special_word) df['stopwords_removed'] = list(map(lambda doc: [word for word in doc if word not in stopwords], df['tokenized_text']))
def get_stopwords(): initial_stopwords = [ "a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "did", "do", "does", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves" ] stopwords = list() for s in initial_stopwords: s = normalize_string(s) stopwords.append(s) return initial_stopwords
def load_en_stopwords(filename): '''Loads English stop-words from a given file Return: a list of stop words Arguments: the stop-words file name ''' stopwords = [] with codecs.open(filename, mode='r', encoding='utf-8') as fSW: for line in fSW: stopwords.append(line.strip().lower()) return stopwords
def ques23(entiresentence,string,np,x,y): stopwords=[] querywords = string.split() #print("querywords",querywords) grammar = "chunk:{<VB.?|MD|RP|RB.?>+<DT>?<RB.?>*<JJ.?>*<NN.?|PRP|PRP$|POS|VBG|DT|CD|VBN>+}" res,sentence=findChunk(string,grammar,"chunk") #print("res",res,"sentence",sentence) # for subtree in sentence.subtrees(): # if subtree.label() == 'chunk': # print(subtree) # for pr in np: # if() if(len(sentence)>0): for words in sentence: stopwords.append(words[0]) #print("sp",stopwords) stopwords.append(np) stopwords.append(x) stopwords.append(y) stopwords=set(stopwords) #print(stopwords) resultwords = [word for word in querywords if word not in stopwords] resultwords = ' '.join(str(e) for e in resultwords) #print("rs",resultwords) WhatQues="What "+x+" "+np+" "+y+" "+resultwords+"?" #print(WhatQues) AnsQues[entiresentence].append(WhatQues)
def get_hindi_stopwords(filename="stop-words.txt"): """Get stopwords in Hindi. Args: filename - stopwords_set words file Returns: set of hindi stop words """ stopwords = [] with open(filename, "r") as stop_words: for word in stop_words.readlines(): stopwords.append(unidecode(word.strip().decode("utf8"))) return set(stopwords)
def preprocessing(tagged_by_Sentence): """ 1. 특수문자 제거, 소문자 2. not, "n't" -> not_stemming(다음단어) 3. 특수문자 제거, 숫자 제거 4. stopword 제거 5. stemming """ from nltk.corpus import stopwords stopwords = stopwords.words('english') stopwords.remove("not") stopwords.remove('very') stopwords.append("'m") stopwords.append("'s") re_special = re.compile('[^A-Za-z0-9]+') # 문자,숫자 제외한 나머지 re_num = re.compile('[0-9]+') # 숫자 st = PorterStemmer() new_sent = [] not_indice = [] for sent in tagged_by_Sentence: text = [(tup[0].lower(), tup[1]) for tup in sent if not bool(re_special.match(tup[0]))] # 1. 특수문자 제거, 소문자 # 2. not, n't 랑 다음단어 합치기 # not, n't 가 나오면 다음 단어랑 합치고, 그 다음 단어의 index를 저장해놨다가 del_element_by_indice 함수에서 제거 new_text = [] for index, tup in enumerate(text): if tup[0] == "n't" or tup[0] == "not": if index + 1 < len(text): if not bool(re_special.match( text[index + 1][0])) or text[index + 1][1] != 'CD': new_text.append("not_" + st.stem(text[index + 1][0])) not_indice.append(index) else: new_text.append("not") else: if not bool(re_num.match( tup[0])) or tup[1] != 'CD': # 3. 특수문자, 숫자 제거 new_text.append(tup[0]) new_text = del_element_by_indice(new_text, not_indice) new_words = [ st.stem(word) for word in new_text if word not in stopwords ] # 4,5 stopword 제거, stemming new_sent.append(new_words) return new_sent
def tokenize_headlines_with_sentiment(df): headlines = df.title.tolist() all_bigrams = [] headlines_string = (' '.join(filter(None, headlines))).lower() tokens = word_tokenize(headlines_string) # Remove single letter tokens tokens_sans_singles = [i for i in tokens if len(i) > 1] # Remove stop words stopwords = nltk.corpus.stopwords.words('english') new_words=("s'","'s","election", "2020", "n't", "wo","...", "'") for i in new_words: stopwords.append(i) tokens_sans_stop = [t for t in tokens_sans_singles if t not in stopwords] tokens_sans_stop = [t.replace('wins', 'win') for t in tokens_sans_stop] # Get bigrams and frequencies bi_grams = list(ngrams(tokens_sans_stop, 2)) counter = Counter(bi_grams) # Convert counter dictionary to dataframe counter_df = pd.DataFrame.from_dict(counter, orient='index').reset_index().rename(columns={"index": "bigram", 0: "freq"}) counter_df_sort = counter_df.sort_values(by=['freq'],ignore_index=True, ascending=False) # Create concatenated bigram string for sentiment scoring counter_df_sort['word1'], counter_df_sort['word2'] = counter_df_sort.bigram.str counter_df_sort['bigram_joined'] = counter_df_sort.word1 + " " + counter_df_sort.word2 counter_df_sort=counter_df_sort.drop(['word1','word2'], axis=1) # get sentiment for bigrams analyzer = SentimentIntensityAnalyzer() bigrams_scores = counter_df_sort['bigram_joined'].apply(analyzer.polarity_scores).tolist() df_bigrams_scores = pd.DataFrame(bigrams_scores).drop(['neg','neu','pos'], axis=1).rename(columns={"compound": "sentiment_compound"}) bigrams_freq_and_scores = counter_df_sort.join(df_bigrams_scores, rsuffix='_right') print(f"There are {len(bigrams_freq_and_scores)} extracted bigrams across all headlines") return bigrams_freq_and_scores
def stem_tokenize(str_use): """ Takes a string and tokenizes it, stripping it of punctuation and stopwords. Returns a list of strings. """ stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(string.punctuation) stopwords.append('') addstopwords = ["in", "on", "of", "''"] stopwords.append(addstopwords) stemmer = wordnet.WordNetLemmatizer() tokenizer = punkt.PunktWordTokenizer() # removes stopwords and punctuation, then splits the string into a list of words token = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(str_use) if token.lower().strip(string.punctuation) not in stopwords] text = [word for word in token if re.search(r'[a-zA-Z]', word) is not None] stem = [stemmer.lemmatize(word) for word in text] # Returns a list of strings return stem
def count_words(text): stopwords = nltk.corpus.stopwords.words('english') stopwords.append('https') stopwords.append('http') stopwords.append('im') stopwords.append('# ') # RegEx for stopwords RE_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords)) # replace '|'-->' ' and drop all stopwords return (text.str.lower().replace([r'\|', RE_stopwords], [' ', ''], regex=True).str.cat(sep=' ').split())
def displaywordcloud(data=None, backgroundcolor='#fff', width=1000, height=1000): stopwords.append(product_name) wordcloud = WordCloud( font_path='C:Windows/Fonts/NanumGothicCoding-Bold.ttf', mask=mask_png, stopwords=stopwords, collocations=False, max_font_size=160, colormap='tab10', background_color=backgroundcolor, width=width, height=height).generate(data) fig = plt.figure(figsize=(10, 10)) plt.imshow(wordcloud, interpolation="bilinear", aspect='auto') plt.axis("off") # plt.show fig.savefig('webservice/static/wordcloud.png')
def Cleaning(liste): import re import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer ps = PorterStemmer() stopwords = list(set(stopwords.words('english'))) Liste = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] for i in range(0,len(Liste)): stopwords.append(Liste[i]) corpus = [] for i in range(0,len(liste),1): review = re.sub('[^a-zA-Z]',' ',liste[i]) review = review.lower() review = review.split() review = [ps.stem(i) for i in review if i not in stopwords] review = ' '.join(review) corpus.append(review) return corpus
def is_ci_stem_stopword_set_match(self, a, b, threshold=0.5): # Get default English stopwords and extend with punctuation stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(string.punctuation) stopwords.append('') # Create tokenizer and stemmer tokenizer = nltk.tokenize.punkt.PunktWordTokenizer() stemmer = nltk.stem.snowball.SnowballStemmer('english') """Check if a and b are matches.""" tokens_a = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) \ if token.lower().strip(string.punctuation) not in stopwords] tokens_b = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) \ if token.lower().strip(string.punctuation) not in stopwords] stems_a = [stemmer.stem(token) for token in tokens_a] stems_b = [stemmer.stem(token) for token in tokens_b] # Calculate Jaccard similarity ratio = len(set(stems_a).intersection(stems_b)) / float( len(set(stems_a).union(stems_b))) return (ratio >= threshold)
def nounphrase(tree): stopwords=[] querywords=[] grammar = "verb:{<VBG|VBN|VB.?|MD|RP>+}" res,verbtree=findChunkwithPOSTags(cstr,grammar,"verb") # print("res",res,"sentence",verbtree) #print(len(verbtree)) if(len(verbtree)>0): for subtree in verbtree.subtrees(): for words in subtree: stopwords.append(words[0]) for subtree in tree.subtrees(): for words in subtree: querywords.append(words[0]) # print("querywords",querywords) resultwords = [word for word in querywords if word not in stopwords] resultwords = ' '.join(str(e) for e in resultwords) #print("rs",str(resultwords)) return 1,resultwords else: return 0,""
def update_stopwords(self, add_words=[], remove_words=[], update_corpus=True): stopwords = self.stopwords [stopwords.append(x) for x in add_words] [stopwords.remove(x) for x in remove_words if x in stopwords] self._stopwords_ = stopwords if update_corpus: self.prepare_corpus()
def ques2_2(entiresentence,string,np,x,y): np="" stopwords=[] querywords = string.split() grammar = "chunk:{<IN>+<DT>?<RB.?>*<JJ.?>*<NN.?|PRP|PRP$|POS|VBG|DT|CD|VBN>+}" res,sentence=findChunk(string,grammar,"chunk") if(res!=0): for words in sentence: stopwords.append(words[0]) #print("sp",stopwords) stopwords.append(np) stopwords.append(x) # stopwords.append(y) #print(stopwords) ########FIND THE PREPOSTION preposition="" for words in sentence: #print("words in sentence",words) if(words[1]=='IN'): #print("prep found",words[0]) preposition=words[0] #print("preposition is",preposition) #print("///////////////////////////////////") resultwords = [word for word in querywords if word not in stopwords] resultwords = ' '.join(str(e) for e in resultwords) #print(resultwords) prepques=preposition+" what "+x+np+" "+resultwords+"?" #print(prepques) AnsQues[entiresentence].append(prepques)
def getStopWordList(stopWordFile): stopwords = [] stopwords.append("AT_USER") stopwords.append("URL") with open(stopWordFile, 'r') as f: reader = csv.reader(f) for w in reader: stopwords.append(w[0]) return stopwords
print("Dataset Loaded...\n") # Tokenization and Stemming stemmer = SnowballStemmer('english') tokenizer = RegexpTokenizer(r'[a-zA-Z\']+') def tokenize(text): return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())] #Adding new stopwords stopwords = nltk.corpus.stopwords.words('english') with open('StopWords.txt', 'r') as file: for i in file: stopwords.append(i.strip()) # Feature extraction using TF-IDF tfidf = TfidfVectorizer(stop_words=stopwords, tokenizer=tokenize) X = tfidf.fit_transform(data) words = tfidf.get_feature_names() print(words) print("Number of features: " + str(len(words))) #check number of features print("Vectorization Completed...\n") ''' # Clustering using K-Means print("k-Means with 5 cluster:\n") kmeans_1 = KMeans(init='k-means++', n_clusters = 5, random_state = 42) kmeans_1.fit(X) common_words = kmeans_1.cluster_centers_.argsort()[:,-1:-21:-1] for num, centroid in enumerate(common_words):
import requests import string #reads from the file defined in project specs on ecampus url = "http://www.gutenberg.org/cache/epub/9845/pg9845.txt" r = requests.get(url, allow_redirects=True) # alternate: # raw = "your string here" or read in a txt file #tokenize to keep only words tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(r.text) #get stopwords + extra for the era of the book at the url stopwords = stopwords.words('english') extrastop = ["thee", "thy", "ye", "thine", "thou"] stopwords.append(extrastop) nostop = [w for w in tokens if not w in stopwords] #remove repeats norepeat = list(set(nostop)) #add all of the words + their initial location to a dictionary dictionary = {} for w in norepeat: dictionary[w.upper().encode('utf8')] = r.text.find(w) #write to file with open('tokens.txt', 'w') as file: file.write(str(dictionary))
def FeaturizeFile(df): # df = pd.read_csv(CSVfile,encoding = 'latin1') stats = [] attribute_name = [] sample = [] id_value = [] i = 0 castability = [] number_extraction = [] avg_tokens = [] ratio_dist_val = [] ratio_nans = [] keys = list(df.keys()) attribute_name.extend(keys) summary_stat_result = summary_stats(df, keys) stats.extend(summary_stat_result) samples = get_sample(df, keys) sample.extend(samples) # castability.extend(castability_feature(df, keys)) # number_extraction.extend(numeric_extraction(df, keys)) # avg_tokens.extend(get_avg_tokens(samples)) ratio_dist_val.extend(get_ratio_dist_val(summary_stat_result)) ratio_nans.extend(get_ratio_nans(summary_stat_result)) csv_names = [ 'Attribute_name', 'total_vals', 'num_nans', 'num_of_dist_val', 'mean', 'std_dev', 'min_val', 'max_val', '%_dist_val', '%_nans', 'sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5' ] golden_data = pd.DataFrame(columns=csv_names) for i in range(len(attribute_name)): # print(attribute_name[i]) val_append = [] val_append.append(attribute_name[i]) val_append.extend(stats[i]) val_append.append(ratio_dist_val[i]) val_append.append(ratio_nans[i]) val_append.extend(sample[i]) # val_append.append(castability[i]) # val_append.append(number_extraction[i]) # val_append.append(avg_tokens[i]) golden_data.loc[i] = val_append # print(golden_data) curdf = golden_data for row in curdf.itertuples(): # print(row[11]) is_list = False curlst = [row[11], row[12], row[13], row[14], row[15]] delim_cnt, url_cnt, email_cnt, date_cnt = 0, 0, 0, 0 chars_totals,word_totals,stopwords,whitespaces,delims_count = [],[],[],[],[] for value in curlst: word_totals.append(len(str(value).split(' '))) chars_totals.append(len(str(value))) whitespaces.append(str(value).count(' ')) if del_reg.match(str(value)): delim_cnt += 1 if url_reg.match(str(value)): url_cnt += 1 if email_reg.match(str(value)): email_cnt += 1 delims_count.append(len(delimeters.findall(str(value)))) tokenized = word_tokenize(str(value)) # print(tokenized) stopwords.append(len([w for w in tokenized if w in stop_words])) try: _ = pd.Timestamp(value) date_cnt += 1 except ValueError: date_cnt += 0 # print(delim_cnt,url_cnt,email_cnt) if delim_cnt > 2: curdf.at[row.Index, 'has_delimiters'] = True else: curdf.at[row.Index, 'has_delimiters'] = False if url_cnt > 2: curdf.at[row.Index, 'has_url'] = True else: curdf.at[row.Index, 'has_url'] = False if email_cnt > 2: curdf.at[row.Index, 'has_email'] = True else: curdf.at[row.Index, 'has_email'] = False if date_cnt > 2: curdf.at[row.Index, 'has_date'] = True else: curdf.at[row.Index, 'has_date'] = False curdf.at[row.Index, 'mean_word_count'] = np.mean(word_totals) curdf.at[row.Index, 'std_dev_word_count'] = np.std(word_totals) curdf.at[row.Index, 'mean_stopword_total'] = np.mean(stopwords) curdf.at[row.Index, 'stdev_stopword_total'] = np.std(stopwords) curdf.at[row.Index, 'mean_char_count'] = np.mean(chars_totals) curdf.at[row.Index, 'stdev_char_count'] = np.std(chars_totals) curdf.at[row.Index, 'mean_whitespace_count'] = np.mean(whitespaces) curdf.at[row.Index, 'stdev_whitespace_count'] = np.std(whitespaces) curdf.at[row.Index, 'mean_delim_count'] = np.mean(whitespaces) curdf.at[row.Index, 'stdev_delim_count'] = np.std(whitespaces) if curdf.at[row.Index, 'has_delimiters'] and curdf.at[row.Index, 'mean_char_count'] < 100: curdf.at[row.Index, 'is_list'] = True else: curdf.at[row.Index, 'is_list'] = False if curdf.at[row.Index, 'mean_word_count'] > 10: curdf.at[row.Index, 'is_long_sentence'] = True else: curdf.at[row.Index, 'is_long_sentence'] = False # print(np.mean(stopwords)) # print('\n\n\n') golden_data = curdf return golden_data
for word in text: if word in stopwords_set: continue if words.get(word) is None: words[word] = 1 else: words[word] = words[word] + 1 word_lis = [] for word, no in words.items(): word_lis.append([word, no]) word_lis = pd.DataFrame(word_lis, columns=["word", "freq"]) word_lis = word_lis.sort_values(by="freq", ascending=False) for word in word_lis['word'][:20]: stopwords.append(word) # print stopwords # In[5]: ## Remove stop words def s_wor_rm(text): words = [] text = text.split() for word in text: if word in stopwords: continue words.append(word)
def http (text): stopwords=[] stopwords.append([w for w in text if 'https' and 'http' in w ]) mynewtext = [w for w in text if w not in stopwords[0]] return mynewtext
def remove_hashtag(sentence): stopwords=[] base = tokinizer(sentence) stopwords.append([w for w in base if w.startswith('#')]) mynewtext = [w for w in base if w not in stopwords[0]] return WordList_to_sentence(mynewtext)
dict[subject] = count return dict if __name__ == "__main__": tokenizer = RegexpTokenizer(r'\w+') stemmer = SnowballStemmer("english") ## get stop words and normalize initial_stopwords = stopwords() stopwords = list() for s in initial_stopwords: s = normalize_string(s) stopwords.append(s) print("Loading index") mention_dict = load_index("../data/surface_forms_new.txt") subject_predicates_dict = [ ] # load_subject_predicates("data/SimpleQuestions_v2/freebase-FB2M.txt") subject_triple_counts = [ ] #load_subject_triple_counts("data/subject_triple_counts.txt") dataset_names = ["test"] max_ngram_size = 10 exclude_small_ngrams = True exclude_stop_words = True for d in dataset_names: correct_count = 0
from ML_models import train_svm,train_knn,knn_accuracies,train_log_regression,\ log_regression_stats,svm_stats,train_random_forest from dataset_builder import add_sub_pol_to_dataset import matplotlib.pyplot as plt import plotly.express as px import seaborn as sns from wordcloud import WordCloud from nltk.corpus import stopwords ###################### Settings ###################### root_path = os.path.dirname(os.path.realpath(__file__)) ds_path = os.path.join(root_path, '..', 'common', 'dataset.csv') clean_ds_path = os.path.join(root_path, '..', 'common', 'clean_dataset.csv') stopwords = stopwords.words('french') stopwords.append('a') stopwords.append('e') stopwords.append('Tre') stopwords.append('cest') amazon_img_path = 'https://i1.wp.com/www.joptimisemonsite.fr/wp-content/uploads/2015/02/logo-amazon.jpg?fit=810%2C295&ssl=1&is-pending-load=1' ####################################################### ###################### Utils ###################### @st.cache def load_data(): ds = pd.read_csv(ds_path) clean_ds = pd.read_csv(clean_ds_path) clean_ds = add_sub_pol_to_dataset(clean_ds) clean_ds = clean_ds.drop(clean_ds[clean_ds.Subjectivity > 1.0].index) return ds, clean_ds
def cluster_topics(json_data): import json import time import nltk import re import numpy as np import scipy as sp import math from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.cluster import KMeans from sklearn.preprocessing import normalize def tokenize_and_stem(text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token) and token not in stopwords: filtered_tokens.append(token) stems = [stemmer.stem(t) for t in filtered_tokens] return stems def tokenize_only(text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens : if re.search("[a-zA-Z]", token) and token not in stopwords: filtered_tokens.append(token) return filtered_tokens #data = json.loads(open("room_messages.json").read()) data = json.loads(open(json_data).read()) stopwords = stopwords.words("english") stopwords.append('blah') stemmer = SnowballStemmer("english") # Extract messages messages = [] time_stamps = [] for i in range(len(data["items"])): if "text" in data["items"][i]: message = (data["items"][i]["text"]) time_stamp = data["items"][i]["created"] t = float(re.search("[0-9]*:[0-9]*", time_stamp).group(0).replace(":", ".")) messages.append(message) time_stamps.append(t) # Extract stemmed and tokenized vocab totalvocab_stemmed = [] totalvocab_tokenized = [] for i in messages: allwords_stemmed = tokenize_and_stem(i) # for each item in 'synopses', tokenize/stem totalvocab_stemmed.extend(allwords_stemmed) # extend the 'totalvocab_stemmed' list allwords_tokenized = tokenize_only(i) totalvocab_tokenized.extend(allwords_tokenized) # vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_stemmed) # print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame') # Cluster messages according to topic using K-means # define vectorizer parameters if len(allwords_stemmed) > 200: tfidf_vectorizer = TfidfVectorizer(max_df=0.99, max_features=200000, min_df=0.01, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3)) else: tfidf_vectorizer = TfidfVectorizer(max_df=1.0, max_features=200000, min_df=0.0, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3)) # fit the vectorizer to messages # t1 = time.clock() tfidf_matrix = tfidf_vectorizer.fit_transform(messages) # t2 = time.clock() # print("Tf-Idf fit time: ", t2-t1) # print(tfidf_matrix.shape) terms = tfidf_vectorizer.get_feature_names() # insert additional time feature new_column = np.asarray(time_stamps).reshape(-1, 1) time_norm = normalize(new_column) # print(new_column.shape) final = sp.sparse.hstack((tfidf_matrix, new_column)) # print(final.shape) terms.append('time') # calculate distance between messages using cosine similarity of tf-idf dist = 1 - cosine_similarity(final) # K-means clustering # num_clusters = 5 if len(messages) > 10: num_clusters = math.floor(math.sqrt(len(messages)) / 2) else: num_clusters = 1 km = KMeans(n_clusters=num_clusters) t3 = time.clock() km.fit(final) t4 = time.clock() #print("K-means fit time: ", t4 - t3) clusters = km.labels_.tolist() topics = {} for t in range(num_clusters): t_name = "topic" + str(t) topics[t_name] = {} t_messages = [] for i in range(len(messages)): if clusters[i] == t: t_messages.append(messages[i]) topics[t_name]['messages'] = t_messages #Export as json # with open('topics.json', 'w') as outfile: #json.dump(topics, outfile) # Inspect clusters # sorted_messages = {'message': messages, 'cluster': clusters} # # frame = pd.DataFrame(sorted_messages, index=[clusters], columns=['message', 'cluster']) # print(frame['cluster'].value_counts()) # # # top words per cluster # print("Top terms per cluster:") # print() # # sort cluster centers by proximity to centroid # order_centroids = km.cluster_centers_.argsort()[:, ::-1] # # for i in range(num_clusters): # print("Cluster %d words:" % i, end='') # # for ind in order_centroids[i, :6]: # replace 6 with n words per cluster # print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',') # print() # add whitespace # print() # add whitespace # # print("Cluster %d messages:" % i, end='') # for message in frame.ix[i]['message'].values.tolist(): # print(' %s,' % message, end='') # print() # add whitespace # print() # add whitespace return topics
from nltk.corpus import stopwords from nltk.tokenize import wordpunct_tokenize from sklearn.feature_extraction.text import CountVectorizer from scipy.sparse.csr import csr_matrix import re, pdb, os, json import numpy as np from collections import Counter import nltk np.random.seed(1234) lemmatizer = WordNetLemmatizer() pattern_train = r"(\d*),\"(.*)\",(\d*)" pattern_test = r"(\d*),\"(.*)\"" stopwords = list(stopwords.words("english")) stopwords.append("__EOS__") sw = [ "right", "love", "people", "feel", "yeah", "one", "see", "something", "want", "year", "yes", "still", "kind",
from nltk import bigrams, trigrams import math import json from nltk.stem import WordNetLemmatizer stopwords = nltk.corpus.stopwords.words('english') tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE) #st = LancasterStemmer() wnl = WordNetLemmatizer() keywords=[] with open('keywords.txt','r') as f: for i in f: keywords.append(i.strip()) with open('stopwords.txt','r') as f: for i in f: stopwords.append(i.strip()) def freq(word, doc): return doc.count(word) def word_count(doc): return len(doc) def tf(word, doc): return (freq(word, doc) / float(word_count(doc))) def calcu_tf(keyword): url = "http://en.wikipedia.com/wiki/"+keyword content = urllib2.urlopen(url).read()
def get_ngram(filename = None,_type=None,is_stopword=None): file_content = open(filename).read() # Get the tockens. Use word punctuations for tokenizing too other than spaces tokens = nltk.word_tokenize(file_content) text = nltk.Text(tokens) word_filter = lambda *w: word_to_find not in w ## Bigrams # ENABLE STOP WORDS #stopwords = stopwords.words('english') stopwords = [] stopwords.append('.') #stopwords.append('The') stopwords.append(':') stopwords.append(',') stopwords.append(';') stopwords.append('`') stopwords.append('``') stopwords.append('\"') #print stopwords if is_stopword == 1: filtered_tokens = [] for ftoken in tokens: ftoken_low = ftoken.lower() if ftoken not in stopwords: #print "Removing ######### " + ftoken filtered_tokens.append(ftoken) else: filtered_tokens = tokens if _type == 2: finder = BigramCollocationFinder.from_words(filtered_tokens,window_size=3) else: finder = TrigramCollocationFinder.from_words(filtered_tokens,window_size=3) # only bigrams that appear 3+ times finder.apply_freq_filter(1) lst = list(finder.ngram_fd.viewitems()) #print lst ll = sorted(lst, key=lambda x: x[1]) #ll = lst.sort(key=lambda x: x[0]) #print ll res =[] for i in ll: k1 = [] k = list(i) k1.append(' '.join(k[0])) k1.append(k[1]) res.append(k1) #raw_input() #print k1 #print res return res
rows = cursor.fetchall() except: print "Pid : ",pid," not found" return return rows #print nltk.pos_tag(['flipkart','samsung']) #lemmatiser and stopwords initialization lemmatizer = nltk.WordNetLemmatizer() from nltk.corpus import stopwords #building the stopwords list stopwords = stopwords.words('english') stoplist = ['>','<','%','.','br/','(',')','=','!'] for i in stoplist: stopwords.append(i) #normalise each qualified word def normalise(word): """Normalises words to lowercase and stems and lemmatizes it.""" word = word.lower() word = lemmatizer.lemmatize(word) return word def armAssoc(dust,strength): print def writetofile(l): feat=[] remove=["flipkart","problem","time","product","awesome","thing","port","delivery","buying","perfect","mode","reason","anything","point","excellent","hand","till","fact","market","weather","brand","life","option","guide","money"] for i in range(0,len(l)):
import bs4 as bs import json import re import nltk import heapq import pickle import sys from pprint import pprint import os import codecs import string from sklearn_crfsuite import metrics from DataExtraction import convertCONLLFormJustExtractionSemEvalPerfile from FeatureExtraction import sent2labels, sent2features from PhraseEval import phrasesFromTestSenJustExtractionWithIndex from nltk import word_tokenize, pos_tag, ne_chunk from nltk import conlltags2tree, tree2conlltags #Swapped hardcoded link with system arguement file_inLoc = sys.argv[1] file_outLoc = sys.argv[1].split(".")[0] + "-DKE.txt" file_outLoc = file_outLoc[15:] with open(file_inLoc, 'r', encoding='utf-8-sig') as f: article_text = json.load(f) pprint(article_text) ''' scraped_data = urllib.request.Request(file_inLoc, headers={'User-Agent' : "Magic Browser"}) scraped_data=urllib.request.urlopen(scraped_data) parsed_article = bs.BeautifulSoup(article,'lxml') paragraphs = parsed_article.find_all('p') article_text = ""
words = tknzr.tokenize(words) exclude = set(string.punctuation) words2 = [word for word in words if not word in exclude] words_tag = dict(pos_tag(words)) words = [word.lower() for word in words2 if not word in nltk.corpus.stopwords.words('english') and not word.isdigit()] # print(words) words = [lima(word, words) for word in words] # print(words) words = ' '.join(words) # print(words) return words stopwords = stopwords.words('english') stopwords.append('.') # stopwords.union('sally') # operators = set(('sally')) # stop = set(nltk.corpus.stopwords.words('english')) + operators # print(stopwords) sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') chapters=[] with open('user_posts_1641812829207516.csv') as File: tfidfReader = csv.reader(File) for row in tfidfReader: chapters.append(clean(row[0]).encode('utf-8')) num_chapters = len(chapters) fvs_lexical = np.zeros((len(chapters), 3), np.float64) fvs_punct = np.zeros((len(chapters), 3), np.float64) i=1
def analyze(soup_object, csv_name): #select only the headlines in each google search result base = soup_object.select("div.g.card h3") #declare empty list where I'm going to put all the headlines headlines = [] #loop to get rid of all html and keep only headline text for row in base: clean = row.text headlines.append(clean) #print to verify headlines are clean #print(headlines) #empty list to store tokenized headlines tokens = [] #loop to tokenize headlines by using clean_tokens method from earlier for each in headlines: clean = clean_tokens(each) tokens.append(clean) #print tokens to verify #print('tokens =',tokens) #create stopwords list from nltk and add fallout 76 as stopwords stopwords = nltk.corpus.stopwords.words('english') stopwords.append("fallout") stopwords.append('76') stopwords.append("'fallout") #remove stopwords from tokens filtered = [] for list in tokens: x = [] for word in list: if word not in stopwords: x.append(word) filtered.append(x) #print to verify stopwords are gone #print("filtered = ", filtered) #declare empty list so I can put the tokens back into headlines without stopwords combined = [] #put tokens back into headlines without stopwords for list in filtered: combined.append(" ".join(list)) #import sentiment analyzer from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA #create sia object I think and then create empty list to put end results in sia = SIA() results = [] #analyze sentiment of each headline for line in combined: pol_score = sia.polarity_scores(line) results.append(pol_score) #print to verify #for i in results: #print(i) #write sentiment analysis to csv file with open('resultsfinal.csv', 'a') as csv_file: writer = csv.writer(csv_file) for d in results: writer.writerow(['compound', d['compound']])
import nltk tweets = pd.read_csv('../data/primary_debates_cleaned.csv') tweets = tweets.drop(['URL','Location','Date','Line'], axis=1) tweets = tweets.loc[tweets['Speaker'].isin(['Bush', 'Carson', 'Chafee', 'Christie', 'Clinton', 'Cruz', 'Fiorina', 'Gilmore', 'Graham', 'Huckabee', 'Jindal', 'Kasich', "O'Malley", 'Pataki', 'Paul', 'Perry', 'Rubio', 'Sanders', 'Santorum', 'Trump', 'Walker', 'Webb'])] tweets = tweets.loc[~tweets['Text'].isin(['(APPLAUSE)', '(ANTHEM)', '(BELL)', '(BOOING)', '(COMMERCIAL)', '(CROSSTALK)', '(LAUGHTER)', '(MOMENT.OF.SILENCE)', '(SPANISH)', '(VIDEO.END)', '(VIDEO.START)', '(inaudible)'])] #print(tweets) #print(tweets.Tweet[0]) democrat = tweets[tweets.Party == 'Democratic'] republican = tweets[tweets.Party == 'Republican'] stopwords = stopwords.words('english') #add some unnecessary word to stopwords list stopwords.append("rt") stopwords.append("u") stopwords.append("amp") stopwords.append("w") stopwords.append("th") clean_democrat = [] for d in democrat.Text: d = re.sub(r'https\S+', '', d) d = re.sub("[^a-zA-Z]", " ", d) d = d.lower() d = nltk.word_tokenize(d) d = [word for word in d if not word in set(stopwords)] lemma = nltk.WordNetLemmatizer() d = [lemma.lemmatize(word) for word in d] d = " ".join(d)
def pullAllTheStops(self): stopwords = [] for word in self.words: if word not in self.stops: stopwords.append(word) return stopwords
import csv import random import nltk from nltk import word_tokenize from nltk.corpus import stopwords stopwords = stopwords.words("english") stopwords.append('.') stopwords.append('?') stopwords.append('\'') stopwords.append(',') stopwords.append('’') stopwords.append(')') stopwords.append('(') stopwords.append('/') class DataLoader: """ DataLoader Utility """ def __init__(self, file): self.file = file self.documents = [] self.all_words = [] self.questions = [] self.answers = [] self.questions_and_answer = [] # doc - document words and dictionary - corpus words def get_feature(self, doc, dictionary): vector = {} words = set(doc) for w in dictionary:
def perturb(self, stringseq, method=0): assert len(stringseq) > 0, "sentence must be more than one char!" assert any( c.lower() in string.ascii_lowercase for c in stringseq ), "the sentence has to contain at least one alphabet letter!" tokens = nltk.wordpunct_tokenize(stringseq) # print (tokens) #if method is not specified, we randomly select one. if method == 0: method = randint(1, 4) elif method < 0: method = randint(1, 4 + method) # eliminate certain methods # qwerty if method == 1: while True: randidx = randint(0, len(tokens) - 1) token = tokens[randidx] #has to be at least one alpha letter if not any(c.lower() in string.ascii_lowercase for c in token): continue res = self.pert_qwerty.perturb(token) if res[0] == res[1].lower(): continue fr, to = self.spans(tokens, stringseq, randidx) return (stringseq[:fr] + res[0] + stringseq[to:], res[1], (fr + res[2][0], res[2][1]), 0) # drop elif method == 2: stopwords = [] for i, token in enumerate(tokens): if token in self.stopwords: stopwords.append(i) if len(stopwords) > 0: randidx = choice(stopwords) token = tokens[randidx] else: randidx = randint(0, len(tokens) - 2) token = tokens[randidx] fr, to = self.spans(tokens, stringseq, randidx) # remove a whitespace if drop the word if to < len(stringseq) and stringseq[to] == ' ': # trailing whitespace return (stringseq[:fr] + stringseq[to + 1:], tokens[randidx], (fr, 0), 2) elif stringseq[fr - 1] == ' ': # leading whitespace return (stringseq[:fr - 1] + stringseq[to + 1:], tokens[randidx], (fr - 1, 0), 1) else: return (stringseq[:fr] + stringseq[to + 1:], tokens[randidx], (fr, 0), 0) # delete chars elif method == 3: pos_tags = nltk.pos_tag(tokens) cnt = 0 while True: cnt += 1 if cnt > 5: break randidx = randint(0, len(tokens) - 2) token = tokens[randidx] if not any(c.lower() in string.ascii_lowercase for c in token): continue if len(token) < 2 or pos_tags[randidx] in ["TO", "SYM"]: continue res = self.pert_delete.perturb(token, pos_tags[randidx]) fr, to = self.spans(tokens, stringseq, randidx) return (stringseq[:fr] + res[0] + stringseq[to:], res[1], (fr + res[2][0], res[2][1]), 0) return self.perturb(stringseq, -2) # synonym elif method == 4: cnt = 0 while True: cnt += 1 if cnt > 5: # in case of dead loop break randidx = randint(0, len(tokens) - 2) token = tokens[randidx] # has to be a word if not all(c.lower() in string.ascii_lowercase for c in token): continue res = self.pert_synonym.perturb(token) if res[0] == res[1]: continue fr, to = self.spans(tokens, stringseq, randidx) return (stringseq[:fr] + res[0] + stringseq[to:], res[1], (fr, res[2][1]), 0) return self.perturb(stringseq, -1)
dic.append( {'구분': 'trigram', '순위': '1~20', '단어': unicode(trigram[0]) + unicode(trigram[1]) + unicode(trigram[2]), '횟수': str(dic[19]['횟수']) + u"회 이상"}) csv_columns = ['구분', '순위', '단어', '횟수'] csv_file = filename.split('.')[0] + "freq.csv" WriteDictToCSV(csv_file, csv_columns, dic) stopwords = [] stoptext = open("stopwords.txt", "r") for sw in stoptext.readlines(): sw = sw.decode('cp949') sw = re.sub('\\n', '', sw) stopwords.append(sw) measures = collocations.BigramAssocMeasures() measures2 = collocations.TrigramAssocMeasures() def remove_values_from_list(the_list, val): return [value for value in the_list if value != val] pwd = os.getcwd()+'\\document' for path, dirs, files in os.walk(pwd): for file in files:
# coding=utf-8 import math from textblob import TextBlob as tb from nltk.corpus import stopwords from nltk.tokenize import word_tokenize #STOPWORDS stopwords = stopwords.words('portuguese') stopwords.append('pra') def remove_stopwords (sentences): phrases = [] for sentence in sentences: #Gera tokens tokens = [] words = word_tokenize(sentence) for word in words: if word.lower() not in stopwords: tokens.append(word) phrases.append(' '.join(tokens)) return phrases def get_cosine(vec1, vec2): size = len(vec1) - 1 numerator = sum([vec1[x] * vec2[x] for x in range(size)]) sum1 = sum([vec1[x]**2 for x in range(size)]) sum2 = sum([vec2[x]**2 for x in range(size)])
from tweepy import OAuthHandler from tweepy.streaming import StreamListener from nltk.corpus import stopwords from nltk.tokenize import word_tokenize nltk.download('punkt') consumerKey="xxxx" consumerSecret="xxxx" accessToken="xxxx-xxxx" accessSecret="xxxxx" stopwords = [] for i in open("stopwords.txt"): sword = i.rstrip('\n') stopwords.append(sword) global check check = [] class listener(StreamListener): x = 0 y = 0 def on_data(self, data): all_data = json.loads(data) tweet = all_data["text"] tweet = re.sub(r"http\S+", "", tweet) analysis = TextBlob(tweet) stop_words = set(stopwords) filtered_words = set(analysis.words.lower()) - stop_words # remove stop words from tweet for i in set(filtered_words): # remove @ tags