def generateWordcloud(cleanedTweets): # join tweets to a single string words = ' '.join(cleanedTweets['text']) # remove URLs, RTs, and twitter handles no_urls_no_tags = " ".join([ word for word in words.split() if 'http' not in word and not word.startswith('@') and word != 'RT' ]) stopwords = set(STOPWORDS) stopwords.add("SAP") wc = WordCloud(font_path='CabinSketch-Bold.ttf', background_color="white", max_words=30, width=600, height=300, stopwords=stopwords).generate(no_urls_no_tags) plt.figure(figsize=(20, 10), facecolor='k') plt.imshow(wc) plt.axis("off") plt.tight_layout(pad=0) plt.show() plt.imshow(wc) #plt.axis('off') plt.savefig('my_twitter_wordcloud_1.png', dpi=300) plt.show()
def word_cloud(lst): string = " ".join(lst) def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl(0, 0%%, %d%%)" % random.randint(60, 100) mask = np.array(Image.open("twitter_mask.png")) stopwords = set(STOPWORDS) stopwords.add("int") stopwords.add("ext") wc = WordCloud(max_words=75, mask=mask, stopwords=stopwords, margin=10, random_state=1).generate(string) plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3), interpolation="bilinear") plt.axis("off") plt.figure() return plt.show()
def clean_text(texts): # list of strings import re from nltk.stem import PorterStemmer ps = PorterStemmer() stopwords = set_stopwords() stopwords.add('ext') stopwords.add('int') res = [] for text in texts: text = text.lower() text = re.sub("\'ve", " have ", text) text = re.sub("\'re", " are ", text) text = re.sub("n't", " not ", text) text = re.sub("\'ll", " will ", text) text = re.sub("cut to", " ", text) text = re.sub("scene shifts", " ", text) text = re.sub("scene", " ", text) text = re.sub("[^A-Za-z\n]", " ", text) text = re.sub("\n", " ", text) text = ' '.join([ ps.stem(w) for w in word_tokenize(text) if (w not in stopwords) and (len(w) > 1) ]) if text != '': res.append(text) return res
def process(self, inputs: ValueMap, outputs: ValueMap): stopwords = set() _languages = inputs.get_value_obj("languages") if _languages.is_set: all_stopwords = get_stopwords() languages: ListModel = _languages.data for language in languages.list_data: if language not in all_stopwords.fileids(): raise KiaraProcessingException( f"Invalid language: {language}. Available: {', '.join(all_stopwords.fileids())}." ) stopwords.update(get_stopwords().words(language)) _stopword_lists = inputs.get_value_obj("stopword_lists") if _stopword_lists.is_set: stopword_lists: ListModel = _stopword_lists.data for stopword_list in stopword_lists.list_data: if isinstance(stopword_list, str): stopwords.add(stopword_list) else: stopwords.update(stopword_list) outputs.set_value("stopwords_list", sorted(stopwords))
def _compute_frequencies(self, word_sent, freq, customStopWords=None): # print word_sent # freq = collections.defaultdict(int) # defaultdict is a standard dictionary if customStopWords is None: stopwords = set(self._stopwords) else: stopwords = set(customStopWords).union(self._stopwords) newStopWords = ['new', 'york', 'times', 'washington', 'post', '-'] for newWord in newStopWords: stopwords.add(newWord) for sentence in word_sent: for word in sentence: if word not in stopwords: freq[word] += 1 print word + "\t" + str(freq) + "\n\n" # flattened = [val for sublist in collectAll for val in sublist] # flattenedAgain = [val for sublist in flattened for val in sublist] # print flattenedAgain m = float(max(freq.values())) for word in freq.keys(): freq[word] = freq[word] / m if freq[word] >= self._max_cut or freq[word] <= self._min_cut: del freq[word] return freq
def get_term_lines(path): with open(path, 'r') as file: lines = list() line_start = 0 for line in file: lines.append((line, line_start)) line_start += len(line) term_lines = list() from nltk.corpus import stopwords #from nltk.tokenize import word_tokenize stopwords = set(stopwords.words('english')) stopwords.add("able") stopwords.add("other") stopwords.add("another") stopwords.add("whether") stopwords.add(",") for text, position in lines: term_line = list() text = text.lower() words = [word for word in text[0:-1].replace(",", " ,").split(" ")] for word in words: if word in stopwords: term_line.append(Term(word, -1)) if word not in [term.text for term in term_line]: term_line.append(Term(word, text.find(word) + position)) #print([term.text for term in term_line]) term_lines.append(term_line) return term_lines
def get_stopwords(): stop_words_sp = set(stopwords.words('spanish')) stop_words_en = set(stopwords.words('english')) stopwords = stop_words_sp | stop_words_en stopwords.add('para') spanish_stemmer = SnowballStemmer('spanish') return set(map(spanish_stemmer.stem, stopwords))
def getStopWords(path): stopwords = set() with open(path, "r") as f: lines = f.readlines() for line in lines: stopwords.add(line.replace("\r\n", "").rstrip()) return stopwords
def load_stopwords(file): stopwords = set() with open(file, "r") as input_file: for line in input_file: if not line.strip(): continue words = line.split() stopwords.add(words[0]) return stopwords
def tokenizeText(sample): doc = nlp(sample) stopwords = set(STOPLIST) stopwords.add("queryset") lemmas = [token.lemma_ for token in doc if not token.is_stop] a_lemmas = [ lemma for lemma in lemmas if (lemma.isalpha() and lemma != '-PRON-') and lemma not in stopwords and lemma not in SYMBOLS ] return a_lemmas
def getStopWords(): """ 返回stopwords_file_path给定的stop words :return: """ stopwords = set() with open(stopwords_file_path) as file: for line in file: stopwords.add(line.strip()) return stopwords
def load_twitter_stopwords(file): stopwords = set() with open(file, "r") as input_file: for line in input_file: if not line.strip(): continue words = line.split(",") for word in words: stopwords.add(word) return stopwords
def read_stopwords(): stopwords = set() f = open('stopwords.txt') for line in f: word = line.strip() if not word: continue stopwords.add(word) f.close() return stopwords
def obter_palavras(self): from nltk.corpus import stopwords palavras = word_tokenize(self.texto_artigo.lower()) stopwords = set(list(punctuation)) stopwords.add('“') stopwords.add('”') palavras = [ palavra for palavra in palavras if palavra not in stopwords ] return palavras
def create_wordcloud(text): from wordcloud import WordCloud, STOPWORDS stopwords = set(STOPWORDS) additional_stopwords = [ 'one', 'see', 'yes', 'really', 'yeah', 'maybe', 'say', 'know', 'think', 'well', 'lot', 'make', 'will', 'also', 'don', 'going', 'go', 'something', 'everything' ] for new_word in additional_stopwords: stopwords.add(new_word) wc = wordcloud.WordCloud(stopwords=stopwords).generate(text) return wc
def read_stopwords_in_set(self): stopwords = set() try: with open(self.stop_words_file_path, "r") as file: # stopwords_file must a contain single stop-word per line line = file.readline() while line: stopwords.add(line.strip()) line = file.readline() except FileNotFoundError as fe: print(f"Stopwords file not found!{fe}") return stopwords
def obter_palavras_sem_stopwords(self): from nltk.corpus import stopwords palavras = word_tokenize(self.obter_texto_artigos()) stopwords = set(stopwords.words('portuguese') + list(punctuation)) stopwords.add('“') stopwords.add('”') self.palavras_sem_stopwords = [ palavra for palavra in palavras if palavra not in stopwords ] return self.palavras_sem_stopwords
def remove_stopwords(self, cleaned_tweets, searchTerm): stopwords = set(STOPWORDS) stopwords.add(searchTerm) for tweetId in cleaned_tweets: cleanTweet = '' tokens = cleaned_tweets[tweetId].split() for i in range(len(tokens)): tokens[i] = tokens[i].lower() if tokens[i] not in stopwords: cleanTweet += tokens[i] + " " cleaned_tweets[tweetId] = cleanTweet return cleaned_tweets
def show_img(self, board, limit_num): reddit = praw.Reddit(client_id='EwAVjgascYrGIg', client_secret='Z7HahaiGdEKl3e57vml1VkC0pVc', user_agent='hunghunghung1231') subreddit = reddit.subreddit( board) #input what subreddit you want to search for top_subreddit = subreddit.top() # grab most up-voted topics all-time top_subreddit = subreddit.hot(limit=limit_num) #使用者輸入要爬幾篇文章 # for submission in subreddit.top(limit=limit_num): # print(submission.title, submission.id) topics_dict = { "title": [], "score": [], "id": [], "url": [], "comms_num": [], "created": [], "body": [] } for submission in top_subreddit: topics_dict["title"].append(submission.title) topics_dict["score"].append(submission.score) topics_dict["id"].append(submission.id) topics_dict["url"].append(submission.url) topics_dict["comms_num"].append(submission.num_comments) topics_dict["created"].append(submission.created) topics_dict["body"].append(submission.selftext) topics_data = pd.DataFrame(topics_dict) def get_date(created): return dt.datetime.fromtimestamp(created) _timestamp = topics_data["created"].apply(get_date) topics_data = topics_data.assign(timestamp=_timestamp) comment_string = '' #set an dict to count words word_list = [] for word in topics_data['title']: comment_string += word stopwords = set(stopwords_l) stopwords.add('https') stopwords.add('gif') wc = WordCloud(height=500, width=1000, background_color='white', stopwords=stopwords).generate(comment_string) img = wc.to_file('img.png') plt.imshow(img) plt.axis("off") plt.show()
def remove_stop(reviews): new_reviews = {} stopwords = Set() reader = csv.reader(open('snowball_stopwords.txt', 'rb')) for row in reader: stopwords.add(row[0]) for review in reviews: new_reviews[review] = [] for word in reviews: if word not in stopwords: new_reviews[review].append(word) print new_reviews[review] return new_reviews
def remove_stop(reviews): new_reviews = {} stopwords = Set() reader = csv.reader(open("snowball_stopwords.txt", "rb")) for row in reader: stopwords.add(row[0]) for review in reviews: new_reviews[review] = [] for word in reviews: if word not in stopwords: new_reviews[review].append(word) print new_reviews[review] return new_reviews
def load_stopwords(path): """ This function loads a stopword list from the *path* file and returns a set of words. Lines begining by '#' are ignored. """ # Set of stopwords stopwords = set([]) # For each line in the file for line in codecs.open(path, 'r', 'utf-8'): if not re.search('^#', line) and len(line.strip()) > 0: stopwords.add(line.strip().lower()) # Return the set of stopwords return stopwords
def __init__(self, fname, verbose): #fixme: check format of supported file self.stories = [] # collect stop words stopwords = set() fin = open( '/Users/msingh/cs221/project/mctDataSet/mctest-master/data/stopwords.txt', 'rU') for stopword in fin: s = stopword.lower().strip() stopwords.add(s) fin.close() # process stories fin = open(fname, 'rU') if verbose > 0: print 'Reading file %s: START' % (fname) for story in fin: story = story.strip() s = Story(stopwords) data = re.split('\t', story) s.setName(data[0]) if verbose > 4: print 'Reading story %s' % (data[0]) properties = re.split(';', data[1]) for p in properties: (name, v) = re.split(': ', p) if name == 'Author': if verbose > 4: print 'Setting Author %s for story %s' % (v, data[0]) s.setAuthor(v) continue if name == 'Work Time(s)' or name == 'Work Time': s.setTime(v) continue s.setStory(data[2]) index = 3 while True: s.setQuestion(data[index], data[index + 1:index + 5]) index += 5 if index + 5 > len(data): break self.stories.append(s) fin.close() if verbose > 0: print 'Reading file %s: DONE' % (fname)
def show_wordcloud(self, data, title='High', size=5, name='wc', file='X', makefile=False, show=True): stopwords = set(STOPWORDS) stopwords.add('shoe') if file != 'X': masking = np.array(Image.open(X)) wordcloud = WordCloud(background_color='white', stopwords=stopwords, max_words=100, max_font_size=70, scale=1, random_state=2, mask=masking).generate(data) else: wordcloud = WordCloud(background_color='white', stopwords=stopwords, max_words=100, max_font_size=70, scale=1, random_state=2).generate(data) if show == True: fig = plt.figure(1, figsize=(5 + size, 5 + size)) plt.axis('off') if title: fig.suptitle(title, fontsize=20) fig.subplots_adjust(top=2.3) plt.imshow(wordcloud) plt.show() if makefile == True: nn = '{}.jpg'.format(name) wordcloud.to_file(nn)
def nube_de_palabras(text): stopwords = set(STOPWORDS) stopwords.add("queryset") stopwords.add("'") plt.figure(figsize=(20, 5)) wordcloud = WordCloud(background_color='white', stopwords=stopwords).generate(text) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") image = io.BytesIO() plt.savefig(image, format='png') image.seek(0) # rewind the data string = base64.b64encode(image.read()) image_64 = 'data:image/png;base64,' + urllib.parse.quote(string) return image_64
def fnCreate_WordCloud(i, text): if len(text) > 5: stopwords = set(STOPWORDS) stopwords.add("said") d = path.dirname(__file__) parent_dir = path.abspath(d + "/../") d = path.join(parent_dir, r'nlp\static\images\img') # lower max_font_size wordcloud = WordCloud(background_color="white", max_font_size=45, stopwords=stopwords).generate(text) ## The pil way (if you don't have matplotlib) image = wordcloud.to_image() # store default colored image filename = str(i) + '.png' image.save(path.join(d, filename), "PNG")
def write_crawl_results(self, the_path, my_query, the_cnt_in): #let use fetch_urls to get URLs then pass to the my_scraper function import pandas as pd #import re from nltk.stem import PorterStemmer from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) stopwords.add('e') my_stem = PorterStemmer() # guess loop needs to be here query_df = pd.DataFrame() for q in my_query: the_urls_list = self.fetch_urls(q, the_cnt_in) #print(the_urls_list) for word in the_urls_list: tmp_txt = self.my_scraper(word) if len(tmp_txt) != 0: try: body_stem = [my_stem.stem(word) for word in tmp_txt] query_df = query_df.append( { 'body basic': tmp_txt, 'body stem': body_stem, 'label': q }, ignore_index=True) #query_df['body basic'] = query_df.append(tmp_txt, ignore_index = True) # query_df['body stem'] = query_df.append([my_stem.stem(word) for word in tmp_txt], ignore_index = True) # query_df['label'] = query_df.append(q, ignore_index = True) #tmp_txt = [word.lower() for word in tmp_txt.split() if word not in stopwords] # tmp_df = pd.DataFrame([tmp_txt], columns =['body basic']) #tmp_df['stemmed text'] = self.clean_data(tmp_txt) # tmp_df['body stem'] = [my_stem.stem(word) for word in tmp_txt] # tmp_df['label'] = my_query # query_df = query_df.append(tmp_df, ignore_index=True) except: pass print(query_df)
def create_wordcloud(yt_comments): # Prepare Data for wordcloud current_time = time.time() image_path = 'static/wordcloud_images/wordcloud' + str( current_time) + '.png' comment_words = "" for words in yt_comments: comment_words = comment_words + words + ' ' # adding movie script specific stopwords stopwords = set(STOPWORDS) stopwords.add("movie") stopwords.add("film") stopwords.add("trailer") wordcloud = WordCloud(width=400, height=400, background_color='cyan', stopwords=stopwords, min_font_size=10).generate(comment_words) # plot the WordCloud image plt.figure(figsize=(8, 8)) plt.imshow(wordcloud) plt.axis("off") # plt.tight_layout(pad=0) plt.savefig(image_path) return image_path
def generate_wordcloud(source, keyword, text): T = Tokenizer() stopwords = set(STOPWORDS) stopwords.add(keyword.split('@')[-1]) for w in T.stopwords: stopwords.add(w) wc_path = "" try: # Generate a word cloud image wordcloud = WordCloud(background_color="white", max_words=50, width=400, height=400, stopwords=stopwords).generate(text) wc_path = os.path.join(wordcloud_dir, "%s_%s.jpg" % (source, keyword)) wordcloud.to_file(wc_path) except Exception as ex: print("generate_wordcloud", str(ex)) return wc_path
from __future__ import division import json import nltk from sklearn.feature_extraction import DictVectorizer from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) stopwords.add('DT') stopwords.add('CD') import sys import os nltk.data.path.append(os.getcwd() + "/../downloads/nltk_data/") sys.path.append(os.getcwd() + "/Code/") import string import re import subprocess as sp import numpy as np from informative_prior_logistic_regresssion_sw import InformativePriorLogisticRegressionWeight from eval.entity_level_evaluation import load_gold from eval.entity_level_evaluation import load_dictionary import operator import matplotlib.pyplot as plt """ Model to train """ def train_informative_logreg(_X, _Q, w0, b0, C=5): """ _X: size (Nmentions by Nfeat) _Q: vector length Nmentions. for each, P(z_i=1|x, y) model: for example a LogisticRegression object.
print "step 2" # words = [] # words_count = [] #for f in filtered_words: # for t in f : # temp = t.lower() # if temp != 'much' and temp != 'last' and temp != 'next' and temp != 'green' : # if pos_tag(word_tokenize(temp))[0][1] == 'JJ' : # text += " " # text += temp print "step 3" # read the mask image # taken from # http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg alice_mask = np.array(Image.open(path.join(d, "names/MG1.png"))) stopwords = set(STOPWORDS) stopwords.add("said") wc = WordCloud(background_color="white", max_words=3000, mask=alice_mask, stopwords=stopwords) # generate word cloud wc.generate(show_text) # store to file wc.to_file(path.join(d, "Mcgee.png"))
top_three = np.argsort(W[idx])[::-1][:3] return top_three if __name__ == '__main__': content = get_content() stopwords = set(stopwords.words('english')) for word in [ 'also', 'would', 'could', 'saw', 'report', 'bfro', 'like', 'said', 'YEAR', 'SEASON', 'MONTH', 'STATE', 'COUNTY', 'LOCATION', 'DETAILS', 'TOWN', 'NEAREST', 'ROAD', 'OBSERVED', 'NOTICED', 'OTHER', 'WITNESSES', 'STORIES', 'TIME', 'CONDITIONS', 'ENVIRONMENT' ]: stopwords.add(word) vectorizer = CountVectorizer(stop_words=stopwords) td_mat = vectorizer.fit_transform(content) V = td_mat.toarray() feature_names = vectorizer.get_feature_names() k = 5 nmf = NMF(n_components=k) nmf.fit(V) W = nmf.transform(V) H = nmf.components_ err = nmf.reconstruction_err_ #top words per topic top_words = get_top_words(H, feature_names)
wordcloud = WordCloud(background_color=None, mode='RGBA', random_state=42).generate_from_frequencies(pairs) print counts plt.figure() plt.imshow(wordcloud) plt.axis("off") plt.savefig(filename, bbox_inches='tight') def tokenize_text(stopwords, text): ''' split text, stem word, remove stopwords ''' lemma = WordNetLemmatizer() tokenizer = RegexpTokenizer(r'\w+') return [lemma.lemmatize(w.lower()) for w in tokenizer.tokenize(text) if w.lower() not in stopwords] if __name__ == '__main__': stopwords = set(stopwords.words('english')) stopwords.add('using') stopwords.add('dis') new_tokenizer = partial(tokenize_text, stopwords) fin, fout = 'paper-titles.txt', 'wordcloud.png' with open(fin) as f: titles = [new_tokenizer(line) for line in f] make_wordcloud(fout, titles)
def load_stopwords(path): stopwords = set([]) with open(path, "r") as f: for line in f.readlines(): stopwords.add(line.rstrip()) return stopwords
para = re.sub('<footref.*?/>', '', para) para = re.sub('</?signpost>', '', para) para = re.sub('</?description>', '', para) para = re.sub('</?blockquote>', '', para) para = re.sub('</?bookref.*?>', '', para) return textwrap.wrap(para, width) sections = OrderedDict() custom = json.load(open('fotw_custom.json')) parser = etree.XMLParser(resolve_entities=False) tree = etree.parse('fotw.xml', parser=parser) root = tree.getroot() stopwords = set(stopwords.words('english')) for w in ['turn', 'wish', 'want', 'turning', 'rather', 'would', 'along', 'upon', 'another']: stopwords.add(w) for sect_elem in root.findall('.//section[@class="numbered"]')[1:]: #for sect_elem in root.findall('.//section[@id="sect%s"]' % 181): sect_id = sect_elem.find('.//title').text sect_paras = [] choices = [] combat = {} enemies = [] rnt_found = False ac_found = False stats_found = False undead_found = False sommerswerd_found = False immune_to_mindblast_found = False illustration_found = False