def wordcloud(topics, k): from wordcloud.wordcloud import WordCloud for label, freqs in topics: highlight_words = [] wordcloud = WordCloud(color_func=grey_color_func, random_state=1, margin=10, background_color='white').fit_words(freqs) wordcloud.to_file("./intermediate_data/figures/BTM_wordcould/" + str(k) + "tp/hpv.%s.tagcloud.png" % (label))
def __create__(text, pic_path): if str(platform.system()).lower() == 'windows': font_path = 'C:/Windows/Fonts/STFANGSO.ttf' else: font_path = '/usr/share/fonts/win/msyh.ttf' mask = imread(jiebaSource + '/back.jpg') # 读取背景图片 wordcloud = WordCloud(mask=mask, background_color='white', max_font_size=240, random_state=180, font_path=font_path).generate(text) wordcloud.to_file(pic_path)
def wordcloud(topics=[]): from wordcloud.wordcloud import WordCloud for label, freqs in topics: # logger.info(label) # logger.info(freqs[0]) # quit() highlight_words = []; wordcloud = WordCloud(color_func = grey_color_func, random_state=1, margin=10, background_color='white').fit_words(freqs) # wordcloud.to_file("./all_data/figures/adv_in_nj/hpv.%s.tagcloud.png"%(label)) # wordcloud.to_file("./intermediate_data/promotional/25tp/hpv.%s.tagcloud.png"%(label)) # wordcloud.to_file("./intermediate_data/laypeople/15tp/hpv.%s.tagcloud.png"%(label)) wordcloud.to_file("./intermediate_data/hpv_tweets/35tp/hpv.%s.tagcloud.png"%(label))
def __init__(self, font_path='C:/Windows/Fonts/simkai.ttf', background_color="white", max_font_size=200, mask=None): self.mask = mask self.graph = None if mask: image = Image.open(mask) self.graph = np.array(image) self.wc = WordCloud(font_path=font_path, background_color=background_color, max_font_size=max_font_size, mask=self.graph, collocations=False)
def gen_wordcloud(dataframe: pd.DataFrame, stopwords=None): """ generates a wordcloud :param dataframe: input dataframe :param stopwords: set of user stopwords. If you want to include all words use stopwords = 'off' :return: None """ import nltk from wordcloud.wordcloud import WordCloud # get stopwords from nltk and merge with given stopwords nltk.download('stopwords') if stopwords == 'off': stopwords = {} elif stopwords is not None: stopwords = stopwords | set(nltk.corpus.stopwords.words('english')) else: stopwords = nltk.corpus.stopwords.words('english') dataframe = dataframe[dataframe['category'] == 'TEXT'] text = dataframe['message'].str.cat(sep=' ') cloud = WordCloud(stopwords=stopwords).generate(text) plt.imshow(cloud, interpolation='bilinear') plt.axis("off") plt.show()
class PyCloudWords: def __init__(self, font_path='C:/Windows/Fonts/simkai.ttf', background_color="white", max_font_size=200, mask=None): self.mask = mask self.graph = None if mask: image = Image.open(mask) self.graph = np.array(image) self.wc = WordCloud(font_path=font_path, background_color=background_color, max_font_size=max_font_size, mask=self.graph, collocations=False) def generate( self, raw_text=None, frequent_dict=None, ): if raw_text and isinstance(raw_text, str): wl_space_split = " ".join(jieba.cut(raw_text)) self.wc.generate(wl_space_split) plt.imshow(self.wc) plt.axis("off") plt.show() self.wc.to_file("my_wordcloud.png") return if frequent_dict and isinstance(frequent_dict, dict): self.wc.generate_from_frequencies(frequent_dict) plt.imshow(self.wc) plt.axis("off") plt.show() self.wc.to_file("my_wordcloud.png") return if self.mask: image_color = ImageColorGenerator(self.graph) self.wc.recolor(color_func=image_color)
def plot_words_cloud(): ''' 词云 :return: ''' tomato_str = ' '.join(tomato_com['comment']) words_list = [] word_generator = jieba.cut_for_search(tomato_str) for word in word_generator: words_list.append(word) words_list = [k for k in words_list if len(k) > 1] back_color = imread('/Users/afa/myFiles/tmp/灰姑娘.png') # 解析该图片 wc = WordCloud( background_color='white', # 背景颜色 max_words=200, # 最大词数 mask=back_color, # 以该参数值作图绘制词云[设置词云形状],这个参数不为空时,width和height会被忽略 max_font_size=300, # 显示字体的最大值 stopwords=STOPWORDS.add('苟利国'), # 使用内置的屏蔽词,再添加'苟利国' font_path="/Library/Fonts/Songti.ttc", random_state=42, # 为每个词返回一个PIL颜色 # width=1000, # 图片的宽 # height=860 #图片的长 ) tomato_count = Counter(words_list) wc.generate_from_frequencies(tomato_count) # 基于彩色图像生成相应彩色 image_colors = ImageColorGenerator(back_color) # 绘制词云 plt.figure() plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear") plt.axis('off') plt.show() return
def generate_word_cloud(world_file): text = codecs.open(world_file,encoding = 'utf-8').read() wordlist_after_jieba = jieba.cut(text, cut_all=True) wl_space_split = " ".join(wordlist_after_jieba) data = dict(Counter(wordlist_after_jieba)) background = np.array(Image.open("love.jpg")) my_wordcloud = WordCloud(mask= background,background_color="black") my_wordcloud.generate(wl_space_split) image_colors = ImageColorGenerator(background) my_wordcloud.recolor(color_func=image_colors) my_wordcloud.to_file(world_file.split(".")[0]+".png")
def wordcloud_plot(search_term, tweets_dataframe, save_path): tweets = tweets_dataframe["Tweet"].dropna().values # Extract and clean words all_words = TextBlob(" ".join(tweets).upper()).words.lemmatize() # Get stop-words stop_words = list(set(stopwords.words('english'))) + ['thi'] # Remove Stop and Short Words words = [ w for w in all_words if len(w) > 2 and w.lower() not in stop_words ] # Convert into one long string tweet_str = " ".join(words) # Create word-cloud word_cloud = WordCloud( font_path= "/Users/jamesashford/Documents/Projects/Hackathons/Oxford Hack 2020/OxHack-2020/TCARS/rsc/swiss_911_ultra_compressed_bt.ttf", mode="RGBA", background_color=None, colormap="Blues", width=1000, height=600, max_words=2000) word_cloud.generate(tweet_str) # Save file_name = f"{save_path}/{search_term}_wordcloud.png" word_cloud.to_file(file_name) return True
def generate_word_cloud(world_file): font = r'C:\Windows\Fonts\Microsoft YaHei UI\MSYHBD.TTC' #os.path.join(os.path.dirname(__file__), "DroidSansFallbackFull.ttf") text = codecs.open(world_file, encoding='utf-8').read() wordlist_after_jieba = jieba.cut(text, cut_all=True) wl_space_split = " ".join(wordlist_after_jieba) backgroup_mask = np.array(Image.open("love.jpg")) my_wordcloud = WordCloud(font_path=font, mask=backgroup_mask, background_color="black", max_font_size=100, random_state=42) my_wordcloud.generate(wl_space_split) image_colors = ImageColorGenerator(backgroup_mask) my_wordcloud.recolor(color_func=image_colors) #plt.imshow(my_wordcloud) #plt.axis("off") #plt.show() my_wordcloud.to_file("love.png")
def gen_wordcloud(file_name): with open (file_name,'r') as file: data=file.read().lower().split("\n") words= [] for line in data: for word in line.split(" "): words.append(word) """ stemmed_words=[] for word in words: stemmed_words.append(ls.stem(word)) #print(word) #print(ls.stem(word)) print (stemmed_words) """ stopwords2=['http','https',"the","say","\'trump",'co',"trump","u00eda","uf0339","udd25","u201d","u26a1","u00e9","u2019re",'realdonaldtrump','ud','nhttps','ude','potus',"udc47","n27","n19",'udd','president','amp','ryan',"'gop", "u2019s","Trump","ndr","donald","'s","'n","gop","'w","u2026","u2019t","'u2019s","u2019m","ude02","ud83c","ufe0f","ud83d","u0627","u2764","ude0d","u0644" ] stopwords = nltk.corpus.stopwords.words('english') words2 = [] for w in words: if w not in stopwords and len(w) > 1 and w not in stopwords2: words2.append(w) freq = nltk.FreqDist(words2) freq.plot(35) text2 = ' '.join(words2) print (text2) wc = WordCloud(max_font_size=40).generate(text2) plt.figure() plt.imshow(wc) plt.axis("off") plt.show() return words2
def on_buttonpress(): if text_input.value != "": tweets = get_tweets() allemojis = [] ef = [] for i in range(len(t)): rr = len(t[i]) for j in range(rr): allemojis = "".join(t[i][j]['emoji']) emoji_list = emoji.emoji_lis(allemojis) if emoji_list != []: ef.append(emoji_list) em = [] for k in range(len(ef)): for l in range(len(ef[k])): em.append(ef[k][l]['emoji']) emoji_series = pd.Series(em) emojis = pd.DataFrame( emoji_series.value_counts()).reset_index().rename( columns={ 'index': 'emoji', 0: 'Count' }) emojis['Rank'] = pd.Series(range(1, len(emojis))) emojis = emojis.head(10) emojis['Rank'] = emojis['Rank'].apply(lambda x: int(x)) source_emoji.data = dict(emoji=emojis['emoji'], Count=emojis['Count'], Rank=emojis['Rank'], color=Paired[10]) labels = LabelSet(x="Rank", y="Count", text="emoji", level='glyph', render_mode='canvas', source=source_emoji, x_offset=-16, y_offset=-14, text_font_size="23pt") p.vbar(x="Rank", top="Count", width=0.95, source=source_emoji, color="color") p.xaxis.minor_tick_line_color = None # turn off x-axis minor ticks p.yaxis.minor_tick_line_color = None # turn off y-axis minor ticks p.y_range.start = 0 p.x_range.start = 0 p.xaxis[0].ticker.desired_num_ticks = 10 p.add_layout(labels) p.xaxis.minor_tick_line_color = None p.xgrid.visible = False p.ygrid.visible = False p.xaxis.major_tick_line_color = None p.add_tools(hover) ####wordcloud alltweets = [] for j in range(len(t)): for i in range(len(t[j])): alltweets.append(t[j][i]['text']) combined_tweets = "".join(alltweets) cleaned_tweets = text_clean(combined_tweets) ss = pd.DataFrame( pd.Series(cleaned_tweets).value_counts()).reset_index().rename( columns={ 'index': 'Word', 0: 'Count' }) ss['Rank'] = pd.Series(range(1, len(ss))) ss = ss.head(10) ss['Rank'] = ss['Rank'].apply(lambda x: int(x)) data_text = dict(Word=list(ss.Word), Count=list(ss.Count), Rank=list(ss.Rank), color=Category20[10]) #labels_text = LabelSet(x="Rank", y="Count", text="Word", level='glyph', render_mode='css', source = ColumnDataSource(data_text), #x_offset = -7, text_font_size="10pt", y_offset = 7, angle = 45) #p1.hbar(y = 'Rank', height = 0.9, right = 'Count', source = ColumnDataSource(data_text), color = "maroon") p1.xaxis[0].ticker.desired_num_ticks = 10 p1.xaxis.major_label_overrides = { 1: list(ss.Word)[0], 2: list(ss.Word)[1], 3: list(ss.Word)[2], 4: list(ss.Word)[3], 5: list(ss.Word)[4], 6: list(ss.Word)[5], 7: list(ss.Word)[6], 8: list(ss.Word)[7], 9: list(ss.Word)[8], 10: list(ss.Word)[9] } p1.vbar(x="Rank", top="Count", width=0.95, source=ColumnDataSource(data_text), color="color") #p1.add_layout(labels_text) p1.xaxis.minor_tick_line_color = None p1.xaxis.major_tick_line_color = None p1.xgrid.visible = False p1.ygrid.visible = False p1.add_tools(hover1) ###Actual wordcloud from nltk.corpus import stopwords stopwords = set(STOPWORDS) stopwords.add('one') stopwords.add('also') stopwords.add('twitter') stopwords.add('pic') stopwords.add('https') stopwords.add('bit') stopwords.add('ly') stopwords.add('via') stopwords.add('buff') wordcloud_good = WordCloud( colormap="Dark2", width=750, height=500, stopwords=stopwords, scale=4, max_words=300, background_color='white').generate(combined_tweets) wordcloud_good.to_file('temp.png') word_img = Image.open('temp.png').convert('RGBA') xdim, ydim = word_img.size img = np.empty((ydim, xdim), dtype=np.uint32) view = img.view(dtype=np.uint8).reshape((ydim, xdim, 4)) view[:, :, :] = np.flipud(np.asarray(word_img)) dim = max(xdim, ydim) fig.image_rgba(image=[img], x=0, y=0, dw=500, dh=750) ##Sentiments graph def loadLexicon(fname): newLex = set() lex_conn = open(fname) #add every word in the file to the set for line in lex_conn: newLex.add(line.strip( )) # remember to strip to remove the lin-change character lex_conn.close() return newLex sia = SIA() results = [] for line in alltweets: pol_score = sia.polarity_scores(line) pol_score['Tweet'] = line results.append(pol_score) polarity = pd.DataFrame(results) most_neg = polarity.sort_values('compound', ascending=True)[0:5] most_pos = polarity.sort_values('compound', ascending=False)[0:5] polarity['Polarity'] = 0 polarity.loc[polarity['compound'] > 0, 'Polarity'] = 1 polarity.loc[polarity['compound'] < 0, 'Polarity'] = -1 perc = pd.DataFrame( polarity.Polarity.value_counts(normalize=True) * 100).reset_index().rename(columns={ 'index': 'Sentiment', 'Polarity': 'Percentage' }) perc.Percentage = perc.Percentage.apply(lambda x: round(x, 2)) perc["Percentage_Text"] = perc.Percentage.apply( lambda x: str(round(x, 2)) + "%") data_text = dict(Sentiment=list(perc.Sentiment), Percentage=list(perc.Percentage), color=Category20[3], Percentage_Text=list(perc.Percentage_Text)) labels = LabelSet(x="Sentiment", y="Percentage", text="Percentage_Text", level='glyph', render_mode='css', source=ColumnDataSource(data_text), x_offset=-10, text_font_size="10pt") sent.xaxis[0].ticker.desired_num_ticks = 3 sent.xaxis.major_label_overrides = { 1: 'Positive', 0: 'Neutral', -1: 'Negative' } sent.xaxis.minor_tick_line_color = None sent.xaxis.major_tick_line_color = None sent.vbar(x="Sentiment", top="Percentage", width=0.80, source=ColumnDataSource(data_text), color="color") sent.xgrid.visible = False sent.ygrid.visible = False sent.add_layout(labels) ##Emotion break down nrc = pd.read_csv("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", sep="\t", header=None, names=["term", "category", "flag"]) emotions = [] final = pd.DataFrame() term = list(nrc.term) for i in range(len(cleaned_tweets)): if cleaned_tweets[i] in term: sub = nrc[nrc.term == cleaned_tweets[i]] s = sub[sub.flag == 1] if list(s.category) != []: emotions.append(list(s.category)) emotions_clubed = [] for i in emotions: for j in range(len(i)): if i[j] == 'positive': i[j] = 'joy' elif i[j] == 'negative': i[j] = 'sadness' emotions_clubed.append(i[j]) radar = pd.DataFrame( pd.Series(emotions_clubed))[0].value_counts(normalize=True) radar = round(radar * 100, 2) radar_df = pd.DataFrame(radar).reset_index() radar_df['angle'] = radar_df[0] / radar_df[0].sum() * 2 * pi radar_df['color'] = Category20c[radar_df.shape[0]] radar_df = radar_df.rename( columns={ 'index': 'Emotion', 0: 'Percentage', 'angle': 'Angle', 'color': 'Color' }) source_events = ColumnDataSource( data=dict(Emotion=radar_df['Emotion'], Percentage=radar_df['Percentage'], Angle=radar_df['Angle'], Color=radar_df['Color'])) plot_events.wedge(x=0, y=1, radius=0.47, start_angle=cumsum('Angle', include_zero=True), end_angle=cumsum('Angle'), line_color="white", fill_color='Color', source=source_events, legend="Emotion") plot_events.add_tools(hover2) ##LDA stopwords = set(STOPWORDS) stopwords.add('one') stopwords.add('also') stopwords.add('twitter') stopwords.add('pic') stopwords.add('https') stopwords.add('bit') stopwords.add('ly') stopwords.add('via') stopwords.add('buff') tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stopwords) matrix = tf_vectorizer.fit_transform(alltweets) vocab = tf_vectorizer.get_feature_names() model = lda.LDA(n_topics=topic_num, n_iter=900) model.fit(matrix) topics = [] topics.append( "<b>Most Frequent Words used in 12 different Topics found in the search</b><br><br>" ) top_words_num = 20 topic_mixes = model.topic_word_ for i in range(topic_num): #for each topic top_indexes = np.argsort(topic_mixes[i])[::-1][:top_words_num] my_top = '' for ind in top_indexes: my_top += vocab[ind] + ' ' topics.append('TOPIC:' + str(i + 1) + ' --> ' + str(my_top) + '<br><br>') pos_nes = [] pos_neg = [] pos_nes.append("<b> Top 5 most Positive Tweets </b><br><br>") pos = (["> " + i + '<br><br>' for i in most_pos['Tweet']]) pos_nes.append(pos) pos_nes.append("<b> Top 5 most Negative Tweets </b><br><br>") neg = (["> " + i + '<br><br>' for i in most_neg['Tweet']]) pos_nes.append(neg) for i in range(len(pos_nes)): pos_neg.append("".join(pos_nes[i])) d = Div( text= """<div style="width: 49%; text-align: justify; float: left">""" + "".join(pos_neg) + "</div>" + """ <div style="width: 49%; text-align: justify; float: right">""" + "".join(topics) + """ </div><div style="width: 2%;text-align:justify;float:center">""" + " " + "</div>", width=1500, height=500) column2.children.append(d) else: pass
wcData = nouns.vocab().most_common(ntags) wcDict = dict(wcData) return wcDict # except Exception as e: # print(e) # print(text) # break wcInput = get_tags(text, 100) # jpype._jexception.OutOfMemoryErrorPyRaisable: java.lang.OutOfMemoryError: Java heap space # jpype._jexception.NullPointerExceptionPyRaisable: java.lang.NullPointerException print(wcInput) print(sorted(wcInput.items(), key=lambda x:x[1], reverse=True)) wordcloud = WordCloud(font_path='c:/Windows/fonts/malgun.ttf', relative_scaling=0.2, background_color='black').generate_from_frequencies(wcInput) plt.figure(figsize=(30, 50)) plt.imshow(wordcloud) plt.title('Top keyword') plt.axis('off') plt.show() plt.savefig('cmtWordcloud.png', dpi=400, bbox_inches='tight') # # # # print('finished')
def make_cloud(): print("Generating word cloud...") authors, dates, contents, bigstring = split_individual("archive.pkl") def rainbow_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl(hue,100%,50%)".replace("hue", str(int(position[1] / 1000 * 360))) wc = WordCloud(width=2000, height=2000, color_func=rainbow_color_func, stopwords=None, collocations=False) wc.generate(bigstring) wc.to_file("wordcloud-rainbow.png") wc = WordCloud(width=2000, height=2000, stopwords=None) wc.generate(bigstring) wc.to_file("wordcloud-standard.png") wc.generate_from_frequencies(Counter(authors)) wc.to_file("wordcloud-rainbow-authors.png") wc = WordCloud(width=2000, height=2000, stopwords=None) wc.generate_from_frequencies(Counter(authors)) wc.to_file("wordcloud-standard-authors.png")
api = tweepy.API(auth) f = open('tweets', 'w') for status in api.user_timeline(): f.write(api.get_status(status.id).text) #print(api.get_status(status.id).text.encode('utf8')) f.close() words = ' ' count = 0 f = open('tweets', 'r') for line in f: words = words + line f.close stopwords = {'will', 'youtube', 'YouTube'} logomask = imread('twitter_mask.png') wordcloud = WordCloud(font_path='Voyager.ttf', stopwords=STOPWORDS.union(stopwords), background_color='black', mask=logomask, max_words=2000, width=2400, height=1400).generate(words) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.savefig('./tweetcloud3.png', dpi=300) plt.show()
for reg in entrada: input_data += ''.join(reg) # Uses regex to filter the data. Mostly just removes URLs. input_data = re.sub(r'^https?:\/\/.*[\r\n]*', '', input_data, flags=re.MULTILINE) #Removes a few URLs input_data = re.sub(r"http\S+", "", input_data) #removes some URLs input_data = re.sub(r"pic\S+", "", input_data) #removes more URLs input_data = re.sub(r"https\S+", "", input_data) #even more URLs! #Wordcloud generator # Generates wordcloud wordcloud = WordCloud(max_font_size=600, width=3200, height=1600).generate(input_data) plt.figure(figsize=(40, 20), facecolor='k') # Sets size for plt plt.imshow(wordcloud) # Embed image into canvas plt.axis("off") plt.tight_layout(pad=0) plt.show() # I don't even think you need this plt.savefig('wordcloud.png', facecolor='k', bbox_inches='tight') # Saves to root directory # ALL THIS CODE DOWN HERE WAS FOR TESTING AND MAY HAVE VALUE IF YOU NEED MORE FEATURES (I guess) ''' with open('output.json', 'r') as f: for row in csv.reader(f, delimiter=',', quoting=csv.QUOTE_NONE): input_data += ''.join(row) '''
# In[25]: from wordcloud.wordcloud import WordCloud, STOPWORDS from PIL import Image # In[26]: stopwords = set(STOPWORDS) stopwords.add('one') stopwords.add('also') mask_image = np.array(Image.open("images.png")) wordcloud_good = WordCloud(colormap="Paired", mask=mask_image, font_path=None, width=30, height=20, scale=2, max_words=1000, stopwords=stopwords) wordcloud_good.generate(good_para) plt.figure(figsize=(7, 10)) plt.imshow(wordcloud_good, interpolation="bilinear", cmap=plt.cm.autumn) plt.axis('off') plt.figure(figsize=(10, 6)) plt.show() wordcloud_good.to_file("good.png") # In[27]: stopwords = set(STOPWORDS) wordcloud_neu = WordCloud(colormap="plasma",
# Extract and clean words all_words = TextBlob(" ".join(tweets).upper()).words.singularize().lemmatize() # Get stop-words stop_words = list(set(stopwords.words('english'))) + ['thi'] # Remove Stop and Short Words words = [w for w in all_words if len(w) > 2 and w.lower() not in stop_words] # Convert into one long string tweet_str = " ".join(words) # Create word-cloud word_cloud = WordCloud( font_path=f"{PROJ_PATH}rsc/swiss_911_ultra_compressed_bt.ttf", mode="RGBA", background_color=None, colormap="Blues", width=1000, height=1000, max_words=2000) word_cloud.generate(tweet_str) # Save save_name = f"{PROJ_PATH}output/{search_term}_wordcloud.png" word_cloud.to_file(save_name) # Show in matplotlib if PLOT: plt.figure(figsize=(15, 10)) plt.imshow(word_cloud) #, interpolation='bilinear') plt.axis('off') plt.show()
import jieba from wordcloud.wordcloud import WordCloud with open('comments/AllComments_shanqiu.txt', 'r', encoding='utf-8') as r: datas = r.read() word_c = WordCloud(font_path='STXINWEI.TTF', width=1000, height=5000, margin=10, background_color='pink') word_c.generate(datas) word_c.to_file('ciyun.jpg')