def wordcloud_plot(search_term, tweets_dataframe, save_path): tweets = tweets_dataframe["Tweet"].dropna().values # Extract and clean words all_words = TextBlob(" ".join(tweets).upper()).words.lemmatize() # Get stop-words stop_words = list(set(stopwords.words('english'))) + ['thi'] # Remove Stop and Short Words words = [ w for w in all_words if len(w) > 2 and w.lower() not in stop_words ] # Convert into one long string tweet_str = " ".join(words) # Create word-cloud word_cloud = WordCloud( font_path= "/Users/jamesashford/Documents/Projects/Hackathons/Oxford Hack 2020/OxHack-2020/TCARS/rsc/swiss_911_ultra_compressed_bt.ttf", mode="RGBA", background_color=None, colormap="Blues", width=1000, height=600, max_words=2000) word_cloud.generate(tweet_str) # Save file_name = f"{save_path}/{search_term}_wordcloud.png" word_cloud.to_file(file_name) return True
def __create__(text, pic_path): if str(platform.system()).lower() == 'windows': font_path = 'C:/Windows/Fonts/STFANGSO.ttf' else: font_path = '/usr/share/fonts/win/msyh.ttf' mask = imread(jiebaSource + '/back.jpg') # 读取背景图片 wordcloud = WordCloud(mask=mask, background_color='white', max_font_size=240, random_state=180, font_path=font_path).generate(text) wordcloud.to_file(pic_path)
def wordcloud(topics, k): from wordcloud.wordcloud import WordCloud for label, freqs in topics: highlight_words = [] wordcloud = WordCloud(color_func=grey_color_func, random_state=1, margin=10, background_color='white').fit_words(freqs) wordcloud.to_file("./intermediate_data/figures/BTM_wordcould/" + str(k) + "tp/hpv.%s.tagcloud.png" % (label))
def wordcloud(topics=[]): from wordcloud.wordcloud import WordCloud for label, freqs in topics: # logger.info(label) # logger.info(freqs[0]) # quit() highlight_words = []; wordcloud = WordCloud(color_func = grey_color_func, random_state=1, margin=10, background_color='white').fit_words(freqs) # wordcloud.to_file("./all_data/figures/adv_in_nj/hpv.%s.tagcloud.png"%(label)) # wordcloud.to_file("./intermediate_data/promotional/25tp/hpv.%s.tagcloud.png"%(label)) # wordcloud.to_file("./intermediate_data/laypeople/15tp/hpv.%s.tagcloud.png"%(label)) wordcloud.to_file("./intermediate_data/hpv_tweets/35tp/hpv.%s.tagcloud.png"%(label))
def generate_word_cloud(world_file): text = codecs.open(world_file,encoding = 'utf-8').read() wordlist_after_jieba = jieba.cut(text, cut_all=True) wl_space_split = " ".join(wordlist_after_jieba) data = dict(Counter(wordlist_after_jieba)) background = np.array(Image.open("love.jpg")) my_wordcloud = WordCloud(mask= background,background_color="black") my_wordcloud.generate(wl_space_split) image_colors = ImageColorGenerator(background) my_wordcloud.recolor(color_func=image_colors) my_wordcloud.to_file(world_file.split(".")[0]+".png")
def make_cloud(): print("Generating word cloud...") authors, dates, contents, bigstring = split_individual("archive.pkl") def rainbow_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl(hue,100%,50%)".replace("hue", str(int(position[1] / 1000 * 360))) wc = WordCloud(width=2000, height=2000, color_func=rainbow_color_func, stopwords=None, collocations=False) wc.generate(bigstring) wc.to_file("wordcloud-rainbow.png") wc = WordCloud(width=2000, height=2000, stopwords=None) wc.generate(bigstring) wc.to_file("wordcloud-standard.png") wc.generate_from_frequencies(Counter(authors)) wc.to_file("wordcloud-rainbow-authors.png") wc = WordCloud(width=2000, height=2000, stopwords=None) wc.generate_from_frequencies(Counter(authors)) wc.to_file("wordcloud-standard-authors.png")
class PyCloudWords: def __init__(self, font_path='C:/Windows/Fonts/simkai.ttf', background_color="white", max_font_size=200, mask=None): self.mask = mask self.graph = None if mask: image = Image.open(mask) self.graph = np.array(image) self.wc = WordCloud(font_path=font_path, background_color=background_color, max_font_size=max_font_size, mask=self.graph, collocations=False) def generate( self, raw_text=None, frequent_dict=None, ): if raw_text and isinstance(raw_text, str): wl_space_split = " ".join(jieba.cut(raw_text)) self.wc.generate(wl_space_split) plt.imshow(self.wc) plt.axis("off") plt.show() self.wc.to_file("my_wordcloud.png") return if frequent_dict and isinstance(frequent_dict, dict): self.wc.generate_from_frequencies(frequent_dict) plt.imshow(self.wc) plt.axis("off") plt.show() self.wc.to_file("my_wordcloud.png") return if self.mask: image_color = ImageColorGenerator(self.graph) self.wc.recolor(color_func=image_color)
def generate_word_cloud(world_file): font = r'C:\Windows\Fonts\Microsoft YaHei UI\MSYHBD.TTC' #os.path.join(os.path.dirname(__file__), "DroidSansFallbackFull.ttf") text = codecs.open(world_file, encoding='utf-8').read() wordlist_after_jieba = jieba.cut(text, cut_all=True) wl_space_split = " ".join(wordlist_after_jieba) backgroup_mask = np.array(Image.open("love.jpg")) my_wordcloud = WordCloud(font_path=font, mask=backgroup_mask, background_color="black", max_font_size=100, random_state=42) my_wordcloud.generate(wl_space_split) image_colors = ImageColorGenerator(backgroup_mask) my_wordcloud.recolor(color_func=image_colors) #plt.imshow(my_wordcloud) #plt.axis("off") #plt.show() my_wordcloud.to_file("love.png")
import jieba from wordcloud.wordcloud import WordCloud with open('comments/AllComments_shanqiu.txt', 'r', encoding='utf-8') as r: datas = r.read() word_c = WordCloud(font_path='STXINWEI.TTF', width=1000, height=5000, margin=10, background_color='pink') word_c.generate(datas) word_c.to_file('ciyun.jpg')
from wordcloud.wordcloud import WordCloud with open('shenteng.txt', 'r', encoding='utf-8') as r: datas = r.read() word_c = WordCloud(font_path='STXINWEI.TTF', width=1000, height=1000, margin=10, background_color='pink') word_c.generate(datas) word_c.to_file('shenten_ciyun.jpg')
def on_buttonpress(): if text_input.value != "": tweets = get_tweets() allemojis = [] ef = [] for i in range(len(t)): rr = len(t[i]) for j in range(rr): allemojis = "".join(t[i][j]['emoji']) emoji_list = emoji.emoji_lis(allemojis) if emoji_list != []: ef.append(emoji_list) em = [] for k in range(len(ef)): for l in range(len(ef[k])): em.append(ef[k][l]['emoji']) emoji_series = pd.Series(em) emojis = pd.DataFrame( emoji_series.value_counts()).reset_index().rename( columns={ 'index': 'emoji', 0: 'Count' }) emojis['Rank'] = pd.Series(range(1, len(emojis))) emojis = emojis.head(10) emojis['Rank'] = emojis['Rank'].apply(lambda x: int(x)) source_emoji.data = dict(emoji=emojis['emoji'], Count=emojis['Count'], Rank=emojis['Rank'], color=Paired[10]) labels = LabelSet(x="Rank", y="Count", text="emoji", level='glyph', render_mode='canvas', source=source_emoji, x_offset=-16, y_offset=-14, text_font_size="23pt") p.vbar(x="Rank", top="Count", width=0.95, source=source_emoji, color="color") p.xaxis.minor_tick_line_color = None # turn off x-axis minor ticks p.yaxis.minor_tick_line_color = None # turn off y-axis minor ticks p.y_range.start = 0 p.x_range.start = 0 p.xaxis[0].ticker.desired_num_ticks = 10 p.add_layout(labels) p.xaxis.minor_tick_line_color = None p.xgrid.visible = False p.ygrid.visible = False p.xaxis.major_tick_line_color = None p.add_tools(hover) ####wordcloud alltweets = [] for j in range(len(t)): for i in range(len(t[j])): alltweets.append(t[j][i]['text']) combined_tweets = "".join(alltweets) cleaned_tweets = text_clean(combined_tweets) ss = pd.DataFrame( pd.Series(cleaned_tweets).value_counts()).reset_index().rename( columns={ 'index': 'Word', 0: 'Count' }) ss['Rank'] = pd.Series(range(1, len(ss))) ss = ss.head(10) ss['Rank'] = ss['Rank'].apply(lambda x: int(x)) data_text = dict(Word=list(ss.Word), Count=list(ss.Count), Rank=list(ss.Rank), color=Category20[10]) #labels_text = LabelSet(x="Rank", y="Count", text="Word", level='glyph', render_mode='css', source = ColumnDataSource(data_text), #x_offset = -7, text_font_size="10pt", y_offset = 7, angle = 45) #p1.hbar(y = 'Rank', height = 0.9, right = 'Count', source = ColumnDataSource(data_text), color = "maroon") p1.xaxis[0].ticker.desired_num_ticks = 10 p1.xaxis.major_label_overrides = { 1: list(ss.Word)[0], 2: list(ss.Word)[1], 3: list(ss.Word)[2], 4: list(ss.Word)[3], 5: list(ss.Word)[4], 6: list(ss.Word)[5], 7: list(ss.Word)[6], 8: list(ss.Word)[7], 9: list(ss.Word)[8], 10: list(ss.Word)[9] } p1.vbar(x="Rank", top="Count", width=0.95, source=ColumnDataSource(data_text), color="color") #p1.add_layout(labels_text) p1.xaxis.minor_tick_line_color = None p1.xaxis.major_tick_line_color = None p1.xgrid.visible = False p1.ygrid.visible = False p1.add_tools(hover1) ###Actual wordcloud from nltk.corpus import stopwords stopwords = set(STOPWORDS) stopwords.add('one') stopwords.add('also') stopwords.add('twitter') stopwords.add('pic') stopwords.add('https') stopwords.add('bit') stopwords.add('ly') stopwords.add('via') stopwords.add('buff') wordcloud_good = WordCloud( colormap="Dark2", width=750, height=500, stopwords=stopwords, scale=4, max_words=300, background_color='white').generate(combined_tweets) wordcloud_good.to_file('temp.png') word_img = Image.open('temp.png').convert('RGBA') xdim, ydim = word_img.size img = np.empty((ydim, xdim), dtype=np.uint32) view = img.view(dtype=np.uint8).reshape((ydim, xdim, 4)) view[:, :, :] = np.flipud(np.asarray(word_img)) dim = max(xdim, ydim) fig.image_rgba(image=[img], x=0, y=0, dw=500, dh=750) ##Sentiments graph def loadLexicon(fname): newLex = set() lex_conn = open(fname) #add every word in the file to the set for line in lex_conn: newLex.add(line.strip( )) # remember to strip to remove the lin-change character lex_conn.close() return newLex sia = SIA() results = [] for line in alltweets: pol_score = sia.polarity_scores(line) pol_score['Tweet'] = line results.append(pol_score) polarity = pd.DataFrame(results) most_neg = polarity.sort_values('compound', ascending=True)[0:5] most_pos = polarity.sort_values('compound', ascending=False)[0:5] polarity['Polarity'] = 0 polarity.loc[polarity['compound'] > 0, 'Polarity'] = 1 polarity.loc[polarity['compound'] < 0, 'Polarity'] = -1 perc = pd.DataFrame( polarity.Polarity.value_counts(normalize=True) * 100).reset_index().rename(columns={ 'index': 'Sentiment', 'Polarity': 'Percentage' }) perc.Percentage = perc.Percentage.apply(lambda x: round(x, 2)) perc["Percentage_Text"] = perc.Percentage.apply( lambda x: str(round(x, 2)) + "%") data_text = dict(Sentiment=list(perc.Sentiment), Percentage=list(perc.Percentage), color=Category20[3], Percentage_Text=list(perc.Percentage_Text)) labels = LabelSet(x="Sentiment", y="Percentage", text="Percentage_Text", level='glyph', render_mode='css', source=ColumnDataSource(data_text), x_offset=-10, text_font_size="10pt") sent.xaxis[0].ticker.desired_num_ticks = 3 sent.xaxis.major_label_overrides = { 1: 'Positive', 0: 'Neutral', -1: 'Negative' } sent.xaxis.minor_tick_line_color = None sent.xaxis.major_tick_line_color = None sent.vbar(x="Sentiment", top="Percentage", width=0.80, source=ColumnDataSource(data_text), color="color") sent.xgrid.visible = False sent.ygrid.visible = False sent.add_layout(labels) ##Emotion break down nrc = pd.read_csv("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", sep="\t", header=None, names=["term", "category", "flag"]) emotions = [] final = pd.DataFrame() term = list(nrc.term) for i in range(len(cleaned_tweets)): if cleaned_tweets[i] in term: sub = nrc[nrc.term == cleaned_tweets[i]] s = sub[sub.flag == 1] if list(s.category) != []: emotions.append(list(s.category)) emotions_clubed = [] for i in emotions: for j in range(len(i)): if i[j] == 'positive': i[j] = 'joy' elif i[j] == 'negative': i[j] = 'sadness' emotions_clubed.append(i[j]) radar = pd.DataFrame( pd.Series(emotions_clubed))[0].value_counts(normalize=True) radar = round(radar * 100, 2) radar_df = pd.DataFrame(radar).reset_index() radar_df['angle'] = radar_df[0] / radar_df[0].sum() * 2 * pi radar_df['color'] = Category20c[radar_df.shape[0]] radar_df = radar_df.rename( columns={ 'index': 'Emotion', 0: 'Percentage', 'angle': 'Angle', 'color': 'Color' }) source_events = ColumnDataSource( data=dict(Emotion=radar_df['Emotion'], Percentage=radar_df['Percentage'], Angle=radar_df['Angle'], Color=radar_df['Color'])) plot_events.wedge(x=0, y=1, radius=0.47, start_angle=cumsum('Angle', include_zero=True), end_angle=cumsum('Angle'), line_color="white", fill_color='Color', source=source_events, legend="Emotion") plot_events.add_tools(hover2) ##LDA stopwords = set(STOPWORDS) stopwords.add('one') stopwords.add('also') stopwords.add('twitter') stopwords.add('pic') stopwords.add('https') stopwords.add('bit') stopwords.add('ly') stopwords.add('via') stopwords.add('buff') tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stopwords) matrix = tf_vectorizer.fit_transform(alltweets) vocab = tf_vectorizer.get_feature_names() model = lda.LDA(n_topics=topic_num, n_iter=900) model.fit(matrix) topics = [] topics.append( "<b>Most Frequent Words used in 12 different Topics found in the search</b><br><br>" ) top_words_num = 20 topic_mixes = model.topic_word_ for i in range(topic_num): #for each topic top_indexes = np.argsort(topic_mixes[i])[::-1][:top_words_num] my_top = '' for ind in top_indexes: my_top += vocab[ind] + ' ' topics.append('TOPIC:' + str(i + 1) + ' --> ' + str(my_top) + '<br><br>') pos_nes = [] pos_neg = [] pos_nes.append("<b> Top 5 most Positive Tweets </b><br><br>") pos = (["> " + i + '<br><br>' for i in most_pos['Tweet']]) pos_nes.append(pos) pos_nes.append("<b> Top 5 most Negative Tweets </b><br><br>") neg = (["> " + i + '<br><br>' for i in most_neg['Tweet']]) pos_nes.append(neg) for i in range(len(pos_nes)): pos_neg.append("".join(pos_nes[i])) d = Div( text= """<div style="width: 49%; text-align: justify; float: left">""" + "".join(pos_neg) + "</div>" + """ <div style="width: 49%; text-align: justify; float: right">""" + "".join(topics) + """ </div><div style="width: 2%;text-align:justify;float:center">""" + " " + "</div>", width=1500, height=500) column2.children.append(d) else: pass
mask_image = np.array(Image.open("images.png")) wordcloud_good = WordCloud(colormap="Paired", mask=mask_image, font_path=None, width=30, height=20, scale=2, max_words=1000, stopwords=stopwords) wordcloud_good.generate(good_para) plt.figure(figsize=(7, 10)) plt.imshow(wordcloud_good, interpolation="bilinear", cmap=plt.cm.autumn) plt.axis('off') plt.figure(figsize=(10, 6)) plt.show() wordcloud_good.to_file("good.png") # In[27]: stopwords = set(STOPWORDS) wordcloud_neu = WordCloud(colormap="plasma", font_path=None, width=1100, height=700, scale=2, max_words=1000, stopwords=stopwords).generate(new_para) plt.figure(figsize=(7, 10)) plt.imshow(wordcloud_neu, cmap=plt.cm.autumn) plt.axis('off') plt.show()
# Remove Stop and Short Words words = [w for w in all_words if len(w) > 2 and w.lower() not in stop_words] # Convert into one long string tweet_str = " ".join(words) # Create word-cloud word_cloud = WordCloud( font_path=f"{PROJ_PATH}rsc/swiss_911_ultra_compressed_bt.ttf", mode="RGBA", background_color=None, colormap="Blues", width=1000, height=1000, max_words=2000) word_cloud.generate(tweet_str) # Save save_name = f"{PROJ_PATH}output/{search_term}_wordcloud.png" word_cloud.to_file(save_name) # Show in matplotlib if PLOT: plt.figure(figsize=(15, 10)) plt.imshow(word_cloud) #, interpolation='bilinear') plt.axis('off') plt.show() # Get counts of each word # counts = dict(Counter(words)) # ord_counts = dict(sorted(counts.items(), key=lambda item: item[1], reverse=True)) # print(ord_counts)