def create_wordclouds(self, text, name_of_cloud, additional_stop_list, max_words, width, height, bigram = False): text_nopunc = self.remove_punctuation(text, "", "") text_lower = text_nopunc.lower() stop = self.stopwords stop.extend(additional_stop_list) text_nostop = self.remove_stopword(text_lower, stop) tokens = wt(text_nostop) text_lem = self.lemmatize(tokens) tokens_lem = wt(text_lem) my_bigrams = nltk.bigrams(tokens_lem) if bigram: bigram_merged=list() for line in my_bigrams: bigram_merged.append(line[0]+' ' + line[1]) counts = collections.Counter(bigram_merged) else: counts = collections.Counter(tokens_lem) final = counts.most_common(max_words) max_count = max(final, key=operator.itemgetter(1))[1] final = [(name, count / float(max_count))for name, count in final] # tags = make_tags(final, maxsize=max_word_size) # create_tag_image(tags, name_of_cloud+'.png', size=(width, height), layout=3, fontname='Crimson Text', background = (255, 255, 255)) # temp_cloud = " ".join(text for text, count in final) word_cloud = WordCloud(font_path="fonts/Georgia.ttf", width=width, height=height, max_words=max_words, stopwords=stop) word_cloud.fit_words(final) word_cloud.to_file(name_of_cloud + ".png")
def make_clouds(files, n_words=20): # set locations base_model_name = os.path.splitext(os.path.basename(files.model))[0] output_d = '../browser/clouds/' + base_model_name + '/' if not os.path.exists(output_d): os.makedirs(output_d) # create wordcloud generator wc = WordCloud(width=1000, height=500, background_color='white') print('Loading model') model = LdaModel.load(files.model) beta = model.expElogbeta print('Normalizing by topics, and by words') pTW = normalize(beta, axis=0) pWT = normalize(beta, axis=1) # load bug<->id map, then invert to id<-> bug bug_to_id = json.loads(open(files.replacements).read()) id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k} for i in range(len(beta)): # compute RAR t_rar = np.sqrt(pTW[i] * pWT[i]) top_word_ids = t_rar.argsort()[:-1 - n_words:-1] top_words = [model.id2word.id2token[wordid] for wordid in top_word_ids] top_words = [id_to_bug[word] if word in id_to_bug else word for word in top_words] wc.fit_words(zip(top_words, t_rar[top_word_ids])) wc.to_file(output_d + str(i) + '.png')
def topic_word_cloud(nmf, topic_idx, max_words=300, figsize=(14, 8), width=2400, height=1300, ax=None): ''' Create word cloud for a given topic INPUT: nmf: NMFClustering object topic_idx: int max_words: int Max number of words to encorporate into the word cloud figsize: tuple (int, int) Size of the figure if an axis isn't passed width: int height: int ax: None or matplotlib axis object ''' wc = WordCloud(background_color='white', max_words=max_words, width=width, height=height) word_freq = nmf.topic_word_frequency(topic_idx) # Fit the WordCloud object to the specific topics word frequencies wc.fit_words(word_freq) # Create the matplotlib figure and axis if they weren't passed in if not ax: fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) ax.imshow(wc) ax.axis('off')
def get_wordcloud_img(self, interval_id): text_freq = self.get_word_frequencies(interval_id) wordcloud = WordCloud(font_path=FONT_PATH, width=self.image_width, height=int(self.image_width * .75)) wordcloud.fit_words(list(reversed(text_freq[-100:]))) img_io = StringIO() wordcloud.to_image().save(img_io, 'JPEG', quality=70) img_io.seek(0) return img_io
def generate_cloud(self,tags,sizeX,sizeY,filename = None): sentence = zip(tags.keys(),tags.values()) wordcloud = WordCloud(width=sizeX, height=sizeY,relative_scaling=0.6) wordcloud.fit_words(sentence) plt.figure( figsize=(20,10), facecolor='k') plt.imshow(wordcloud) plt.axis("off") if(filename != None): plt.savefig(filename, facecolor='k', bbox_inches='tight') plt.show()
def make_cloud(docs): flat_doc = count_words(docs) from wordcloud import WordCloud import wordcloud wc = WordCloud(ranks_only = True, font_path='/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf') wc.fit_words(flat_doc) plt.imshow(wc) plt.axis("off") plt.show()
def generate_wordcloud(vocab, metric, name): ''' Generate a simple word cloud of text ''' list_tuples = [] for w, c in zip(vocab, metric): list_tuples.append((w,int(c*100))) # Generate a word cloud image wordcloud = WordCloud(background_color = "white") wordcloud.fit_words(list_tuples) plt.imshow(wordcloud) plt.axis('off') plt.savefig('../../figures/'+name) pass
def paint_clouds(genre, cloud_words): ''' For a given genre (text), paint a word cloud of at most cloud_words (int) words. Call the load_frequencies function to get a frequency list with 50 more words in it than are needed for the word cloud, in case some don't fit. ''' freq_list = load_frequencies(genre, cloud_words+50) wc = WordCloud(background_color = "white", max_words = cloud_words, \ max_font_size = 40, random_state = 42) wc.fit_words(freq_list) fig = plt.figure() plt.imshow(wc) plt.axis("off") plt.title(genre) plt.show() filename = '../data/cloud_' + genre + '.png' fig.savefig(filename)
def topic_time_and_cloud(df, topic, feature_names, nmf, title, source=False, normalize=False, freq='W', year=True, max_words=300, positivity=True, show=True): fig = plt.figure(figsize=(14, 8.5)) ax1 = fig.add_axes([0.05, 0.5, 0.93, 0.41]) article_count_by_time(df, topic=topic, source=source, normalize=normalize, freq=freq, year=year, fig=fig, label=topic_labels[topic[1]], show=False) ax1.xaxis.labelpad = -4 plt.suptitle(title, fontsize=20) fig.text(0.05, 0.44, 'Author: Erich Wellinger', fontsize=10, alpha=0.7) fig.text(0.33, 0.8, 'github.com/ewellinger/election_analysis', fontsize=20, color='gray', alpha=0.5) outlets = [('nyt', 'NYT', '#4c72b0'), ('foxnews', 'FOX', '#c44e52'), ('npr', 'NPR', '#55a868'), ('guardian', 'GUA', '#8172b2'), ('wsj', 'WSJ', '#ccb974')] # Create a boolean mask for whether each document is in the topic or not labels_mask = topic[0][:, topic[1]] num_articles = labels_mask.sum() percent_by_source = [float(len(df.loc[(labels_mask) & (df['source'] == outlet)])) / num_articles for outlet in zip(*outlets)[0]] normalized = [percent / np.sum(df['source'] == outlet) for percent, outlet in zip(percent_by_source, zip(*outlets)[0])] normalized = [percent / np.sum(normalized) for percent in normalized] plt.title('Number of Articles in Topic: {}'.format(num_articles), x=0.4825) ''' You should incorporate the word_cloud function in here!!! ''' if not positivity: ax2 = fig.add_axes([0.025, 0, 0.79, 0.43]) wc = WordCloud(background_color='white', max_words=max_words, width=1900, height=625) else: num_sources = 0 for idx in xrange(len(outlets)): if len(df.loc[(labels_mask) & (df['source'] == outlets[idx][0])]) >= 5: num_sources += 1 ax2 = fig.add_axes([0.025, 0, 0.712125-(num_sources*0.034425), 0.43]) wc = WordCloud(background_color='white', max_words=max_words, width=1715-(num_sources*83), height=625) ax4 = fig.add_axes([0.782125-(num_sources*0.034425), 0.035, 0.034425+(num_sources*0.034425), 0.375]) word_freq = topic_word_freq(nmf.components_, topic[1], feature_names) wc.fit_words(word_freq) ax2.imshow(wc) ax2.axis('off') ax3 = fig.add_axes([0.825, 0.01, 0.15555, 0.4]) normalized_source_barchart(df, topic, outlets, ax3) if positivity: sentiment_source_barchart(df.loc[labels_mask], outlets, ax=ax4) if num_sources < 3: ax4.set_title('') if show: plt.show() return ax1
def generate_wordcloud(y, vocab): ''' Generate a simple word cloud of text ''' ingred_counts = np.sum(y, axis=0) word_cloud_text = [] for i, vocab in zip(ingred_counts, vocab): word_cloud_text.append((str(vocab),int(i))) # Generate a word cloud image wordcloud = WordCloud(background_color = "white") wordcloud.fit_words(word_cloud_text) plt.imshow(wordcloud) plt.axis('off') plt.savefig('../../figures/vocab_wordcloud.png') pass
def post_process(): #with open('clda_data/out_prism', 'r') as fin: # phi_prism = [np.array(ast.literal_eval(line.strip())) for line in fin] #phi_prism = np.array(phi_prism) #theta_pb = np.load('/tmp/peircebayes/avg_samples.npz') #theta_pb = np.load('/home/rares/Desktop/peircebayes_all_no_sampling/last_sample.npz') theta_pb = np.load('data/avg_samples.npz') phi = theta_pb['arr_1'] print phi.shape vocab = pickle.load(open('data/vocab.pkl', 'r')) inv = dict((v, k) for k, v in vocab.iteritems()) axis = 1 index = list(np.ix_(*[np.arange(i) for i in phi.shape])) index[axis] = phi.argsort(axis) a = phi[index][:,-20:] counts = np.rint(a/np.sum(a, axis=1).reshape(-1,1)*1000).tolist() idx_l = index[axis][:,-20:].tolist() words = [[inv[i] for i in subl] for subl in idx_l] #pprint(words) index_prism = list(np.ix_(*[np.arange(i) for i in phi_prism.shape])) index_prism[axis] = phi_prism.argsort(axis) a_prism = phi_prism[index_prism][:,-20:] idx_l_prism = index_prism[axis][:,-20:].tolist() words_prism = [[inv[i] for i in subl] for subl in idx_l_prism] #pprint(words_prism) # topic 1 freq1 = list(reversed(zip(words[0], list(a[0,:])))) # topic 2 freq2 = list(reversed(zip(words[1], list(a[1,:])))) # topic 1 #freq1_prism = list(reversed(zip(words_prism[19], list(a_prism[19,:])))) # topic 2 #freq2_prism = list(reversed(zip(words_prism[18], list(a_prism[18,:])))) wc = WordCloud(background_color="white", width=400, height=400, random_state=1234).fit_words(freq1) plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3)) plt.axis("off") plt.savefig('data/topic_1.pdf', format='pdf') plt.close() plt.imshow(wc.fit_words(freq2).recolor(color_func=grey_color_func, random_state=3)) plt.axis("off") plt.savefig('data/topic_2.pdf', format='pdf') plt.close()
def make_word_cloud(topic_num, max_words=1000, width=10, height=10): post_nmf = pickle.load( open(POST_NMF_PICKLE) ) post_tfidf = pickle.load( open(POST_TFIDF_PICKLE) ) words = np.array(post_tfidf.get_feature_names()) freq_sum = np.sum(post_nmf.components_[topic_num]) frequencies = [val / freq_sum for val in post_nmf.components_[topic_num]] word_freq = zip(words, frequencies) wc = WordCloud(background_color='white') wc.fit_words(word_freq) #fig = plt.figure(figsize=(10,10)) #ax = fig.add_subplot(111) plt.imshow(wc) plt.axis('off') plt.show() return word_freq
def topic_word_cloud(self, topic_num, max_words=200, figsize=None, width=2400, height=1300, ax=None, mask_fname=None, inherit_color=False): ''' Create word cloud for a given topic INPUT: topic_idx: int max_words: int (default 200) Max number of words to encorporate into the word cloud figsize: tuple (int, int) Size of the figure if an axis isn't passed width: int (default 2400) height: int (default 1300) ax: None or matplotlib axis object mask_fname: None or str None if no mask is desired, otherwise a string providing the path the image being used as the mask inherit_color: bool, default False Indicates whether the wordcloud should inherit the colors from the image mask ''' if figsize == None: figsize = self.figsize if mask_fname: mask = np.array(Image.open(mask_fname)) wc = WordCloud(background_color='white', max_words=max_words, mask=mask, width=width, height=height) else: wc = WordCloud(background_color='white', max_words=max_words, width=width, height=height) word_freq = self.nmf.topic_word_frequency(topic_num) # Fit the WordCloud object to the specific topic's word frequencies wc.fit_words(word_freq) # Create the matplotlib figure and axis if they weren't passed in if not ax: fig = plt.figure(figsize=self.figsize) ax = fig.add_subplot(111) if mask_fname and inherit_color: image_colors = ImageColorGenerator(imread(mask_fname)) plt.imshow(wc.recolor(color_func=image_colors)) plt.axis('off') else: ax.imshow(wc) ax.axis('off')
def make_word_cloud(cluster_to_words, colormaps): b64_figures = [] font_path = "./crover/data/font/NotoSansJP-Regular_subset.otf" # 通常使われる漢字を抽出したサブセット for i in range(len(cluster_to_words)): wordcloud = WordCloud(font_path=font_path, background_color="white", width=500, height=500, colormap=colormaps[i % len(colormaps)]) logger.info('fit word cloud') if len(cluster_to_words[i]) == 0: cluster_to_words[i] = {'ベクトル未割り当てワードなし': 1} wordcloud.fit_words(cluster_to_words[i]) logger.info('save word cloud') # 画像書き込み用バッファに画像を保存してhtmlに返す buf = io.BytesIO() img = wordcloud.to_image() img.save(buf, 'PNG') #img.save(buf, 'JPEG') logger.info('b64 encode') qr_b64str = base64.b64encode(buf.getvalue()).decode("utf-8") b64_figures.append("data:image/png;base64,{}".format(qr_b64str)) #b64_figures.append("data:image/jpg;base64,{}".format(qr_b64str)) return b64_figures
def plot_top_words_with_filters(num_word_instances, stop_words, small_words, lower, more_stop_words): tweets = bok_tweets.text if lower: tweets = tweets.str.lower() if stop_words: tweets = tweets.apply(remove_stopwords) if small_words: tweets = tweets.str.findall('\w{3,}').str.join(' ') if len(more_stop_words) > 0: remove_more_stopwords = lambda x: ' '.join(y for y in x.split( ) if y not in (x.strip() for x in more_stop_words.split(','))) tweets = tweets.apply(remove_more_stopwords) tdm_df = create_term_document_matrix2(tweets, min_df=2) word_frequencies = tdm_df[[x for x in tdm_df.columns if len(x) > 1]].sum() sorted_words = word_frequencies.sort_values(ascending=False) top_sorted_words = sorted_words[:num_word_instances] wordcloud = WordCloud(max_font_size=40) wordcloud.fit_words(top_sorted_words.to_dict()) plt.figure() plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.show()
def creating_cloud(given_dict_with_freq_words): ### most freq 200 words dict_two_hundred = {} counting = 0 for i, j in given_dict_with_freq_words.items(): dict_two_hundred[i] = j if counting == 200: break counting += 1 wordcloud = WordCloud(colormap='prism', background_color='white') wordcloud = wordcloud.fit_words(dict_two_hundred) wordcloud.to_file('PrideAndPrejudice.png')
def gen_word_cloud_picture(words_stat, font_path="./demo.ttf", mask_file="./data/heart.jpg", word_color_img="./data/pink.jpg", background_color="white"): # 自定义图像背景并将词云图形化输出 mask_img = imread(mask_file) wordcloud = WordCloud(background_color=background_color, mask=mask_img, font_path=font_path) word_frequence = {x[0]: x[1] for x in words_stat.head(20000).values} wordcloud = wordcloud.fit_words(word_frequence) color_img = imread(word_color_img) mask_color = ImageColorGenerator(color_img) return wordcloud.recolor(color_func=mask_color)
def show(words_stat): #词云表示 matplotlib.rcParams['figure.figsize'] = (10.0, 5.0) wordcloud = WordCloud(font_path='hanyiqihei.ttf', background_color="white", max_font_size=80) # 指定字体类型、字体大小和字体颜色 word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values} word_frequence_list = [] for key in word_frequence: temp = (key, word_frequence[key]) word_frequence_list.append(temp) wordcloud = wordcloud.fit_words(dict(word_frequence_list)) plt.imshow(wordcloud) plt.show()
def draw_wordcloud(period_i, i): word_count = np.array(collections.Counter(period_i).most_common()) tf = { word_count[j][0]: int(word_count[j][1]) for j in range(len(word_count)) } #词频统计词典 coloring = np.array(Image.open("zhongxing.jpg")) #图片 my_wordcloud = WordCloud(background_color="white", max_words=2000, mask=coloring, max_font_size=60, random_state=42, scale=2, font_path=os.environ.get( "FONT_PATH", "C:/Windows/Fonts/simfang.ttf")) my_wordcloud.fit_words(tf) image_colors = ImageColorGenerator(coloring) plt.figure(figsize=(18.5, 10.5)) plt.imshow(my_wordcloud.recolor(color_func=image_colors)) plt.xticks([]), plt.yticks([]) #隐藏坐标线 plt.axis("off") plt.imshow(my_wordcloud) plt.savefig("period_" + str(i) + "_wordcloud.jpg") plt.show()
def main(): #循环获取第一个电影的前10页评论 commentList = [] NowPlayingMovie_list = getNowPlayingMovie_list() for i in range(10): num = i + 1 commentList_temp = getCommentsById(NowPlayingMovie_list[1]['id'], num) commentList.append(commentList_temp) #将列表中的数据转换为字符串 comments = '' for k in range(len(commentList)): comments = comments + (str(commentList[k])).strip() #使用正则表达式去除标点符号 pattern = re.compile(r'[\u4e00-\u9fa5]+') filterdata = re.findall(pattern, comments) cleaned_comments = ''.join(filterdata) print(cleaned_comments) #使用结巴分词进行中文分词 segment = jieba.lcut(cleaned_comments) words_df=pd.DataFrame({'segment':segment}) #去掉停用词 stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用 words_df=words_df[~words_df.segment.isin(stopwords.stopword)] #统计词频 words_stat=words_df.groupby(by=['segment'])['segment'].agg({"num":numpy.size}) words_stat=words_stat.reset_index().sort_values(by=["num"],ascending=False) #用词云进行显示 d = path.dirname(__file__) alice_mask = np.array(Image.open(path.join(d, "alice_mask.png"))) wordcloud=WordCloud(font_path=r'simhei.ttf',background_color="white",max_font_size=80, mask=alice_mask) word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values} word_frequence_list = [] for key in word_frequence: temp = (key,word_frequence[key]) word_frequence_list.append(temp) wordcloud=wordcloud.fit_words(word_frequence_list) plt.imshow(wordcloud) wordcloud.to_file('wwxd.png') # 把词云保存下来
def main(): #循环获取第一个电影的前10页评论 commentList = [] NowPlayingMovie_list = getNowPlayingMovie_list() for i in range(1): num = i + 1 commentList_temp = getCommentsById(NowPlayingMovie_list[0]['id'], num) commentList.append(commentList_temp) #将列表中的数据转换为字符串 comments = '' for k in range(len(commentList)): comments = comments + (str(commentList[k])).strip() #使用正则表达式去除标点符号 pattern = re.compile(r'[\u4e00-\u9fa5]+') filterdata = re.findall(pattern, comments) # 去除标签符号以后,变成了一个纯字符串 cleaned_comments = ''.join(filterdata) #使用结巴分词进行中文分词 segment = jieba.lcut(cleaned_comments) words_df=pd.DataFrame({'segment':segment}) #去掉停用词 stopwords=pd.read_csv(file_path+"stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用 words_df=words_df[~words_df.segment.isin(stopwords.stopword)] # 其中 ~是取反 #统计词频 words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size}) words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False) bg_pic = imread(file_path+'3.jpg') #用词云进行显示 if 'Windows' in platform.system(): wordcloud=WordCloud(mask=bg_pic,font_path="simhei.ttf",background_color="white",max_font_size=80) else: wordcloud=WordCloud(font_path="/Library/Fonts/Songti.ttc",background_color="white",max_font_size=80) word_frequence = {x[0]:x[1] for x in words_stat.head(100).values} wordcloud=wordcloud.fit_words(word_frequence) #参数为dict类型 image_colors = ImageColorGenerator(bg_pic) #显示词云 plt.imshow(wordcloud) plt.axis('off') plt.show()
def character_view_picture(data): # backgroud_Image = plt.imread('1.jpg') # 有背景就用,没背景就不用 picture = WordCloud( # mask=backgroud_Image, width=1024, height=768, background_color='white', font_path="C:\simhei.ttf", max_font_size=400, random_state=50) picture = picture.fit_words({x[0]: x[1] for x in data.head(100).values}) plt.imshow(picture, interpolation='bilinear') plt.axis("off") plt.show() base_path = path.dirname(__file__) picture.to_file(path.join(base_path, "yuebing.png"))
def main(): #循环获取第一个电影的前10页评论 commentslist = [] nowplayingmovie_list = getmovie_list() for i in range(20): num = i + 1 commentlist_temp = get_comment(nowplayingmovie_list[0]['id'], num) commentslist.append(commentlist_temp) #将列表中的数据转换成字符串 comments = '' for com in range(len(eachcomment)): comments = comments + (str(eachcomment[com])).strip() #使用正则表达式去除标点符号 pattern = re.compile(r'[\u4e00-\u9fa5]+') filterdata = re.findall(pattern, comments) clean_comments = ''.join(filterdata) #print(clean_comments) #使用结巴分词进行中文分词 segment = jieba.lcut(clean_comments) words_df = pd.DataFrame({'segment': segment}) #去掉停用词 stopwords = pd.read_csv("stopwords.txt", index_col=False, quotechar="3", sep="\t", names=['stopwords'], encoding='gb2312') words_df = words_df[~words_df.segment.isin(stopwords)] #统计词频 words_stat = words_df.groupby(by=['segment'])['segment'].agg( {"计数": numpy.size}) words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False) #用词云进行显示 wordcloud = WordCloud(font_path="C:/windows/fonts/simhei.ttf", background_color="white", max_font_size=80) word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values} word_frequence_list = [] for key in word_frequence: temp = (key, word_frequence[key]) word_frequence_list.append(temp) wordcloud = wordcloud.fit_words(dict(word_frequence_list)) plt.imshow(wordcloud) #plt.savefig("result.jpg") plt.axis('off') plt.show()
def generate_word_cloud(data, stopwords, **kwargs): """ 生成词云 :param data 词云数据 :param stopwords 停用词 :param kwargs :return: """ movie_name = kwargs.get('movie_name') if movie_name: data_com_X = data[data.movie == movie_name] else: data_com_X = data content_X = data_com_X.comment.dropna().values.tolist() # 导入,分词 segment = [] for line in content_X: try: segs = jieba.lcut(line) for seg in segs: if len(seg) > 1 and seg != '\r\n': segment.append(seg) except Exception as e: # print(line) continue # 去停用词 words_df = pd.DataFrame({'segment': segment}) words_df = words_df[~words_df.segment.isin(stopwords.stopword)] # 统计词频 words_stat = words_df.groupby(by=['segment'])['segment'].agg( {'计数': np.size}) words_stat = words_stat.reset_index().sort_values(by=['计数'], ascending=False) # print(words_stat.head()) # 词云 word_cloud = WordCloud(font_path='./data/simhei.ttf', background_color='white', max_font_size=80) words_frequence = {x[0]: x[1] for x in words_stat.head(1000).values} print(words_frequence) word_cloud = word_cloud.fit_words(words_frequence) plt.imshow(word_cloud) return True
def make_wordcloud(): for folder in os.listdir('LRC'): os.chdir('E:\代码\python\CloudSpider') # 依次读取文件,分词,生成all_words列表,用停用词检查后生成新的all_words_new all_words =[] outstr = '' for filename in os.listdir('LRC/'+folder): with open('LRC/'+folder+'/' +filename,encoding='utf-8') as f: lyrics =f.read() data =jieba.cut(lyrics) all_words.extend(set(data)) for word in all_words: if word not in stopwords: if word != '\t': outstr += word outstr += " " all_words_new= outstr.split(" ") # 转成列表 # 对all_words中的词计数,并按照词频排序 count =Counter(all_words_new) result =sorted(count.items(), key=lambda x: x[1], reverse=True) for r in result: if r[0]=='' or r[0]=='\ufeff': result.remove(r) for r in result: if r[0]=='\n': result.remove(r) # print(result[0:20]) # 词云显示 word_dic =dict(count.items()) # 使matplotlib模块能显示中文 mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体 mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 color_mask =imread('bg_love.jpg') # 背景图 cloud =WordCloud( font_path='msyh.ttc', # 注意选择本机字体文件的地址 width=600, height=480, background_color='black', mask=color_mask, max_words=350, max_font_size=150) world_cloud =cloud.fit_words(word_dic) os.chdir('word_picture') world_cloud.to_file(folder+'.jpg')
def wcfigure( wordsdf, path=r'd:\test.jpg' ): #画云图图,传入dataframe,保存文件路径和名字 wdcounts.head(2000).itertuples(index=False) wordcloud = WordCloud(font_path='c:\windows\fonts\STCAIYUN.TTF', background_color="white", margin=5, width=1800, height=1000) #必须要加载文字体不然中文乱码 #print segStat.head(100).itertuples(index=False) wordcloud = wordcloud.fit_words(wordsdf.itertuples(index=False)) plt.figure(num=None, figsize=(25, 16), dpi=8, facecolor='w', edgecolor='k') plt.imshow(wordcloud) plt.axis("off") plt.savefig(path) plt.show() plt.close()
def main(): commentList = [] ''' for i in range(15): pagenum = i + 1 commentList_temp = getComment(nowplaying_list[0]['id'], pagenum) commentList.append(commentList_temp) ''' comments = '' for i in range(len(eachCommentList)): comments = comments + (str(eachCommentList[i])).strip() pattern = re.compile(r'[\u4e00-\u9fa5]+') filterdata = re.findall(pattern, comments) cleaned_comments = ''.join(filterdata) segment = jieba.lcut(cleaned_comments) words_df = pd.DataFrame({'segment': segment}) stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='GBK') words_df = words_df[~words_df.segment.isin(stopwords.stopword)] words_stat = words_df.groupby(by=['segment'])['segment'].agg( {"计数": numpy.size}) words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False) wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80) word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values} word_frequence_list = [] for key in word_frequence: temp = (key, word_frequence[key]) word_frequence_list.append(temp) wordcloud = wordcloud.fit_words(dict(word_frequence_list)) plt.imshow(wordcloud) pylab.show()
def main(): #循环获取电影的前10页评论 commentList = [] # NowPlayingMovie_list = getNowPlayingMovie_list() movieId = getMovieIdByName() for i in range(10): num = i + 1 commentList_temp = getCommentsById(movieId, num) # commentList_temp = getCommentsById('27133303', num) commentList.append(commentList_temp) #将列表中的数据转换为字符串 comments = '' for k in range(len(commentList)): comments = comments + (str(commentList[k])).strip() #使用正则表达式去除标点符号 pattern = re.compile(r'[\u4e00-\u9fa5]+') filterdata = re.findall(pattern, comments) cleaned_comments = ''.join(filterdata) #使用结巴分词进行中文分词 segment = jieba.lcut(cleaned_comments) words_df=pd.DataFrame({'segment':segment}) #去掉停用词 stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用 words_df=words_df[~words_df.segment.isin(stopwords.stopword)] #统计词频 words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size}) words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False) #用词云进行显示 wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=80,width=800,height=400) word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values} word_frequence_list = [] for key in word_frequence: temp = (key,word_frequence[key]) word_frequence_list.append(temp) wordcloud=wordcloud.fit_words(word_frequence) plt.imshow(wordcloud) plt.axis("off") plt.show()
def one_movie(): comment_list = [] nowplaying_movie_list = getNowPlayingMovie_list() for i in range(10): # 前10页 num = i + 1 comment_list_temp = getCommentsById(nowplaying_movie_list[0]['id'], num) # 索引0为当前上映的《八佰》 comment_list.append(comment_list_temp) # print(comment_list) # 将列表中转换为字符串 comments = '' for j in range(len(comment_list)): comments = comments + (str(comment_list[j])).strip() # 使用正则表达式去掉标点符号 pattern = re.compile(r'[\u4e00-\u9fa5]+') filter_data = re.findall(pattern, comments) cleaned_comments = ''.join(filter_data) # 使用jieba分词,并获得词频排列列表 result = jieba.analyse.textrank(cleaned_comments, topK=50, withWeight=True) keywords = {} for k in result: keywords[k[0]] = k[1] # 把列表里的二元组形成字典 print('before delete stopword:', keywords) # 读取为停用词集合 stopwords = set() f = open('./movie_stopwords.txt', encoding='utf-8') while True: word = f.readline() if word == '': break stopwords.add(word[:-1]) print(stopwords) keywords = {x: keywords[x] for x in keywords if x not in stopwords} print('after delete stopword:', keywords) # 制作词云图 wordcloud = WordCloud(font_path='simhei.ttf', background_color='white', max_font_size=80, stopwords=stopwords) word_frequence = keywords myword = wordcloud.fit_words(word_frequence) plt.axis('off') plt.imshow(myword) plt.savefig('movie_comments_from_douban.png', dpi=300, bbox_inches='tight') plt.show() # 先保存,show之后生成空对象
def main(): commentList = [] NowPlayingMovie_list = getNowPlayingMovie_list() for i in range(10): num = i + 1 commentList_temp = getCommentsById(NowPlayingMovie_list[0]["id"], num) commentList.append(commentList_temp) comments = '' for k in range(len(commentList)): comments = comments + (str(commentList[k])).strip() pattern = re.compile(r'[\u4e00-\u9fa5]+') filterdata = re.findall(pattern, comments) cleaned_comments = ''.join(filterdata) segment = jieba.lcut(cleaned_comments) words_df = pd.DataFrame({'segment': segment}) stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep='\t', names=['stopword'], encoding='utf-8') words_df = words_df[~words_df.segment.isin(stopwords.stopword)] words_stat = words_df.groupby(by=['segment'])['segment'].agg( {'计数': numpy.size}) words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False) wordcloud = WordCloud(font_path='simhei.ttf', background_color='white', max_font_size=80) word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values} print(word_frequence) word_frequence_list = [] for key in word_frequence: temp = (key, word_frequence[key]) word_frequence_list.append(temp) wordcloud = wordcloud.fit_words(dict(word_frequence_list)) plt.imshow(wordcloud) plt.savefig("result.jpg")
def draw_wc(self, words, stopword=None, title=''): if stopword: data = self.make_df(words, stopword) else: data = self.make_df(words) bg_pic = imread('source/luhan.jpg') wordcloud = WordCloud(background_color='black', max_font_size=110, mask=bg_pic, min_font_size=10, mode='RGBA', font_path='source/SimHei.ttf') word_frequence = {x[0]: x[1] for x in data.values} wordcloud = wordcloud.fit_words(word_frequence) plt.title(title, fontsize=16) plt.imshow(wordcloud) plt.axis("off") plt.show()
def word_count(): """ 绘制WordCloud :return: """ df_all_words = pd.DataFrame({"all_words": all_word_list}) words_count = df_all_words.groupby(by=["all_words"])["all_words"].agg( {"count": np.size}) words_count = words_count.reset_index().sort_values(by=["count"], ascending=False) matplotlib.rcParams["figure.figsize"] = (10.0, 5.0) word_cloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80) word_frequency = {x[0]: x[1] for x in words_count.head(100).values} word_cloud = word_cloud.fit_words(word_frequency) plt.imshow(word_cloud) plt.show()
def plot_wordcloud(words_count): image = imread(blackground_pic) wordcloud = WordCloud(background_color="white", mask=image, font_path='./data/simhei.ttf', max_words=5000, scale=1.5) wordcloud = wordcloud.fit_words(words_count) plt.figure(figsize=(10, 6), dpi=100) plt.axis("off") wordcloud.to_file('./data/wordcloud.png') plt.imshow(wordcloud) # plt.show() plt.close()
def get_word_list(): with open("dan_mu.txt", encoding="utf-8") as f: word = f.read() word_list = word.split() # 使用lcut()方法进行分词 data_cut = [jieba.lcut(x) for x in word_list] # data_cut print(data_cut) # 3 读取停用词 with open(r"cn_stopwords.txt", encoding="utf-8") as f: stop = f.read() stop = stop.split() stop = [" ", "道", "说道", "说"] + stop # 4 去掉停用词之后的最终词 s_data_cut = pd.Series(data_cut) all_words_after = s_data_cut.apply( lambda x: [i for i in x if i not in stop]) # 5 词频统计 all_words = [] for i in all_words_after: all_words.extend(i) word_count = pd.Series(all_words).value_counts() # 6 词云图的绘制 # 1)读取背景图片 back_picture = imread("kakaxi.jpg") # 2)设置词云参数 wc = WordCloud(font_path="FZNiNSJW.TTF", background_color="white", max_words=2000, mask=back_picture, max_font_size=200, random_state=42) wc2 = wc.fit_words(word_count) # 3)绘制词云图 plt.figure(figsize=(16, 8)) plt.imshow(wc2) plt.axis("off") plt.show() wc.to_file("ciyun.png")
def drawWordCloud(self, word_text, filename, dict_type=False, background_image='image/tom2.jpeg'): """ :param word_text: :param filename: :param dict_type: :param background_image: 词云图的背景形状 :return: """ mask = Image.open(BASE_DIR + background_image) mask = np.array(mask) my_wordcloud = WordCloud( background_color='white', # 设置背景颜色 mask=mask, # 设置背景图片 max_words=2000, # 设置最大现实的字数 stopwords=STOPWORDS, # 设置停用词 font_path=self.system_font, # 设置字体格式,如不设置显示不了中文 max_font_size=50, # 设置字体最大值 random_state=30, # 设置有多少种随机生成状态,即有多少种配色方案 scale=1.3) if not dict_type: my_wordcloud = my_wordcloud.generate(word_text) else: my_wordcloud = my_wordcloud.fit_words(word_text) image_colors = ImageColorGenerator(mask) my_wordcloud.recolor(color_func=image_colors) # 以下代码显示图片 plt.imshow(my_wordcloud) plt.axis("off") # 保存图片 if not self.from_web: my_wordcloud.to_file(filename=self.image_path + filename + '.jpg') print("result file path:", self.image_path + filename + '.jpg') plt.show() else: my_wordcloud.to_file(filename=self.web_image_bash_path + filename + '.jpg') print("result file path:", self.web_image_bash_path + filename + '.jpg')
def word_segmentation(): data = pd.read_csv(filepath, encoding='utf-8').fillna('') image = imread(os.path.join(PROJECT_ROOT, LAGOU.BACKGROUND)) df = pd.read_csv(os.path.join(PROJECT_ROOT, LAGOU.STOPWORDS), encoding='utf8', index_col=False) jieba.load_userdict(os.path.join(PROJECT_ROOT, LAGOU.USERDICT)) ## 添加多个词汇 stopwords = list(df['stopword'].unique()) stopwords.extend([u'技术', u'高端', u'职位', u'企业', ',', '工程师', '类', '实施', 'IT']) ## 添加停止词 ## 需求岗位与技能词汇 word_list = flatten( [list(jieba.cut(row['firstType'])) for _, row in data.iterrows()]) word_list.extend( flatten([ list(jieba.cut(row['secondType'])) for _, row in data.iterrows() ])) word_list.extend( flatten( [list(jieba.cut(row['thirdType'])) for _, row in data.iterrows()])) word_list = [word for word in word_list if word not in stopwords] words_count = pd.value_counts(word_list).to_dict() print pd.value_counts(word_list) wordcloud = WordCloud(background_color="white", mask=image, font_path=LAGOU.TTF, max_words=5000, scale=1.5) wordcloud = wordcloud.fit_words(words_count) plt.figure(figsize=(10, 6), dpi=100) plt.axis("off") wordcloud.to_file(filedir + 'data_wordcloud.png') plt.imshow(wordcloud) plt.show() plt.close()
def main(): #循环获取第一个电影的前10页评论 commentList = [] NowPlayingMovie_list = getNowPlayingMovie_list() for i in range(10): num = i + 1 commentList_temp = getCommentsById(NowPlayingMovie_list[0]['id'], num) commentList.append(commentList_temp) #将列表中的数据转换为字符串 comments = '' for k in range(len(commentList)): comments = comments + (str(commentList[k])).strip() #使用正则表达式去除标点符号 pattern = re.compile(r'[\u4e00-\u9fa5]+') filterdata = re.findall(pattern, comments) cleaned_comments = ''.join(filterdata) #使用结巴分词进行中文分词 segment = jieba.lcut(cleaned_comments) words_df=pd.DataFrame({'segment':segment}) #去掉停用词 stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用 words_df=words_df[~words_df.segment.isin(stopwords.stopword)] #统计词频 words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size}) words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False) #用词云进行显示 wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=80) word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values} word_frequence_list = [] for key in word_frequence: temp = (key,word_frequence[key]) word_frequence_list.append(temp) wordcloud=wordcloud.fit_words(word_frequence_list) plt.imshow(wordcloud)
def show(words_stat): import matplotlib.pyplot as plt #%matplotlib inline import matplotlib matplotlib.rcParams['figure.figsize'] = (10.0, 5.0) from wordcloud import WordCloud #词云包 wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80) #指定字体类型、字体大小和字体颜色 word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values} word_frequence_list = [] for key in word_frequence: temp = (key, word_frequence[key]) word_frequence_list.append(temp) wordcloud = wordcloud.fit_words(word_frequence_list) plt.im show(wordcloud)
def analyze_data_and_generate_word_cloud(): df_allwords_clean_dist = pd.DataFrame({'allwords': allwords_clean_dist}) word_count = df_allwords_clean_dist.allwords.value_counts().reset_index() word_count.columns = ['word', 'count'] wc = WordCloud(width=1024, height=768, background_color='white', font_path="simhei.ttf", max_font_size=400, random_state=50) wc = wc.fit_words({x[0]: x[1] for x in word_count.head(100).values}) plt.imshow(wc, interpolation='bilinear') plt.axis("off") plt.show() wc.to_file(path.join(path.dirname(__file__), "yuebing.png"))
def description(job_list, filename): comments = '' for k in range(len(job_list)): comments = comments + (str(job_list[k])).strip() pattern = re.compile(r'[\u4e00-\u9fa5]+') filterdata = re.findall(pattern, comments) cleaned_comments = ''.join(filterdata) result = jieba.analyse.extract_tags(cleaned_comments, topK=100, withWeight=True) keywords = dict() for i in result: keywords[i[0]] = i[1] wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80) word_frequence = keywords myword = wordcloud.fit_words(word_frequence) myword.to_file('wordCloud/' + filename + '.png')
def statistics(words_df, moviename): words_stat = words_df.groupby(by=["segment"])["segment"].agg( {"计数": numpy.size}) words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False) print(words_stat) wordcloud = WordCloud( font_path="C:\\Windows\\Fonts\\msyh.ttc", background_color="white", max_font_size=150, width=1000, height=860, margin=2, ) word_frequence = {x[0]: x[1] for x in words_stat.head(500).values} wordcloud = wordcloud.fit_words(word_frequence) plt.imshow(wordcloud) plt.axis("off") plt.show(block=False) img_name = "./" + moviename + ".jpg" wordcloud.to_file(img_name)
def make_wordclouds(self, commons): d = path.dirname(__file__) msk = np.array( Image.open( path.join( d, "C:/Users/27438/PycharmProjects/huzhou/tss/asset/me.jpg"))) wordcloud = WordCloud(font_path="simhei.ttf", background_color="#DDDDDD", max_font_size=250, width=1920, height=1080, mask=msk) # 指定字体类型、字体大小和字体颜色 word_frequence = { x[0]: x[1] for x in self.data_clear(commons).head(200).values } wordcloud = wordcloud.fit_words(word_frequence) plt.imshow(wordcloud) plt.axis("off") plt.show()
def word(words): """ 生词词云图片 :param words: :type words: :return: :rtype: """ words_count = words.groupby(by=['word_cloud'])['word_cloud'].agg({"count"}) words_count = words_count.reset_index().sort_values(by=["count"], ascending=False) matplotlib.rcParams['figure.figsize'] = (10.0, 5.0) wordcloud=WordCloud(font_path="./data/simhei.ttf",background_color=None,max_font_size=80) word_frequence = {x[0]: x[1] for x in words_count.head(100).values} wordcloud=wordcloud.fit_words(word_frequence) plt.imshow(wordcloud) wordcloud.to_file('./data/wordcloud.png') url = qiniu_picture() return url
def main(): # 循环获取第一个电影的前10页评论 commentList = [] movieList = getLatestShowMovieList() for i in range(10): commentListTmp = getCommentsById(movieList[0]['id'], i + 1) commentList.append(commentListTmp) # 将列表中的数据转换为字符串 comments = '' for comm in commentList: comments = comments + str(comm) # 使用正则表达式去除标点符号 pattern = re.compile(r'[\u4e00-\u9fa5]+') filterData = re.findall(pattern, comments) cleaned_comments = ''.join(filterData) # 使用结巴分词进行中文分词 segment = jieba.lcut(cleaned_comments) words = pd.DataFrame({'segment': segment}) # # 去掉停用词 # stopwords = pd.read_csv("D:\Program\PythonCrawler\DouBan_Movie\stopword.txt", # index_col=False, quoting=3, sep="\t", names=['stopword'], # encoding='utf-8') # quoting=3全不引用 # words = words[~words.segment.isin(stopwords.stopword)] # 统计词频 wordsFreq = words.groupby(by=['segment'])['segment'].agg( {"计数": numpy.size}) wordsFreq = wordsFreq.reset_index().sort_values(by=["计数"], ascending=False) #用词云显示 wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80) word_frequence = {x[0]: x[1] for x in wordsFreq.head(1000).values} word_frequence_list = [] for key in word_frequence: temp = (key, word_frequence[key]) word_frequence_list.append(temp) wordcloud = wordcloud.fit_words(word_frequence_list) plt.imshow(wordcloud)
print("--- LDA trained : %s minutes ---" % round(((time.time() - start_lda_time)/60),2)) ################################# ##### Display WordCloud ######### ################################# curr_topic = 0 wc = WordCloud(background_color="black", max_words=2000,max_font_size=40, width=120, height=120, random_state=42) for line in final_topics: line = line[1] scores = [float(x.split("*")[0]) for x in line.split(" + ")] words = [x.split("*")[1] for x in line.split(" + ")] freqs = [] for word, score in zip(words, scores): freqs.append((word, score)) elements = wc.fit_words(freqs) fig = plt.figure() plt.imshow(elements) plt.axis("off") fig.savefig('images/topic'+str(curr_topic)) curr_topic += 1 plt.show() program = os.path.basename(sys.argv[0]) logging.info("finished running %s" % program)
cur.execute(sql) conn.commit() blockSize = 10000 record = [] sql = u'%s,' * 207 sql = u'insert into TB_BEER_STEM_PERCENT values(' + sql sql = sql[:-1] + u')' for i in dfBeer.index.values: record.append((i,) + tuple(dfBeer.loc[i])) if i and not i % blockSize: # print sql cur.executemany(sql, record) record = [] conn.commit() if len(record): cur.executemany(sql, record) record = [] conn.commit() # Generate a word cloud beer_mask = imread("/home/bobbruno/BeerApp/Analysis/beer-glass-mask.png", as_grey=True) beer_mask.shape pylab.rcParams['figure.figsize'] = (30.0, 40.0) wc = WordCloud(width=800, height=500, background_color='white', ranks_only=False, # font_path='/usr/share/fonts/truetype/msttcorefonts/Verdana.ttf', color_func=my_color_func, mask=beer_mask, prefer_horizontal=0.1) wc.fit_words(counts) wc2 = ndimage.rotate(wc, -60, cval=255) plt.imshow(wc2) plt.tight_layout() plt.axis("off") plt.show()
def ouput_word_cloud_by_frequency(frequency_list=None, outputFileName='word_cloud.png', maskFile=None, font_path='../resource/fonts/simhei.ttf', background_color='white', max_words=2000): pic_mask = np.array(Image.open(maskFile)) wc = WordCloud(font_path=font_path, background_color=background_color, max_words=max_words, mask=pic_mask) wc.fit_words(frequency_list) wc.to_file(outputFileName)
def print_top_tokens(model, feature_names, n_top_words, category): """ Prints top words for category to text file """ with open('results_m4w_4.txt', 'wb') as fid: for topic_idx, topic in enumerate(model.components_): fid.write('\n') fid.write(category) fid.write('\n') fid.write("Topic #%d:" % topic_idx) fid.write(' '.join([feature_names[i] for i in topic.argsort()[:-n_top_words -1:-1]])) #fid.write('\n') #fid.write( for i in topic.argsort()[:-n_top_words -1:-1]]) fid.write('\n') def topic_word_freq(topics, idx, feature_names): """ Calculates word frequencies Returns iterator of tuples """ freq_sum = np.sum(topics[idx]) frequencies = [val/freq_sum for val in topics[idx]] return zip(feature_names, frequencies) def get_data(): """ Reads pandas dataframe from .pkl Returns dataframe """ with open('dataframe_for_eda.pkl', 'rb') as fid: df = cPickle.load(fid) return df def reduce_dimensions(total_mat, n_topics): """ Calculates and returns nmf Input is data matrix, shape (n_samples, n_features) returns W array, shape (n_samples, n_components) """ nmf = NMF(n_components = n_topics, random_state=42, alpha=.2, l1_ratio=0.5) nmf.fit(total_mat) X = nmf.transform(total_mat) w = nmf.components_ return nmf def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl(0, 0%%, %d%%)" % random.randint(70, 100) if __name__=='__main__': df = get_data() mask_path = False #Use tfidf features for NMF for category in df.category_code.unique().tolist(): if category == 'mis': #df_cat = df.ix[df['category_code']==category, :] text_mat, text_features = md.tfidf_matrix(df.ix[df['category_code'] == category, 'total_text']) #Fit NMF n_samples = text_mat.shape[0] n_features = text_mat.shape[1] n_topics = 10 n_top_words = 500 print category print 'Fitting the NMF model with tf-idf features' nmf = reduce_dimensions(text_mat, n_topics) #print_top_tokens(nmf,text_features,n_top_words, category) word_freq = topic_word_freq(nmf.components_, 2, text_features) wc = WordCloud(stopwords=tp.custom_stop_words(), background_color='black', max_words=n_top_words, width=2000, height=1800) wc.fit_words(word_freq) plt.figure() plt.imshow(wc) #wc.recolor(color_func=grey_color_func, random_state=3) #wc.to_file('background.png') plt.axis('off')
def unknowncoll(filename='unknownwords.p', stem=False): """ Word cloud from sentiment analysis. Finds the bi-collocation of unknown words (words without sentiment) and displays the 10 most common words based on frequency in a word-cloud, colored green for words seen mostly in positive sentiments and red for the opposite. Comparison is made on all comments concatenated -> filename: name of the file to load unknown words from -> stem: stem the words """ bigram_measures = nltk.collocations.BigramAssocMeasures() subreddits = scraper.load_data('sub-reddits.txt') fullcomment = [] print 'building comment' for name, data in subreddits.items(): for sub_id, sub in data.items(): fullcomment += [fixer(comment, True, stem).split() for comment in sub.comments] print 'getting unknowns' unknownwords = unknownsent(filename) #flatten the comment structure fullcomment = [word for comment in fullcomment for word in comment] basefinder = BigramCollocationFinder.from_words(fullcomment) count = 0 for unknown, unknownscore in unknownwords: finder = copy.copy(basefinder) print '\n' + unknown #only bigrams that contain the unknown word finder.apply_ngram_filter(lambda w1, w2: unknown != w1 and unknown != w2) wordcloud = WordCloud() wordcloud.font_path = 'C:\Windows\Fonts\comic.ttf' #trick the wordcloud to accept custom input wordcloud.generate('generate') colls = finder.score_ngrams(bigram_measures.raw_freq) colls = colls[:10] maximum = colls[1][1] #generate the tuple (word, score) for the wordcloud cloudwords = [(word, score) for ((word, _), score) in colls if word != unknown] cloudwords += [(word, score) for ((_, word), score) in colls if word != unknown] #normalize the scores cloudwords = [(word, score / maximum) for (word, score) in cloudwords] #tricking part 2. wordcloud.fit_words(cloudwords) wordcloud.to_image() if(unknownscore > 0): wordcloud = wordcloud.recolor(color_func=green_color_func, random_state=3) else: wordcloud = wordcloud.recolor(color_func=red_color_func, random_state=3) count += 1 plt.figure(count) plt.title(unknown) plt.imshow(wordcloud) plt.axis("off") plt.savefig('plots/' + unknown + '.png', bbox_inches='tight') plt.close()
from wordcloud import WordCloud import matplotlib.pyplot as plt line='0.475*director + 0.302*band + 0.169*school + 0.014*student + 0.011*ingredient + 0.000*aicf + 0.000*development + 0.000*foundation + 0.000*parent + 0.000*life' scores = [x.split("*")[0] for x in line.split(" + ")] words = [x.split("*")[1] for x in line.split(" + ")] freqs = [] w=[] curr_topic = 0 for word, score in zip(words, scores): freqs.append((word, float(score))) w.append(word) wc=WordCloud() cloud=wc.fit_words(freqs) plt.imshow(cloud) plt.axis("off") plt.show()
segStat = segmentDF.groupby( by=["segment"] )["segment"].agg({ "计数":numpy.size }).reset_index().sort( columns=["计数"], ascending=False ); segStat.head(100) #绘画词云 #http://www.lfd.uci.edu/~gohlke/pythonlibs/ from wordcloud import WordCloud import matplotlib.pyplot as plt wordcloud = WordCloud( font_path='D:\\simhei.ttf', background_color="black" ) words = segStat.set_index('segment').to_dict() wordcloud = wordcloud.fit_words(words['计数']) plt.imshow(wordcloud) plt.close()
# remove words with just one occurence word_counts2 = defaultdict(lambda: 0) total_count = float(total_count) for word in word_counts: if word_counts[word] > 1: word_counts2[word] = word_counts[word]/total_count word_counts2 = sorted(word_counts2.items(), key=itemgetter(1), reverse=True) print "Creating wordcloud in wordcloud.png.." print word_counts2 wordcloud = WordCloud(font_path='OpenSans-Bold.ttf', background_color='black', width=1920, height=1080) wordcloud.fit_words(word_counts2) wordcloud.to_file('./wordcloud.png') # generate email heatmap print email_datetime cols = ["Midnight", "6 AM", "Noon", "6 PM"] rows = ['M','Tu','W','Th','F','Sa','Su'] plt.pcolor(email_datetime) plt.xticks(np.arange(0,24,6)+0.5,cols) plt.yticks(np.arange(0,7)+0.5,rows) plt.colorbar() plt.title("When the Most Emails are Being Received?") plt.xlabel("Time of Day") plt.ylabel("Day of Week") plt.savefig("./heatmap.png", dpi=300)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator d = path.dirname(__file__) with open('haizi.txt', 'r') as poet: s = poet.read() seg_list = [x for x in jieba.cut(s) if len(x) > 1 and x not in [u'一个', u'一只', u'一样', u'一直', u'一种']] alice_coloring = np.array(Image.open(path.join(d, "haizi.jpg"))) font = "/Library/Fonts/Lantinghei.ttc" wc = WordCloud(background_color="white", font_path=font, mask=alice_coloring, max_font_size=80, random_state=42, scale=1.5) # generate word cloud wc.fit_words(Counter(seg_list).items()) # create coloring from image image_colors = ImageColorGenerator(alice_coloring) # show plt.imshow(wc) plt.axis("off") plt.figure() # recolor wordcloud and show # we could also give color_func=image_colors directly in the constructor plt.imshow(wc.recolor(color_func=image_colors)) plt.axis("off") plt.figure()
count = float(count) print "Sum: %f" % count words = [] # get the words from the whitelist and calculate their frequences sql = """SELECT word, count FROM word_whitelist ORDER BY `count` DESC""" for word, frequency in query(sql): words.append((word, float(frequency) / count)) print "Creating cloud." from scipy.misc import imread mask = imread(MASK_FILE) # generate the world cloud. This takes a while because the library is not parallelized. wordcloud = WordCloud(font_path="/usr/share/fonts/truetype/msttcorefonts/Georgia.ttf", ranks_only=True, max_words = len(words), mask=mask, background_color="white") wordcloud.fit_words(words) print "Creating LOW RES image." wordcloud.to_file(LOW_RES) # low let's beef up the scale wordcloud.scale = 12 print "Creating HI RES image." img = wordcloud.to_image() img.save(HIGH_RES, dpi=(100000,100000))
by="segment")['segment'].agg({ 'count':numpy.size}).reset_index().sort( columns=['count'], ascending=False ) segStat.to_csv('segStat.csv',encoding='utf-8') print 'save segStat to segStat.csv...' #segStat=pandas.read_csv('D:\\python\\code\\movie\\comments\\segStat.csv') stopwords= pandas.read_csv( 'D:\\python\\code\\movie\\comments\\stop.txt', error_bad_lines=False, ) fSegStat=segStat[~segStat.segment.isin(stopwords.stopword)] fSegStat.to_csv('fSegStat.csv',encoding='utf-8') print 'save fSegStat to fSegStat.csv...' #fSegStat=pandas.read_csv('D:\\python\\code\\movie\\comments\\fSegStat.csv',encoding='utf-8') wordcloud=WordCloud( font_path='D:\\python\\simhei.ttf', background_color='black' ) #fSegStat_file=pandas.read_csv('D:\\python\\code\\movie\\comments\\fSegStat.csv') wordcloud=wordcloud.fit_words(fSegStat.itertuples(index=False)) plt.imshow(wordcloud) plt.show() plt.close()
__author__ = 'sandip' import os from wordcloud import WordCloud, STOPWORDS import matplotlib.pyplot as plt MODELS_DIR = "models" final_topics = open("lda_topics.txt", 'r', encoding='ascii') curr_topic = 0 lines = "" for line in final_topics: lines = line lines = lines.strip("[").strip("]") for line in lines.split(","): scores = [float(x.split("*")[0].replace("'","")) for x in line.split(" + ")] words = [x.split("*")[1] for x in line.split(" + ")] freqs = [] for word, score in zip(words, scores): freqs.append((word, score)) wc = WordCloud(background_color="white", max_words=2000, stopwords=STOPWORDS.add("said")) elements = wc.fit_words(frequencies=freqs) plt.imshow(elements) plt.axis("off") plt.show() curr_topic += 1 final_topics.close()