def test_unicode_stopwords(): wc_unicode = WordCloud(stopwords=[u'Beautiful']) try: words_unicode = wc_unicode.process_text(unicode(THIS)) except NameError: # PY3 words_unicode = wc_unicode.process_text(THIS) wc_str = WordCloud(stopwords=['Beautiful']) words_str = wc_str.process_text(str(THIS)) assert words_unicode == words_str
def test_unicode_stopwords(): wc_unicode = WordCloud(stopwords=[u'Beautiful']) try: words_unicode = wc_unicode.process_text(unicode(THIS)) except NameError: # PY3 words_unicode = wc_unicode.process_text(THIS) wc_str = WordCloud(stopwords=['Beautiful']) words_str = wc_str.process_text(str(THIS)) assert_true(words_unicode == words_str)
def test_process_text_default_patterns(): wc = WordCloud(stopwords=set(), include_numbers=True, min_word_length=2) words = wc.process_text(THIS) wc2 = WordCloud(stopwords=set(), include_numbers=True, min_word_length=1) words2 = wc2.process_text(THIS) assert "a" not in words assert "3" not in words assert "a" in words2 assert "3" in words2
def generate(text, mask, filtered_words): """Generate wordcloud""" wc = WordCloud(max_words=4000, mask=mask, repeat=False, stopwords=filtered_words) wc.process_text(text) wc.generate(text) img = wc.to_image() b = BytesIO() img.save(b, 'png') b.seek(0) return b
def generate_wordcloud(data: list): """Function to generate wordmap""" text = ' '.join(data[5]) # backgroud_Image = plt.imread('job.jpg') stopwords = set('') stopwords.update(['be', 'am', 'are', 'is', 'was', 'were', 'being', 'been', 'can', 'could', 'dare', 'do', 'does', 'did', 'have', 'has', 'had', 'having', 'may', 'might', 'must', 'need', 'ought', 'shall', 'should', 'will', 'would', 'he', 'she', 'they', 'i', 'me', 'my', 'mine', 'you', 'yours', 'their', 'a']) wc = WordCloud( background_color='white', font_path='C:\Windows\Fonts\Arial.TTF', max_words=2000, max_font_size=150, random_state=30, stopwords=stopwords ) wc.generate_from_text(text) process_word = WordCloud.process_text(wc, text) sort = sorted(process_word.items(), key=lambda e:e[1], reverse=True) print(sort[:50]) img_colors = ImageColorGenerator('white') wc.recolor(color_func=img_colors) img_name = '{}_wordcloud.png'.format(data[0]) plt.savefig(img_name, dpi=180) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() return img_name
def gera_word_cloud(cod_assunto, nome_assunto): docs = df_amostra[(df_amostra['cd_assunto_nivel_3'] == cod_assunto)] docs = docs.head(2000) pool = mp.Pool(7) docs['texto_processado_2'] = pool.map( processa_texto, [row for row in docs['texto_processado']]) pool.close() texto_docs = '' for index, row in docs.iterrows(): texto_docs = texto_docs + ' ' + row['texto_processado_2'] wc = WordCloud(background_color="white", max_words=200, width=1024, height=350, stopwords=stopwords_processadas, collocations=False, colormap="twilight", normalize_plurals=False) t = wc.process_text(texto_docs) wc.generate_from_frequencies(t) wc.to_file( "/media/DATA/classificadorDeAssuntos/Dados/Resultados/word_clouds/word_cloud_" + str(cod_assunto) + "_" + nome_assunto + ".png")
def plot_wordcloud(data, selected_job): selected_data = data.loc[data.Category == selected_job].reset_index( drop=True) selected_data[ "All_Qualifications"] = selected_data.Basic_Qualifications + selected_data.Preferred_Qualifications selected_qualifications = ''.join(selected_data.All_Qualifications) my_stopwords = { 'and', 'experience', 'e', 'g', 'in', 'a', 'years', 'of', 'with', 'ability', 'to', 'such', 'as', 'working', 'the', 'related', 'field', 'or', 'work', 'for', 'using', 'etc', 'other', 'At', 'least', 'similar', 'equivalent', 's', 'on', 'M', 'one', 'degree', 'knowledge', 'building', 'strong', 'skill', 'skills', 'relevant', 'advanced', 'R', 'demonstrated', 'tools', 'proficiency', 'environment', 'technical', 'engineering', 'an', 'Amazon', 'i', 'Minimum', 'education', 'reporting', 'highly', 'is', 'including', 'detail', 'this', 'role', 'Meets', 'exceeds', 'project', 'able' } wc = WordCloud(background_color='white', min_font_size=8, prefer_horizontal=1, stopwords=my_stopwords) wc.generate(selected_qualifications) frequencies = wc.process_text(selected_qualifications) return wc.to_image(), frequencies
def create_wordcloud(df): """ 生成地铁名词云 """ # 分词 text = '' for line in df['station']: text += ' '.join(jieba.cut(line, cut_all=False)) text += ' ' backgroud_Image = plt.imread('rocket.jpg') wc = WordCloud( background_color='white', mask=backgroud_Image, font_path='C:\Windows\Fonts\华康俪金黑W8.TTF', max_words=1000, max_font_size=150, min_font_size=15, prefer_horizontal=1, random_state=50, ) wc.generate_from_text(text) img_colors = ImageColorGenerator(backgroud_Image) wc.recolor(color_func=img_colors) # 看看词频高的有哪些 process_word = WordCloud.process_text(wc, text) sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True) print(sort[:50]) plt.imshow(wc) plt.axis('off') wc.to_file("地铁名词云.jpg") print('生成词云成功!')
def get_wc(df: pd.DataFrame, field: str) -> (dict, WordCloud): """Devuelve un dict con las frecuencias y el word cloud""" words = "" for row in df[field].values: words += remove_accents(str(row)) + "\n" # uso stopwords custom a parte de las de wordcloud # sacamos las preposiciones porque no aportan mucho. stopwords = set([ # preposiciones "el", "para", "en", "de", "la", "del", "nan", "los", "las", "se", "con", "al", "es", "lo", # html escapado "nbsp", "li", "br", "aacute", "hr", "col", ]).union(STOPWORDS) wc = WordCloud( width = 800, height = 800, stopwords = stopwords, background_color ='white', collocations=False, # evita collocations (ex. en casa, en venta) ) freq = wc.process_text(words) # por si queremos printear las frecuencias en orden #print({k: v for k, v in sorted(freq.items(), key=lambda item: item[1], reverse=True)}) wc.fit_words(freq) return freq, wc
def test_generate_from_frequencies(): # test that generate_from_frequencies() takes input argument dicts wc = WordCloud(max_words=50) words = wc.process_text(THIS) result = wc.generate_from_frequencies(words) assert_true(isinstance(result, WordCloud))
def test_process_text(): # test that process function returns a dict wc = WordCloud(max_words=50) result = wc.process_text(THIS) # check for proper return type assert_true(isinstance(result, dict))
def creat_wordcloud(df): text = '' for line in df['title']: text += ' '.join(jieba.cut(line, cut_all=False)) text += ' ' background_Image = plt.imread('data/image.jpg') wc = WordCloud(background_color='white', mask=background_Image, font_path='msyh.ttc', max_words=1000, max_font_size=150, min_font_size=15, prefer_horizontal=1, random_state=50) wc.generate_from_text(text) img_colors = ImageColorGenerator(background_Image) wc.recolor(color_func=img_colors) process_word = WordCloud.process_text(wc, text) sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True) print(sort[:50]) plt.imshow(wc) plt.axis('off') wc.to_file('商家标题词云.jpg') print('生成词云成功')
def make_wordcloud(readtext, imagename): text_from_file_with_apath = open(readtext, encoding='utf-8-sig').read() wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=True) wl_space_split = " ".join(wordlist_after_jieba) # jieba中文分词 stopwords = set() #停用詞 stopwords.update(['https:imgur'], ['https'], ['imgur'], ['jpg'], ['com'], ['dcard'], ['tw'], ['www'], ['http'], ['png']) my_wordcloud = WordCloud(max_font_size=35, mask=imagename, stopwords=stopwords, font_path='C:/Windows/Fonts/MSYH.TTC').generate( wl_space_split) #max_font_size:最大的文字大小 process_word = WordCloud.process_text(my_wordcloud, wl_space_split) # 查看詞頻,方便重新增加停用词 sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True) # sort為list print(sort[:50]) plt.imshow(my_wordcloud, interpolation='bilinear') plt.axis("off") plt.show()
def ana_result(): file = r'E:\pytotal\bilibllcomments\result.csv' df = pd.read_csv(file, header=None) # 利用jieba库分词 text = '' for line in df[1]: text += ' '.join(jieba.cut(line)) # 生成词云 bgi = pt.imread(r'E:\pytotal\bilibllcomments\佩奇.jpg') wc = WordCloud( background_color='white', mask=bgi, max_words=2000, max_font_size=80, random_state=30, ) wc.generate_from_text(text) process_word = WordCloud.process_text(wc, text) sort = sorted(process_word.items(), key=lambda e: e[1], reverse=False) img_colors = ImageColorGenerator(bgi) wc.recolor(color_func=img_colors) pt.imshow(wc) pt.axis('off') wc.to_file(r'E:\pytotal\bilibllcomments\result.jpg') print('done')
def create_wordcloud(pandas_series, max_words=2000, max_font_size=None, filepath=None): mask = np.array(Image.open("images/NYC_silhouette.png")) mask[mask > 0] = 255 # this is because the WordCloud library uses bigrams and if we do not shuffle data, we will # see noise like: Noise Residential, Residential Noise pandas_series = pandas_series.sample(frac=1)["Complaint Type"] wordcloud = WordCloud(width=3000, height=2000, max_words=max_words, max_font_size=max_font_size, background_color='black', stopwords=STOPWORDS, random_state=1, mask=mask, contour_width=3, contour_color='white') text = pandas_series.astype(str).values processed_text = wordcloud.process_text(" ".join(text)) wordcloud.generate_from_frequencies(processed_text) if filepath: wordcloud.to_file(filepath)
def wc_chinese(): text = open(path.join(d, 'langchao2.txt'), encoding='UTF-8-SIG').read() font_path = 'C:\Windows\Fonts\STXIHEI.TTF' background_Image = np.array(Image.open(path.join(d, "circle.jpg"))) img_colors = ImageColorGenerator(background_Image) stopwords = set('') wc = WordCloud( font_path=font_path, margin=2, mask=background_Image, scale=2, max_words=200, min_font_size=4, max_font_size=100, stopwords=stopwords, random_state=42, background_color='white', ) wc.generate_from_text(text) # 获取文本词排序,可调整 stopwords process_word = WordCloud.process_text(wc, text) sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True) print(sort[:50]) # 获取文本词频最高的前50个词 wc.recolor(color_func=img_colors) plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.tight_layout() plt.savefig('浪潮basic2.png', dpi=200) plt.show()
def test_process_text(): # test that process function returns a dict wc = WordCloud(max_words=50) result = wc.process_text(THIS) # check for proper return type assert isinstance(result, dict)
def test_generate_from_frequencies(): # test that generate_from_frequencies() takes input argument dicts wc = WordCloud(max_words=50) words = wc.process_text(THIS) result = wc.generate_from_frequencies(words) assert isinstance(result, WordCloud)
def generate_from_str(text_str, output_name): now = time.strftime('%Y-%m-%d', time.localtime()) #text_str = ''.join(text_list) # 分词操作 #text_str = jieba.cut(text_str) #text_str = ''.join(text_str) wc = WordCloud( font_path='./fonts/msyh.ttc', background_color='white', width=4961, # A3 height=3508, max_words=1000, color_func=colorFunc) print('计算词频...') freqs = wc.process_text(text_str) print(freqs) print('生成词频json...') if not os.path.exists('./wordcloud_json'): os.mkdir('./wordcloud_json') with open('./wordcloud_json/' + output_name + '_' + now + '.json', 'w', encoding='utf-8') as jsonf: json.dump(freqs, jsonf, ensure_ascii=False)
def create_wordcloud(df, save_name=None): """ Creates a word cloud based on enrichment array Must have column 'term_name'. It takes this column, flattens it into a large string. Then we pass it to the python package wordcloud, which generates the wordcloud. It returns the figure with a save method and a dictionary of counts. Parameters ---------- df : pd.DataFrame save_name : str Returns ------- """ data = df.apply(_cleanup_term_name, axis=1) text = ' '.join(data) # Generate a word cloud image wc = WordCloud(margin=0, background_color=None, mode='RGBA', # min_count=1, width=800, height=600, collocations=True, stopwords=basic_words) wordcloud = wc.generate(text) word_dict = wc.process_text(text) def plot(self, save_name=None, figsize=(8, 5)): fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) ax.imshow(self, interpolation='bilinear') plt.xticks([]) plt.yticks([]) plt.axis("off") if save_name is not None: plt.savefig('{}.png'.format(save_name), bbox_inches='tight', dpi=150) plt.title(save_name) return fig wordcloud.plot = types.MethodType(plot, wordcloud) if save_name is not None: wordcloud.plot(save_name) wordcloud.word_dict = word_dict df1 = pd.DataFrame(list(word_dict.items()), columns=['words', 'counts']) df1.sort_values('counts', ascending=False, inplace=True) wordcloud.data = df1 return wordcloud
def test_generate_from_frequencies(): # test that generate_from_frequencies() takes input argument of class # 'dict_items' wc = WordCloud(max_words=50) words = wc.process_text(THIS) items = words.items() result = wc.generate_from_frequencies(items) assert_true(isinstance(result, WordCloud))
def get_comments_wordcloud(self): print('请输入需要生产词云的番剧名称:') while True: fanOperaname = str(input()) sql = "select media_id from media where media_name = '%s'" % (fanOperaname) self.cursor.execute(sql) temp = self.cursor.fetchall() if len(temp) != 0: result = temp[0][0] break else: print('输入番剧名称有错,请重新输入:') image_path = '词云相关数据/' + str(fanOperaname) + '.jpeg' data = self.get_data(result) # 用jieba进行精确分词,返回list data = jieba.lcut(data, cut_all=False) text = '。'.join(data) # 加载停止词 stop_words = self.load_stopwords() # 中文的话要设置中文字体,不然词云会乱码 font_path = 'SourceHanSansCN-Regular.ttf' # 设置词云背景图 background_image = numpy.array(Image.open(image_path)) # 从背景图中取色,不同区域字不同颜色 img_colors = ImageColorGenerator(background_image) stopwords = set(stop_words) wc = WordCloud( font_path=font_path, margin=2, mask=background_image, scale=2, max_words=400, min_font_size=4, stopwords=stopwords, random_state=42, background_color='white', max_font_size=100, ) wc.generate(text) # 生成词云 # 获取文本排序 process_word = WordCloud.process_text(wc,text) sort = sorted(process_word.items(), key=lambda e:e[1], reverse=True) print(sort[:50]) wc.recolor(color_func=img_colors) ciyun_image = str(image_path.split('.')[0] + '.png') wc.to_file(ciyun_image) # 将词云图片和原图片进行叠加操作 self.imageOverlay(image_path, ciyun_image) return True
def word_cloud(self, show_plot=True): word_cloud_obj = WordCloud() freq_dict = word_cloud_obj.process_text(' '.join(self.text)) if show_plot: word_cloud = word_cloud_obj.generate_from_frequencies(freq_dict) plt.figure() plt.imshow(word_cloud, interpolation='bilinear') plt.axis("off") plt.show() return freq_dict
def analysis9(data): jieba.load_userdict("userdict.txt") jieba.add_word('区块链') text='' for i in data['title'].values: # for i in data[data.year == 2018]['title'].values: # 替换无用字符 symbol_to_replace = '[!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+' # data['name'].str.replace(symbol_to_replace,'',inplace=True,regex=True) i = re.sub(symbol_to_replace,'',i) # print(i) text+=' '.join(jieba.cut(i,cut_all=False)) # text = jieba.del_word('如何') d = path.dirname(__file__) if "__file__" in locals() else os.getcwd() background_Image = np.array(Image.open(path.join(d, "tiger.png"))) # background_Image = plt.imread('E:\my_Python\training\1exercise\tiger.png') font_path = 'C:\Windows\Fonts\SourceHanSansCN-Regular.otf' # 思源黑,黑体simhei.ttf # 添加stopswords stopwords = set() # 先运行对text进行词频统计再排序,再选择要增加的停用词 stopwords.update(['如何','怎么','一个','什么','为什么','还是','我们','为何','可能','不是','没有','哪些','成为','可以','背后','到底','就是','这么','不要','怎样','为了','能否','你们','还有','这样','这个','真的','那些']) wc = WordCloud( # background_color = '#3F3F3F', # background_color = 'white', background_color = 'black', font_path = font_path, mask = background_Image, stopwords = stopwords, max_words = 200, # width = 1000,height=600, margin =2, max_font_size = 100, random_state = 42, scale = 2, # colormap = 'viridis' ) wc.generate_from_text(text) process_word = WordCloud.process_text(wc, text) # 下面是字典排序 sort = sorted(process_word.items(),key=lambda e:e[1],reverse=True) # sort为list print(sort[:50]) # 输出前词频最高的前50个,然后筛选出不需要的stopwords,添加到前面的stopwords.update()方法中 img_colors = ImageColorGenerator(background_Image) wc.recolor(color_func=img_colors) # 颜色跟随图片颜色 plt.imshow(wc,interpolation='bilinear') plt.axis('off') plt.tight_layout() # 自动控制空白边缘,以全部显示x轴名称 plt.savefig('huxiu5.png',dpi=200) plt.show()
def generate_wordclouds(candidate): texts = { 'total': candidate['text'], 'positive': candidate['text_positive'], 'negative': candidate['text_negative'] } scores = { 'total': candidate['score_palavras'], 'positive': candidate['score_palavras_pos'], 'negative': candidate['score_palavras_neg'] } brazil_mask = np.array(Image.open("brazil_mask.png")) colors = {} for score in scores.items(): colors[score[0]] = SimpleGroupedColorFunc( {p[0]: color_to_use[math.floor(p[1] * 9.99)] for p in score[1]}, default_color) wordcloud = WordCloud(width=1200, height=1200, background_color="white", mask=brazil_mask, stopwords=stopwords_pt, font_path='/Library/Fonts/Times New Roman.ttf') for text in texts.items(): wordcloud_total = wordcloud.generate(text[1]) wordcloud_total.recolor(color_func=colors[text[0]]) plt.axis("off") plt.imshow(wordcloud_total, interpolation="bilinear") plt.show() #image_total = wordcloud_total.to_image() #image_total.show() wordcloud_total.to_file('../imagens/{}_{}_{}.png'.format( 'pyspark' if candidate['type'] == 'csv' else 'azure', candidate['candidate'], text[0])) #generate CSV if candidate['type'] == 'csv': freq = wordcloud.process_text(texts['total']) with open('../data/frequency/pyspark_frequency_{}.csv'.format( candidate['candidate']), 'w', encoding='utf-8') as f: # Just use 'w' mode in 3.x csv_out = csv.writer(f) csv_out.writerow(['Word', 'Count']) for key in freq.keys(): csv_out.writerow((key, freq[key]))
def xnet_input_to_communities_wordcloud(input_file, output_file, minYear=minYear, minKCore=minKCore): graph = xn.xnet2igraph(input_file) verticesToDelete = np.where( np.logical_or( np.array(graph.vs["year"]) < minYear, np.array(graph.vs["KCore"]) < minKCore))[0] graph.delete_vertices(verticesToDelete) graph = graph.clusters(mode="WEAK").giant() communities = graph.vs["Community"] sortedCommunities = sortByFrequency(communities)[0:maxCommunities] fig = plt.figure(figsize=(20, 5 * math.ceil(len(sortedCommunities) / 2))) allAbstracts = "\n".join(graph.vs["paper_abstract"]) allFrequencies = WordCloud( max_words=maxAllWords).process_text(allAbstracts) amask = np.zeros((500, 1000), dtype='B') amask[:10, :] = 255 amask[-10:, :] = 255 amask[:, :10] = 255 amask[:, -10:] = 255 for index, community in enumerate(sortedCommunities): communityColor = (_styleColors[index] if index < len(_styleColors) else "#aaaaaa") abstracts = "\n".join([ vertex["paper_abstract"] for vertex in graph.vs if vertex["Community"] == community ]) plt.subplot(math.ceil(len(sortedCommunities) / 2), 2, index + 1) wc = WordCloud(background_color="white", max_words=maxInternalWords, width=1000, height=500, mask=amask, contour_width=10, contour_color=communityColor, random_state=3, color_func=generateColorFunction(communityColor)) inCommunityFrequency = wc.process_text(abstracts) relativeFrequencies = { key: frequency / math.log(allFrequencies[key] + 1) for key, frequency in inCommunityFrequency.items() if key in allFrequencies } wc.generate_from_frequencies(relativeFrequencies) plt.imshow(wc, interpolation='bilinear') plt.axis("off") plt.tight_layout() plt.savefig(output_file) plt.close(fig)
def topWord(self, top=100): wc = WordCloud() result=wc.process_text(self.sentense) sortlist = sorted(result.items(), key=lambda item: item[1], reverse=True) top_dict = {} try: for i in range(0, top): top_dict[sortlist[i][0]] = sortlist[i][1] except: print('not enough %s words, please set top parameter down'%(top)) print(top_dict) return top_dict
def show_worldcloud(self, stopwords): stopwords = set(STOPWORDS) | set(stopwords) wc = WordCloud(width=800, height=400, stopwords=stopwords, collocations=False) a = " ".join(x.text for x in self.tweets) self.words = wc.process_text(a) wc.generate_from_frequencies(self.words) plot.imshow(wc) plot.axis("off") plot.rcParams["figure.figsize"] = (20, 10) plot.show()
def get_wordcloud_from_wordlist(wordlist, background_image='background', slow_connection_mode=False): from PIL import Image fpath = "/usr/share/fonts/opentype/noto/NotoSansCJK-Medium.ttc" # ストップワードの設定 stop_words = [ 'てる', 'いる', 'なる', 'れる', 'する', 'ある', 'ない', 'くれる', 'やる', 'くださる', 'そう', 'せる', 'した', 'して', 'て', 'に', 'を', 'は', 'の', 'が', 'と', 'た', 'し', 'で', 'も', 'な', 'い', 'か', 'こと', 'これ', 'それ', 'ここ', 'もの', 'ので', 'よう', 'いい', '思う', '人', '気', '何', '私', '僕', '自分', 'やつ', 'さん', 'くん', 'ちゃん', '今日', '今', 'とき', 'まだ', 'もう', 'みたい', ] img_array = np.array(Image.open(background_image)) pastel_colors = [f"hsl({hue}, 25%, 66%)" for hue in [0, 60, 120, 180]] def pastel_color_func(word, font_size, position, orientation, random_state=None, **kwargs): import random return pastel_colors[random.randint(0, 3)] wordcloud = WordCloud(regexp=r"\w[\w']*", normalize_plurals=False, background_color="white", font_path=fpath, mask=img_array, color_func=pastel_color_func if slow_connection_mode else ImageColorGenerator(img_array), scale=1.5, stopwords=set(stop_words), # max_font_size=55, ) text = ' '.join(wordlist) words = wordcloud.process_text(text) wordcloud.generate_from_frequencies(words) if slow_connection_mode: (wordcloud.to_image() .resize((400, 400), resample=Image.BOX) .convert(mode="P", palette=Image.ADAPTIVE, colors=8) .save('/tmp/wordcloud.png')) else: wordcloud.to_file("/tmp/wordcloud.png") return wordcloud, words
def wc_english(): # 获取当前文件的路径 d = path.dirname(__file__) if "__file__" in locals() else os.getcwd() # 获取文本txt text = open(path.join(d, r'world_cloud\file\legend1900.txt')).read() # 读取背景图片 background_Image = np.array( Image.open(path.join(d, r'world_cloud\mask1900.jpg'))) # 提取背景图片颜色 img_color = ImageColorGenerator(background_Image) # 设置英文停止词,用词云自带的停止词 stopwords = set(STOPWORDS) stopwords.add('one') wc = WordCloud( margin=2, # 设置页面边缘 mask=background_Image, # 设置背景图片 scale=2, # 设置缩放倍数 max_words=200, # 设置最多次个数 min_font_size=4, # 设置最小字体大小 max_font_size=150, # 设置最大字体大小 stopwords=stopwords, random_state=42, background_color='white', # 背景颜色 colormap='Blues') # 删除词频出现的one # 获取文本词排序,来调整stopword process_word = WordCloud.process_text(wc, text) sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True) # 生成词云 wc.generate_from_text(text) # 根据图片色设置背景色,根据图片色彩绘制词云文字颜色 # wc.recolor(color_func=img_color) wc.recolor(color_func=grey_color_func) # 显示图像 interpolation 内插入法 bilinear双线性 plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.tight_layout() # 存储图像 wc.to_file(d + r'\world_cloud\1900pro1.png') # or plt.savefig('1900_basic.png,dpi=200) plt.show()
def wc_chinese(): d = path.dirname(__file__) if "__file__" in locals() else os.getcwd() # 确定文本编码格式 text = open(path.join(d, 'langchao.txt'), 'rb').read() text_charInfo = chardet.detect(text) #print(text_charInfo) # 结果:{'encoding': 'UTF-8-SIG', 'confidence': 1.0, 'language': ''} text = open(path.join(d, r'langchao.txt'), encoding='UTF-8-SIG').read() text += ' '.join(jieba.cut(text, cut_all=False)) # cut_all=False 表示采用精确模式 # 设置中文字体 font_path = 'C:\Windows\Fonts\STXIHEI.TTF' # 获取背景图片 background_Image = np.array(Image.open(path.join(d, 'circle.jpg'))) # 提取背景图片颜色 img_colors = ImageColorGenerator(background_Image) # 设置中文停止词 stopwords = set('') stopwords.update([ '但是', '一个', '自己', '因此', '没有', '很多', '可以', '这个', '虽然', '因为', '这样', '已经', '现在', '一些', '比如', '不是', '当然', '可能', '如果', '就是', '同时', '比如', '这些', '必须', '由于', '而且', '并且', '他们' ]) wc = WordCloud(font_path=font_path, margin=2, mask=background_Image, scale=2, max_words=200, min_font_size=4, max_font_size=100, stopwords=stopwords, random_state=42, background_color='white') wc.generate_from_text(text) # 获取文本词排序,调整stopwords process_word = WordCloud.process_text(wc, text) sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True) #print(sort[:50]) # 获取文本词频最高的前50个词 # 根据图片色设置背景色 wc.recolor(color_func=img_colors) plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.tight_layout() plt.savefig('浪潮basic01.png', dpi=200) plt.show()
def writeFreq(text, outFile, words): """ Writes frequencies of words into the specified file """ excludewords = STOPWORDS.copy() for word in words: excludewords.add(word) wordcloud = WordCloud(max_words=NUM_OF_WORDS, stopwords=excludewords) freqList = wordcloud.process_text(text) for item in freqList: outFile.write(item[0] + ',' + str(item[1]) + '\n')
def analysis_09(data): jieba.load_userdict('userdict.txt') jieba.add_word('区块链') text = '' for i in data['title'].values: # 替换无用字符 symbol_to_replace = '[!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+' i = re.sub(symbol_to_replace, '', i) text += ' '.join(jieba.cut(i, cut_all=False)) d = path.dirname(__file__) if '__file__' in locals() else os.getcwd() background_Image = np.array(Image.open(path.join(d, "tiger.png"))) font_path = 'C:\Windows\Fonts\STFANGSO.TTF' # 添加stopwords stopwords = set() # 先运行对text进行词频统计再排序,再选择要增加的停用词 stopwords.update([ '如何', '怎么', '一个', '什么', '为什么', '还是', '我们', '为何', '可能', '不是', '没有', '哪些', '成为', '可以', '背后', '到底', '就是', '这么', '不要', '怎样', '为了', '能否', '你们', '还有', '这样', '这个', '真的', '那些' ]) wc = WordCloud(background_color='black', font_path=font_path, mask=background_Image, stopwords=stopwords, max_words=200, margin=2, max_font_size=100, random_state=42, scale=2) wc.generate_from_text(text) process_word = WordCloud.process_text(wc, text) # 字典排序 sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True) print(sort[:50]) img_colors = ImageColorGenerator(background_Image) wc.recolor(color_func=img_colors) # 颜色跟随图片颜色 plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.tight_layout() plt.savefig('词云图.png', dpi=200) plt.show()
def wc_english_improve_03(): d = path.dirname(__file__) if '__file__' in locals() else os.getcwd() text = open(path.join(d, 'legend1900.txt')).read() # 读取背景图片 background_Image = np.array(Image.open(path.join(d, 'mask1900.jpg'))) # or # background_Image = imread(path.join(d,'mask1900.jpg')) # 提取背景图片颜色 #img_colors = ImageColorGenerator(background_Image) # 设置英文停止词,分割筛除文本中不需要的词汇,比如:a、an、the stopwords = set(STOPWORDS) stopwords.add('one') wc = WordCloud(margin=2, mask=background_Image, scale=2, max_words=200, min_font_size=4, max_font_size=150, stopwords=stopwords, random_state=42, background_color='black') # 生成词云 wc.generate_from_text(text) # 获取文本词排序,调整stopwords process_word = WordCloud.process_text( wc, text) # 返回的是dict,表示的是分词后的token以及对应出现的次数 sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True) #print(sort[:50]) # 获取文本词频最高的前50个词,one出现60次 def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl(0, 0%%, %d%%)" % random.randint(50, 100) # 随机设置hsl值,色相,饱和度,明度 wc.recolor(color_func=grey_color_func) plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.tight_layout() plt.savefig('1900_basic_04.png', dpi=200) plt.show()
__author__ = "Nick" from os import path import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS text = open("Alice.txt").read() wc = WordCloud(background_color="white", max_words=100, stopwords=STOPWORDS.add("said")) wc.generate(text) print(wc.process_text(text=text)) plt.imshow(wc) plt.axis("off") plt.show()
seed = int(sys.argv[0]) except: seed = 0 wc = WordCloud(max_words=10000, stopwords=stopwords, margin=5, random_state=seed, height=height, max_font_size=max_font_size, width=width, prefer_horizontal=0.75) freqs = wc.process_text(text) freqs = [freqs[1::2], freqs[::2]] count = [0, 0] for i in range(min([len(x) for x in freqs])): if((len(freqs[0][i][0]) > len(freqs[1][i][0]) and count[0] > count[1]) or (len(freqs[0][i][0]) < len(freqs[1][i][0]) and count[0] < count[1])): freqs[0][i], freqs[1][i] = freqs[1][i], freqs[0][i] print("Swapped %d" % i, freqs[0][i], freqs[1][i], count)
df = pd.read_csv('music_message.csv', header=None) text = '' for line in df[2]: text += ' '.join(jieba.cut(line, cut_all=False)) backgroud_Image = plt.imread('job.jpg') stopwords = set('') stopwords.update(['封面', 'none介绍', '介绍', '歌单', '歌曲', '我们', '自己', '没有', '就是', '可以', '知道', '一起', '不是', '因为', '什么', '时候', '还是', '如果', '不要', '那些', '那么', '那个', '所有', '一样', '一直', '不会', '现在', '他们', '这样', '最后', '这个', '只是', '有些', '其实', '开始', '曾经', '所以', '不能', '你们', '已经', '后来', '一切', '一定', '这些', '一些', '只有', '还有']) wc = WordCloud( background_color='white', mask=backgroud_Image, font_path='C:\Windows\Fonts\STZHONGS.TTF', max_words=2000, max_font_size=150, random_state=30, stopwords=stopwords ) wc.generate_from_text(text) # 看看词频高的有哪些,把无用信息去除 process_word = WordCloud.process_text(wc, text) sort = sorted(process_word.items(), key=lambda e:e[1], reverse=True) print(sort[:50]) img_colors = ImageColorGenerator(backgroud_Image) wc.recolor(color_func=img_colors) plt.imshow(wc) plt.axis('off') wc.to_file("活着.jpg") print('生成词云成功!')
def test_stopwords_lowercasing(): # test that capitalized stopwords work. wc = WordCloud(stopwords=["Beautiful"]) processed = wc.process_text(THIS) words = [count[0] for count in processed] assert_true("Beautiful" not in words)
def test_include_numbers(): wc_numbers = WordCloud(include_numbers=True) wc = wc_numbers.process_text(THIS) assert '14' in wc.keys()
def test_min_word_length(): wc_numbers = WordCloud(min_word_length=5) wc = wc_numbers.process_text(THIS) word_lengths = [len(word) for word in wc.keys()] assert min(word_lengths) == 5
def test_process_text_regexp_parameter(): # test that word processing is influenced by `regexp` wc = WordCloud(max_words=50, regexp=r'\w{5}') words = wc.process_text(THIS) assert_false('than' in words)