def main(): parser = argparse.ArgumentParser() parser.add_argument('--url', metavar='URL', required=True, help='input the url') args = parser.parse_args() url = args.url output_file = path.join(path.dirname(__file__), 'wordcloud.png') response = requests.get(url) origin_text = response.text origin_text = re.sub(r'<script.*?>.*?</script>', '', origin_text, flags=re.I|re.M|re.DOTALL) origin_text = re.sub(r'<style.*?>.*?</style>', '', origin_text, flags=re.I|re.M|re.DOTALL) doc = html.fromstring(origin_text) text = doc.xpath('//body//text()') text = [i.strip() for i in text if i.strip()] text = ' '.join(text) seg = jieba.cut(text) seg = [i.strip() for i in seg if i.strip() and not i.strip().isdigit() and i.strip() not in stopwords] seg = ' '.join(seg) wordcloud = WordCloud(font_path='simhei.ttf', background_color='black', margin=5, width=1800, height=800) wordcloud = wordcloud.generate(seg) image = wordcloud.to_image() with open(output_file, 'wb') as f: image.save(f, format='png')
def main(): wr=WordReader() # wlist=wr.word_reader('data1/dt01.txt') wcount='' for root,dirs,files in os.walk('data2'): for file in files: file_path=os.path.join(root,file) wlist=wr.word_reader(file_path) wcount+=wlist back_coloring = np.array(Image.open("./sky.png")) wc = WordCloud( background_color="white", #背景颜色 max_words=1000,# 词云显示的最大词数 mask=back_coloring,#设置背景图片 max_font_size=150, #字体最大值 random_state=42, ) wc.generate(wcount) # # wc.generate_from_frequencies(word_list) # wc.fit_words(word_list) plt.figure() plt.imshow(wc) plt.axis("off") plt.show()
def create_word_cloud(filename): # 读取文件内容 text = open("{}.txt".format(filename), encoding='utf-8').read() # 注释部分采用结巴分词 # wordlist = jieba.cut(text, cut_all=True) # wl = " ".join(wordlist) # 设置词云 wc = WordCloud( # 设置背景颜色 background_color="white", # 设置最大显示的词云数 max_words=2000, # 这种字体都在电脑字体中,window在C:\Windows\Fonts\下,mac下可选/System/Library/Fonts/PingFang.ttc 字体 font_path='C:\\Windows\\Fonts\\simfang.ttf', height=500, width=500, # 设置字体最大值 max_font_size=60, # 设置有多少种随机生成状态,即有多少种配色方案 random_state=30, ) myword = wc.generate(text) # 生成词云 如果用结巴分词的话,使用wl 取代 text, 生成词云图 # 展示词云图 plt.imshow(myword) plt.axis("off") plt.show() wc.to_file('signature.png') # 把词云保存下
def genwordcloud(texts,mask=None,font_path=None,background_color='white'): '''生成词云 parameter ---------- mask: RGBA模式数组,最后一个分量是alpha通道, 默认会生成一个900*1200的椭圆 font_path: 采用的字体,建议采用安卓默认字体DroidSansFallback.ttf return ------- img:可以直接img.save('test.png') ''' from PIL import Image try: from wordcloud import WordCloud except: #raise Exception('wordcloud need install wordcloud package.') print('wordcloud need install wordcloud package.') return None if mask is None: tmp=np.zeros((900,1200),dtype=np.uint8) for i in range(tmp.shape[0]): for j in range(tmp.shape[1]): if (i-449.5)**2/(430**2)+(j-599.5)**2/(580**2)>1: tmp[i,j]=255 mask=np.zeros((900,1200,4),dtype=np.uint8) mask[:,:,0]=tmp mask[:,:,1]=tmp mask[:,:,2]=tmp mask[:,:,3]=255 else: mask=np.array(Image.open(mask)) wordcloud = WordCloud(background_color = background_color,font_path=font_path, mask = mask) wordcloud.generate(texts) img=wordcloud.to_image() return img
def generate_image(words, image): graph = np.array(image) wc = WordCloud(font_path=os.path.join(CUR_DIR, 'fonts/simhei.ttf'), background_color='white', max_words=MAX_WORDS, mask=graph) wc.generate_from_frequencies(words) image_color = ImageColorGenerator(graph) return wc, image_color
def create_word_cloud(ballots, chart_directory, image_name, mask_file, stop_words, word_counts=None): """ Generates a word cloud from given ballots. """ if word_counts is None: word_counts=[25, 50, 100, 1000] text = '' for ballot in ballots: text = ''.join((text, ballot.feedback,)) all_stop_words = STOPWORDS all_stop_words |= set(stop_words) for word_count in word_counts: if mask_file: color_mask = imread(mask_file) image_colors = ImageColorGenerator(color_mask) wc = WordCloud(background_color="white", max_words=word_count, mask=color_mask, stopwords=all_stop_words, color_func=image_colors, max_font_size=80, random_state=42) else: wc = WordCloud(background_color="white", max_words=word_count, stopwords=all_stop_words, max_font_size=80, random_state=42) wc.generate(text) axis_image = plt.imshow(wc) plt.axis("off") image_name_with_count = '{0}-{1}.png'.format(image_name, str(word_count)) logger.info('...creating word cloud {0}'.format(image_name_with_count)) save_location = os.path.join(chart_directory, image_name_with_count) plt.savefig(save_location) plt.close()
def cloudplot(person): person = re.sub(r'\+', ' ', person) text = GetTextRange(Emails, person) text = rmBoring(rmNonAlpha(text)).decode('ascii', 'ignore') plt.clf() d = path.dirname(path.abspath(__file__)) hilcolor = np.array(Image.open(path.join(d, "static/img/hillarylogo.jpg"))) wc = WordCloud(background_color="white", max_words=150, mask=hilcolor, stopwords=STOPWORDS.add("said"), max_font_size=80, random_state=42, relative_scaling = 0.5) wc.generate(text) image_colors = ImageColorGenerator(hilcolor) plt.imshow(wc.recolor(color_func=image_colors)) plt.axis("off") fig = plt.gcf() img = StringIO.StringIO() fig.savefig(img) img.seek(0) return send_file(img, mimetype='image/png')
def main(): #d = path.dirname(__file__) width, height = 1000, 500 themes = lib.datastats.themes_with_usage() data = { th: len(tobj.stories) ** 0.5 for th, tobj in themes.iteritems() } #mask = np.array(Image.open(path.join(d, "ellipse1000x500.png"))) mask = np.array(Image.open("ellipse1000x500.png")) wordcloud = WordCloud( font_path = 'Helvetica.ttf', max_words = 5000, max_font_size = 20, prefer_horizontal = 1.0, width = width, height = height, scale = 2, mask = mask, relative_scaling = 1.0, ).fit_words(data) for rec in wordcloud.layout_: print rec image = wordcloud.to_image() image.show()
def create_wc(words_in): """Create WordCloud object. Parameters ---------- words_in : list of tuple Words to plot, with their corresponding frequencies. Returns ------- wc : WordCloud() object Wordcloud definition. """ # Create the WordCloud object wc = WordCloud(background_color=None, mode='RGBA', width=800, height=400, prefer_horizontal=1, relative_scaling=0.5, min_font_size=25, max_font_size=80).generate_from_frequencies(words_in) # Change colour scheme to grey wc.recolor(color_func=_grey_color_func, random_state=3) return wc
def create_wordcloud(frequencies, stop_words): if len(frequencies) == 0: Exception("No history is found.") logger.debug("word frequencies count = %s" % len(frequencies)) logger.debug("stop words = %s" % pformat(stop_words)) wordcloud = WordCloud(background_color="black", width=900, height=600, stopwords=stop_words).generate_from_frequencies(frequencies) image = wordcloud.to_image() image.show()
def twittersearch(): api = twitter.Api( consumer_key=twitter_consumer_key, consumer_secret=twitter_consumer_secret, access_token_key=twitter_access_token_key, access_token_secret=twitter_access_token_secret ) search = api.GetSearch(term='DevOps', lang='en', result_type='recent', count=100, max_id='') item = 0 to_wordcloud = '' to_display = '<html><body bgcolor="#0033FF"><font color="white"> <h2>Tweets about DevOps.... or <a href="'+my_url+'">click here to go back</a></h2>' to_display += '<ol>' for t in search: item += 1 #to_display += str(item) + '. ' to_display += '<li>' to_display += t.user.screen_name to_display += ' (' to_display += t.created_at to_display += ') : ' to_display += t.text to_wordcloud += t.text to_display += '</li>' # Generate a word cloud image #wordcloud = WordCloud().generate(to_wordcloud) wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(to_wordcloud) wcimage = wordcloud.to_image() random_name = 'static/'+str(random.randint(0,888888))+'-image-wc.png' wcimage.save('./'+random_name) to_display += '</ol></font>' to_display += '<center><img src="/'+random_name+'" width="80%" height="80%"></center></body></html>' return to_display
def wcloud(wf, color, save_as=None): """Create a word cloud based on word frequencies, `wf`, using a color function from `wc_colors.py` Parameters ---------- wf : list (token, value) tuples color : function from `wc_colors.py` save_as : str filename Returns ------- None """ wc = WordCloud(background_color=None, mode='RGBA', width=2400, height=1600, relative_scaling=0.5, font_path='/Library/Fonts/Futura.ttc') wc.generate_from_frequencies(wf) plt.figure() plt.imshow(wc.recolor(color_func=color, random_state=42)) plt.axis("off") if save_as: plt.savefig(save_as, dpi=300, transparent=True)
def draw_wordCloud(): ''' 画出词云图 :return: ''' ## 读取wordList,转化为str global wordList cut_text = "" for word in wordList: cut_text = cut_text + word + " " ## 生成词云 os.chdir(r"D:\STUDYING\MyProjects\pycharm\music163_EasonComments") d = path.dirname(__file__) # 当前文件文件夹所在目录 color_mask = imread("Eason.jpg") # 读取背景图片 plt.imshow(color_mask) cloud = WordCloud( font_path=path.join(d, 'simsun.ttc'), background_color='white', mask=color_mask, max_words=2000, max_font_size=40, ) word_cloud = cloud.generate(cut_text) # 产生词云 ## show plt.imshow(word_cloud, interpolation="bilinear") plt.axis('off') plt.show()
def make_word_cloud(text, save_path, background_color='black'): # text expected to a string or a list of [(word, count), ...] from wordcloud import WordCloud import os def col_fun(word, *args, **kw): return '#333' if type(text) == str: big_string = text else: big_string = '' for word in text: big_string = big_string + ''.join((word[0]+' ') * word[1]) # print 'trying to make cloud: %s' % save_path # print os.getcwd() wc = WordCloud(background_color=background_color, color_func=col_fun, max_words=10000, height=200, width=700, font_path='app/static/fonts/NanumScript.ttc').generate(big_string) wc.generate(big_string) wc.to_file('app/%s' % save_path)
def run(): f = open(u'words2.txt', 'r').read() words = list(jieba.cut(f)) a = [] for w in words: if len(w) > 1: a.append(w) text = r' '.join(a) bg = np.array(Image.open('bg.jpg')) wordcloud = WordCloud( background_color = 'white', #width = 1500, #height = 960, #margin = 10, mask = bg, font_path='C:/Windows/Fonts/simkai.ttf', ).generate(text) image_colors=ImageColorGenerator(bg) plt.imshow(wordcloud.recolor(color_func=image_colors)) plt.axis('off') plt.show() wordcloud.to_file('words_result3.png') return
def test_coloring_black_works(): # check that using black colors works. mask = np.zeros((50, 50, 3)) image_colors = ImageColorGenerator(mask) wc = WordCloud(width=50, height=50, random_state=42, color_func=image_colors, min_font_size=1) wc.generate(THIS)
def test_repeat(): short_text = "Some short text" wc = WordCloud(stopwords=[]).generate(short_text) assert_equal(len(wc.layout_), 3) wc = WordCloud(max_words=50, stopwords=[], repeat=True).generate(short_text) # multiple of word count larger than max_words assert_equal(len(wc.layout_), 51) # relative scaling doesn't work well with repeat assert_equal(wc.relative_scaling, 0) # all frequencies are 1 assert_equal(len(wc.words_), 3) assert_array_equal(list(wc.words_.values()), 1) frequencies = [w[0][1] for w in wc.layout_] assert_array_equal(frequencies, 1) repetition_text = "Some short text with text" wc = WordCloud(max_words=52, stopwords=[], repeat=True) wc.generate(repetition_text) assert_equal(len(wc.words_), 4) # normalized frequencies assert_equal(wc.words_['text'], 1) assert_equal(wc.words_['with'], .5) assert_equal(len(wc.layout_), wc.max_words) frequencies = [w[0][1] for w in wc.layout_] # check that frequencies are sorted assert_true(np.all(np.diff(frequencies) <= 0))
def test_process_text(): # test that process function returns a dict wc = WordCloud(max_words=50) result = wc.process_text(THIS) # check for proper return type assert_true(isinstance(result, dict))
def test_generate_from_frequencies(): # test that generate_from_frequencies() takes input argument dicts wc = WordCloud(max_words=50) words = wc.process_text(THIS) result = wc.generate_from_frequencies(words) assert_true(isinstance(result, WordCloud))
def create_wordclouds(self, text, name_of_cloud, additional_stop_list, max_words, width, height, bigram = False): text_nopunc = self.remove_punctuation(text, "", "") text_lower = text_nopunc.lower() stop = self.stopwords stop.extend(additional_stop_list) text_nostop = self.remove_stopword(text_lower, stop) tokens = wt(text_nostop) text_lem = self.lemmatize(tokens) tokens_lem = wt(text_lem) my_bigrams = nltk.bigrams(tokens_lem) if bigram: bigram_merged=list() for line in my_bigrams: bigram_merged.append(line[0]+' ' + line[1]) counts = collections.Counter(bigram_merged) else: counts = collections.Counter(tokens_lem) final = counts.most_common(max_words) max_count = max(final, key=operator.itemgetter(1))[1] final = [(name, count / float(max_count))for name, count in final] # tags = make_tags(final, maxsize=max_word_size) # create_tag_image(tags, name_of_cloud+'.png', size=(width, height), layout=3, fontname='Crimson Text', background = (255, 255, 255)) # temp_cloud = " ".join(text for text, count in final) word_cloud = WordCloud(font_path="fonts/Georgia.ttf", width=width, height=height, max_words=max_words, stopwords=stop) word_cloud.fit_words(final) word_cloud.to_file(name_of_cloud + ".png")
def make_clouds(files, n_words=20): # set locations base_model_name = os.path.splitext(os.path.basename(files.model))[0] output_d = '../browser/clouds/' + base_model_name + '/' if not os.path.exists(output_d): os.makedirs(output_d) # create wordcloud generator wc = WordCloud(width=1000, height=500, background_color='white') print('Loading model') model = LdaModel.load(files.model) beta = model.expElogbeta print('Normalizing by topics, and by words') pTW = normalize(beta, axis=0) pWT = normalize(beta, axis=1) # load bug<->id map, then invert to id<-> bug bug_to_id = json.loads(open(files.replacements).read()) id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k} for i in range(len(beta)): # compute RAR t_rar = np.sqrt(pTW[i] * pWT[i]) top_word_ids = t_rar.argsort()[:-1 - n_words:-1] top_words = [model.id2word.id2token[wordid] for wordid in top_word_ids] top_words = [id_to_bug[word] if word in id_to_bug else word for word in top_words] wc.fit_words(zip(top_words, t_rar[top_word_ids])) wc.to_file(output_d + str(i) + '.png')
def get_tagcloud(self, tags, tag_limit=None): tag_limit = tag_limit or len(tags) tags = sorted(tags, key=lambda kv: -kv['count'])[:tag_limit] # Get top X tags tag_dict = {t['tag_name']: t['count'] for t in tags} # Generate a word cloud image wordcloud = WordCloud( background_color='white', min_font_size=10, max_font_size=60, width=self.tagcloud_width, height=self.tagcloud_height or 30 * len(tags) / 2 + 10, font_path=os.path.sep.join([settings.STATIC_ROOT, 'fonts', 'OpenSans-Regular.ttf']) ).generate_from_frequencies(tag_dict) tag_counts = [t['count'] for t in tags] step = (float(max(tag_counts))) / len(self.color_selection) thresholds = list(reversed([int(round(i * step)) for i in range(len(self.color_selection))])) def get_color(word, font_size, position, orientation, random_state=None, **kwargs): index = next((i for i, t in enumerate(thresholds) if tag_dict[word] >= t), 0) return self.color_selection[index] wordcloud.recolor(color_func=get_color) image = wordcloud.to_image() filepath = self.get_write_to_path(ext="png") image.save(filepath) return encode_file_to_base64(filepath, "data:image/png;base64,")
def draw_tag_cloud(users_tokens): from PIL import Image import matplotlib.pyplot as plt from wordcloud import WordCloud, ImageColorGenerator trump_coloring = np.array(Image.open("pics/trump.png")) freqs = get_full_frequencies(users_tokens) freq_pairs = freqs.items() wc = WordCloud(max_words=2000, mask=trump_coloring, max_font_size=40, random_state=42) wc.generate_from_frequencies(freq_pairs) image_colors = ImageColorGenerator(trump_coloring) # plt.imshow(wc) # plt.axis("off") # # plt.figure() plt.imshow(wc.recolor(color_func=image_colors)) # recolor wordcloud and show # we could also give color_func=image_colors directly in the constructor # plt.imshow(trump_coloring, cmap=plt.cm.gray) plt.axis("off") plt.show()
def wordCloud(text_array,name,keyword=""): new_text_arr=[] if keyword is not "": keyword=keyword.split(" ")[1] for text in text_array: if keyword in text: new_text_arr.append(text) text_array=new_text_arr cloud_text="" for text in text_array: cloud_text+=text+" " m_stopwords=['police','traffic','sir'] for word in m_stopwords: STOPWORDS.add(word) image_mask = os.path.join(BASE_DIR, 'static/tool/img/nebula.png') coloring = imread(image_mask) wordcloud = WordCloud(stopwords=STOPWORDS,background_color="white",mask=coloring,ranks_only=True,max_words=50).generate(cloud_text) filename=os.path.join(BASE_DIR, 'static/tool/img/'+name+'.png') image_colors = ImageColorGenerator(coloring) wordcloud.recolor(color_func=image_colors) wordcloud.to_file(filename) data_uri = open(filename, 'rb').read().encode('base64').replace('\n', '') img_tag = '<img src="data:image/png;base64,{0}" style="height:400px;">'.format(data_uri) layout=wordcloud.layout_ words_colours={} count=1 for lo in layout: entry={} entry['word']=lo[0][0] color=lo[len(lo)-1] color=color[4:] color=color[:-1] color_split=color.split(',') color_num=[int(x) for x in color_split] color_hex='#%02x%02x%02x' % tuple(color_num) # print color_num entry['color']=color_hex words_colours[count]=entry count+=1 # print words_colours list_html="" cap=51 if cap>len(words_colours): cap=len(words_colours) for i in range(1,cap): list_html+='<li class="list-group-item" ><a class="cloud-key-'+name+'" href="#" style="color:'+words_colours[i]['color']+'">' list_html+="#"+str(i)+" "+words_colours[i]['word']+'</a></li>' return (img_tag,list_html)
def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'): """ calculate and show hot words of Job Description (JD) :param jd_dir: :return: """ if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0: print('Error! No valid content in {0}'.format(jd_dir)) sys.exit(0) else: jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)} for k, v in jd_and_dir.items(): text = "".join(pd.read_excel(v)['详情描述']) jieba.analyse.set_stop_words(STOPWORDS_PATH) jieba.load_userdict(USER_CORPUS) hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=()) frequencies = {_[0]: _[1] for _ in hot_words_with_weights} print(frequencies) x, y = np.ogrid[:300, :300] mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2 mask = 255 * mask.astype(int) wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white", repeat=False, mask=mask) wordcloud.generate_from_frequencies(frequencies) import matplotlib.pyplot as plt plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()
def generate_wc(self, background_color='#ffffff'): """generate wordcloud and save to file""" # fig_kw = dict(figsize=(self.width/self.dpi, self.height/self.dpi), # dpi=self.dpi) self.get_exclude_words() try: imgpath = os.path.join(self.curdir, self.wordcloud_mask) arr = np.array(Image.open(imgpath)) # Other masks can be extracted from # Font-Awesome (http://minimaxir.com/2016/05/wordclouds/) # Download font or use the default one font_path = get_font(self.font_name) if self.allow_font_change: logger.info('Using {} font'.format(font_path)) # print(font_path) wc = WordCloud(width=self.width, height=self.height, font_path=font_path, colormap=self.cmap, stopwords=self.exclude_words, background_color=background_color, mode='RGBA', mask=arr).generate(self.text) self.make_img_file() wc.to_file(self.img_file) self.error_in_wordcloud_gen = None self.font_name = None # reset to default except Exception as e: self.error_in_wordcloud_gen = e
def generate_word_cloud(img_bg_path,top_words_with_freq,font_path,to_save_img_path,background_color = 'white'): # 读取背景图形 img_bg = imread(img_bg_path) # 创建词云对象 wc = WordCloud(font_path = font_path, # 设置字体 background_color = background_color, # 词云图片的背景颜色,默认为白色 max_words = 500, # 最大显示词数为1000 mask = img_bg, # 背景图片蒙版 max_font_size = 50, # 字体最大字号 random_state = 30, # 字体的最多模式 width = 1000, # 词云图片宽度 margin = 5, # 词与词之间的间距 height = 700) # 词云图片高度 # 用top_words_with_freq生成词云内容 wc.generate_from_frequencies(top_words_with_freq) # 用matplotlib绘出词云图片显示出来 plt.imshow(wc) plt.axis('off') plt.show() # 如果背景图片颜色比较鲜明,可以用如下两行代码获取背景图片颜色函数,然后生成和背景图片颜色色调相似的词云 #img_bg_colors = ImageColorGenerator(img_bg) #plt.imshow(wc.recolor(color_func = img_bg_colors)) # 将词云图片保存成图片 wc.to_file(to_save_img_path)
def generate_cloud(): d = path.dirname(__file__) janice = open(path.join(d, 'messages.txt')).read() group_mask = misc.imread(path.join(d, "mask.png"), flatten=True) wc = WordCloud(background_color="white", max_words = 2000, mask=group_mask) wc.generate(text) wc.to_file(path.join(d, "masked.jpg"))
def topic_word_cloud(nmf, topic_idx, max_words=300, figsize=(14, 8), width=2400, height=1300, ax=None): ''' Create word cloud for a given topic INPUT: nmf: NMFClustering object topic_idx: int max_words: int Max number of words to encorporate into the word cloud figsize: tuple (int, int) Size of the figure if an axis isn't passed width: int height: int ax: None or matplotlib axis object ''' wc = WordCloud(background_color='white', max_words=max_words, width=width, height=height) word_freq = nmf.topic_word_frequency(topic_idx) # Fit the WordCloud object to the specific topics word frequencies wc.fit_words(word_freq) # Create the matplotlib figure and axis if they weren't passed in if not ax: fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) ax.imshow(wc) ax.axis('off')
def make_cloud(words, image, size=10, filename='figures/cloud.png', max_words=200, horizontal=0.8): # Remove URLs, 'RT' text, screen names, etc my_stopwords = ['RT', 'amp', 'lt'] words_no_urls = ' '.join([word for word in words.split() if word not in my_stopwords]) # Add stopwords, if needed stopwords = STOPWORDS.copy() stopwords.add("RT") stopwords.add('amp') stopwords.add('lt') # Load up a logo as a mask & color image logo = imread(image) # Generate colors image_colors = ImageColorGenerator(logo) # Generate plot wc = WordCloud(stopwords=stopwords, mask=logo, color_func=image_colors, scale=0.8, max_words=max_words, background_color='white', random_state=42, prefer_horizontal=horizontal) wc.generate(words_no_urls) plt.figure(figsize=(size, size)) plt.imshow(wc) plt.axis("off") plt.savefig(filename)
full_texts.append(tmp) # write full_texts in txt file temp_texts = [text + '\n' for text in full_texts] fout = open('tweets with tag-{}.txt'.format(tag), 'w', encoding='utf-8') fout.writelines(temp_texts) fout.close() print('==== Tweets clawed and saved at root directory ====\n') # create wordcloud for visualization import matplotlib.pyplot as plt from wordcloud import WordCloud ## assign font and container attributes font_path = 'c:\\windows\\fonts\\Roboto-Regular.ttf' wordcloud = WordCloud(font_path=font_path, width=800, height=800) ## create wordcloud long_text = '' for text in full_texts: long_text = long_text + text long_text = re.sub(tag, string=long_text.lower(), repl='') wordcloud = wordcloud.generate(long_text) fig = plt.figure(figsize=(12, 12)) plt.imshow(wordcloud) plt.axis("off") ## save it on png fig.savefig('wordcloud with tag-{}.png'.format(tag)) print('==== WordCloud from tweets generated and saved at root directory ====')
# 1、读入xls文本数据 data = xlrd.open_workbook('data.xls') table = data.sheets()[0] nrows = table.nrows text = '' for i in range(nrows): colnames = table.row_values(i) #某一行数据 text += ','.join(colnames) # 2、结巴分词,默认精确模式。可以添加自定义词典userdict.txt,然后jieba.load_userdict(file_name) ,file_name为文件类对象或自定义词典的路径 # 自定义词典格式和默认词库dict.txt一样,一个词占一行:每一行分三部分:词语、词频(可省略)、词性(可省略),用空格隔开,顺序不可颠倒 cut_text = jieba.cut(text) result = "/".join(cut_text) # 必须给个符号分隔开分词结果来形成字符串,否则不能绘制词云 print(result) # 3、生成词云图,这里需要注意的是WordCloud默认不支持中文,所以这里需已下载好的中文字库 # 无自定义背景图:需要指定生成词云图的像素大小,默认背景颜色为黑色,统一文字颜色:mode='RGBA'和colormap='pink' d = path.dirname(__file__) round_coloring = imread(path.join(d, "maid.png")) wc = WordCloud(font_path="Yahei.ttf", background_color='white', width=800,mask=round_coloring, height=600, max_font_size=50, max_words=150) # ,min_font_size=10)#,mode='RGBA',colormap='pink') wc.generate(result) wc.to_file("wordcloud.png") # 按照设置的像素宽高度保存绘制好的词云图,比下面程序显示更清晰 # 4、显示图片 plt.figure("词云图") # 指定所绘图名称 plt.imshow(wc) # 以图片的形式显示词云 plt.axis("off") # 关闭图像坐标系 plt.show()
"之下", "一只", "一半", "这个", "便是", "倘若", "突然", "只是", "不敢", "他们", "我们", "见到", "声音", "心想", "如此", "只见", "之中", "不能", "一个", "知道", "什么", "不想", "不是", "甚么", "一声", "咱们", "别人", "一句", "不知" ] # 初始化自定义背景图片 bg_img = "fuyao.jpg" #注图片背景ps成白色 image = Image.open(bg_img) graph = np.array(image) # wordcloud配置 wc = WordCloud( font_path="simhei.ttf", # 设置字体 background_color='white', # 背景颜色 width=image.size[0], # 设置宽,我这里设置和背景图片宽一样 height=image.size[1], # 设置高,我这里设置和背景图片高一样 max_font_size=70, min_font_size=10, # 字体最大/最小值 stopwords=no_name, # 设置停用词,不在词云图中显示 max_words=2000, # 设置最大显示的字数 mode='RGBA') wc.generate(result) # 绘制文字的颜色以背景图颜色为参考 image_color = ImageColorGenerator(graph) # 从背景图片生成颜色值 wc.recolor(color_func=image_color) # 保存图片的名字 img_name = filename[:filename.rfind("."):] + "_词云图" + ".png" # 生成图片 wc.to_file(img_name) # 4、显示图片 plt.figure("词云图") # 指定所绘图名称
detokenized_data.append(t) dataset['clean_text']= detokenized_data documents = dataset['clean_text'] """# 6. Perform Exploratory Analysis To verify whether the preprocessing happened correctly, we’ll make a word cloud using the wordcloud package to get a visual representation of most common words. It is key to understanding the data and ensuring we are on the right track, and if any more preprocessing is necessary before training the model. """ # import the wordcloud library from wordcloud import WordCloud # Join the different processed titles together. long_string = ','.join(list(documents.values)) # create a WordCloud object wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue') # generate a word cloud wordcloud.generate(long_string) # visualize the word cloud wordcloud.to_image() """# 7. Create Document-Term Matrix This is the first step towards topic modeling. We need to represent each and every term and document as a vector.We will use sklearn's TfidfVectorizer to create a document-term matrix using only 1000 terms (words) from our corpus. """ # set variables no_terms = 1000 # NMF uses tf-idf Vectorizer
def generate_ldavis(lda, topic_count, word_count): print("Latent Dirichlet Allocation......") print(lda.print_topics(-1, word_count)) print_time() lda_vis = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary) pyLDAvis.save_html(lda_vis, 'visualization_all_' + str(topic_count) + '.html') start_time = datetime.now() warnings.filterwarnings("ignore", category=DeprecationWarning) print("Start time of program: " + str(start_time)) tokenizer = RegexpTokenizer(r'\w+') stopWords = set(stopwords.words('english')) wordcloud = WordCloud() Lda = gensim.models.ldamodel.LdaModel filename = "../data/fake_or_real_news.csv" with open(filename, 'rb') as f: lines = f.read() new = str(lines, 'utf-8') with open('clear', 'w') as f2: f2.write(new) df = pd.read_csv("clear") df = df.set_index('Unnamed: 0') bigram = gensim.models.phrases.Phrases(df.text) df['text_tokens'] = df.text.apply(process_text) doc_clean = df.text_tokens frequency = defaultdict(int)
lda_corpus = lda[corpus] lda_corpus lda_docs = [doc for doc in lda_corpus] lda_docs[0:5] len(lda_docs) from wordcloud import WordCloud import matplotlib.colors as mcolors cols = [color for name, color in mcolors.TABLEAU_COLORS.items() ] # more colors: 'mcolors.XKCD_COLORS' cloud = WordCloud(stopwords=stop, background_color='white', width=2500, height=1800, max_words=10, colormap='tab10', color_func=lambda *args, **kwargs: cols[i], prefer_horizontal=1.0) topics = lda.show_topics(formatted=False) fig, axes = plt.subplots(1, 2, figsize=(10, 20), sharex=True, sharey=True) for i, ax in enumerate(axes.flatten()): fig.add_subplot(ax) topic_words = dict(topics[i][1]) cloud.generate_from_frequencies(topic_words, max_font_size=300) plt.gca().imshow(cloud) plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16)) plt.gca().axis('off')
import matplotlib.pyplot as plt from wordcloud import WordCloud import jieba import PIL import numpy as np file_text = open("ci.txt", encoding='utf8').read() dict_list = jieba.cut(file_text, cut_all=True) di = " ".join(dict_list) alice_img = np.array( PIL.Image.open("C:\\Users\\Administrator\\Desktop\\timg.jpg")) wc = WordCloud(width=1920, height=1080, background_color="#fff", margin=2, mask=alice_img, font_path="C:\\Windows\\Fonts\\msyhbd.ttf")\ .generate(di) plt.imshow(wc) plt.axis("off") plt.show()
stopwords.add("also") stopwords.add("told") stopwords.add("one") stopwords.add("last") stopwords.add("new") stopwords.add("say") stopwords.add("year") stopwords.add("will") stopwords.add("yes") stopwords.add("no") stopwords.add("although") stopwords.add("first") stopwords.add("day") # Generate a word cloud image wordcloud1 = WordCloud(stopwords=stopwords).generate(politics_text) wordcloud2 = WordCloud(stopwords=stopwords).generate(film_text) wordcloud3 = WordCloud(stopwords=stopwords).generate(football_text) wordcloud4 = WordCloud(stopwords=stopwords).generate(business_text) wordcloud5 = WordCloud(stopwords=stopwords).generate(technology_text) # Display the generated image: # the matplotlib way: import matplotlib.pyplot as plt plt.imshow(wordcloud1, interpolation='bilinear') plt.axis("off") plt.imshow(wordcloud2, interpolation='bilinear') plt.axis("off") plt.imshow(wordcloud3, interpolation='bilinear')
data_pname = data[['pname']] data_requirement = data[['requirement']] data_workplace = data[['workplace']] data_nature = data[['nature']] # pandas 保存txt去除空格 data_pname.to_csv('static/wordcloud_pname.txt', header=None, index=False, sep=" ") file_pname = open('static/wordcloud_pname.txt', 'r', 1, encoding='utf8').read() mytext_pname = " ".join(jieba.cut(file_pname)) wordcloud = WordCloud(font_path='static/SimSun.ttf', background_color="white", width=1000, height=860, margin=2).generate(mytext_pname) wordcloud.to_file('static/img/wordcloud_pname.png') # pandas 保存txt去除空格 data_requirement.to_csv('static/wordcloud_requirement.txt', header=None, index=False, sep=" ") file_requirement = open('static/wordcloud_requirement.txt', 'r', 1, encoding='utf8').read() mytext_requirement = " ".join(jieba.cut(file_requirement)) wordcloud = WordCloud(font_path='static/SimSun.ttf',
import numpy as np from collections import Counter text = open('20190910_101329.csv', "r", encoding="utf-8").read() # 讀文字資料 jieba.set_dictionary("jieba_dict/dict.txt.big") # 設定繁體中文詞庫 with open("jieba_dict/stopWord_cloud.txt", "r", encoding="utf-8-sig") as f: # 設定停用字 stops = f.read().split("\n") # 讀取停用詞並存於stops串列中 terms = [] # 儲存字詞 for t in jieba.cut(text, cut_all=False): if t not in stops: terms.append(t) diction = Counter(terms) font = "msyh.ttc" mask = np.array(Image.open("Coins.png")) # 設定文字雲形狀 wordcloud = WordCloud(font_path=font) wordcloud = WordCloud(background_color="white", mask=mask, font_path=font) # 背景顏色預設黑色, 改為白色 wordcloud.generate_from_frequencies(frequencies=diction) # 產生文字雲 # 產生圖片 plt.figure(figsize=(6, 6)) plt.imshow(wordcloud) plt.axis("off") plt.show() wordcloud.to_file("news_Wordcloud.png")
def write(): #write is smart...it stops you from loading the every page when you import #^Above notes are just my musings import streamlit as st import matplotlib.pyplot as plt import pandas as pd import wordcloud import seaborn as sea username = st.text_input("Enter username here:") #plays song on the beach #eventually I should let this personalize boolean = False if len(username) > 0: try: df = pd.read_csv(username + ".csv") boolean = True except: st.write("username doesn't exist!") if boolean: audio_file = open('songonthebeach.ogg', 'rb') audio_bytes = audio_file.read() st.audio(audio_bytes, format='audio/ogg') #this reads our rastaman, example user csv df = pd.DataFrame(df) #str = random.choice(quote_list) #st.write(str) df = df.dropna() df.columns = ["score", "sentence", "date"] score = df["score"] #recent = the most recent score try: recent = score[len(score)-1] if recent == 0: st.write("Days like these come, and it's perfectly fine to be upset when difficulties arise. What you should remember is that days like these pass too, and that even when these times are dark, you still have friends, family, external resources to reach out too. Check out the resources tab for ways you can improve now.") except: st.write("Your username exists but we didn't save your score. Sorry about that! Please insert your journal entry again and press save my score again to save it officially. This is a known bug thart occurs when a username is first created, but not after!") #below are placeholders for personalized notes. should add functionality for this l8r #if recent == 2: # st.write("You're doing well today. I hope you keep up the progress.") #if recent == 1: # st.write("You're not feeling so great today, and that's okay. Know I'll always care about you.") #code where if the last five have been super happy play Photograph col1, col2, col3 = st.beta_columns(3) #need to make this graph look better. should add a time slider too. would be cool if when a person hovers over a point they see the journal entry for it. with col1: #df["date"] = pd.to_datetime(df["date"]) fig, ax = plt.subplots() df2 = df[["score", "date"]] df2["date"] = pd.to_datetime(df2["date"]) df2['week_num'] = df2['date'].dt.strftime("%W") df2['day_num'] = df2['date'].dt.weekday df_wide = df2.pivot_table(index='week_num',columns='day_num',values='score') ax = sea.heatmap(df_wide) st.pyplot(fig) #have to take down this labelled_journal_entries csv before we release. If not, we release a lot of people's personal data. #need to add a slider for time here. maybe for mood too. with col2: #LJE = LJE[LJE["score"] < 4] from wordcloud import WordCloud wordcloud2 = WordCloud(background_color='white').generate(' '.join(df['sentence'])) fig, ax = plt.subplots() plt.imshow(wordcloud2) plt.axis("off") st.pyplot(fig) #need to add sentiment-dependent emojis to output searches with col3: word = st.text_input("Input word you want to search for") if len(word) > 2: entries = ' '.join(df['sentence']) arr = entries.split('.') str = " " for i in range(0, len(arr)): if word in arr[i]: str = arr[i] st.markdown(str)
if word_polarity > 0.25: positive.append(word) else: neutral.append(word) positive_count = {} positive_count2 = {} for word in positive: positive_count[word] = positive_count.get(word, 0) + 1 for word, count in positive_count.items(): if count < 2: continue else: positive_count2[word] = count word_cloud = WordCloud().generate_from_frequencies(positive_count2) plt.imshow(word_cloud, interpolation = 'bilinear') plt.show() neutral_count = {} neutral_count2 = {} for word in neutral: neutral_count[word] = neutral_count.get(word, 0) + 1 for word, count in neutral_count.items(): if count < 2: continue else: neutral_count2[word] = count word_cloud = WordCloud().generate_from_frequencies(neutral_count2) plt.imshow(word_cloud, interpolation = 'bilinear')
(WIDTH, HEIGHT, RESOLUTION) = (3840, 2160, 500) ############################################################################## # Read artists file ############################################################################## data = pd.read_csv(stp.DATA_PATH + stp.USR + '_cln.csv', parse_dates=[3]) songs = sorted(data.get('Song').unique()) songCount = data.groupby('Song').size().sort_values(ascending=False) ############################################################################## # Wordcloud ############################################################################## wordcloudDef = WordCloud(width=WIDTH, height=HEIGHT, max_words=2000, relative_scaling=1, min_font_size=12, background_color='Black', colormap='Purples', font_path=stp.FONT) wordcloud = wordcloudDef.generate_from_frequencies(songCount) ax1 = plt.axes(frameon=False) plt.figure(figsize=(20, 20 * (HEIGHT / WIDTH)), facecolor='k') plt.imshow(wordcloud, interpolation='bilinear') plt.tight_layout(pad=0) plt.axis("off") plt.savefig(stp.IMG_PATH + '/SNG_WDC.png', dpi=RESOLUTION, facecolor='k', edgecolor='w', orientation='portrait', papertype=None,
text = open(path.join(d, 'stopwords.txt'),encoding ='utf-8').read() #如果是中文 #text = processChinese(text)#中文不好分词,使用Jieba分词进行 # read the mask / color image # taken from http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010 # 设置背景图片 back_coloring = np.array(Image.open('/Users/xxxx1/Pictures/alice.jpeg')) wc = WordCloud(font_path = "/System/Library/Fonts/STHeiti Light.ttc", background_color="white", #背景颜色 max_words=2000,# 词云显示的最大词数 mask=back_coloring,#设置背景图片 # max_font_size=100, #字体最大值 # random_state=42, ) # 生成词云, 可以用generate输入全部文本(中文不好分词),也可以我们计算好词频后使用generate_from_frequencies函数 wc.generate(text) # wc.generate_from_frequencies(txt_freq) # txt_freq例子为[('词a', 100),('词b', 90),('词c', 80)] # 从背景图片生成颜色值 image_colors = ImageColorGenerator(back_coloring) # plt.figure(figsize=(8,6), dpi=800) # 以下代码显示图片 # plt.imshow(wc) # plt.axis("off")
keiyou_count = keiyou_count + 1 keiyou_list.append(nodes.surface) else: pass nodes = nodes.next text = "" + " ".join(meishi_list) + " ".join(doushi_list) + " ".join( keiyou_list) # In[27]: #WCの下ごしらえ stop_words = ["https", "co", "てる", "する", "そう", "すぎ", "いい", "さん", "こと"] fpath = "/Library/Fonts//ヒラギノ丸ゴ ProN W4.ttc" wc = WordCloud(font_path=fpath, background_color="white", max_words=2000, collocations=False, stopwords=set(stop_words)) # WordCloudの実行 wc.generate(text) wc.to_file("word_cloud.png") plt.figure(figsize=(15, 12)) plt.imshow(wc) plt.axis("off") plt.show()
# Omit unnecessary words if token.base_form not in [ "こと", "よう", "そう", "これ", "それ", "本田", "カードバトル", "pepsi", "フォロー", "jpn", "バトル", "コイン", "ペプシ", "フォロー", "RT", "ジャパン", "コーラ", "カード", "プレゼント", "毎日", "挑戦", "ケース", "記念" ]: words_count[token.base_form] += 1 words.append(token.base_form) return words_count, words with open('data/twitter_list.txt', 'r') as f: reader = csv.reader(f, delimiter='\t') texts = [] for row in reader: if (len(row) > 0): text = row[0].split('http') texts.append(text[0]) words_count, words = counter(texts) text = ' '.join(words) # My font in fontbook fpath = "~/Library/Fonts/Ricty-Bold.ttf" wordcloud = WordCloud(background_color="White", font_path=fpath, width=900, height=500).generate(text) wordcloud.to_file("./wordcloud.png")
# 파이썬이 인식할 수 있는 한글 단어의 갯수를 늘리기 위한 작업 file = open('d://project//word.txt', 'r', encoding='utf-8') #word.txt는 리뷰에 나올만한 단어들 word = file.read().split(' ') #word.txt를 어절별로 분리하고 for i in word: #분리한 어절들을 하나씩 불러온다. text = re.sub(i, '', text) #re.sub('있다','','있다') <-라라랜드 리뷰의 '있다'를 ''으로 대체하겠다 라는 뜻 print(text) #*일반적인 문장에서 자주나오는 단어들을 일일히 손으로 다 할 수는 없으니까 for문으로 전부 ''으로 대체 # 워드 클라우드를 그린다. wordcloud = WordCloud( font_path='d://Windows//Fonts//gulim', # 글씨체 stopwords=STOPWORDS, # 마침표, 느낌표,싱글 쿼테이션 등을 정제 max_words=1000, # 워드 클라우드에 그릴 최대 단어갯수 background_color='white', # 배경색깔 max_font_size=100, # 최대 글씨 크기 min_font_size=1, # 최소 글씨 mask=usa_mask, # 배경 모양 colormap='jet').generate(text).to_file('d://project//digimon_cloud.png') # c 드라이브 밑에 project 폴더 밑에 생성되는 워드 클라우드 이미지 이름 plt.figure(figsize=(15, 15)) #가로x세로 15x15 plt.imshow(wordcloud, interpolation='bilinear') # 글씨가 퍼지는 스타일 plt.axis("off") #%% #Q.라라랜드 리뷰 txt에서 평가 점수가 6점 이상인 리뷰들만 출력하시오 stev = open("d:\\data\\lalaland.txt", encoding="UTF8") stev2 = stev.readlines() #어절별로 분리해서 stev2라는 리스트에 담는다.
filtered_sent.append(review) from sklearn.feature_extraction.text import TfidfVectorizer tf = TfidfVectorizer() text_tf = tf.fit_transform(filtered_sent) feature_names = tf.get_feature_names() dense = text_tf.todense() denselist = dense.tolist() df = pd.DataFrame(denselist, columns=feature_names) #plotting wordcloud on TFIDF from wordcloud import WordCloud import matplotlib.pyplot as plt cloud = ' '.join(df) wordcloud = WordCloud(background_color='black', width=1800, height=1400).generate(cloud) plt.imshow(wordcloud) ##Importing positive words to plot positive word cloud with open("E:\\Assignment\\11) Text mining\\positive-words.txt", "r") as pos: poswords = pos.read().split("\n") poswords = poswords[36:] pos_words = ' '.join([w for w in df if w in poswords]) cloud_pos = WordCloud(background_color='black', width=1800, height=1400).generate(pos_words) plt.imshow(cloud_pos)
plt.xlabel('Polarity') plt.ylabel('Rate') plt.grid(True) plt.hist(polarity , bins = 5) plt.axis([-1.00, 1.00, 0, 60]) plt.show() plt.title('Sub Histogram') plt.xlabel('Subjectivity') plt.ylabel('Rate') plt.grid(True) plt.hist(subjectivity, bins = 5) plt.axis([-1.00, 1.00, 0, 60]) plt.show() plt.scatter(polarity, subjectivity) plt.show() all_tweets = ', '.join(tweet['text'] for tweet in tweetData) tb = TextBlob(all_tweets) #filtered_words = [] print(all_tweets) print(str(tweet)) wordcloud = WordCloud().generate(all_tweets) plt.imshow(wordcloud) plt.axis("off") plt.show() #generate_from_text(text)
""" import os from os import path from wordcloud import WordCloud # get data directory (using getcwd() is needed to support running example in generated IPython notebook) d = path.dirname(__file__) if "__file__" in locals() else os.getcwd() # Read the whole text. #text = open(path.join(d, 'constitution.txt')).read() text = open(path.join(d, 'garda.csv')).read() # Generate a word cloud image wordcloud = WordCloud().generate(text) # Display the generated image: # the matplotlib way: import matplotlib.pyplot as plt plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") # lower max_font_size wordcloud = WordCloud(max_font_size=40).generate(text) plt.figure() plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.savefig('cloud.png') plt.show()
from wordcloud import WordCloud import jieba text = open("fulian3.txt", "rb").read() # 结巴分词 wordlist = jieba.cut(text, cut_all=True) wl = " ".join(wordlist) # print(wl)#输出分词之后的txt # 把分词后的txt写入文本文件 # fenciTxt = open("fenciHou.txt","w+") # fenciTxt.writelines(wl) # fenciTxt.close() # 设置词云 wc = WordCloud( background_color="black", # 设置背景颜色 # mask = "图片", #设置背景图片 max_words=2000, # 设置最大显示的字数 # stopwords = "", #设置停用词 font_path="fangsong_GB2312.ttf", # 设置中文字体,使得词云可以显示(词云默认字体是“DroidSansMono.ttf字体库”,不支持中文) max_font_size=50, # 设置字体最大值 random_state=30, # 设置有多少种随机生成状态,即有多少种配色方案 ) myword = wc.generate(wl) # 生成词云 # 展示词云图 plt.imshow(myword) plt.axis("off") plt.show()
text = re.sub("[0-9]+", '', text) logger.info('Words') print(text) ##################### # Make Word Cloud ##################### extra_stopwords = EXTRA_STOPWORDS for e in extra_stopwords: STOPWORDS.add(e) stopwords = set(STOPWORDS) wc = WordCloud(background_color="white", max_words=2000, stopwords=stopwords, mode="RGBA", colormap='BuPu') # generate word cloud wc.generate(text) # store to file wc.to_file(FILENAME) n = utils.Notify() n.telegram({ 'chat_id': '@whalepoolbtcfeed', 'message': KEYWORD + ' related google trends for the last 7 days', 'picture': FILENAME }) print('Saved: ' + FILENAME)
from wordcloud import WordCloud import matplotlib.pyplot as plt votes = getVoteData() for name, group in votes.groupby('Vote'): print(name) speeches = group['Discurso'] concatSpeeches = '' for speech in speeches: concatSpeeches += ' ' + sanitizeString(str(speech)) nWord = len(concatSpeeches.split()) mostCommonWords = Counter(concatSpeeches.split()).most_common() for i in range(0, 20): print(mostCommonWords[i][0], mostCommonWords[i][1] / (nWord)) wordcloud = WordCloud(max_font_size=40, relative_scaling=.5, background_color='white', max_words=50).generate( concatSpeeches.replace('nao', '').replace('sim', '')) plt.figure() plt.imshow(wordcloud) plt.axis("off") plt.savefig('Temp/WordCloud_' + name + '.png') plt.close()
print('불필요한 키워드 제거중...') for d in rmkeys: data = data.replace(' ' + d + ' ', ' ') data = data.replace('>', ' ') data = data.replace('<', ' ') data = data.replace('https://www.youtube.com/watch?v=', ' ') data = data.replace('https://youtu.be/', ' ') data = data.replace('- dc official App',' ') data = data.replace('- 라하마갤 와주는데스http://gall.dcinside.com/loudhouse',' ') data = data.replace('https://gall.dcinside.com/mgallery/board/view/?id=',' ') data = data.replace('https://gall.dcinside.com/board/lists/?id=', ' ') data = data.replace('https://', ' ') print('워드클라우드 생성 중...') wc_title = WordCloud(font_path='font.otf', width=2000, height=1800, background_color='white', collocations=False, max_words=2000).generate(data) print('이미지 저장 중...') wc_title.to_file('wordcloud.png') hk = sorted(wc_title.words_.items(), key=(lambda x: x[1]), reverse = True) #hotkey = hk[0][0] + ', ' + hk[1][0] + ', ' + hk[2][0] + ', ' + hk[3][0] + ', ' + hk[4][0] #print('핵심 키워드:', hotkey) #pkeys = '' #for s in hk: pkeys += s[0] + '\n' keys = '' for k in hk: keys += str(k) + '\n'
from PIL import Image import numpy as np import matplotlib.pyplot as plt import matplotlib from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator matplotlib.rcParams['figure.figsize'] = (16.0, 9.0) script=open("spec.txt").read() stopwords=set(STOPWORDS) bond=np.array(Image.open("images.jpg")) from matplotlib.colors import LinearSegmentedColormap as lsc colors=["#000000","#0060A8","#484848","#FFF200"] cmap=lsc.from_list("mycmap",colors) wc=WordCloud(background_color="white",stopwords=stopwords,mask=bond,width=1987,height=787,colormap=cmap) wc.generate(script) plt.figure() plt.imshow(wc,interpolation="bilinear") plt.axis("off") plt.show()
from wordcloud import WordCloud import matplotlib.pyplot as plt text = open('debate.csv', 'r').read() wordcloud = WordCloud(max_font_size=100, width=1520, height=535).generate(text) plt.figure(figsize=(16, 9)) plt.imshow(wordcloud) plt.axis("off") plt.show()
Counter(words).most_common() stop_words = [word for word, count in Counter( words).most_common() if count > 3] # Recreate document-term matrix cv = CountVectorizer(stop_words=stop_words) data_cv = cv.fit_transform(data_clean.transcript) data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names()) data_stop.index = data_clean.index # Pickle it for later use pickle.dump(cv, open("cv_stop.pkl", "wb")) data_stop.to_pickle("dtm_stop.pkl") wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2", max_font_size=150, random_state=42) plt.rcParams['figure.figsize'] = [16, 6] names = ['bird-and-whale', 'chick-little', 'goldilocks', 'threepigs', 'petitchaperonrouge', 'uglyduckling'] # Create subplots for each comedian # for index, name in enumerate(data.columns): # wc.generate(data_clean.transcript[name]) # # plt.subplot(3, 4, index+1) # plt.imshow(wc, interpolation="bilinear") # plt.axis("off") # plt.title(names[index])
from nltk.corpus import stopwords from wordcloud import WordCloud import matplotlib.pyplot as plt import random import TKinter as tk def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl(0, 0%%, %d%%)" % random.randint(10, 50) f = open('/Users/Tristan/Downloads/const.txt') text = f.read() font_path = '/Users/Tristan/books/datasets/Open_Sans_Condensed/' + 'Open_Sans_Condensed/OpenSansCondensed-Light.ttf' from nltk.corpus import stopwords stopWords = set(stopwords.words('english')) # w = WordCloud() w = WordCloud(stopwords=stopWords, background_color='white', min_font_size=14, max_words=1000 ,font_path=font_path ,normalize_plurals=True) wordcloud = w.generate(text) wordcloud.recolor(color_func=grey_color_func) # plt.figure(figsize=(15,8)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.savefig('/Users/Tristan/books/datasets/output/wordclouds/cloud2.png') plt.close()
from os import path from PIL import Image import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS f = open('Ev_Scan.txt', 'r') document = f.read() print(len(document)) wordcloud = WordCloud(width=600, height=480, colormap="Oranges_r").generate(document) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.margins(x=0, y=0) plt.tight_layout(pad=0) plt.show()
# because the interactive session continues into the self check. # Loading the Text from pathlib import Path text = Path('RomeoAndJuliet.txt').read_text() # Loading the Mask Image that Specifies the Word Cloud’s Shape import imageio mask_image = imageio.imread('mask_heart.png') # Configuring the WordCloud Object from wordcloud import WordCloud wordcloud = WordCloud(width=1000, height=1000, colormap='prism', mask=mask_image, background_color='white') # Generating the Word Cloud wordcloud = wordcloud.generate(text) # Saving the Word Cloud as an Image File wordcloud = wordcloud.to_file('RomeoAndJulietHeart.png') %matplotlib import matplotlib.pyplot as plt plt.imshow(wordcloud) # Section 12.3.2 Self Check snippets