def gen_wordcloud(text, filename): # 1). 强调分割中有问题的词; # jieba.suggest_freq(('微博'), True) # jieba.suggest_freq(('热搜'), True) # 2). 难点: 如何切割中文, jieba, lcut result = jieba.lcut(text) # print(result) # 绘制词云 # 3). 打开图片, 获取图片的数据信息; imgObj = Image.open('./doc/wordcloud.jpg') img_mask = np.array(imgObj) # print(img_mask) # 4). 创建词云对象, 设置属性 wcObj = wordcloud.WordCloud( mask=img_mask, # 数据如何填充到图片 background_color="snow", # 北京颜色 font_path="/usr/share/fonts/wqy-zenhei/wqy-zenhei.ttc", # 如果是中文, 指定字体库(fc-list :lang=zh) min_font_size=5, # 图片中最小的字体大小 max_font_size=50, # 图片中最小的字体大小 width=1000, # 图片宽度 height=1000, # 高 ) # 5). 生成图片; # 词云绘制时, 默认之处理字符串类型, 怎么分隔每个单词? 必须以逗号分隔符分割 wcObj.generate(",".join(result)) wcObj.to_file(filename) print("生成图片%s成功......." %(filename))
def generate_ciyun(): try: # 调用jieba的lcut()方法对原始文本进行中文分词,得到string string = " ".join(get_news_content()) print(string) #string = " ".join(txtlist) # 构建并配置词云对象w w = wordcloud.WordCloud(width=1000, height=700, background_color='black', font_path='/Library/Fonts/华文行楷.ttf') # 将string变量传入w的generate()方法,给词云输入文字 w.generate(string) #检测词云文件是否存在,如果存在就删除 my_file = 'key_word_ciyun.png' if os.path.exists(my_file): # 删除文件,可使用以下两种方法。 os.remove(my_file) # os.unlink(my_file) else: print 'no such file:%s' % my_file # 将词云图片导出到当前文件夹 w.to_file('key_word_ciyun.png') except Exception as e: print(e)
def create_word_cloud(f): f = remove_stop_words(f) cut_text = nltk.word_tokenize(f) cut_text = " ".join(cut_text) wc = wordcloud.WordCloud(max_words=100, width=2000, height=1200) wc.generate(cut_text) wc.to_file("WordCloud.jpg")
def draw_wordcloud(self): #texts = self.find_freq() texts = self.remove_stopword() wcloud = wordcloud.WordCloud('./data/D2Coding.ttf', relative_scaling=0.2, background_color='white').generate( " ".join(texts)) plt.figure(figsize=(12, 12)) plt.imshow(wcloud, interpolation='bilinear') plt.axis('off') plt.show()
def get_and_show_keywords(total): keywords_list = [] for item in total['keywords']: keywords_list.append(item) keywords_list = '|'.join(keywords_list) wc = wordcloud.WordCloud(background_color='black', max_words=3000, scale=1.5).generate(keywords_list) plt.figure(figsize=(14, 8)) plt.imshow(wc) plt.axis('off') plt.show()
def wordcloudshow(keywordsfrequency): background_image = imread("heart.jpg") wordcloudobject = wordcloud.WordCloud( font_path="/library/fonts/microsoft/simsun.ttf", mask=background_image, background_color="white", max_font_size=300, random_state=30).generate_from_frequencies(keywordsfrequency, 200) image_colors = ImageColorGenerator(background_image) wordcloudobject.recolor(color_func=image_colors) plt.imshow(wordcloudobject) plt.axis("off") plt.show()
def createWordCloud(data, font_path, mask_path, background_color, max_words): if background_color == "-1": background_color = "white" if max_words == "-1": max_words = 300 if font_path == "-1" and mask_path == "-1": w = wordcloud.WordCloud(background_color=background_color, max_words=int(max_words)) elif font_path == "-1" and mask_path != "-1": mask = imread(mask_path) w = wordcloud.WordCloud(background_color=background_color, mask=mask, max_words=int(max_words)) elif font_path != "-1" and mask_path == "-1": w = wordcloud.WordCloud(background_color=background_color, font_path=font_path, max_words=int(max_words)) else: mask = imread(mask_path) w = wordcloud.WordCloud(background_color=background_color, mask=mask, font_path=font_path, max_words=int(max_words)) w.generate(data['DM']) timec = time.ctime() timec = timec.replace(":", "-") save_path = "【" + data['ownerName'] + "】" + data[ 'videoName'] + " @BV:" + data['bvid'] + " @时间:" + str( timec) + "(词量:" + str( max_words) + " ,分词模式:" + data['mode'] + ")" + ".png" blacklist = r"[\/\\\:\*\?\"\<\>\|]" save_path = ".\\" + re.sub(blacklist, "", save_path) w.to_file(save_path)
def wordcloud_plot(self, txtfile, imgfile, para): # txtfile = "./text/我的孤独是一座花园.txt" # imgfile = "./images/动物/1225574.png" self.bct = barchart() self.bct.wordfreqsum(txtfile) self.wfreq = self.bct.wfreq # 词频字典 txt = open(txtfile, encoding='UTF-8').read() txtlist = jieba.lcut(txt) string = " ".join(txtlist) para['stopwords'] = jieba.lcut(para['stopwords']) mk = imageio.imread(imgfile) # 设定参数 self.wc = wordcloud.WordCloud( width=para["width"], height=para['height'], background_color='white', font_path=para['font_path'], mask=mk, max_words=para['number'], scale=para['scale'], stopwords=para['stopwords'], contour_width=para['contour_width'], relative_scaling=para['relative_scaling'], colormap=para['colormap']) # matplotlib colormap if para['swf'] == 0: self.wc.generate(string) # 先生成对象,然后考虑着色 else: print(self.wfreq) self.wc.generate_from_frequencies(self.wfreq) #清除原来的图像 self.axes.clear() if para['tc'] == 0: self.axes.imshow(self.wc) else: image_colors = ImageColorGenerator(mk) wc_color = self.wc.recolor(color_func=image_colors) self.axes.imshow(wc_color) self.draw()
def buildTags2WordCloud(self,tags): text = {} for (word,flag),weight in tags: if 'n' == flag: self.n_s.append(word) elif 'ns' == flag: self.ns_s.append(word) elif 'vn' == flag: self.vn_s.append(word) text[word] = weight w = wordcloud.WordCloud( font_path = '/usr/share/fonts/truetype/SIMKAI.TTF', width = 1000, height = 700, background_color = 'white', color_func = self.__color_fun__ ) w.generate_from_frequencies(text) from spider.io import settings cloud_img = settings.FILE_ROOT_PATH + str(hash(self.user_url)) +'.png' w.to_file(cloud_img) return cloud_img
from wordcloud import wordcloud if __name__ == '__main__': key = '1' with open('pure_json_data\\' + key + '.json', 'r', encoding="utf-8") as f: data = json.loads(f.read()) with open('other_data\\a', 'r', encoding="utf-8") as sf: stopWords = sf.read().split("\n") all_contents = '' for i in data: #put all comments together for content in i['content']: all_contents += content word = jieba.cut( all_contents ) #use jieba api to break down all words so can be put in use presentive = [] for i in word: if i not in stopWords: presentive.append(i) wc = wordcloud.WordCloud(font_path='msyh', width=1920, height=1080, background_color='white') wc.generate(" ".join(presentive)) wc.to_file('pic\\' + key + '_period(word).png')
# 1.read word test = open("test.txt", 'r', encoding='utf-8').read() print(test) # 2.seperate word cut_test = jieba.cut(test) # 3. merge with space res = ' '.join(cut_test) # 4.generate word cloud wc = wordcloud.WordCloud( font_path='迷你简太极.ttf', background_color='white', # 背景颜色 width=1000, height=600, max_font_size=50, # 字体大小 min_font_size=10, mask=plt.imread('cloud.jpg'), # 背景图片 max_words=1000) wc.generate(res) wc.to_file('wc_res.png') # 5.display image plt.figure('wc_res.png') # 图片显示的名字 plt.imshow(wc) plt.axis('off') # 关闭坐标 plt.show()
# 词频统计 word_counts = collections.Counter(object_list) # 对分词做词频统计 word_counts_top10 = word_counts.most_common(15) # 获取前10最高频的词 print(word_counts_top10) # 输出检查 word_counts_top10 = str(word_counts_top10) # 词频展示 mask = np.array(Image.open('background.jpg')) # 定义词频背景 img_colors = ImageColorGenerator(mask) # 提取背景图片颜色 wc = wordcloud.WordCloud( font_path='simfang.ttf', # 设置字体格式 mask=mask, # 设置背景图 max_words=200, # 最多显示词数 max_font_size=180, # 字体最大值 background_color='white', width=640, height=480, scale=0.6, colormap='binary', ) wc.generate_from_frequencies(word_counts) # 从字典生成词云 #wc.recolor(color_func=img_colors) #重新上色 plt.imshow(wc) # 显示词云 plt.axis('off') # 关闭坐标轴 plt.show() # 显示图像 wc.to_file('wordcloud.png')
from selenium import webdriver import sys import numpy as np from PIL import Image import time from wordcloud import wordcloud, STOPWORDS, ImageColorGenerator import matplotlib.pyplot as plt driver=webdriver.Chrome() driver.get('https://www.youtube.com/watch?v=9eX-HRFwCnU') driver.execute_script('window.scrollTo(1, 500);') #now wait let load the comments time.sleep(10) driver.execute_script('window.scrollTo(1, 3000);') comment_div=driver.find_element_by_xpath('//*[@id="contents"]') comments=comment_div.find_elements_by_xpath('//*[@id="content-text"]') mimg=np.array(Image.open("C:/Users/Sony/Downloads/images (1).jpg")) mcolour=ImageColorGenerator(mimg) text="".join(r.text for r in comments) wordcloud1 =wordcloud.WordCloud(stopwords=set(STOPWORDS),background_color="white",mask=mimg,contour_width=2,contour_color='yellow',max_font_size=40) wordcloud1.generate(text) plt.figure(figsize=(10,10)) plt.imshow(wordcloud1.recolor(color_func=mcolour), interpolation='bilinear') plt.axis("off") plt.show()
import jieba from PIL import Image from wordcloud import wordcloud import matplotlib.pyplot as plt import numpy as np #英文词云 wc = wordcloud.WordCloud() words = wc.generate("Choose a life of action, not one of ostentation.") wc.to_file("./picture/英文词云.png") #中文词云 wc = wordcloud.WordCloud("font_path='C:/Windows/Fonts/simhei.ttf") text = "今天是个好日子" cut_text = jieba.cut(text) #分词 cuted = ' '.join(cut_text) #词语之间加空格 words = wc.generate(cuted) wc.to_file("./picture/中文词云.png") ##生成带形状的词云 text = open("./Data/微博评论数据女排20191230.csv", 'r', encoding='UTF-8').read() words_cuted = jieba.cut(text) results = " ".join(words_cuted) wc = wordcloud.WordCloud(mask=np.array(Image.open("./picture/china.jpg")), font_path="C:\\Windows\\Fonts\\msyh.ttc", background_color='white').generate(results) # 显示词云 plt.imshow(wc, interpolation='bilinear') plt.axis("off") plt.show() wc.to_file("./picture/形状词云.png")
word_list = [] for sent in resl: word_list.append(sent) stop_list = [] stop = collections.Counter(word_list) for x in stop.keys(): if stop[x] > 200: stop_list.append(x) elif stop[x] < 10: stop_list.append(x) for st in stop_list: stop.pop(st) #生产词云 cloud = wordcloud.WordCloud( font_path="./q.ttf", background_color='black', max_words=400, #最大号字体,如果不指定则为图像高度 max_font_size=100, #画布宽度和高度,如果设置了msak则不会生效 width=600, height=400, margin=2, #词语水平摆放的频率,默认为0.9.即竖直摆放的频率为0.1 prefer_horizontal=0.8) wc = cloud.generate_from_frequencies(stop) # news = ts.guba_sina(show_content=True) # print(news.ix[3]) plt.imshow(wc) #不现实坐标轴 plt.axis( 'off' ) #绘制词云 #plt.figure(dpi = 600) image_colors = ImageColorGenerator(color_mask) #plt.imshow(wc.recolor(color_func=image_colors)) 重新上色, plt.show()
import numpy as np # 矩阵运行 from matplotlib import pyplot as plt # 绘图,数据可视化 from wordcloud import wordcloud # 词云 from PIL import Image # 图形处理 import sqlite3 conn = sqlite3.connect('movie250.db') cs = conn.cursor() sql = 'select introduction from movie' data = cs.execute(sql) text = '' for item in data: text = text + item[0] # print(item) # print(text) cs.close() conn.close() cuts = jieba.cut(text) strs = ' '.join(cuts) print(len(strs)) img = Image.open(r'kk2.jpeg') img_arr = np.array(img) # 将图片转成数组 wc = wordcloud.WordCloud(background_color='white', mask=img_arr, font_path='msyh.ttc').generate_from_text(strs) fig = plt.figure(1) plt.imshow(wc) plt.axis('off') plt.show()
# parts=re.split('[^\w\u4e00-\u9fff]+',comment)#过滤非中文和非英文 # for ele in parts: # if len(ele) >0: # sentence.append(ele) ########################################################################## with open('pure_json_data\\all_comments.json', 'r', encoding="utf-8") as f: data_all = json.loads(f.read()) for content in data_all: sentences = re.split('[^\w\u4e00-\u9fff]+', content) for ele in sentences: if len(ele) > 0: sentence.append(ele) print(sentence) ########################################################################### presentive = [] for i in sentence: if any(word in i for word in emotionwords): # for word in emotionwords: # if word in sentence: presentive.append(i) # break print(presentive) wc = wordcloud.WordCloud(font_path='msyh', width=3840, height=2160, background_color='white') wc.generate(" ".join(presentive)) wc.to_file('pic\\' + key + '(sentence)-keywords.png')
h = randint(0, 48) s = int(100.0 * 255.0 / 255.0) l = int(100.0 * float(randint(60, 120)) / 255.0) return "hsl({}, {}%, {}%)".format(h, s, l) #stopword设置,没有用到 stopWORD = [] stopWORD.append("国家") stopWORD.append("中国") #词云configuration w = wordcloud.WordCloud( \ width = 3000, height = 2100,\ scale =4, background_color = "white",color_func = random_color_func, stopwords = stopWORD, mask=abel_mask, max_words=100, font_path = "msyh.ttc" ) w.generate(txt) w.to_file("职位词频2.jpg") #将底版和词云融合 img1 = Image.open("a.jpg") img2 = Image.open("职位词频2.jpg") img_1 = img1.resize(img2.size) img = Image.blend(img_1, img2, 0.9) img.save("职位词频3.png") #w.recolor(color_func=image_colors)
returning = begining.flow_R() content = returning[0] Num = returning[1] print(type(content)) contents = 0 for x in range(len(content)): contents = str(contents) + str(content[x]) tfidf = jieba.analyse.extract_tags(contents, topK=10, withWeight=False) wordcloud.random_color_func(word=None, font_size=None, position=None, orientation=None, font_path=None, random_state=None) image1 = PIL.Image.open( r'C:\\Users\\Administrator\\Desktop\\pac\\1017\\ciyun\\item.JPG') MASK = np.array(image1) WC = wordcloud.WordCloud(font_path='STFANGSO.TTF', max_words=2000, mask=MASK, height=400, width=400, background_color='white', repeat=False, mode='RGBA') st1 = re.sub('[,。、“”‘ ’]', '', str(tfidf)) conten = ' '.join(jieba.lcut(st1)) con = WC.generate(conten) plt.imshow(con) WC.to_file('C:\\Users\\Administrator\\Desktop\\pac\\1017\\ciyun\\test.png') plt.axis("off")
def getSubjectivity(text): return TextBlob(text).sentiment.subjectivity #Create a function to get polarity def getPolarity(text): return TextBlob(text).sentiment.polarity #Create two new columns df['Subjectivity'] = df['Tweets'].apply(getSubjectivity) df['Polarity'] = df['Tweets'].apply(getPolarity) df # Plot the Word Cloud allWords = ' '.join( [twts for twts in df['Tweets']] ) wordCloud = wordcloud.WordCloud(width = 500, height=300, random_state = 21, max_font_size = 119).generate(allWords) plt.imshow(wordCloud, interpolation= "bilinear") plt.axis('off') plt.show() #Create a function to compute the negative, neutral and positive analysis def getAnalysis(score): if score <0: return 'Negative' elif score ==0: return 'Neutral' else: return 'Positive' df['Analysis'] = df['Polarity'].apply(getAnalysis)
data = [] with open('/tmp/passwd') as f: for line in f: result1 = re.split(r'\s|:|/', line) # 如果item存在数据并且不是空格或者数字, 则继续进行处理; result2 = [ item for item in result1 if not re.findall(r'\s+|\d+', item) and item ] # print(result2) data.extend(result2) # 2). 打开图片, 获取图片的数据信息; imgObj = Image.open('./doc/wordcloud.jpg') img_mask = np.array(imgObj) # print(img_mask) # # 3). 创建词云对象, 设置属性 wcObj = wordcloud.WordCloud( mask=img_mask, background_color="snow", min_font_size=5, max_font_size=50, width=1000, height=1000, ) # 4). 生成图片; # 词云绘制时, 默认之处理字符串类型, 怎么分隔每个单词? 必须以逗号分隔符分割 wcObj.generate(",".join(data)) wcObj.to_file('doc/wcObj.png')
videoid = input("Please Enter Video ID: ") outputformat = "dataframe" data = ytc.get_comments(googleapikey, videoid, outputformat) cs = "result" data.to_csv(cs, index=True) df = pd.read_csv(cs) title_words = list(df["textDisplay"].apply(lambda x: x.split())) title_words = [x for y in title_words for x in y] wc = wordcloud.WordCloud(width=1200, height=500, collocations=False, background_color="Azure", colormap="viridis").generate(" ".join(title_words)) plt.figure(figsize=(15, 10)) plt.imshow(wc, interpolation="nearest") _ = plt.axis("off") plt.show() if __name__ == "__main__": import pandas as pd dataset = pd.read_csv(cs) X = dataset['textDisplay'] X.to_csv("comments.csv", index=True) re = pd.read_csv("comments.csv")