def data_analysis_report(data_name, keyword): data_name = data_name keyword = keyword mongo = MongoDBUtils("data_report") # curInfo = mongo.searchByDoc({"_id":data_name+"_report"}) # # 如果报告已经存在了直接引用,不存在调用接口生成 # if curInfo: # print(curInfo) # else: # 事件走势 event_mongo = Event(data_name) event_data = event_mongo.trend(keyword) # 这个是性别比例的函数 gender_mongo = Gender(data_name) female = gender_mongo.female() male = gender_mongo.male() unknowmale = gender_mongo.unknowmale() # 信息地域分布 location_mongo = Location(data_name) location_data = location_mongo.analysis() # 点赞数前50的函数 vote_top = Vote(data_name) vote_data = vote_top.top50(keyword, 20) # 评论数前50的函数 comment_top = Comment(data_name) comment_data = comment_top.top50(keyword, 20) # 词云生成,这个是生成图 word_mongo = Word(data_name) word_cloud = word_mongo.keywordcloud(keyword) word_count = word_mongo.wordcount() word_pie = word_mongo.wordpie() word_data = word_mongo.get_data() data = { "event_data": event_data, "gender_data": { "female": female, "male": male, "unknowmale": unknowmale }, "location_data": location_data, "vote_data": vote_data, "comment_data": comment_data, "word_data": word_data, "_id": data_name + "_report", "created_time": int(time.time()) } mongo.insertmongoDB(data) mongo.close() return data
class Comment: def __init__(self,collectionName): self.mongo = MongoDBUtils(collectionName) self.data_name = collectionName def top50(self,keyword,limitsize): # curInfo = self.mongo.searchByDocSortLimit({"_id":{"$regex":keyword}},"comment_count",-1,limitsize) if self.data_name == "zhihu_icu": curInfo = self.mongo.searchByDocSortLimit({"question.title":{"$regex":keyword,"$options":"i"}},"comment_count",-1,limitsize) else: curInfo = self.mongo.searchByDocSortLimit({"_id":{"$regex":keyword,"$options":"i"}},"comment_count",-1,limitsize) # print(list(curInfo)) # for data in curInfo[:5]: # print(data) self.mongo.close() return list(curInfo)
class Gender: def __init__(self,collectionName): self.mongo = MongoDBUtils(collectionName) self.data_name = collectionName # 女性 def female(self): female_count = self.mongo.searchByDoc({"author.gender":0}).count() self.mongo.close() return female_count # 男性 def male(self): male_count = self.mongo.searchByDoc({"author.gender": 1}).count() self.mongo.close() return male_count # 未知性 def unknowmale(self): unknow_count = self.mongo.searchByDoc({"author.gender": -1}).count() self.mongo.close() return unknow_count
class Word: def __init__(self,collectionName): self.data_name = collectionName self.mongo = MongoDBUtils(collectionName) self.path = r"D:\Django\NegativeInternet\app\analysisData\common_class\images" self.font = r"D:\Django\NegativeInternet\app\analysisData\msyh.ttc" # self.font_set = FontProperties(fname=r"D:\Django\NegativeInternet\app\analysisData\msyh.ttc", size=12) self.alice_mask = np.array(Image.open(self.path+r'\e.jpg')) def keywordcloud(self,keyword): """ 生成词云的图片 :return: 图片 """ if self.data_name == "zhihu_icu": curInfo = self.mongo.searchByDoc({"question.title": {"$regex": keyword,"$options": "i"}}) else: curInfo = self.mongo.searchByDoc({"_id":{"$regex":keyword,"$options": "i"}}) # print(curInfo.count()) # stopwords = set(STOPWORDS) self.stopwords = ["游戏","手机","没有","时候","可能","快递","有点","东西","女人","不能","觉得","看到"] with open(r'D:\Django\NegativeInternet\app\analysisData\common_class\chineseStopWords.txt','r',encoding='gbk') as r: for w in r.readlines(): self.stopwords.append(w) self.stopwords = set(self.stopwords) self.keywords = "" for data in curInfo: k = data.get("keywords") if k: self.keywords = self.keywords +" "+k else: continue wc = WordCloud( background_color='white', width=1000, height=800, font_path=self.font, mask=self.alice_mask, stopwords=self.stopwords ) wc.generate_from_text(self.keywords) plt.imshow(wc) plt.axis("off") plt.figure() plt.show() if os.path.exists(self.path+r"\word_cloud_"+self.data_name+".png") == True: os.remove(self.path+r"\word_cloud_"+self.data_name+".png") wc.to_file(self.path+r"\word_cloud_"+self.data_name+".png") self.mongo.close() def wordcount(self): """ 词频统计 :return:图片以及数据 """ keywords_list = self.keywords.split() for k in list(keywords_list): if k in self.stopwords: keywords_list.remove(k) self.top20 = dict(Counter(keywords_list).most_common(20)) print(self.top20) label = list(self.top20.keys()) y = list(self.top20.values()) idx = np.arange(len(y)) plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 plt.barh(idx, y) plt.yticks(idx + 0.4, label) plt.xlabel(u'出现次数', fontsize=20, labelpad=5) plt.ylabel(u'关键词', fontsize=20, labelpad=5) # plt.title(u'涡流发生器对激波串振荡的控制', fontsize=25) if os.path.exists(self.path+u'\word_count_'+self.data_name) == True: os.remove(self.path+u'\word_count_'+self.data_name) plt.savefig(self.path+u'\word_count_'+self.data_name) plt.show() def wordpie(self): """ pie级坐标图 :return: 图片 """ # 绘制pie char on polar axis N = len(self.top20) label = list(self.top20.keys()) y = list(self.top20.values()) theta = np.arange(0.0, 2 * np.pi, 2 * np.pi / N) radii = y width = np.pi / 6 plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 ax = plt.subplot(111, projection='polar') bars = ax.bar(theta, radii, width=width, bottom=0.0) plt.xticks(theta + np.pi / 12, label) for r, bar in zip(radii, bars): bar.set_facecolor(plt.cm.viridis(r / 10)) bar.set_alpha(0.5) if os.path.exists(self.path+u'\word_pie_'+self.data_name) == True: os.remove(self.path+u'\word_pie_'+self.data_name) plt.savefig(self.path+u'\word_pie_'+self.data_name) plt.show() def get_data(self): keywords_list = self.keywords.split() for k in list(keywords_list): if k in self.stopwords: keywords_list.remove(k) top100 = dict(Counter(keywords_list).most_common(100)) word_data = [] for k,v in top100.items(): word_data.append({ "name":k, "value":v }) return word_data