def __init__(self,collectionName): self.data_name = collectionName self.mongo = MongoDBUtils(collectionName) self.path = r"D:\Django\NegativeInternet\app\analysisData\common_class\images" self.font = r"D:\Django\NegativeInternet\app\analysisData\msyh.ttc" # self.font_set = FontProperties(fname=r"D:\Django\NegativeInternet\app\analysisData\msyh.ttc", size=12) self.alice_mask = np.array(Image.open(self.path+r'\e.jpg'))
def parse(self, response): # print(response.text) # print(response.status) # print(json.loads(response.text)) # for collections in ["zhihu_paris","zhihu_car","zhihu_icu"]: mongo = MongoDBUtils("zhihu_paris") curInfo = mongo.distinctID("author.id") for uid in curInfo: if uid == "0": continue yield Request(self.start_url.format(uid), callback=self.parse_detail, dont_filter=True)
class Comment: def __init__(self,collectionName): self.mongo = MongoDBUtils(collectionName) self.data_name = collectionName def top50(self,keyword,limitsize): # curInfo = self.mongo.searchByDocSortLimit({"_id":{"$regex":keyword}},"comment_count",-1,limitsize) if self.data_name == "zhihu_icu": curInfo = self.mongo.searchByDocSortLimit({"question.title":{"$regex":keyword,"$options":"i"}},"comment_count",-1,limitsize) else: curInfo = self.mongo.searchByDocSortLimit({"_id":{"$regex":keyword,"$options":"i"}},"comment_count",-1,limitsize) # print(list(curInfo)) # for data in curInfo[:5]: # print(data) self.mongo.close() return list(curInfo)
class Event: def __init__(self, collectionName): self.mongo = MongoDBUtils(collectionName) self.data_name = collectionName def trend(self, keyword): if self.data_name == "zhihu_icu": # curInfo = self.mongo.searchByDoc({"question.title": {"$regex": keyword,"$options": "i"}}) curInfo = self.mongo.searchByDocSortLimit( {"question.title": { "$regex": keyword, "$options": "i" }}, "voteup_count", -1, 500) else: curInfo = self.mongo.searchByDocSortLimit( {"_id": { "$regex": keyword, "$options": "i" }}, "voteup_count", -1, 500) question_id = [] # print(question_id) # question_id = list(set(question_id)) for q in curInfo: if q.get("question").get("id") not in question_id: question_id.append(q.get("question").get("id")) print(question_id) top = [] for id in question_id: curQ = self.mongo.searchByDoc({"question.id": id}) title = curQ[0].get("question").get("title") created_time = curQ[0].get("question").get("created") comment_count = 0 voteup_count = 0 for c in curQ: comment_count += c.get("comment_count") voteup_count += c.get("voteup_count") top.append({ "title": title, "created_time": created_time, "comment_count": comment_count, "voteup_count": voteup_count }) return top
def post(self, request): """ 获取案例详情内容 :param request:请求参数:keyword 关键词data_name 数据库名 :return: 请求的数据 """ data = request.data data_name = data.get("data_name") keyword = data.get("keywords") print(data_name) print(keyword) mongo = MongoDBUtils("data_report") curInfo = mongo.searchByDoc({"_id": data_name + "_report"})[0] if curInfo != [{}] and len(curInfo) != 0: print(list(curInfo)) return Response({"data": curInfo}) else: result = data_analysis_report(data_name, keyword) return Response({"data": result})
def analysis(self): curInfo = self.mongo.distinctID("author.id") location_list = [] local_mongo = MongoDBUtils("zhihu_user") for uid in curInfo: if uid != "0": curUser = local_mongo.searchByDoc({"_id": uid})[0] # 数据库有数据的 if curUser: if len(curUser.get("location")) == 0: continue else: # location_list.extend([data.get("name") for data in curUser.get("location")]) for data in curUser.get("location"): if data.get("name") in city_province.keys(): location_list.append( city_province.get(data.get("name"))) elif data.get("name") + "市" in city_province.keys( ): location_list.append( city_province.get(data.get("name") + "市")) # elif data.get("name")+"" in city_province.keys(): # location_list.append(city_province.get(data.get("name")+"")) else: # print(data.get("name")) for reg in province_list: if reg in data.get("name"): location_list.append(reg + "") location_list.append("其他") else: # 数据库没有数据的记录下来 # with open('uid2.txt','a+',encoding='utf-8') as f: # f.write(uid + '\n') # 加\n换行显示 pass location_dict = Counter(location_list) print(location_dict) location_data = [] for k, v in dict(location_dict).items(): location_data.append({"name": k, "value": v}) return location_data
class Gender: def __init__(self,collectionName): self.mongo = MongoDBUtils(collectionName) self.data_name = collectionName # 女性 def female(self): female_count = self.mongo.searchByDoc({"author.gender":0}).count() self.mongo.close() return female_count # 男性 def male(self): male_count = self.mongo.searchByDoc({"author.gender": 1}).count() self.mongo.close() return male_count # 未知性 def unknowmale(self): unknow_count = self.mongo.searchByDoc({"author.gender": -1}).count() self.mongo.close() return unknow_count
def data_analysis_report(data_name, keyword): data_name = data_name keyword = keyword mongo = MongoDBUtils("data_report") # curInfo = mongo.searchByDoc({"_id":data_name+"_report"}) # # 如果报告已经存在了直接引用,不存在调用接口生成 # if curInfo: # print(curInfo) # else: # 事件走势 event_mongo = Event(data_name) event_data = event_mongo.trend(keyword) # 这个是性别比例的函数 gender_mongo = Gender(data_name) female = gender_mongo.female() male = gender_mongo.male() unknowmale = gender_mongo.unknowmale() # 信息地域分布 location_mongo = Location(data_name) location_data = location_mongo.analysis() # 点赞数前50的函数 vote_top = Vote(data_name) vote_data = vote_top.top50(keyword, 20) # 评论数前50的函数 comment_top = Comment(data_name) comment_data = comment_top.top50(keyword, 20) # 词云生成,这个是生成图 word_mongo = Word(data_name) word_cloud = word_mongo.keywordcloud(keyword) word_count = word_mongo.wordcount() word_pie = word_mongo.wordpie() word_data = word_mongo.get_data() data = { "event_data": event_data, "gender_data": { "female": female, "male": male, "unknowmale": unknowmale }, "location_data": location_data, "vote_data": vote_data, "comment_data": comment_data, "word_data": word_data, "_id": data_name + "_report", "created_time": int(time.time()) } mongo.insertmongoDB(data) mongo.close() return data
class Word: def __init__(self,collectionName): self.data_name = collectionName self.mongo = MongoDBUtils(collectionName) self.path = r"D:\Django\NegativeInternet\app\analysisData\common_class\images" self.font = r"D:\Django\NegativeInternet\app\analysisData\msyh.ttc" # self.font_set = FontProperties(fname=r"D:\Django\NegativeInternet\app\analysisData\msyh.ttc", size=12) self.alice_mask = np.array(Image.open(self.path+r'\e.jpg')) def keywordcloud(self,keyword): """ 生成词云的图片 :return: 图片 """ if self.data_name == "zhihu_icu": curInfo = self.mongo.searchByDoc({"question.title": {"$regex": keyword,"$options": "i"}}) else: curInfo = self.mongo.searchByDoc({"_id":{"$regex":keyword,"$options": "i"}}) # print(curInfo.count()) # stopwords = set(STOPWORDS) self.stopwords = ["游戏","手机","没有","时候","可能","快递","有点","东西","女人","不能","觉得","看到"] with open(r'D:\Django\NegativeInternet\app\analysisData\common_class\chineseStopWords.txt','r',encoding='gbk') as r: for w in r.readlines(): self.stopwords.append(w) self.stopwords = set(self.stopwords) self.keywords = "" for data in curInfo: k = data.get("keywords") if k: self.keywords = self.keywords +" "+k else: continue wc = WordCloud( background_color='white', width=1000, height=800, font_path=self.font, mask=self.alice_mask, stopwords=self.stopwords ) wc.generate_from_text(self.keywords) plt.imshow(wc) plt.axis("off") plt.figure() plt.show() if os.path.exists(self.path+r"\word_cloud_"+self.data_name+".png") == True: os.remove(self.path+r"\word_cloud_"+self.data_name+".png") wc.to_file(self.path+r"\word_cloud_"+self.data_name+".png") self.mongo.close() def wordcount(self): """ 词频统计 :return:图片以及数据 """ keywords_list = self.keywords.split() for k in list(keywords_list): if k in self.stopwords: keywords_list.remove(k) self.top20 = dict(Counter(keywords_list).most_common(20)) print(self.top20) label = list(self.top20.keys()) y = list(self.top20.values()) idx = np.arange(len(y)) plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 plt.barh(idx, y) plt.yticks(idx + 0.4, label) plt.xlabel(u'出现次数', fontsize=20, labelpad=5) plt.ylabel(u'关键词', fontsize=20, labelpad=5) # plt.title(u'涡流发生器对激波串振荡的控制', fontsize=25) if os.path.exists(self.path+u'\word_count_'+self.data_name) == True: os.remove(self.path+u'\word_count_'+self.data_name) plt.savefig(self.path+u'\word_count_'+self.data_name) plt.show() def wordpie(self): """ pie级坐标图 :return: 图片 """ # 绘制pie char on polar axis N = len(self.top20) label = list(self.top20.keys()) y = list(self.top20.values()) theta = np.arange(0.0, 2 * np.pi, 2 * np.pi / N) radii = y width = np.pi / 6 plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 ax = plt.subplot(111, projection='polar') bars = ax.bar(theta, radii, width=width, bottom=0.0) plt.xticks(theta + np.pi / 12, label) for r, bar in zip(radii, bars): bar.set_facecolor(plt.cm.viridis(r / 10)) bar.set_alpha(0.5) if os.path.exists(self.path+u'\word_pie_'+self.data_name) == True: os.remove(self.path+u'\word_pie_'+self.data_name) plt.savefig(self.path+u'\word_pie_'+self.data_name) plt.show() def get_data(self): keywords_list = self.keywords.split() for k in list(keywords_list): if k in self.stopwords: keywords_list.remove(k) top100 = dict(Counter(keywords_list).most_common(100)) word_data = [] for k,v in top100.items(): word_data.append({ "name":k, "value":v }) return word_data
def __init__(self,collectionName): self.mongo = MongoDBUtils(collectionName) self.data_name = collectionName