Esempio n. 1
0
 def __init__(self,collectionName):
     self.data_name = collectionName
     self.mongo = MongoDBUtils(collectionName)
     self.path = r"D:\Django\NegativeInternet\app\analysisData\common_class\images"
     self.font = r"D:\Django\NegativeInternet\app\analysisData\msyh.ttc"
     # self.font_set = FontProperties(fname=r"D:\Django\NegativeInternet\app\analysisData\msyh.ttc", size=12)
     self.alice_mask = np.array(Image.open(self.path+r'\e.jpg'))
Esempio n. 2
0
 def parse(self, response):
     # print(response.text)
     # print(response.status)
     # print(json.loads(response.text))
     # for collections in ["zhihu_paris","zhihu_car","zhihu_icu"]:
     mongo = MongoDBUtils("zhihu_paris")
     curInfo = mongo.distinctID("author.id")
     for uid in curInfo:
         if uid == "0":
             continue
         yield Request(self.start_url.format(uid), callback=self.parse_detail, dont_filter=True)
Esempio n. 3
0
class Comment:
    def __init__(self,collectionName):
        self.mongo = MongoDBUtils(collectionName)
        self.data_name = collectionName

    def top50(self,keyword,limitsize):
        # curInfo = self.mongo.searchByDocSortLimit({"_id":{"$regex":keyword}},"comment_count",-1,limitsize)
        if self.data_name == "zhihu_icu":
            curInfo = self.mongo.searchByDocSortLimit({"question.title":{"$regex":keyword,"$options":"i"}},"comment_count",-1,limitsize)
        else:
            curInfo = self.mongo.searchByDocSortLimit({"_id":{"$regex":keyword,"$options":"i"}},"comment_count",-1,limitsize)
        # print(list(curInfo))
        # for data in curInfo[:5]:
        #     print(data)
        self.mongo.close()
        return list(curInfo)
Esempio n. 4
0
class Event:
    def __init__(self, collectionName):
        self.mongo = MongoDBUtils(collectionName)
        self.data_name = collectionName

    def trend(self, keyword):
        if self.data_name == "zhihu_icu":
            # curInfo = self.mongo.searchByDoc({"question.title": {"$regex": keyword,"$options": "i"}})
            curInfo = self.mongo.searchByDocSortLimit(
                {"question.title": {
                    "$regex": keyword,
                    "$options": "i"
                }}, "voteup_count", -1, 500)
        else:
            curInfo = self.mongo.searchByDocSortLimit(
                {"_id": {
                    "$regex": keyword,
                    "$options": "i"
                }}, "voteup_count", -1, 500)

        question_id = []
        # print(question_id)
        # question_id = list(set(question_id))
        for q in curInfo:
            if q.get("question").get("id") not in question_id:
                question_id.append(q.get("question").get("id"))
        print(question_id)
        top = []
        for id in question_id:
            curQ = self.mongo.searchByDoc({"question.id": id})
            title = curQ[0].get("question").get("title")
            created_time = curQ[0].get("question").get("created")
            comment_count = 0
            voteup_count = 0
            for c in curQ:
                comment_count += c.get("comment_count")
                voteup_count += c.get("voteup_count")
            top.append({
                "title": title,
                "created_time": created_time,
                "comment_count": comment_count,
                "voteup_count": voteup_count
            })
        return top
Esempio n. 5
0
 def post(self, request):
     """
     获取案例详情内容
     :param request:请求参数:keyword 关键词data_name 数据库名
     :return: 请求的数据
     """
     data = request.data
     data_name = data.get("data_name")
     keyword = data.get("keywords")
     print(data_name)
     print(keyword)
     mongo = MongoDBUtils("data_report")
     curInfo = mongo.searchByDoc({"_id": data_name + "_report"})[0]
     if curInfo != [{}] and len(curInfo) != 0:
         print(list(curInfo))
         return Response({"data": curInfo})
     else:
         result = data_analysis_report(data_name, keyword)
         return Response({"data": result})
Esempio n. 6
0
    def analysis(self):
        curInfo = self.mongo.distinctID("author.id")
        location_list = []
        local_mongo = MongoDBUtils("zhihu_user")
        for uid in curInfo:
            if uid != "0":
                curUser = local_mongo.searchByDoc({"_id": uid})[0]
                # 数据库有数据的
                if curUser:
                    if len(curUser.get("location")) == 0:
                        continue
                    else:
                        # location_list.extend([data.get("name") for data in curUser.get("location")])
                        for data in curUser.get("location"):
                            if data.get("name") in city_province.keys():
                                location_list.append(
                                    city_province.get(data.get("name")))
                            elif data.get("name") + "市" in city_province.keys(
                            ):
                                location_list.append(
                                    city_province.get(data.get("name") + "市"))
                            # elif data.get("name")+"" in city_province.keys():
                            #     location_list.append(city_province.get(data.get("name")+""))
                            else:
                                # print(data.get("name"))
                                for reg in province_list:
                                    if reg in data.get("name"):
                                        location_list.append(reg + "")
                                location_list.append("其他")
                else:
                    # 数据库没有数据的记录下来
                    # with open('uid2.txt','a+',encoding='utf-8') as f:
                    #     f.write(uid + '\n')  # 加\n换行显示
                    pass

        location_dict = Counter(location_list)
        print(location_dict)
        location_data = []
        for k, v in dict(location_dict).items():
            location_data.append({"name": k, "value": v})
        return location_data
Esempio n. 7
0
class Gender:
    def __init__(self,collectionName):
        self.mongo = MongoDBUtils(collectionName)
        self.data_name = collectionName

    # 女性
    def female(self):
        female_count = self.mongo.searchByDoc({"author.gender":0}).count()
        self.mongo.close()
        return female_count

    # 男性
    def male(self):
        male_count = self.mongo.searchByDoc({"author.gender": 1}).count()
        self.mongo.close()
        return male_count

    # 未知性
    def unknowmale(self):
        unknow_count = self.mongo.searchByDoc({"author.gender": -1}).count()
        self.mongo.close()
        return unknow_count
Esempio n. 8
0
def data_analysis_report(data_name, keyword):
    data_name = data_name
    keyword = keyword
    mongo = MongoDBUtils("data_report")
    # curInfo = mongo.searchByDoc({"_id":data_name+"_report"})
    # # 如果报告已经存在了直接引用,不存在调用接口生成
    # if curInfo:
    #     print(curInfo)
    # else:
    # 事件走势
    event_mongo = Event(data_name)
    event_data = event_mongo.trend(keyword)
    # 这个是性别比例的函数
    gender_mongo = Gender(data_name)
    female = gender_mongo.female()
    male = gender_mongo.male()
    unknowmale = gender_mongo.unknowmale()
    # 信息地域分布
    location_mongo = Location(data_name)
    location_data = location_mongo.analysis()
    # 点赞数前50的函数
    vote_top = Vote(data_name)
    vote_data = vote_top.top50(keyword, 20)
    # 评论数前50的函数
    comment_top = Comment(data_name)
    comment_data = comment_top.top50(keyword, 20)
    # 词云生成,这个是生成图
    word_mongo = Word(data_name)
    word_cloud = word_mongo.keywordcloud(keyword)
    word_count = word_mongo.wordcount()
    word_pie = word_mongo.wordpie()
    word_data = word_mongo.get_data()
    data = {
        "event_data": event_data,
        "gender_data": {
            "female": female,
            "male": male,
            "unknowmale": unknowmale
        },
        "location_data": location_data,
        "vote_data": vote_data,
        "comment_data": comment_data,
        "word_data": word_data,
        "_id": data_name + "_report",
        "created_time": int(time.time())
    }
    mongo.insertmongoDB(data)
    mongo.close()
    return data
Esempio n. 9
0
class Word:
    def __init__(self,collectionName):
        self.data_name = collectionName
        self.mongo = MongoDBUtils(collectionName)
        self.path = r"D:\Django\NegativeInternet\app\analysisData\common_class\images"
        self.font = r"D:\Django\NegativeInternet\app\analysisData\msyh.ttc"
        # self.font_set = FontProperties(fname=r"D:\Django\NegativeInternet\app\analysisData\msyh.ttc", size=12)
        self.alice_mask = np.array(Image.open(self.path+r'\e.jpg'))

    def keywordcloud(self,keyword):
        """
        生成词云的图片
        :return: 图片
        """
        if self.data_name == "zhihu_icu":
            curInfo = self.mongo.searchByDoc({"question.title": {"$regex": keyword,"$options": "i"}})
        else:
            curInfo = self.mongo.searchByDoc({"_id":{"$regex":keyword,"$options": "i"}})
        # print(curInfo.count())
        # stopwords = set(STOPWORDS)
        self.stopwords = ["游戏","手机","没有","时候","可能","快递","有点","东西","女人","不能","觉得","看到"]
        with open(r'D:\Django\NegativeInternet\app\analysisData\common_class\chineseStopWords.txt','r',encoding='gbk') as r:
            for w in r.readlines():
                self.stopwords.append(w)
        self.stopwords = set(self.stopwords)
        self.keywords = ""
        for data in curInfo:
            k = data.get("keywords")
            if k:
                self.keywords = self.keywords +" "+k
            else:
                continue
        wc = WordCloud(
            background_color='white',
            width=1000,
            height=800,
            font_path=self.font,
            mask=self.alice_mask,
            stopwords=self.stopwords
        )
        wc.generate_from_text(self.keywords)

        plt.imshow(wc)
        plt.axis("off")
        plt.figure()
        plt.show()
        if os.path.exists(self.path+r"\word_cloud_"+self.data_name+".png") == True:
            os.remove(self.path+r"\word_cloud_"+self.data_name+".png")
        wc.to_file(self.path+r"\word_cloud_"+self.data_name+".png")
        self.mongo.close()

    def wordcount(self):
        """
        词频统计
        :return:图片以及数据
        """
        keywords_list = self.keywords.split()
        for k in list(keywords_list):
            if k in self.stopwords:
                keywords_list.remove(k)
        self.top20 = dict(Counter(keywords_list).most_common(20))
        print(self.top20)
        label = list(self.top20.keys())
        y = list(self.top20.values())
        idx = np.arange(len(y))
        plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
        plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
        plt.barh(idx, y)
        plt.yticks(idx + 0.4, label)
        plt.xlabel(u'出现次数', fontsize=20, labelpad=5)
        plt.ylabel(u'关键词', fontsize=20, labelpad=5)
        # plt.title(u'涡流发生器对激波串振荡的控制', fontsize=25)
        if os.path.exists(self.path+u'\word_count_'+self.data_name) == True:
            os.remove(self.path+u'\word_count_'+self.data_name)
        plt.savefig(self.path+u'\word_count_'+self.data_name)
        plt.show()

    def wordpie(self):
        """
        pie级坐标图
        :return: 图片
        """
        # 绘制pie char on polar axis
        N = len(self.top20)
        label = list(self.top20.keys())
        y = list(self.top20.values())
        theta = np.arange(0.0, 2 * np.pi, 2 * np.pi / N)
        radii = y
        width = np.pi / 6
        plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
        plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
        ax = plt.subplot(111, projection='polar')
        bars = ax.bar(theta, radii, width=width, bottom=0.0)
        plt.xticks(theta + np.pi / 12, label)
        for r, bar in zip(radii, bars):
            bar.set_facecolor(plt.cm.viridis(r / 10))
            bar.set_alpha(0.5)
        if os.path.exists(self.path+u'\word_pie_'+self.data_name) == True:
            os.remove(self.path+u'\word_pie_'+self.data_name)
        plt.savefig(self.path+u'\word_pie_'+self.data_name)
        plt.show()

    def get_data(self):
        keywords_list = self.keywords.split()
        for k in list(keywords_list):
            if k in self.stopwords:
                keywords_list.remove(k)
        top100 = dict(Counter(keywords_list).most_common(100))
        word_data = []
        for k,v in top100.items():
            word_data.append({
                "name":k,
                "value":v
            })
        return word_data
Esempio n. 10
0
 def __init__(self,collectionName):
     self.mongo = MongoDBUtils(collectionName)
     self.data_name = collectionName