Example #1
0
 def __init__(self, mainwindow):
     super().__init__()
     client = MongoDB.get_client()
     db = client[MONGO_DB]
     self.collection = db['topic']
     self.weibo_collection = db['weibo']
     self.ltp = Ltp.get_object()
     self.area = ''
     self.keys = []
     self.setupUi(mainwindow)
Example #2
0
 def __init__(self, parent=None):
     super().__init__(parent)
     self.parent = parent
     db = MongoDB.get_client()[MONGO_DB]
     self.topic_collection = db['topic']
     self.weibo_collection = db['weibo']
     self.attention = False
     self.emotion = False
     self._time = False
     self.ltp = Ltp.get_object()
     self.setupUi()
Example #3
0
 def __init__(self, id, parent=None):
     super().__init__(parent)
     self.parent = parent
     self.ltp = Ltp.get_object()
     client = MongoDB.get_client()
     db = client[MONGO_DB]
     self.topic_collection = db['topic']
     self.weibo_collection = db['weibo']
     self.comment_collection = db['comment']
     self.id = id
     self.setupUi()
Example #4
0
 def run(self):
     ltp = Ltp.get_object()
     tdt = Tdt.get_object()
     model = Text2Vec.get_object()
     weibo_collection = MongoDB.get_client()[MONGO_DB]['weibo']
     count1 = 0
     weibo_set = weibo_collection.find().sort('posted_at',
                                              pymongo.ASCENDING)
     for weibo in weibo_set:
         tdt.single_pass(weibo, 'topic', ltp, model)
         weibo_collection.update_one({'_id': weibo['_id']},
                                     {'$set': weibo}, True)
         count1 += 1
         self.count.emit(count1)
     self.stop.emit(True)
Example #5
0
 def __init__(self):
     super().__init__()
     self.ltp = Ltp.get_object()
Example #6
0
    def single_pass(self, weibo, topic_table, ltp=None, text2vec=None):
        """
        Single-Pass聚类算法,微博weibo属于话题集topic_set某话题,则加入话题并更新话题,否则,自成一个话题加入话题库
        :param ltp: Ltp类实例
        :param text2vec: Text2Vec类实例
        :param topic_table: str, mongoDB话题库名
        :param weibo:dict, 微博数据
        :return:
        """
        if 'if_topic' in weibo and weibo['if_topic']:
            return
        if not ltp:
            ltp = Ltp.get_object()
        if not text2vec:
            model = Text2Vec.get_object()
        else:
            model = text2vec
        content = weibo['content']
        parser = ltp.text_parser(content)
        vector = model.text2dict(list(parser[0:3]))  # 微博切分: [标题, 正文, hashtag]
        entity = parser[3]  # 命名实体
        topic_collection = MongoDB.get_client()[MONGO_DB][topic_table]
        topic_set = topic_collection.find()
        similiratiy = []  # 存储微博与所有话题的相似度

        for topic in topic_set:
            # if cls > 0 and cls != topic['cls'] :
            #     continue
            keydict = topic['keywords']
            vector2 = {}
            count = 0
            for key, value in keydict.items():
                if len(vector2) > len(vector):
                    break
                vector2[key] = value
                count += value
            similar_score = model.similarity(vector2, vector)  # 计算相似度

            if similar_score < 0.4:  # 相似度低,微博不属于话题,判断是否将话题淘汰
                time_gip = (self.get_timestamp(weibo['posted_at']) -
                            self.get_timestamp(topic['latest_time'])) / 86400
                if topic['text_num'] < 5 and time_gip > 60:  # 话题微博数小于5且两个月得不到更新,淘汰
                    topic_collection.delete_one({'_id': topic['_id']})
                else:
                    similiratiy.append(similar_score)
            else:
                similiratiy.append(similar_score)

        try:
            score = max(similiratiy)
        except:
            score = 0.0

        if score >= 0.5:  # 微博加入话题,更新话题
            index = similiratiy.index(score)
            topic = topic_collection.find_one(skip=index)
            keywords = topic['keywords']
            text_num = topic['text_num']
            topic['text_id_list'].append(weibo['id'])
            topic['text_list'].append(weibo['content'])
            ltp.netag_dict_merge(topic['entity'], entity)
            self.dict_combine(keywords, vector, text_num)
            topic['keywords'] = dict(
                sorted(keywords.items(),
                       key=lambda item: item[1],
                       reverse=True))
            topic['heat'] += weibo['comment_count'] + sqrt(
                weibo['forward_count'] + weibo['like_count'])
            topic['text_num'] += 1
            if weibo['posted_at'] < topic['start_time']:
                topic['start_time'] = weibo['posted_at']
            elif weibo['posted_at'] > topic['latest_time']:
                topic['latest_time'] = weibo['posted_at']
            topic['central_time'] = self.datetime_update(
                topic['central_time'], weibo['posted_at'], text_num)
            topic_collection.update_one({'_id': topic['_id']}, {'$set': topic},
                                        True)
        else:  # 微博自成一新话题
            one_topic = {
                'entity': {},
                'keywords': {},
                'text_id_list': [],
                'text_list': [],
                'text_num': 1,
                'heat': 0,
                'start_time': None,
                'latest_time': None,
                'central_time': None,
                # 'cls': cls
            }
            one_topic['text_id_list'].append(weibo['id'])
            one_topic['text_list'].append(weibo['content'])
            one_topic['entity'] = entity
            one_topic['heat'] = weibo['comment_count'] + sqrt(
                weibo['forward_count'] + weibo['like_count'])
            one_topic['start_time'] = one_topic['latest_time'] = one_topic[
                'central_time'] = weibo['posted_at']
            one_topic['keywords'] = dict(
                sorted(vector.items(), key=lambda item: item[1], reverse=True))
            topic_collection.insert_one(one_topic)
        weibo['if_topic'] = True