def __init__(self, mainwindow): super().__init__() client = MongoDB.get_client() db = client[MONGO_DB] self.collection = db['topic'] self.weibo_collection = db['weibo'] self.ltp = Ltp.get_object() self.area = '' self.keys = [] self.setupUi(mainwindow)
def __init__(self, parent=None): super().__init__(parent) self.parent = parent db = MongoDB.get_client()[MONGO_DB] self.topic_collection = db['topic'] self.weibo_collection = db['weibo'] self.attention = False self.emotion = False self._time = False self.ltp = Ltp.get_object() self.setupUi()
def __init__(self, id, parent=None): super().__init__(parent) self.parent = parent self.ltp = Ltp.get_object() client = MongoDB.get_client() db = client[MONGO_DB] self.topic_collection = db['topic'] self.weibo_collection = db['weibo'] self.comment_collection = db['comment'] self.id = id self.setupUi()
def run(self): ltp = Ltp.get_object() tdt = Tdt.get_object() model = Text2Vec.get_object() weibo_collection = MongoDB.get_client()[MONGO_DB]['weibo'] count1 = 0 weibo_set = weibo_collection.find().sort('posted_at', pymongo.ASCENDING) for weibo in weibo_set: tdt.single_pass(weibo, 'topic', ltp, model) weibo_collection.update_one({'_id': weibo['_id']}, {'$set': weibo}, True) count1 += 1 self.count.emit(count1) self.stop.emit(True)
def __init__(self): super().__init__() self.ltp = Ltp.get_object()
def single_pass(self, weibo, topic_table, ltp=None, text2vec=None): """ Single-Pass聚类算法,微博weibo属于话题集topic_set某话题,则加入话题并更新话题,否则,自成一个话题加入话题库 :param ltp: Ltp类实例 :param text2vec: Text2Vec类实例 :param topic_table: str, mongoDB话题库名 :param weibo:dict, 微博数据 :return: """ if 'if_topic' in weibo and weibo['if_topic']: return if not ltp: ltp = Ltp.get_object() if not text2vec: model = Text2Vec.get_object() else: model = text2vec content = weibo['content'] parser = ltp.text_parser(content) vector = model.text2dict(list(parser[0:3])) # 微博切分: [标题, 正文, hashtag] entity = parser[3] # 命名实体 topic_collection = MongoDB.get_client()[MONGO_DB][topic_table] topic_set = topic_collection.find() similiratiy = [] # 存储微博与所有话题的相似度 for topic in topic_set: # if cls > 0 and cls != topic['cls'] : # continue keydict = topic['keywords'] vector2 = {} count = 0 for key, value in keydict.items(): if len(vector2) > len(vector): break vector2[key] = value count += value similar_score = model.similarity(vector2, vector) # 计算相似度 if similar_score < 0.4: # 相似度低,微博不属于话题,判断是否将话题淘汰 time_gip = (self.get_timestamp(weibo['posted_at']) - self.get_timestamp(topic['latest_time'])) / 86400 if topic['text_num'] < 5 and time_gip > 60: # 话题微博数小于5且两个月得不到更新,淘汰 topic_collection.delete_one({'_id': topic['_id']}) else: similiratiy.append(similar_score) else: similiratiy.append(similar_score) try: score = max(similiratiy) except: score = 0.0 if score >= 0.5: # 微博加入话题,更新话题 index = similiratiy.index(score) topic = topic_collection.find_one(skip=index) keywords = topic['keywords'] text_num = topic['text_num'] topic['text_id_list'].append(weibo['id']) topic['text_list'].append(weibo['content']) ltp.netag_dict_merge(topic['entity'], entity) self.dict_combine(keywords, vector, text_num) topic['keywords'] = dict( sorted(keywords.items(), key=lambda item: item[1], reverse=True)) topic['heat'] += weibo['comment_count'] + sqrt( weibo['forward_count'] + weibo['like_count']) topic['text_num'] += 1 if weibo['posted_at'] < topic['start_time']: topic['start_time'] = weibo['posted_at'] elif weibo['posted_at'] > topic['latest_time']: topic['latest_time'] = weibo['posted_at'] topic['central_time'] = self.datetime_update( topic['central_time'], weibo['posted_at'], text_num) topic_collection.update_one({'_id': topic['_id']}, {'$set': topic}, True) else: # 微博自成一新话题 one_topic = { 'entity': {}, 'keywords': {}, 'text_id_list': [], 'text_list': [], 'text_num': 1, 'heat': 0, 'start_time': None, 'latest_time': None, 'central_time': None, # 'cls': cls } one_topic['text_id_list'].append(weibo['id']) one_topic['text_list'].append(weibo['content']) one_topic['entity'] = entity one_topic['heat'] = weibo['comment_count'] + sqrt( weibo['forward_count'] + weibo['like_count']) one_topic['start_time'] = one_topic['latest_time'] = one_topic[ 'central_time'] = weibo['posted_at'] one_topic['keywords'] = dict( sorted(vector.items(), key=lambda item: item[1], reverse=True)) topic_collection.insert_one(one_topic) weibo['if_topic'] = True