def run(self): ltp = Ltp.get_object2() ltp.load_dict(ALL_DICT_PATH) analyzer = EmotionAnalysis(ltp) topic_collection = MongoDB.get_client()[MONGO_DB]['topic'] comment_collection = MongoDB.get_client()[MONGO_DB]['comment'] topic = topic_collection.find_one({'_id': ObjectId(self.id)}) count1 = 0 for weibo_id in topic['text_id_list']: if MONGO_DB == 'weibo': comments = comment_collection.find({'id': weibo_id}) else: data = MongoDB.get_client()[MONGO_DB]['mid'].find_one( {'id': weibo_id}) if not data: continue comments = comment_collection.find({'mid': data['mid']}) for data in comments: if 'score' in data and data['score']: continue content = data['content'].strip() if not content: continue data['score'] = analyzer.sent_sentiment_score( data['content'].strip()) comment_collection.update_one({'_id': data['_id']}, {'$set': data}, True) count1 += 1 self.count.emit(round((count1 / topic['text_num']) * 100)) self.stop.emit(True)
def __init__(self, mainwindow): super().__init__() client = MongoDB.get_client() db = client[MONGO_DB] self.collection = db['topic'] self.weibo_collection = db['weibo'] self.ltp = Ltp.get_object() self.area = '' self.keys = [] self.setupUi(mainwindow)
def __init__(self, parent=None): super().__init__(parent) self.parent = parent db = MongoDB.get_client()[MONGO_DB] self.topic_collection = db['topic'] self.weibo_collection = db['weibo'] self.attention = False self.emotion = False self._time = False self.ltp = Ltp.get_object() self.setupUi()
def __init__(self, id, parent=None): super().__init__(parent) self.parent = parent self.ltp = Ltp.get_object() client = MongoDB.get_client() db = client[MONGO_DB] self.topic_collection = db['topic'] self.weibo_collection = db['weibo'] self.comment_collection = db['comment'] self.id = id self.setupUi()
def run(self): ltp = Ltp.get_object() tdt = Tdt.get_object() model = Text2Vec.get_object() weibo_collection = MongoDB.get_client()[MONGO_DB]['weibo'] count1 = 0 weibo_set = weibo_collection.find().sort('posted_at', pymongo.ASCENDING) for weibo in weibo_set: tdt.single_pass(weibo, 'topic', ltp, model) weibo_collection.update_one({'_id': weibo['_id']}, {'$set': weibo}, True) count1 += 1 self.count.emit(count1) self.stop.emit(True)
fr2 = codecs.open('train_label.txt', 'a', 'utf-8') for child in root: for sentence in child.xpath('.//sentence'): text = str(sentence.xpath('.//text()')[0]).strip() label = sentence.attrib.get('polarity') if label: if label == 'POS': label = '1' elif label == 'NEG': label = '-1' else: label = '0' if text: fr1.write(text + '\n') fr2.write(label + '\n') text = None label = None fr1.close() fr2.close() if __name__ == '__main__': ltp = Ltp(4) ltp.load_dict(ALL_DICT_PATH) analyzer = EmotionAnalysis(ltp) sent = '苹果说,用户从即日起可以预定新款iPad,有关产品将于3月16日开始率先在美国、澳大利亚、加拿大、法国、中国香港和新加坡等10多个国家和地区率先上市。' print(analyzer.sent_sentiment_score(sent.strip()))
def __init__(self): super().__init__() self.ltp = Ltp.get_object()
def single_pass(self, weibo, topic_table, ltp=None, text2vec=None): """ Single-Pass聚类算法,微博weibo属于话题集topic_set某话题,则加入话题并更新话题,否则,自成一个话题加入话题库 :param ltp: Ltp类实例 :param text2vec: Text2Vec类实例 :param topic_table: str, mongoDB话题库名 :param weibo:dict, 微博数据 :return: """ if 'if_topic' in weibo and weibo['if_topic']: return if not ltp: ltp = Ltp.get_object() if not text2vec: model = Text2Vec.get_object() else: model = text2vec content = weibo['content'] parser = ltp.text_parser(content) vector = model.text2dict(list(parser[0:3])) # 微博切分: [标题, 正文, hashtag] entity = parser[3] # 命名实体 topic_collection = MongoDB.get_client()[MONGO_DB][topic_table] topic_set = topic_collection.find() similiratiy = [] # 存储微博与所有话题的相似度 for topic in topic_set: # if cls > 0 and cls != topic['cls'] : # continue keydict = topic['keywords'] vector2 = {} count = 0 for key, value in keydict.items(): if len(vector2) > len(vector): break vector2[key] = value count += value similar_score = model.similarity(vector2, vector) # 计算相似度 if similar_score < 0.4: # 相似度低,微博不属于话题,判断是否将话题淘汰 time_gip = (self.get_timestamp(weibo['posted_at']) - self.get_timestamp(topic['latest_time'])) / 86400 if topic['text_num'] < 5 and time_gip > 60: # 话题微博数小于5且两个月得不到更新,淘汰 topic_collection.delete_one({'_id': topic['_id']}) else: similiratiy.append(similar_score) else: similiratiy.append(similar_score) try: score = max(similiratiy) except: score = 0.0 if score >= 0.5: # 微博加入话题,更新话题 index = similiratiy.index(score) topic = topic_collection.find_one(skip=index) keywords = topic['keywords'] text_num = topic['text_num'] topic['text_id_list'].append(weibo['id']) topic['text_list'].append(weibo['content']) ltp.netag_dict_merge(topic['entity'], entity) self.dict_combine(keywords, vector, text_num) topic['keywords'] = dict( sorted(keywords.items(), key=lambda item: item[1], reverse=True)) topic['heat'] += weibo['comment_count'] + sqrt( weibo['forward_count'] + weibo['like_count']) topic['text_num'] += 1 if weibo['posted_at'] < topic['start_time']: topic['start_time'] = weibo['posted_at'] elif weibo['posted_at'] > topic['latest_time']: topic['latest_time'] = weibo['posted_at'] topic['central_time'] = self.datetime_update( topic['central_time'], weibo['posted_at'], text_num) topic_collection.update_one({'_id': topic['_id']}, {'$set': topic}, True) else: # 微博自成一新话题 one_topic = { 'entity': {}, 'keywords': {}, 'text_id_list': [], 'text_list': [], 'text_num': 1, 'heat': 0, 'start_time': None, 'latest_time': None, 'central_time': None, # 'cls': cls } one_topic['text_id_list'].append(weibo['id']) one_topic['text_list'].append(weibo['content']) one_topic['entity'] = entity one_topic['heat'] = weibo['comment_count'] + sqrt( weibo['forward_count'] + weibo['like_count']) one_topic['start_time'] = one_topic['latest_time'] = one_topic[ 'central_time'] = weibo['posted_at'] one_topic['keywords'] = dict( sorted(vector.items(), key=lambda item: item[1], reverse=True)) topic_collection.insert_one(one_topic) weibo['if_topic'] = True
one_topic['start_time'] = one_topic['latest_time'] = one_topic[ 'central_time'] = weibo['posted_at'] one_topic['keywords'] = dict( sorted(vector.items(), key=lambda item: item[1], reverse=True)) topic_collection.insert_one(one_topic) weibo['if_topic'] = True if __name__ == '__main__': dirname, filename = os.path.split(os.path.abspath(__file__)) # 获取执行文件路径 path = dirname + '\\data\\' # 获取数据文件夹路径 """single-pass""" # filename = r'data\weibo.cut' # fr = codecs.open(filename, 'r', 'utf-8') # lines = fr.readlines() # data = [] # for line in lines: # data.append(line) # (tfidf_words, tfidf_weight) = tfidf_calculate(data) # fr.close() ltp = Ltp(3) ltp.create_stopwordslist(STOPWORDS_DIR) # parser = ltp.text_parser('#成都七中实验学校食品安全问题# 哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈') # parser1 = ltp.text_parser('【网曝成都七中实验学校#给学生吃变质食品# 官方再次回应:已对8名负责人展开调查】据成都市温江区人民政府新闻办公室官方微博@金温江:温江区公安分局目前正在对掌握的成都七中实验学校负责食品安全的8名责任人开展全面深入的调查。区市场监管局对投诉反映的19个批次的食材进行了抽样,对所有冻库及库房内食材进行了查封,对新进食材进行全程监管。区市场监管局、区教育局举一反三,已组织开展全区大中小学和幼儿园食堂食品安全的专项检查,切实保障学生的身体健康。温江区委、区政府将依法依规对成都七中实验学校食品安全问题进行认真彻查,严肃处理相关责任人,及时公布调查处理结果。') # model = Text2Vec() # score = model.similarity(list(parser[0:3]), list(parser1[0:3])) # print(score) ttt = Tdt() # ttt.single_pass(ltp)