def cluster_week_hot(self, day_hot, hot_value=None, article_count=None, vip_count=None, negative_emotion_count=None, weight=None): ''' @summary: 聚类 --------- @param hot:每日热点信息 @param hot_value: 一条舆情的热度 (不为空时表示该条每日热点为更新热点,那么7日热点已经聚过此热点, 热度应该只加该条舆情的热度) @param article_count: @param vip_count: @param negative_emotion_count: @param weight: --------- @result: ''' article_text = day_hot.get("TITLE") # + hot.get("CONTENT") release_time = day_hot.get("RELEASE_TIME") article_text = tools.del_html_tag(article_text) hots = self._get_week_hots(article_text, release_time) # 找最相似的热点 similar_hot = None max_similarity = 0 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('TITLE') # + hot.get('CONTENT') hot_text = tools.del_html_tag(hot_text) temp_similarity = compare_text(article_text, hot_text) if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity: similar_hot = hot max_similarity = temp_similarity break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 if similar_hot: # 找到相似的热点 if similar_hot["ID"] != day_hot["ID"]: # 防止同一个舆情 比较多次 data = {} # 更新热点的热度与文章数 data['HOT'] = similar_hot['HOT'] + (hot_value or day_hot.get('HOT')) data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + ( article_count or day_hot.get('ARTICLE_COUNT')) # 更新主流媒体数量及负面舆情数量 data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + ( vip_count or day_hot.get('VIP_COUNT')) data["NEGATIVE_EMOTION_COUNT"] = similar_hot[ 'NEGATIVE_EMOTION_COUNT'] + ( negative_emotion_count or hot.get('NEGATIVE_EMOTION_COUNT')) # 更新相关度 # data['WEIGHT'] = similar_hot['WEIGHT'] + (weight or day_hot['WEIGHT']) # 更新 hot_day_ids if not hot_value: data["HOT_DAY_IDS"] = similar_hot[ 'HOT_DAY_IDS'] + ',' + day_hot['ID'] # 更新热点 self._es.update_by_id("tab_iopm_hot_week_info", data_id=similar_hot.get("ID"), data=data) # 返回热点id return similar_hot.get("ID") else: # 将该舆情添加为热点 hot_info = deepcopy(day_hot) # 处理事件类型 del_tag_content = tools.del_html_tag(hot_info['CONTENT']) text = hot_info['TITLE'] + del_tag_content contain_event_ids = self._event_filter.find_contain_event(text) hot_info['EVENT_IDS'] = ','.join(contain_event_ids) hot_info['HOT_DAY_IDS'] = day_hot.get("ID") self._es.add('tab_iopm_hot_week_info', hot_info, data_id=hot_info['ID']) # 返回热点id return hot_info['ID']
def main(): rownum = 0 sql = 'select count(*) from tab_iopm_article_info' result = db.find(sql) articles_count = result[0][0] deal_count = 0 while articles_count: # 查文章 sql = ''' select * from (select rownum r, id, title from tab_iopm_article_info where rownum <= %d and info_type != 3 and release_time >= to_date('2017-10-10 00:00:00', 'yyyy-mm-dd hh24:mi:ss') and release_time <= to_date('2017-10-25 23:59:59', 'yyyy-mm-dd hh24:mi:ss')) where r > %d ''' % (rownum + PAGE_SIZE, rownum) sql = ''' select * from (select rownum r, id, title from tab_iopm_article_info where rownum <= %d and info_type != 3) where r > %d ''' % (rownum + PAGE_SIZE, rownum) articles = db.find(sql) if not articles: deal_cluster_buffer() break rownum += PAGE_SIZE # 查热点 sql = 'select id, title, hot from tab_iopm_hot_info' hots = db.find(sql) # 将元组的结果转换为列表 for i, hot in enumerate(hots): hots[i] = list(hot) # 查询类别最大id sql = 'select max(id) from tab_iopm_hot_info' result = db.find(sql) max_hot_id = result[0][0] if result[0][0] else 0 for article in articles: max_similar = { 'similarity': 0, 'hot_id': -1, 'article_id': -1, 'hot_title': '', 'article_count': 0, 'hot_pos': -1 } # 最相似的文章 similarity表示相似度(0~1) article_id = article[1] article_text = article[2] for i, hot in enumerate(hots): hot_id = hot[0] hot_text = hot[1] # article_count = hot[2] similarity = compare_text(hot_text, article_text) # print(''' # article_text %s # hot_text %s # similarity %s # '''%(article_text, hot_text, similarity)) # 将相似的文章和热点的信息记录下来 if similarity > max_similar['similarity']: max_similar['similarity'] = similarity max_similar['hot_id'] = hot_id max_similar['article_id'] = article_id max_similar['hot_title'] = article_text if len( hot_text) > len(article_text) else hot_text max_similar['hot_pos'] = i # 相似热点的下标 后续根据下标来更新热点的标题和文章数 # 该舆情找到了所属类别 if max_similar['similarity'] >= SIMILARITY: # 将热点及舆情信息缓存起来 if max_similar['hot_id'] not in cluster_buffer.keys(): cluster_buffer[max_similar['hot_id']] = { 'title': '', 'article_ids': [], 'article_count': 0 } cluster_buffer[ max_similar['hot_id']]['title'] = max_similar['hot_title'] cluster_buffer[max_similar['hot_id']][ 'article_count'] = max_similar['article_count'] cluster_buffer[max_similar['hot_id']]['article_ids'].append( max_similar['article_id']) hots[max_similar['hot_pos']][1] = max_similar[ 'hot_title'] # 热点标题 hots[max_similar['hot_pos']][2] += 1 # 热点文章信息量 else: # 在原有的类别集合中添加新的类别 max_hot_id += 1 hots.append([max_hot_id, article_text, 1]) # 1 为文章数 # 文章自己是一类, 自己和自己肯定相似,所以在聚类的缓存中把自己及类别对应关系缓存起来 cluster_buffer[max_hot_id] = { 'title': article_text, 'article_ids': [article_id], 'article_count': 1 } deal_count += 1 tools.print_loading('正在聚类分析 已完成 %d/%d' % (deal_count, articles_count)) # 如果大于最大缓存,则添加到数据库中 if len(cluster_buffer) > CLUSTER_BUFFER_ZISE: deal_cluster_buffer() # 查热点 sql = 'select id, title, hot from tab_iopm_hot_info' hots = db.find(sql) # 将元组的结果转换为列表 for i, hot in enumerate(hots): hots[i] = list(hot)
def main(): db = OracleDB() # 查文章 sql = ''' select * from (select rownum r, id, title from tab_iopm_article_info where rownum >= 1) where r <= 100000 ''' articles = db.find(sql) # 查热点 sql = 'select id, title from tab_iopm_hot_info' hots = db.find(sql) for article in articles: max_similar = { 'similarity': 0, 'hot_id': -1, 'article_id': -1, 'hot_title': '' } # 最相似的文章 similarity表示相似度(0~1) article_id = article[1] article_text = article[2] for hot in hots: hot_id = hot[0] hot_text = hot[1] similarity = compare_text(hot_text, article_text) # print(''' # article_text %s # hot_text %s # similarity %s # '''%(article_text, hot_text, similarity)) if similarity > max_similar['similarity']: max_similar['similarity'] = similarity max_similar['hot_id'] = hot_id max_similar['article_id'] = article_id max_similar['hot_title'] = article_text if len(hot_text) > len( article_text) else hot_text if max_similar['similarity'] > SIMILARITY: sql = 'update tab_iopm_article_info set hot_id = %s where id = %s' % ( max_similar['hot_id'], max_similar['article_id']) db.update(sql) sql = "update tab_iopm_hot_info set hot = hot + 1, title = '%s' where id = %s" % ( max_similar['hot_title'], max_similar['hot_id']) db.update(sql) else: sql = 'select sequence.nextval from dual' hot_id = db.find(sql)[0][0] sql = "insert into tab_iopm_hot_info (id, title, hot) values (%s, '%s', 1)" % ( hot_id, article_text) db.add(sql) sql = 'update tab_iopm_article_info set hot_id = %s where id = %s' % ( hot_id, article_id) db.update(sql) sql = 'select id, title from tab_iopm_hot_info' hots = db.find(sql)
def get_hot_id(self, article_info, positions, weight_factor): ''' @summary: 聚类 --------- @param article_info: --------- @result: ''' # weight_factor = 1 article_text = article_info.get("TITLE")# + article_info.get("CONTENT") release_time = article_info.get("RELEASE_TIME") article_text = tools.del_html_tag(article_text) hots = self._get_today_hots(article_text, release_time) # 找最相似的热点 similar_hot = None max_similarity = 0 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('TITLE')# + hot.get('CONTENT') hot_text = tools.del_html_tag(hot_text) temp_similarity = compare_text(article_text, hot_text) if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity: similar_hot = hot max_similarity = temp_similarity break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 if similar_hot:# 找到相似的热点 if similar_hot["ID"] != article_info["ID"]: # 防止同一个舆情 比较多次 data = {} # 更新热点的热度与文章数 data['HOT'] = similar_hot['HOT'] + INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) * weight_factor data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + 1 # 更新主流媒体数量及负面舆情数量 data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + article_info["IS_VIP"] data["NEGATIVE_EMOTION_COUNT"] = similar_hot['NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION'] == 2 else 0) weight_temp = 0 # 记录更新前后的差值 # 更新相关度 if similar_hot['CLUES_IDS']: url = IOPM_SERVICE_ADDRESS + 'related_sort' data_args = { 'hot_id': similar_hot['ID'], # 文章id 'hot_value' :data['HOT'], # 热度值 'clues_ids': similar_hot['CLUES_IDS'], #相关舆情匹配到的线索id 'article_count' : data['ARTICLE_COUNT'], # 文章总数 'vip_count': data["VIP_COUNT"], # 主流媒体数 'negative_emotion_count': data["NEGATIVE_EMOTION_COUNT"], # 负面情感数 'zero_ids':article_info['ZERO_ID'] } result = tools.get_json_by_requests(url, data = data_args) weight_temp = similar_hot['WEIGHT'] - result.get('weight', 0) data['WEIGHT'] = result.get('weight', 0) * weight_factor # 更新热点 self._es.update_by_id("tab_iopm_hot_info", data_id = similar_hot.get("ID"), data = data) # 同步7日热点 self._hot_week_sync.cluster_week_hot(similar_hot, hot_value = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0), article_count = 1, vip_count = article_info["IS_VIP"], negative_emotion_count = 1 if article_info['EMOTION'] == 2 else 0, weight = weight_temp) # 返回热点id return similar_hot.get("ID") else: # 将该舆情添加为热点 hot_info = deepcopy(article_info) hot_info.pop('HOT_ID') # 热点表中无hot_id # 默认用户行为数量为零 hot_info['ACCEPT_COUNT'] = 0 hot_info['UNACCEPT_COUNT'] = 0 hot_info['WATCH_COUNT'] = 0 # 其他值 hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0 hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info['EMOTION'] == 2 else 0 hot_info['HOT'] = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) * weight_factor hot_info['ID'] = article_info.get("ID") hot_info['ARTICLE_COUNT'] = 1 hot_info['HOT_KEYWORDS'] = ','.join(self._cut_text.cut_for_keyword(article_info["TITLE"])) # 关键词 可优化速度 在比较相似度时已经分词了 TODO hot_info['POSITIONS'] = positions hot_info['EVENT_IDS'] = '' # 事件类型(每日热点不需要 TODO | 每周热点已加) self._es.add('tab_iopm_hot_info', hot_info, data_id = hot_info['ID']) # 同步7日热点 self._hot_week_sync.cluster_week_hot(hot_info) # 返回热点id return hot_info['ID']
def main(): deal_count = 0 record_time = tools.get_current_date() # 2017-11-07 08:09:11 while True: # 查文章 sql = ''' select id, title, record_time from tab_iopm_article_info where record_time >= to_date('%s', 'yyyy-mm-dd hh24:mi:ss') '''%(record_time) articles = db.find(sql) if not articles: deal_cluster_buffer() print(''' sql 未查到数据 %s 等待新数据... '''%sql) time.sleep(10) continue # 查热点 sql = 'select id, title, hot from tab_iopm_hot_info_test where record_time >= sysdate-1' hots = db.find(sql) # 查询类别最大id sql = 'select max(id) from tab_iopm_hot_info_test' result = db.find(sql) max_hot_id = result[0][0] if result[0][0] else 0 for article in articles: max_similar = {'similarity':0, 'hot_id':-1, 'article_id':-1, 'hot_title':'', 'article_count':0, 'hot_pos':-1} # 最相似的文章 similarity表示相似度(0~1) article_id = article[0] article_title = article[1][:article[1].find('-')] if article[1] else '' # article_content = article[2] temp_record_time = article[2] article_text = article_title# + article_content if not article_text: continue # 更新record_time 为库里最大的值 if temp_record_time > record_time: record_time = temp_record_time for i, hot in enumerate(hots): hot_id = hot[0] hot_text = hot[1] # article_count = hot[2] similarity = compare_text(hot_text, article_text) # print(''' # article_text %s # hot_text %s # similarity %s # '''%(article_text, hot_text, similarity)) # 将相似的文章和热点的信息记录下来 if similarity > max_similar['similarity']: max_similar['similarity'] = similarity max_similar['hot_id'] = hot_id max_similar['article_id'] = article_id max_similar['hot_title'] = article_title if len(hot_text) > len(article_title) else hot_text max_similar['hot_pos'] = i # 相似热点的下标 后续根据下标来更新热点的标题和文章数 # 该舆情找到了所属类别 if max_similar['similarity'] >= SIMILARITY: # 将热点及舆情信息缓存起来 if max_similar['hot_id'] not in cluster_buffer.keys(): cluster_buffer[max_similar['hot_id']] = { 'title':'', 'article_ids':[], 'article_count':0 } hots[max_similar['hot_pos']][1] = max_similar['hot_title'] # 热点标题 hots[max_similar['hot_pos']][2] += 1 # 热点文章信息量 cluster_buffer[max_similar['hot_id']]['title'] = max_similar['hot_title'] cluster_buffer[max_similar['hot_id']]['article_count'] = hots[max_similar['hot_pos']][2] cluster_buffer[max_similar['hot_id']]['article_ids'].append(max_similar['article_id']) else: # 在原有的类别集合中添加新的类别 max_hot_id += 1 hots.append([max_hot_id, article_title, 1]) # 1 为文章数 # 文章自己是一类, 自己和自己肯定相似,所以在聚类的缓存中把自己及类别对应关系缓存起来 cluster_buffer[max_hot_id] = { 'title':article_title, 'article_ids':[article_id], 'article_count':1 } deal_count += 1 tools.print_loading('正在聚类分析 已完成 %d'%(deal_count)) deal_cluster_buffer()
def deal_news(self): ''' @summary: 取tab_news_csr_result信息 --------- --------- @result: ''' while True: body = { "query": { "filtered": { "filter": { "range": { "csr_res_id": { # 查询大于该csr_res_id 的信息 "gt": self._current_csr_res_id } } } } }, "_source": ["csr_res_id", "csr_content", "start_time"], "sort": [{ "csr_res_id": "asc" }] } news_json = self._es.search('tab_news_csr_result', body) news_list = news_json.get('hits', {}).get('hits', []) if not news_list: log.debug( 'tab_news_csr_result 表中无大于%s的csr_res_id\nsleep %s...' % (self._current_csr_res_id, SLEEP_TIME)) tools.delay_time(SLEEP_TIME) continue for news_info in news_list: news = news_info.get('_source') csr_res_id = news.get('csr_res_id') csr_content = news.get('csr_content') start_time = news.get('start_time') log.debug(''' 处理 tab_news_csr_result csr_res_id %s start_time %s csr_content %s ''' % (csr_res_id, start_time, csr_content)) # 找相似文章 similar_hot = None hots = self._get_same_day_hots(csr_content, start_time) # 遍历相似的文章,比较相似度 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('csr_content') temp_similarity = compare_text(csr_content, hot_text) if temp_similarity > MIN_SIMILARITY: similar_hot = hot break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 # 如果找到相似的文章,追加csr_res_id和hot值, 否则将该条信息最为新的热点 if similar_hot: # 找到相似的热点 log.debug('找到所属热点:%s' % similar_hot.get('csr_content')) data = {} # 更新热点的热度及追加文章的id data["hot"] = similar_hot["hot"] + 1 data["csr_res_ids"] = similar_hot[ "csr_res_ids"] + ',' + csr_res_id # 更新热点 self._es.update_by_id("tab_news_csr_hot", data_id=similar_hot.get("hot_id"), data=data) else: # 没有找到相似的热点, 将当前文章作为热点 log.debug('无所属热点') hot_info = { 'hot_id': csr_res_id, 'hot': 1, 'start_time': start_time, 'csr_res_ids': csr_res_id, 'csr_content': csr_content } self._es.add('tab_news_csr_hot', hot_info, data_id=csr_res_id) # 保存当前的id self._current_csr_res_id = csr_res_id self._save_current_id()
def get_hot_id(self, article_info): article_text = article_info.get( "TITLE") # + article_info.get("CONTENT") release_time = article_info.get("RELEASE_TIME") article_text = tools.del_html_tag(article_text) hots = self._get_today_hots(article_text, release_time) # 找最相似的热点 similar_hot = None max_similarity = 0 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('TITLE') # + hot.get('CONTENT') hot_text = tools.del_html_tag(hot_text) temp_similarity = compare_text(article_text, hot_text) if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity: similar_hot = hot max_similarity = temp_similarity break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 if similar_hot: # 找到相似的热点 if similar_hot["ID"] != article_info["ID"]: # 防止同一个舆情 比较多次 data = {} # 更新热点的热度 data["HOT"] = similar_hot["HOT"] + 1 # 更新主流媒体数量及负面舆情数量 data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + ( 1 if article_info["IS_VIP"] else 0) data["NEGATIVE_EMOTION_COUNT"] = similar_hot[ 'NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION'] == 2 else 0) # 更新相关度 if similar_hot['CLUES_IDS']: url = IOPM_SERVICE_ADDRESS + 'related_sort' data_args = { 'hot_id': similar_hot['ID'], # 文章id 'hot_value': data['HOT'], # 热度值 'clues_id': similar_hot['CLUES_IDS'], #相关舆情匹配到的线索id 'article_count': data['HOT'], # 文章总数 'vip_count': data["VIP_COUNT"], # 主流媒体数 'negative_emotion_count': data["NEGATIVE_EMOTION_COUNT"], # 负面情感数 'zero_ids': article_info['ZERO_ID'] } result = tools.get_json_by_requests(url, data=data_args) if result: data['WEIGHT'] = result.get('weight', 0) # 更新热点 self._es.update_by_id("tab_iopm_hot_info", data_id=similar_hot.get("ID"), data=data) # 返回热点id return similar_hot.get("ID") else: # 将该舆情添加为热点 hot_info = deepcopy(article_info) hot_info.pop('HOT_ID') # 热点表中无hot_id # 默认用户行为数量为零 hot_info['ACCEPT_COUNT'] = 0 hot_info['UNACCEPT_COUNT'] = 0 hot_info['WATCH_COUNT'] = 0 # 其他值 hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0 hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info[ 'EMOTION'] == 2 else 0 hot_info['HOT'] = 1 hot_info['ID'] = article_info.get("ID") self._es.add('tab_iopm_hot_info', hot_info, data_id=hot_info['ID']) # 返回热点id return hot_info['ID']