class ElasticSearchServer(): def __init__(self): self._es = ES('192.168.60.40') # self._es = ES('localhost') def search(self, table, body): return self._es.search(table, body)
class HotWeekSync(): def __init__(self): self._es = ES() self._event_filter = EventFilter() self._event_filter.start() def _get_week_hots(self, text, release_time): before_week = tools.get_before_date(release_time, -7) body = { "size": 1, "query": { "filtered": { "filter": { "range": { "RELEASE_TIME": { # 当日发布的新闻 "gte": before_week, "lte": release_time } } }, "query": { "multi_match": { "query": text, "fields": ["TITLE"], "operator": "or", "minimum_should_match": "{percent}%".format(percent=int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比 } } } }, "_source": [ "ID", "TITLE", # "CONTENT", "HOT", "ARTICLE_COUNT", "VIP_COUNT", "NEGATIVE_EMOTION_COUNT", "HOT_DAY_IDS", "WEIGHT" ], # "highlight": { # "fields": { # "TITLE": {} # } # } } # 默认按照匹配分数排序 hots = self._es.search('tab_iopm_hot_week_info', body) # print(tools.dumps_json(hots)) return hots.get('hits', {}).get('hits', []) def cluster_week_hot(self, day_hot, hot_value=None, article_count=None, vip_count=None, negative_emotion_count=None, weight=None): ''' @summary: 聚类 --------- @param hot:每日热点信息 @param hot_value: 一条舆情的热度 (不为空时表示该条每日热点为更新热点,那么7日热点已经聚过此热点, 热度应该只加该条舆情的热度) @param article_count: @param vip_count: @param negative_emotion_count: @param weight: --------- @result: ''' article_text = day_hot.get("TITLE") # + hot.get("CONTENT") release_time = day_hot.get("RELEASE_TIME") article_text = tools.del_html_tag(article_text) hots = self._get_week_hots(article_text, release_time) # 找最相似的热点 similar_hot = None max_similarity = 0 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('TITLE') # + hot.get('CONTENT') hot_text = tools.del_html_tag(hot_text) temp_similarity = compare_text(article_text, hot_text) if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity: similar_hot = hot max_similarity = temp_similarity break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 if similar_hot: # 找到相似的热点 if similar_hot["ID"] != day_hot["ID"]: # 防止同一个舆情 比较多次 data = {} # 更新热点的热度与文章数 data['HOT'] = similar_hot['HOT'] + (hot_value or day_hot.get('HOT')) data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + ( article_count or day_hot.get('ARTICLE_COUNT')) # 更新主流媒体数量及负面舆情数量 data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + ( vip_count or day_hot.get('VIP_COUNT')) data["NEGATIVE_EMOTION_COUNT"] = similar_hot[ 'NEGATIVE_EMOTION_COUNT'] + ( negative_emotion_count or hot.get('NEGATIVE_EMOTION_COUNT')) # 更新相关度 # data['WEIGHT'] = similar_hot['WEIGHT'] + (weight or day_hot['WEIGHT']) # 更新 hot_day_ids if not hot_value: data["HOT_DAY_IDS"] = similar_hot[ 'HOT_DAY_IDS'] + ',' + day_hot['ID'] # 更新热点 self._es.update_by_id("tab_iopm_hot_week_info", data_id=similar_hot.get("ID"), data=data) # 返回热点id return similar_hot.get("ID") else: # 将该舆情添加为热点 hot_info = deepcopy(day_hot) # 处理事件类型 del_tag_content = tools.del_html_tag(hot_info['CONTENT']) text = hot_info['TITLE'] + del_tag_content contain_event_ids = self._event_filter.find_contain_event(text) hot_info['EVENT_IDS'] = ','.join(contain_event_ids) hot_info['HOT_DAY_IDS'] = day_hot.get("ID") self._es.add('tab_iopm_hot_week_info', hot_info, data_id=hot_info['ID']) # 返回热点id return hot_info['ID']
class ArticleSync(): def __init__(self, table): self._sync_time_file = SYNC_TIME_FILE + table + '.txt' self._record_time = tools.get_json( tools.read_file(self._sync_time_file)) or {} self._compare_keywords = CompareKeywords() self._summary = Summary() self._emotion = Emotion() self._word_cloud = WordCloud() self._yqtj_es = ES(YQTJ) self._data_pool_es = ES(DATA_POOL) self._hot_sync = HotSync() self._vip_checked = VipChecked() self._province_filter = ProvinceFilter() # self._event_filter = EventFilter() self._table = table self._per_record_time_key = '{table}_record_time'.format( table=self._table) self._vip_checked.start() self._compare_keywords.start() # self._event_filter.start() def get_article_info(self): ''' @summary: 取article的结构信息 --------- --------- @result: ''' article_info = { "EMOTION": None, "HOST": "", "AUTHOR": "", "URL": "", "WEBSITE_NAME": "", "ACCOUNT": "", "REVIEW_COUNT": None, "KEYWORDS_COUNT": None, "RELEASE_TIME": "", "CONTENT": "", "ID": None, "UUID": "", "WEIGHT": None, "CLUES_IDS": "", "UP_COUNT": None, "INTERACTION_COUNT": None, "RECORD_TIME": None, "COMMENT_COUNT": None, "IS_VIP": None, "INFO_TYPE": None, "HOT_ID": None, "KEYWORD_CLUES_ID": "", "MAY_INVALID": None, "TITLE": "", "KEYWORDS": "", "TRANSMIT_COUNT": None, "ZERO_ID": None, "FIRST_ID": None, "SECOND_ID": None, "SUMMARY": "", "WORD_CLOUD": "", "IMAGE_URL": "" } return article_info def get_article_clues_src(self): article_clues_src = {"CLUES_ID": "", "ARTICLE_ID": "", "ID": ""} return article_clues_src def get_per_record_time(self): per_record_time = self._record_time.get(self._per_record_time_key) return per_record_time def record_now_record_time(self, record_time): self._record_time[self._per_record_time_key] = record_time tools.write_file(self._sync_time_file, tools.dumps_json(self._record_time)) def get_article(self): ''' @summary: 目前取的是record_time 为了保证有数据, 正常应该取releast_time TODO --------- --------- @result: ''' log.debug("取代做种子集...") per_record_time = self.get_per_record_time() today_time = tools.get_current_date("%Y-%m-%d") min_day_ago = tools.get_before_date(today_time, -30, current_date_format='%Y-%m-%d', return_date_format='%Y-%m-%d') if per_record_time: # body = { # "size":1500, # "query": { # "filtered": { # "filter": { # "range": { # "record_time" : { # "gt": per_record_time # } # } # } # } # }, # "sort":[{"record_time":"asc"}] # } body = { "size": 1500, "query": { "filtered": { "filter": { "bool": { "must": [ { "range": { "record_time": { "gt": per_record_time } } }, { "range": { "release_time": { "gte": min_day_ago + ' 00:00:00', # 30日前 "lte": today_time + ' 23:59:59' # 今日 } } } ] } } } }, "sort": [{ "record_time": "asc" }] } else: body = { "query": { "filtered": { "filter": { "range": { "release_time": { "gte": three_day_ago + ' 00:00:00', # 三日前 "lte": today_time + ' 23:59:59' # 今日 } } } } }, "size": 1500, "sort": [{ "record_time": "asc" }] } log.debug(self._table + " => " + tools.dumps_json(body)) article = self._data_pool_es.search(self._table, body) return article.get('hits', {}).get('hits', []) def deal_article(self, article_list): ''' @summary:处理article --------- @param article_list: --------- @result: ''' article_infos = [] # 补全剩余的信息 for article_info in article_list: # print(tools.dumps_json(article_info)) # 互动量 article_info['INTERACTION_COUNT'] = ( article_info['UP_COUNT'] or 0) + (article_info['TRANSMIT_COUNT'] or 0) + (article_info['REVIEW_COUNT'] or 0) + (article_info['COMMENT_COUNT'] or 0) # 检查库中是否已存在 存在则更新互动量 if self._yqtj_es.get('tab_iopm_article_info', article_info["ID"]): log.debug('%s 已存在' % article_info['TITLE']) data = { "INTERACTION_COUNT": article_info['INTERACTION_COUNT'], "UP_COUNT": article_info['UP_COUNT'], "TRANSMIT_COUNT": article_info['TRANSMIT_COUNT'], "REVIEW_COUNT": article_info['REVIEW_COUNT'], "COMMENT_COUNT": article_info['COMMENT_COUNT'] } # 更新舆情 self._yqtj_es.update_by_id("tab_iopm_article_info", data_id=article_info.get("ID"), data=data) continue # 标题+内容文本信息 del_tag_content = tools.del_html_tag(article_info['CONTENT']) text = article_info['TITLE'] + del_tag_content # print(text) # 地域过滤 contain_airs = ','.join( self._province_filter.find_contain_air(text)) weight_factor = 1 # 权重系数 if not contain_airs and PROVINCE: # log.debug('%s 不包含 本地地名 pass' % article_info['TITLE']) weight_factor = 0.01 # 不是本市的,权重系数较小; 权值 = 权重 * 权重系数 # 线索关键词比对 keywords, clues_ids, zero_ids, first_ids, second_ids, keyword_clues = self._compare_keywords.get_contained_keys( text) article_info[ 'KEYWORDS'] = keywords + ',' + contain_airs if keywords else contain_airs article_info['KEYWORDS'] = ','.join( set(article_info['KEYWORDS'].split(','))) article_info['CLUES_IDS'] = clues_ids article_info['ZERO_ID'] = zero_ids article_info['FIRST_ID'] = first_ids article_info['SECOND_ID'] = second_ids article_info['KEYWORDS_COUNT'] = len(keyword_clues) article_info['KEYWORD_CLUES_ID'] = str(keyword_clues) # # 线索与舆情中间表 # article_clues_srcs = [] # if clues_ids: # for clues_id in clues_ids.split(','): # article_clues_src = self.get_article_clues_src() # article_clues_src['ID'] = tools.get_uuid(clues_id, article_info['ID']) # article_clues_src['CLUES_ID'] = clues_id # article_clues_src['ARTICLE_ID'] = article_info['ID'] # article_clues_srcs.append(article_clues_src) # self._yqtj_es.add_batch(article_clues_srcs, "ID", 'tab_iopm_article_clues_src') # 词语图 word_cloud = self._word_cloud.get_word_cloud(del_tag_content) article_info['WORD_CLOUD'] = tools.dumps_json(word_cloud) # 摘要 if not article_info['SUMMARY']: article_info['SUMMARY'] = self._summary.get_summary( del_tag_content) # 情感分析 (1 正 2 负 3 中立, 百度:0:负向,1:中性,2:正向) emotion = self._emotion.get_emotion(article_info['SUMMARY']) if emotion == 0: emotion = 2 elif emotion == 1: emotion = 3 elif emotion == 2: emotion = 1 else: emotion = 3 article_info['EMOTION'] = emotion # 主流媒体 is_vip, zero_id, first_id, second_id = self._vip_checked.is_vip( article_info['HOST'], article_info['WEBSITE_NAME']) article_info["IS_VIP"] = is_vip if is_vip: article_info['ZERO_ID'] = article_info[ 'ZERO_ID'] + ',' + zero_id if article_info[ 'ZERO_ID'] else zero_id article_info['FIRST_ID'] = article_info[ 'FIRST_ID'] + ',' + first_id if article_info[ 'FIRST_ID'] else first_id article_info['SECOND_ID'] = article_info[ 'SECOND_ID'] + ',' + second_id if article_info[ 'SECOND_ID'] else second_id # 计算相关度 url = IOPM_SERVICE_ADDRESS + 'related_sort' data = { 'article_id': article_info['ID'], # 文章id 'clues_ids': article_info['CLUES_IDS'], # 线索ids 'may_invalid': 0, #是否可能无效(微博包含@ 或者#) 'vip_count': article_info['IS_VIP'], # 主流媒体数 'negative_emotion_count': 1 if article_info['EMOTION'] == 2 else 0, # 负面情感数 'zero_ids': article_info['ZERO_ID'] } result = tools.get_json_by_requests(url, data=data) article_info['WEIGHT'] = result.get('weight', 0) * weight_factor # 统计相似文章 热点 if article_info['INFO_TYPE'] == 3: # 微博 article_info['TITLE'] = article_info['SUMMARY'][:30] article_info['HOT_ID'] = self._hot_sync.get_hot_id( article_info, contain_airs, weight_factor) log.debug(''' title %s release_time %s record_time %s url %s 匹配的关键字:%s 线索id %s 一级分类 %s 二级分类 %s 三级分类 %s 关键词-线索 %s 地域 %s ''' % (article_info['TITLE'], article_info['RELEASE_TIME'], article_info['RECORD_TIME'], article_info["URL"], keywords, clues_ids, zero_ids, first_id, second_ids, keyword_clues, contain_airs)) # print(tools.dumps_json(article_info)) article_infos.append(article_info) # print('article入库') # self._yqtj_es.add('tab_iopm_article_info', article_info, article_info["ID"]) # article入库 批量 print('article批量入库 size = %s' % len(article_infos)) # print(tools.dumps_json(article_infos)) self._yqtj_es.add_batch(article_infos, "ID", 'tab_iopm_article_info')
class HotSync(): def __init__(self): self._es = ES() self._hot_week_sync = HotWeekSync() self._cut_text = CutText() self._cut_text.set_stop_words('utils/stop_words.txt') def _get_today_hots(self, text, release_time): release_day = release_time[:release_time.find(' ')] body = { "size":1, "query": { "filtered": { "filter": { "range": { "RELEASE_TIME": { # 当日发布的新闻 "gte": release_day + ' 00:00:00', "lte": release_day + ' 23:59:59' } } }, "query": { "multi_match": { "query": text, "fields": [ "TITLE" ], "operator": "or", "minimum_should_match": "{percent}%".format(percent = int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比 } } } }#, # "_source": [ # "ID", # "TITLE", # # "CONTENT", # "RELEASE_TIME", # "WEIGHT", # "HOT", # "ARTICLE_COUNT", # "CLUES_IDS", # "VIP_COUNT", # "NEGATIVE_EMOTION_COUNT" # ], # "highlight": { # "fields": { # "TITLE": {} # } # } } # 默认按照匹配分数排序 hots = self._es.search('tab_iopm_hot_info', body) # print(tools.dumps_json(hots)) return hots.get('hits', {}).get('hits', []) def get_hot_id(self, article_info, positions, weight_factor): ''' @summary: 聚类 --------- @param article_info: --------- @result: ''' # weight_factor = 1 article_text = article_info.get("TITLE")# + article_info.get("CONTENT") release_time = article_info.get("RELEASE_TIME") article_text = tools.del_html_tag(article_text) hots = self._get_today_hots(article_text, release_time) # 找最相似的热点 similar_hot = None max_similarity = 0 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('TITLE')# + hot.get('CONTENT') hot_text = tools.del_html_tag(hot_text) temp_similarity = compare_text(article_text, hot_text) if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity: similar_hot = hot max_similarity = temp_similarity break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 if similar_hot:# 找到相似的热点 if similar_hot["ID"] != article_info["ID"]: # 防止同一个舆情 比较多次 data = {} # 更新热点的热度与文章数 data['HOT'] = similar_hot['HOT'] + INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) * weight_factor data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + 1 # 更新主流媒体数量及负面舆情数量 data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + article_info["IS_VIP"] data["NEGATIVE_EMOTION_COUNT"] = similar_hot['NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION'] == 2 else 0) weight_temp = 0 # 记录更新前后的差值 # 更新相关度 if similar_hot['CLUES_IDS']: url = IOPM_SERVICE_ADDRESS + 'related_sort' data_args = { 'hot_id': similar_hot['ID'], # 文章id 'hot_value' :data['HOT'], # 热度值 'clues_ids': similar_hot['CLUES_IDS'], #相关舆情匹配到的线索id 'article_count' : data['ARTICLE_COUNT'], # 文章总数 'vip_count': data["VIP_COUNT"], # 主流媒体数 'negative_emotion_count': data["NEGATIVE_EMOTION_COUNT"], # 负面情感数 'zero_ids':article_info['ZERO_ID'] } result = tools.get_json_by_requests(url, data = data_args) weight_temp = similar_hot['WEIGHT'] - result.get('weight', 0) data['WEIGHT'] = result.get('weight', 0) * weight_factor # 更新热点 self._es.update_by_id("tab_iopm_hot_info", data_id = similar_hot.get("ID"), data = data) # 同步7日热点 self._hot_week_sync.cluster_week_hot(similar_hot, hot_value = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0), article_count = 1, vip_count = article_info["IS_VIP"], negative_emotion_count = 1 if article_info['EMOTION'] == 2 else 0, weight = weight_temp) # 返回热点id return similar_hot.get("ID") else: # 将该舆情添加为热点 hot_info = deepcopy(article_info) hot_info.pop('HOT_ID') # 热点表中无hot_id # 默认用户行为数量为零 hot_info['ACCEPT_COUNT'] = 0 hot_info['UNACCEPT_COUNT'] = 0 hot_info['WATCH_COUNT'] = 0 # 其他值 hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0 hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info['EMOTION'] == 2 else 0 hot_info['HOT'] = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) * weight_factor hot_info['ID'] = article_info.get("ID") hot_info['ARTICLE_COUNT'] = 1 hot_info['HOT_KEYWORDS'] = ','.join(self._cut_text.cut_for_keyword(article_info["TITLE"])) # 关键词 可优化速度 在比较相似度时已经分词了 TODO hot_info['POSITIONS'] = positions hot_info['EVENT_IDS'] = '' # 事件类型(每日热点不需要 TODO | 每周热点已加) self._es.add('tab_iopm_hot_info', hot_info, data_id = hot_info['ID']) # 同步7日热点 self._hot_week_sync.cluster_week_hot(hot_info) # 返回热点id return hot_info['ID']
class UpdateWeight(): """docstring for UpdateWeight""" def __init__(self): self._yqtj_es = ES(YQTJ) def get_articles(self, table, record_time, release_time_begin, release_time_end): body = { "query": { "filtered": { "filter": { "bool": { "must": [ { "range": { "RECORD_TIME": { # 查询大于该csr_res_id 的信息 "gt": record_time } } }, { "range": { "RELEASE_TIME": { "gte": release_time_begin, "lte": release_time_end } } } ] } } } }, "size": 1500, "sort": [{ "RECORD_TIME": "asc" }] } print(tools.dumps_json(body)) article = self._yqtj_es.search(table, body) return article.get('hits', {}).get('hits', []) def update_article_weight(self, articles): release_time = '' for article in articles: article_info = article.get('_source') if article_info['WEIGHT'] == 0: continue data = { 'article_id': article_info['ID'], # 文章id 'clues_ids': article_info['CLUES_IDS'], # 线索ids 'may_invalid': 0, #是否可能无效(微博包含@ 或者#) 'vip_count': article_info['IS_VIP'], # 主流媒体数 'negative_emotion_count': article_info['EMOTION'], # 负面情感数 'zero_ids': article_info['ZERO_ID'] } print(article_info["TITLE"]) print(article_info["RELEASE_TIME"]) result = tools.get_json_by_requests(IOPM_SERVICE_ADDRESS, data=data) weight = result.get('weight', 0) # * weight_factor 没有考虑到地域 tools.print_one_line("修改相关度 %s -> %s" % (article_info['WEIGHT'], weight)) if self._yqtj_es.update_by_id('tab_iopm_article_info', article_info['ID'], {"WEIGHT": weight}): release_time, record_time = article_info[ "RELEASE_TIME"], article_info["RECORD_TIME"] return release_time, record_time def update_hot_weight(self, articles): release_time = '' for article in articles: article_info = article.get('_source') if article_info['WEIGHT'] == 0: continue data = { 'hot_id': article_info['ID'], # 文章id 'hot_value': article_info['HOT'], # 热度值 'clues_ids': article_info['CLUES_IDS'], #相关舆情匹配到的线索id 'article_count': article_info['ARTICLE_COUNT'], # 文章总数 'vip_count': article_info["VIP_COUNT"], # 主流媒体数 'negative_emotion_count': article_info["NEGATIVE_EMOTION_COUNT"], # 负面情感数 'zero_ids': article_info['ZERO_ID'] } print(''' release_time %s record_time %s ''' % (article_info["RELEASE_TIME"], article_info["RECORD_TIME"])) result = tools.get_json_by_requests(IOPM_SERVICE_ADDRESS, data=data) weight = result.get('weight', 0) # * weight_factor 没有考虑到地域 tools.print_one_line("修改相关度 %s -> %s" % (article_info['WEIGHT'], weight)) if self._yqtj_es.update_by_id('tab_iopm_hot_info', article_info['ID'], {"WEIGHT": weight}): record_time = article_info['RECORD_TIME'] return record_time
class NewsCluster(): def __init__(self): self._es = ES() self._current_csr_res_id = tools.read_file(STO_CURRENT_ID_FILE) self._current_csr_res_id = self._current_csr_res_id and int( self._current_csr_res_id) or 0 def _get_same_day_hots(self, text, start_time): news_day_time = start_time[:start_time.find(' ')] body = { "query": { "filtered": { "filter": { "range": { "start_time": { "gte": news_day_time + ' 00:00:00', 'lte': news_day_time + ' 59:59:59' } } }, "query": { "multi_match": { "query": text, "fields": ["csr_content"], "operator": "or", "minimum_should_match": "{percent}%".format(percent=int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比 } } } }, "_source": ["hot_id", "csr_res_ids", "csr_content", 'hot'], "highlight": { "fields": { "csr_content": {} } } } # 默认按照匹配分数排序 hots = self._es.search('tab_news_csr_hot', body) # print(tools.dumps_json(hots)) return hots.get('hits', {}).get('hits', []) def _save_current_id(self): ''' @summary: 保存做到的id, 下次接着做 --------- --------- @result: ''' tools.write_file(STO_CURRENT_ID_FILE, str(self._current_csr_res_id)) def deal_news(self): ''' @summary: 取tab_news_csr_result信息 --------- --------- @result: ''' while True: body = { "query": { "filtered": { "filter": { "range": { "csr_res_id": { # 查询大于该csr_res_id 的信息 "gt": self._current_csr_res_id } } } } }, "_source": ["csr_res_id", "csr_content", "start_time"], "sort": [{ "csr_res_id": "asc" }] } news_json = self._es.search('tab_news_csr_result', body) news_list = news_json.get('hits', {}).get('hits', []) if not news_list: log.debug( 'tab_news_csr_result 表中无大于%s的csr_res_id\nsleep %s...' % (self._current_csr_res_id, SLEEP_TIME)) tools.delay_time(SLEEP_TIME) continue for news_info in news_list: news = news_info.get('_source') csr_res_id = news.get('csr_res_id') csr_content = news.get('csr_content') start_time = news.get('start_time') log.debug(''' 处理 tab_news_csr_result csr_res_id %s start_time %s csr_content %s ''' % (csr_res_id, start_time, csr_content)) # 找相似文章 similar_hot = None hots = self._get_same_day_hots(csr_content, start_time) # 遍历相似的文章,比较相似度 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('csr_content') temp_similarity = compare_text(csr_content, hot_text) if temp_similarity > MIN_SIMILARITY: similar_hot = hot break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 # 如果找到相似的文章,追加csr_res_id和hot值, 否则将该条信息最为新的热点 if similar_hot: # 找到相似的热点 log.debug('找到所属热点:%s' % similar_hot.get('csr_content')) data = {} # 更新热点的热度及追加文章的id data["hot"] = similar_hot["hot"] + 1 data["csr_res_ids"] = similar_hot[ "csr_res_ids"] + ',' + csr_res_id # 更新热点 self._es.update_by_id("tab_news_csr_hot", data_id=similar_hot.get("hot_id"), data=data) else: # 没有找到相似的热点, 将当前文章作为热点 log.debug('无所属热点') hot_info = { 'hot_id': csr_res_id, 'hot': 1, 'start_time': start_time, 'csr_res_ids': csr_res_id, 'csr_content': csr_content } self._es.add('tab_news_csr_hot', hot_info, data_id=csr_res_id) # 保存当前的id self._current_csr_res_id = csr_res_id self._save_current_id()
class HotSync(): def __init__(self): self._es = ES() def _get_today_hots(self, text, release_time): release_day = release_time[:release_time.find(' ')] body = { "query": { "filtered": { "filter": { "range": { "RELEASE_TIME": { # 当日发布的新闻 "gte": release_day + ' 00:00:00', "lte": release_day + ' 23:59:59' } } }, "query": { "multi_match": { "query": text, "fields": ["TITLE"], "operator": "or", "minimum_should_match": "{percent}%".format(percent=int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比 } } } }, "_source": [ "ID", "TITLE", "CONTENT", "HOT", "CLUES_IDS", "VIP_COUNT", "NEGATIVE_EMOTION_COUNT" ], "highlight": { "fields": { "TITLE": {} } } } # 默认按照匹配分数排序 hots = self._es.search('tab_iopm_hot_info', body) # print(tools.dumps_json(hots)) return hots.get('hits', {}).get('hits', []) def get_hot_id(self, article_info): article_text = article_info.get( "TITLE") # + article_info.get("CONTENT") release_time = article_info.get("RELEASE_TIME") article_text = tools.del_html_tag(article_text) hots = self._get_today_hots(article_text, release_time) # 找最相似的热点 similar_hot = None max_similarity = 0 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('TITLE') # + hot.get('CONTENT') hot_text = tools.del_html_tag(hot_text) temp_similarity = compare_text(article_text, hot_text) if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity: similar_hot = hot max_similarity = temp_similarity break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 if similar_hot: # 找到相似的热点 if similar_hot["ID"] != article_info["ID"]: # 防止同一个舆情 比较多次 data = {} # 更新热点的热度 data["HOT"] = similar_hot["HOT"] + 1 # 更新主流媒体数量及负面舆情数量 data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + ( 1 if article_info["IS_VIP"] else 0) data["NEGATIVE_EMOTION_COUNT"] = similar_hot[ 'NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION'] == 2 else 0) # 更新相关度 if similar_hot['CLUES_IDS']: url = IOPM_SERVICE_ADDRESS + 'related_sort' data_args = { 'hot_id': similar_hot['ID'], # 文章id 'hot_value': data['HOT'], # 热度值 'clues_id': similar_hot['CLUES_IDS'], #相关舆情匹配到的线索id 'article_count': data['HOT'], # 文章总数 'vip_count': data["VIP_COUNT"], # 主流媒体数 'negative_emotion_count': data["NEGATIVE_EMOTION_COUNT"], # 负面情感数 'zero_ids': article_info['ZERO_ID'] } result = tools.get_json_by_requests(url, data=data_args) if result: data['WEIGHT'] = result.get('weight', 0) # 更新热点 self._es.update_by_id("tab_iopm_hot_info", data_id=similar_hot.get("ID"), data=data) # 返回热点id return similar_hot.get("ID") else: # 将该舆情添加为热点 hot_info = deepcopy(article_info) hot_info.pop('HOT_ID') # 热点表中无hot_id # 默认用户行为数量为零 hot_info['ACCEPT_COUNT'] = 0 hot_info['UNACCEPT_COUNT'] = 0 hot_info['WATCH_COUNT'] = 0 # 其他值 hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0 hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info[ 'EMOTION'] == 2 else 0 hot_info['HOT'] = 1 hot_info['ID'] = article_info.get("ID") self._es.add('tab_iopm_hot_info', hot_info, data_id=hot_info['ID']) # 返回热点id return hot_info['ID']