class SyncES(): def __init__(self): self._es = ES() self._db = OracleDB() self._max_id = tools.read_file(STO_MAX_ID_FILE) self._max_id = self._max_id and eval(self._max_id) or {} def get_data(self, sql): return self._db.find(sql, to_json=True) def export_to_es(self, table, data, data_id): self._es.add(table=table, data=data, data_id=data_id) def sync_data(self, table, step=20): ''' @summary: 需要先把id设为主键 --------- @param sql: @param table: @param is_step: 分批导 0 位一次导入, 适合数据量不多情况。速度快 --------- @result: ''' max_id = self._max_id.get(table, 0) self._db.set_primary_key(table) while True: inner_sql = 'select * from %s where id > %d and rownum <= %d order by id' % ( table, max_id, step) datas = sync_es.get_data(inner_sql) if not datas: self.close() break for data in datas: data_id = data['ID'] data = tools.dumps_json(data) print(data) print(data_id) # print(data) max_id = data_id self.export_to_es(table, data, data_id) self._max_id[table] = max_id def close(self): tools.write_file(STO_MAX_ID_FILE, str(self._max_id))
"praise_count": 0, "uuid": "31ea4d35-a8ce-377f-bb6d-9846ab607aaa", "author": "", "play_count": 0, "release_time": "2017-12-21 11:47:17", "time_length": 0, "summary": "", "domain": "sina.com.cn", "comment_count": 0, "site_name": "新浪" } content_info = { "title": "十九大代表带你学报告——钟佰均谈脱贫攻坚", "domain": "qq.com", "uuid": "4f73c1bf-6b67-3b32-9ddc-3d141de72f05", "summary": "", "image_url": "http://vpic.video.qq.com/89220912/l051527zwgx_160_90_3.jpg", "url": "http://v.qq.com/x/page/l051527zwgx.html?ptag=iqiyi.news", "praise_count": 0, "play_count": 0, "release_time": "2017-12-21", "time_length": 0, "author": "", "content": "", "site_name": "爱奇艺", "comment_count": 0, "record_time": "2017-12-21 11:59:17" } es.add('video_news', content_info, content_info['uuid'])
class HotWeekSync(): def __init__(self): self._es = ES() self._event_filter = EventFilter() self._event_filter.start() def _get_week_hots(self, text, release_time): before_week = tools.get_before_date(release_time, -7) body = { "size": 1, "query": { "filtered": { "filter": { "range": { "RELEASE_TIME": { # 当日发布的新闻 "gte": before_week, "lte": release_time } } }, "query": { "multi_match": { "query": text, "fields": ["TITLE"], "operator": "or", "minimum_should_match": "{percent}%".format(percent=int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比 } } } }, "_source": [ "ID", "TITLE", # "CONTENT", "HOT", "ARTICLE_COUNT", "VIP_COUNT", "NEGATIVE_EMOTION_COUNT", "HOT_DAY_IDS", "WEIGHT" ], # "highlight": { # "fields": { # "TITLE": {} # } # } } # 默认按照匹配分数排序 hots = self._es.search('tab_iopm_hot_week_info', body) # print(tools.dumps_json(hots)) return hots.get('hits', {}).get('hits', []) def cluster_week_hot(self, day_hot, hot_value=None, article_count=None, vip_count=None, negative_emotion_count=None, weight=None): ''' @summary: 聚类 --------- @param hot:每日热点信息 @param hot_value: 一条舆情的热度 (不为空时表示该条每日热点为更新热点,那么7日热点已经聚过此热点, 热度应该只加该条舆情的热度) @param article_count: @param vip_count: @param negative_emotion_count: @param weight: --------- @result: ''' article_text = day_hot.get("TITLE") # + hot.get("CONTENT") release_time = day_hot.get("RELEASE_TIME") article_text = tools.del_html_tag(article_text) hots = self._get_week_hots(article_text, release_time) # 找最相似的热点 similar_hot = None max_similarity = 0 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('TITLE') # + hot.get('CONTENT') hot_text = tools.del_html_tag(hot_text) temp_similarity = compare_text(article_text, hot_text) if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity: similar_hot = hot max_similarity = temp_similarity break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 if similar_hot: # 找到相似的热点 if similar_hot["ID"] != day_hot["ID"]: # 防止同一个舆情 比较多次 data = {} # 更新热点的热度与文章数 data['HOT'] = similar_hot['HOT'] + (hot_value or day_hot.get('HOT')) data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + ( article_count or day_hot.get('ARTICLE_COUNT')) # 更新主流媒体数量及负面舆情数量 data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + ( vip_count or day_hot.get('VIP_COUNT')) data["NEGATIVE_EMOTION_COUNT"] = similar_hot[ 'NEGATIVE_EMOTION_COUNT'] + ( negative_emotion_count or hot.get('NEGATIVE_EMOTION_COUNT')) # 更新相关度 # data['WEIGHT'] = similar_hot['WEIGHT'] + (weight or day_hot['WEIGHT']) # 更新 hot_day_ids if not hot_value: data["HOT_DAY_IDS"] = similar_hot[ 'HOT_DAY_IDS'] + ',' + day_hot['ID'] # 更新热点 self._es.update_by_id("tab_iopm_hot_week_info", data_id=similar_hot.get("ID"), data=data) # 返回热点id return similar_hot.get("ID") else: # 将该舆情添加为热点 hot_info = deepcopy(day_hot) # 处理事件类型 del_tag_content = tools.del_html_tag(hot_info['CONTENT']) text = hot_info['TITLE'] + del_tag_content contain_event_ids = self._event_filter.find_contain_event(text) hot_info['EVENT_IDS'] = ','.join(contain_event_ids) hot_info['HOT_DAY_IDS'] = day_hot.get("ID") self._es.add('tab_iopm_hot_week_info', hot_info, data_id=hot_info['ID']) # 返回热点id return hot_info['ID']
class HotSync(): def __init__(self): self._es = ES() self._hot_week_sync = HotWeekSync() self._cut_text = CutText() self._cut_text.set_stop_words('utils/stop_words.txt') def _get_today_hots(self, text, release_time): release_day = release_time[:release_time.find(' ')] body = { "size":1, "query": { "filtered": { "filter": { "range": { "RELEASE_TIME": { # 当日发布的新闻 "gte": release_day + ' 00:00:00', "lte": release_day + ' 23:59:59' } } }, "query": { "multi_match": { "query": text, "fields": [ "TITLE" ], "operator": "or", "minimum_should_match": "{percent}%".format(percent = int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比 } } } }#, # "_source": [ # "ID", # "TITLE", # # "CONTENT", # "RELEASE_TIME", # "WEIGHT", # "HOT", # "ARTICLE_COUNT", # "CLUES_IDS", # "VIP_COUNT", # "NEGATIVE_EMOTION_COUNT" # ], # "highlight": { # "fields": { # "TITLE": {} # } # } } # 默认按照匹配分数排序 hots = self._es.search('tab_iopm_hot_info', body) # print(tools.dumps_json(hots)) return hots.get('hits', {}).get('hits', []) def get_hot_id(self, article_info, positions, weight_factor): ''' @summary: 聚类 --------- @param article_info: --------- @result: ''' # weight_factor = 1 article_text = article_info.get("TITLE")# + article_info.get("CONTENT") release_time = article_info.get("RELEASE_TIME") article_text = tools.del_html_tag(article_text) hots = self._get_today_hots(article_text, release_time) # 找最相似的热点 similar_hot = None max_similarity = 0 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('TITLE')# + hot.get('CONTENT') hot_text = tools.del_html_tag(hot_text) temp_similarity = compare_text(article_text, hot_text) if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity: similar_hot = hot max_similarity = temp_similarity break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 if similar_hot:# 找到相似的热点 if similar_hot["ID"] != article_info["ID"]: # 防止同一个舆情 比较多次 data = {} # 更新热点的热度与文章数 data['HOT'] = similar_hot['HOT'] + INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) * weight_factor data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + 1 # 更新主流媒体数量及负面舆情数量 data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + article_info["IS_VIP"] data["NEGATIVE_EMOTION_COUNT"] = similar_hot['NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION'] == 2 else 0) weight_temp = 0 # 记录更新前后的差值 # 更新相关度 if similar_hot['CLUES_IDS']: url = IOPM_SERVICE_ADDRESS + 'related_sort' data_args = { 'hot_id': similar_hot['ID'], # 文章id 'hot_value' :data['HOT'], # 热度值 'clues_ids': similar_hot['CLUES_IDS'], #相关舆情匹配到的线索id 'article_count' : data['ARTICLE_COUNT'], # 文章总数 'vip_count': data["VIP_COUNT"], # 主流媒体数 'negative_emotion_count': data["NEGATIVE_EMOTION_COUNT"], # 负面情感数 'zero_ids':article_info['ZERO_ID'] } result = tools.get_json_by_requests(url, data = data_args) weight_temp = similar_hot['WEIGHT'] - result.get('weight', 0) data['WEIGHT'] = result.get('weight', 0) * weight_factor # 更新热点 self._es.update_by_id("tab_iopm_hot_info", data_id = similar_hot.get("ID"), data = data) # 同步7日热点 self._hot_week_sync.cluster_week_hot(similar_hot, hot_value = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0), article_count = 1, vip_count = article_info["IS_VIP"], negative_emotion_count = 1 if article_info['EMOTION'] == 2 else 0, weight = weight_temp) # 返回热点id return similar_hot.get("ID") else: # 将该舆情添加为热点 hot_info = deepcopy(article_info) hot_info.pop('HOT_ID') # 热点表中无hot_id # 默认用户行为数量为零 hot_info['ACCEPT_COUNT'] = 0 hot_info['UNACCEPT_COUNT'] = 0 hot_info['WATCH_COUNT'] = 0 # 其他值 hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0 hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info['EMOTION'] == 2 else 0 hot_info['HOT'] = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) * weight_factor hot_info['ID'] = article_info.get("ID") hot_info['ARTICLE_COUNT'] = 1 hot_info['HOT_KEYWORDS'] = ','.join(self._cut_text.cut_for_keyword(article_info["TITLE"])) # 关键词 可优化速度 在比较相似度时已经分词了 TODO hot_info['POSITIONS'] = positions hot_info['EVENT_IDS'] = '' # 事件类型(每日热点不需要 TODO | 每周热点已加) self._es.add('tab_iopm_hot_info', hot_info, data_id = hot_info['ID']) # 同步7日热点 self._hot_week_sync.cluster_week_hot(hot_info) # 返回热点id return hot_info['ID']
class ExportData(): INSERT = 1 UPDATE = 2 EXCEPTION = 3 def __init__(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback='', sync_to_es=False): ''' @summary: 初始化 --------- @param source_table: 源table mongo数据库 @param aim_table: 目标table @param key_map: 目标table 和 源table 的键的映射 eg: key_map = { 'aim_key1' : 'str_source_key2', # 目标键 = 源键对应的值 类型为str 'aim_key2' : 'int_source_key3', # 目标键 = 源键对应的值 类型为int 'aim_key3' : 'date_source_key4', # 目标键 = 源键对应的值 类型为date 'aim_key4' : 'vint_id', # 目标键 = 值 类型为int 'aim_key5' : 'vstr_name', # 目标键 = 值 类型为str 'aim_key6' : 'vdate_name', # 目标键 = 值 类型为date 'aim_key7' : 'sint_select id from xxx' # 目标键 = 值为sql 查询出的结果 类型为int 'aim_key8' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str 'aim_key9' : 'clob_key8' # 目标键 = 源键对应的值 类型为clob 'aim_key10' : 'clob_key8' # 目标键 = 源键对应的值 类型为str } @param unique_key: 唯一的key 目标数据库根据该key去重 @param unique_key_mapping_source_key: 目标表中唯一的key所对应的源表中的key 该值不为空时 更新目标表中已有的数据 eg: unique_key_mapping_source_key = { 'url':'str_url' # 目标键 = 源键对应的值 类型为str } @param condition: 导出满足什么样条件的数据 默认是read_status = 0 的 @param datas: 要导出的数据,格式为[{...},{...}] 或者 {}用于直接将json数组导入到目标表,为空时默认导出mongodb的数据 @param callback 导出数据的回调,导出一组,执行一次,callback(execute_type, sql) execute_type为执行类型(ExportData.INSERT、ExportData.UPDATE、ExportData.EXCEPTION) sql 为执行的语句 --------- @result: ''' super(ExportData, self).__init__() self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._update_read_status = update_read_status self._condition = condition self._mongodb = MongoDB() if self._source_table else '' self._datas = datas self._sync_to_es = sync_to_es self._callback = callback self._is_oracle = False self._is_set_unique_key = False self._is_set_unique_key = False self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key def export_to_oracle(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback='', sync_to_es=False): if aim_table: if self._aim_table != aim_table: self._is_set_unique_key = False self._es = ES() if sync_to_es else '' self._mongodb = MongoDB() if source_table else '' self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key self._update_read_status = update_read_status if not datas else False self._condition = condition self._datas = datas self._callback = callback self._sync_to_es = sync_to_es self._es = None self._aim_db = OracleDB() self._is_oracle = True return self.__export() def export_to_mysql(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback=''): if self._aim_table != aim_table: self._is_set_unique_key = False self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key self._update_read_status = update_read_status if not datas else False self._condition = condition self._datas = datas self._callback = callback self._aim_db = MysqlDB() return self.__export() def make_sql(self, data): ''' @summary: --------- @param data: 数据字典 --------- @result: 当unique_key_mapping_source_key不为空时返回insert_sql, update_sql 否则返回insert_sql ''' aim_keys = tuple(self._key_map.keys()) source_keys = tuple(self._key_map.values()) # 取源key值 对应的type 和 key (源key包含type 和 key 信息) keys = [] value_types = [] for source_key in source_keys: temp_var = source_key.split('_', 1) value_types.append(temp_var[0]) keys.append(temp_var[1]) insert_sql = 'insert into ' + self._aim_table + " (" + ', '.join( aim_keys) + ") values (" update_sql = 'update ' + self._aim_table + " set " data_json = {} # 导入到es中用 values = [] for i in range(len(keys)): if (value_types[i] != 'vint' and value_types[i] != 'vstr' and value_types[i] != 'vdate' and value_types[i] != 'sint' and value_types[i] != 'sstr') and (not data[keys[i]] and data[keys[i]] != 0): values.append('null') insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % values[-1] data_json[aim_keys[i].upper()] = None elif value_types[i] == 'str': values.append( str(data[keys[i]]).replace("'", "''") ) # if isinstance(data[keys[i]], str) else data[keys[i]]) # 将单引号替换成两个单引号 否者insert_sql语句语法出错 insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'clob': text = str(data[keys[i]]).replace("'", "''") if not text: insert_sql += "'%s', " values.append(text) update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = None else: values_ = tools.cut_string(text, 1000) clob_text = '' for value in values_: clob_text += "to_clob('%s') || " % value clob_text = clob_text[:-len(' || ')] values.append(clob_text) insert_sql += "%s, " update_sql += aim_keys[i] + " = %s, " % values[-1] data_json[aim_keys[i].upper()] = data[keys[i]] elif value_types[i] == 'int': if isinstance(data[keys[i]], int) or isinstance( data[keys[i]], float) or isinstance( data[keys[i]], str): values.append(data[keys[i]]) elif isinstance(data[keys[i]], bool): values.append(data[keys[i]] and 1 or 0) else: # _id values.append(int(str(data[keys[i]])[-6:], 16)) insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % values[-1] data_json[aim_keys[i].upper()] = eval( values[-1]) if isinstance(values[-1], str) else values[-1] elif value_types[i] == 'date': values.append(data[keys[i]].replace('年', '-').replace( '月', '-').replace('日', '')) if self._is_oracle: format_date = 'yyyy-mm-dd hh24:mi:ss'[:len( values[-1]) if len(values[-1]) <= 10 else None] insert_sql += "to_date('%s','{}'), ".format(format_date) update_sql += aim_keys[i] + "= to_date('%s','%s'), " % ( values[-1], format_date) data_json[aim_keys[i].upper()] = values[-1] else: insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'vint': if tools.get_english_words(keys[i]): sql = 'select %s from dual' % keys[i] value = self._aim_db.find(sql)[0][0] values.append(value) data_json[aim_keys[i].upper()] = values[-1] else: values.append(keys[i]) data_json[aim_keys[i].upper()] = eval(values[-1]) insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % values[-1] elif value_types[i] == 'vstr': values.append(keys[i]) insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'vdate': values.append(keys[i]) if self._is_oracle: format_date = 'yyyy-mm-dd hh24:mi:ss'[:len( values[-1]) if len(values[-1]) <= 10 else None] insert_sql += "to_date('%s','{}'), ".format(format_date) update_sql += aim_keys[i] + "= to_date('%s','%s'), " % ( values[-1], format_date) data_json[aim_keys[i].upper()] = values[-1] else: insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'sint': value = self._aim_db.find(keys[i], fetch_one=True)[0] values.append(value) insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % value data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'sstr': value = self._aim_db.find(keys[i], fetch_one=True)[0] values.append(value) insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % value data_json[aim_keys[i].upper()] = values[-1] else: error_msg = '%s不符合key_map规定格式' % value_types[i] raise (Exception(error_msg)) insert_sql = insert_sql[:-2] + ")" insert_sql = insert_sql % tuple(values) # tools.print(data_json) # log.debug(insert_sql) if self._unique_key_mapping_source_key: # aim_key = tuple(self._unique_key_mapping_source_key.keys())[0] # value = tuple(self._unique_key_mapping_source_key.values())[0] # temp_var = value.split('_', 1) # source_key_types = temp_var[0] # source_key = temp_var[1] # if source_key_types == 'str': # update_sql = update_sql[:-2] + " where %s = '%s'" %(aim_key, data[source_key]) # elif source_key_types == 'int': # update_sql = update_sql[:-2] + " where %s = %s" %(aim_key, data[source_key]) # # log.debug(update_sql) return insert_sql, update_sql[:-2], data_json else: return insert_sql, data_json # @tools.run_safe_model(__name__) def __export(self): if self._unique_key and not self._is_set_unique_key: self._aim_db.set_unique_key(self._aim_table, self._unique_key) self._is_set_unique_key = True datas = self._mongodb.find( self._source_table, condition=self._condition) if self._mongodb else ( self._datas if isinstance(self._datas, list) else [self._datas]) for data in datas: if self._unique_key_mapping_source_key: insert_sql, update_sql, data_json = self.make_sql(data) else: insert_sql, data_json = self.make_sql(data) # tools.write_file(self._aim_table + '.txt', insert_sql, 'w+') def exception_callfunc(e): if 'ORA-00001' in str(e): if self._update_read_status: self._mongodb.update(self._source_table, data, {'read_status': 1}) else: log.error(insert_sql) execute_type = ExportData.EXCEPTION sql = '' # log.debug(insert_sql) if self._aim_db.add(insert_sql, exception_callfunc): self._export_count += 1 sql = insert_sql execute_type = ExportData.INSERT if self._update_read_status: self._mongodb.update(self._source_table, data, {'read_status': 1}) elif self._unique_key_mapping_source_key: # 取id字段 aim_key = tuple(self._unique_key_mapping_source_key.keys())[0] value = tuple(self._unique_key_mapping_source_key.values())[0] temp_var = value.split('_', 1) source_key_types = temp_var[0] source_key = temp_var[1] select_sql = 'select id from ' + self._aim_table if source_key_types == 'str': select_sql = select_sql + " where %s = '%s'" % ( aim_key, data[source_key]) elif source_key_types == 'int': select_sql = select_sql + " where %s = %s" % ( aim_key, data[source_key]) data_id = self._aim_db.find(select_sql) if data_id: data_id = data_id[0][0] else: continue #拼接update语句 update_sql += " where id = %s" % data_id log.debug(update_sql) # 删除 update 里面 id= xxx 的条件,保证更新后的数据 ID不变 id_info = ''.join( tools.get_info(update_sql, [' id .*?,', ' ID .*?,'])) update_sql = update_sql.replace(id_info, '') # 修改data_json 里的ID if "ID" in data_json.keys(): data_json["ID"] = data_id # 更新 if self._aim_db.update(update_sql): self._update_count += 1 sql = update_sql execute_type = ExportData.UPDATE if self._update_read_status: self._mongodb.update(self._source_table, data, {'read_status': 1}) # 同步到ES if self._sync_to_es and execute_type != ExportData.EXCEPTION: self._es.add(table=self._aim_table, data=data_json, data_id=data_json.get('ID')) if self._callback: self._callback(execute_type, sql, data_json) log.debug(''' 共导出%s条数据 共更新%s条数据 ''' % (self._export_count, self._update_count)) return self._export_count + self._update_count def close(self): self._aim_db.close()
class NewsCluster(): def __init__(self): self._es = ES() self._current_csr_res_id = tools.read_file(STO_CURRENT_ID_FILE) self._current_csr_res_id = self._current_csr_res_id and int( self._current_csr_res_id) or 0 def _get_same_day_hots(self, text, start_time): news_day_time = start_time[:start_time.find(' ')] body = { "query": { "filtered": { "filter": { "range": { "start_time": { "gte": news_day_time + ' 00:00:00', 'lte': news_day_time + ' 59:59:59' } } }, "query": { "multi_match": { "query": text, "fields": ["csr_content"], "operator": "or", "minimum_should_match": "{percent}%".format(percent=int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比 } } } }, "_source": ["hot_id", "csr_res_ids", "csr_content", 'hot'], "highlight": { "fields": { "csr_content": {} } } } # 默认按照匹配分数排序 hots = self._es.search('tab_news_csr_hot', body) # print(tools.dumps_json(hots)) return hots.get('hits', {}).get('hits', []) def _save_current_id(self): ''' @summary: 保存做到的id, 下次接着做 --------- --------- @result: ''' tools.write_file(STO_CURRENT_ID_FILE, str(self._current_csr_res_id)) def deal_news(self): ''' @summary: 取tab_news_csr_result信息 --------- --------- @result: ''' while True: body = { "query": { "filtered": { "filter": { "range": { "csr_res_id": { # 查询大于该csr_res_id 的信息 "gt": self._current_csr_res_id } } } } }, "_source": ["csr_res_id", "csr_content", "start_time"], "sort": [{ "csr_res_id": "asc" }] } news_json = self._es.search('tab_news_csr_result', body) news_list = news_json.get('hits', {}).get('hits', []) if not news_list: log.debug( 'tab_news_csr_result 表中无大于%s的csr_res_id\nsleep %s...' % (self._current_csr_res_id, SLEEP_TIME)) tools.delay_time(SLEEP_TIME) continue for news_info in news_list: news = news_info.get('_source') csr_res_id = news.get('csr_res_id') csr_content = news.get('csr_content') start_time = news.get('start_time') log.debug(''' 处理 tab_news_csr_result csr_res_id %s start_time %s csr_content %s ''' % (csr_res_id, start_time, csr_content)) # 找相似文章 similar_hot = None hots = self._get_same_day_hots(csr_content, start_time) # 遍历相似的文章,比较相似度 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('csr_content') temp_similarity = compare_text(csr_content, hot_text) if temp_similarity > MIN_SIMILARITY: similar_hot = hot break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 # 如果找到相似的文章,追加csr_res_id和hot值, 否则将该条信息最为新的热点 if similar_hot: # 找到相似的热点 log.debug('找到所属热点:%s' % similar_hot.get('csr_content')) data = {} # 更新热点的热度及追加文章的id data["hot"] = similar_hot["hot"] + 1 data["csr_res_ids"] = similar_hot[ "csr_res_ids"] + ',' + csr_res_id # 更新热点 self._es.update_by_id("tab_news_csr_hot", data_id=similar_hot.get("hot_id"), data=data) else: # 没有找到相似的热点, 将当前文章作为热点 log.debug('无所属热点') hot_info = { 'hot_id': csr_res_id, 'hot': 1, 'start_time': start_time, 'csr_res_ids': csr_res_id, 'csr_content': csr_content } self._es.add('tab_news_csr_hot', hot_info, data_id=csr_res_id) # 保存当前的id self._current_csr_res_id = csr_res_id self._save_current_id()
class WechatService(): _todo_accounts = collections.deque() _rownum = 1 _is_done = False def __init__(self): self._db = OracleDB() self._es = ES() self.__load_todo_account() def __load_todo_account(self): if not WechatService._todo_accounts: sql = ''' select * from (select rownum r, t.id, t.domain, t.biz from TAB_IOPM_SITE t where t.biz is not null and rownum < {size}) where r >= {rownum} '''.format(rownum=WechatService._rownum, size=WechatService._rownum + SIZE) results = self._db.find(sql) if not results: WechatService._is_done = True WechatService._rownum = 1 self.__load_todo_account() else: WechatService._todo_accounts = collections.deque( results) # 转为队列 WechatService._rownum += SIZE def get_next_account(self): ''' @summary: --------- --------- @result: 返回biz, 是否已做完一圈 (biz, True) ''' if not WechatService._todo_accounts: self.__load_todo_account() next_account_info = WechatService._todo_accounts.popleft() next_account_id = next_account_info[2] next_account_biz = next_account_info[3] next_account = next_account_id, next_account_biz, WechatService._is_done # 重置_is_done 状态 WechatService._is_done = False return next_account def is_exist(self, table, data_id): if self._es.get(table, data_id=data_id, doc_type=table): return True else: return False def add_article_info(self, article_info): ''' @summary: --------- @param article_info: --------- @result: ''' log.debug(''' -----文章信息----- %s''' % tools.dumps_json(article_info)) self._es.add('wechat_article', article_info, article_info.get('article_id')) def add_account_info(self, account_info): log.debug(''' -----公众号信息----- %s''' % tools.dumps_json(account_info)) self._es.add('wechat_account', account_info, account_info.get('__biz'))
class HotSync(): def __init__(self): self._es = ES() def _get_today_hots(self, text, release_time): release_day = release_time[:release_time.find(' ')] body = { "query": { "filtered": { "filter": { "range": { "RELEASE_TIME": { # 当日发布的新闻 "gte": release_day + ' 00:00:00', "lte": release_day + ' 23:59:59' } } }, "query": { "multi_match": { "query": text, "fields": ["TITLE"], "operator": "or", "minimum_should_match": "{percent}%".format(percent=int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比 } } } }, "_source": [ "ID", "TITLE", "CONTENT", "HOT", "CLUES_IDS", "VIP_COUNT", "NEGATIVE_EMOTION_COUNT" ], "highlight": { "fields": { "TITLE": {} } } } # 默认按照匹配分数排序 hots = self._es.search('tab_iopm_hot_info', body) # print(tools.dumps_json(hots)) return hots.get('hits', {}).get('hits', []) def get_hot_id(self, article_info): article_text = article_info.get( "TITLE") # + article_info.get("CONTENT") release_time = article_info.get("RELEASE_TIME") article_text = tools.del_html_tag(article_text) hots = self._get_today_hots(article_text, release_time) # 找最相似的热点 similar_hot = None max_similarity = 0 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('TITLE') # + hot.get('CONTENT') hot_text = tools.del_html_tag(hot_text) temp_similarity = compare_text(article_text, hot_text) if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity: similar_hot = hot max_similarity = temp_similarity break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 if similar_hot: # 找到相似的热点 if similar_hot["ID"] != article_info["ID"]: # 防止同一个舆情 比较多次 data = {} # 更新热点的热度 data["HOT"] = similar_hot["HOT"] + 1 # 更新主流媒体数量及负面舆情数量 data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + ( 1 if article_info["IS_VIP"] else 0) data["NEGATIVE_EMOTION_COUNT"] = similar_hot[ 'NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION'] == 2 else 0) # 更新相关度 if similar_hot['CLUES_IDS']: url = IOPM_SERVICE_ADDRESS + 'related_sort' data_args = { 'hot_id': similar_hot['ID'], # 文章id 'hot_value': data['HOT'], # 热度值 'clues_id': similar_hot['CLUES_IDS'], #相关舆情匹配到的线索id 'article_count': data['HOT'], # 文章总数 'vip_count': data["VIP_COUNT"], # 主流媒体数 'negative_emotion_count': data["NEGATIVE_EMOTION_COUNT"], # 负面情感数 'zero_ids': article_info['ZERO_ID'] } result = tools.get_json_by_requests(url, data=data_args) if result: data['WEIGHT'] = result.get('weight', 0) # 更新热点 self._es.update_by_id("tab_iopm_hot_info", data_id=similar_hot.get("ID"), data=data) # 返回热点id return similar_hot.get("ID") else: # 将该舆情添加为热点 hot_info = deepcopy(article_info) hot_info.pop('HOT_ID') # 热点表中无hot_id # 默认用户行为数量为零 hot_info['ACCEPT_COUNT'] = 0 hot_info['UNACCEPT_COUNT'] = 0 hot_info['WATCH_COUNT'] = 0 # 其他值 hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0 hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info[ 'EMOTION'] == 2 else 0 hot_info['HOT'] = 1 hot_info['ID'] = article_info.get("ID") self._es.add('tab_iopm_hot_info', hot_info, data_id=hot_info['ID']) # 返回热点id return hot_info['ID']