Python ES.add Exemples, db.elastic_search.ES.add Python Exemples

Exemple #1

0

Afficher le fichier

class SyncES():
    def __init__(self):
        self._es = ES()
        self._db = OracleDB()

        self._max_id = tools.read_file(STO_MAX_ID_FILE)
        self._max_id = self._max_id and eval(self._max_id) or {}

    def get_data(self, sql):
        return self._db.find(sql, to_json=True)

    def export_to_es(self, table, data, data_id):
        self._es.add(table=table, data=data, data_id=data_id)

    def sync_data(self, table, step=20):
        '''
        @summary: 需要先把id设为主键
        ---------
        @param sql:
        @param table:
        @param is_step: 分批导 0 位一次导入， 适合数据量不多情况。速度快
        ---------
        @result:
        '''

        max_id = self._max_id.get(table, 0)
        self._db.set_primary_key(table)

        while True:
            inner_sql = 'select * from %s where id > %d and rownum <= %d order by id' % (
                table, max_id, step)
            datas = sync_es.get_data(inner_sql)

            if not datas:
                self.close()
                break

            for data in datas:
                data_id = data['ID']
                data = tools.dumps_json(data)
                print(data)
                print(data_id)
                # print(data)

                max_id = data_id

                self.export_to_es(table, data, data_id)

        self._max_id[table] = max_id

    def close(self):
        tools.write_file(STO_MAX_ID_FILE, str(self._max_id))

Exemple #2

0

Afficher le fichier

    "praise_count": 0,
    "uuid": "31ea4d35-a8ce-377f-bb6d-9846ab607aaa",
    "author": "",
    "play_count": 0,
    "release_time": "2017-12-21 11:47:17",
    "time_length": 0,
    "summary": "",
    "domain": "sina.com.cn",
    "comment_count": 0,
    "site_name": "新浪"
}

content_info = {
    "title": "十九大代表带你学报告——钟佰均谈脱贫攻坚",
    "domain": "qq.com",
    "uuid": "4f73c1bf-6b67-3b32-9ddc-3d141de72f05",
    "summary": "",
    "image_url": "http://vpic.video.qq.com/89220912/l051527zwgx_160_90_3.jpg",
    "url": "http://v.qq.com/x/page/l051527zwgx.html?ptag=iqiyi.news",
    "praise_count": 0,
    "play_count": 0,
    "release_time": "2017-12-21",
    "time_length": 0,
    "author": "",
    "content": "",
    "site_name": "爱奇艺",
    "comment_count": 0,
    "record_time": "2017-12-21 11:59:17"
}
es.add('video_news', content_info, content_info['uuid'])

Exemple #3

0

Afficher le fichier

class HotWeekSync():
    def __init__(self):
        self._es = ES()
        self._event_filter = EventFilter()
        self._event_filter.start()

    def _get_week_hots(self, text, release_time):
        before_week = tools.get_before_date(release_time, -7)

        body = {
            "size":
            1,
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "RELEASE_TIME": {  # 当日发布的新闻
                                "gte": before_week,
                                "lte": release_time
                            }
                        }
                    },
                    "query": {
                        "multi_match": {
                            "query":
                            text,
                            "fields": ["TITLE"],
                            "operator":
                            "or",
                            "minimum_should_match":
                            "{percent}%".format(percent=int(MIN_SIMILARITY *
                                                            100))  # 匹配到的关键词占比
                        }
                    }
                }
            },
            "_source": [
                "ID",
                "TITLE",
                # "CONTENT",
                "HOT",
                "ARTICLE_COUNT",
                "VIP_COUNT",
                "NEGATIVE_EMOTION_COUNT",
                "HOT_DAY_IDS",
                "WEIGHT"
            ],
            # "highlight": {
            #       "fields": {
            #           "TITLE": {}
            #       }
            # }
        }

        # 默认按照匹配分数排序
        hots = self._es.search('tab_iopm_hot_week_info', body)
        # print(tools.dumps_json(hots))

        return hots.get('hits', {}).get('hits', [])

    def cluster_week_hot(self,
                         day_hot,
                         hot_value=None,
                         article_count=None,
                         vip_count=None,
                         negative_emotion_count=None,
                         weight=None):
        '''
        @summary: 聚类
        ---------
        @param hot:每日热点信息
        @param hot_value: 一条舆情的热度 （不为空时表示该条每日热点为更新热点，那么7日热点已经聚过此热点， 热度应该只加该条舆情的热度）
        @param article_count:
        @param vip_count:
        @param negative_emotion_count:
        @param weight:
        ---------
        @result:
        '''

        article_text = day_hot.get("TITLE")  # + hot.get("CONTENT")
        release_time = day_hot.get("RELEASE_TIME")

        article_text = tools.del_html_tag(article_text)

        hots = self._get_week_hots(article_text, release_time)

        # 找最相似的热点
        similar_hot = None
        max_similarity = 0
        for hot_info in hots:
            hot = hot_info.get('_source')
            hot_text = hot.get('TITLE')  # + hot.get('CONTENT')
            hot_text = tools.del_html_tag(hot_text)

            temp_similarity = compare_text(article_text, hot_text)
            if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity:
                similar_hot = hot
                max_similarity = temp_similarity

            break  #hots 按照匹配值排序后，第一个肯定是最相似的，无需向后比较

        if similar_hot:  # 找到相似的热点
            if similar_hot["ID"] != day_hot["ID"]:  # 防止同一个舆情 比较多次
                data = {}

                # 更新热点的热度与文章数
                data['HOT'] = similar_hot['HOT'] + (hot_value
                                                    or day_hot.get('HOT'))
                data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + (
                    article_count or day_hot.get('ARTICLE_COUNT'))

                # 更新主流媒体数量及负面舆情数量
                data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + (
                    vip_count or day_hot.get('VIP_COUNT'))
                data["NEGATIVE_EMOTION_COUNT"] = similar_hot[
                    'NEGATIVE_EMOTION_COUNT'] + (
                        negative_emotion_count
                        or hot.get('NEGATIVE_EMOTION_COUNT'))

                # 更新相关度
                # data['WEIGHT'] = similar_hot['WEIGHT'] + (weight or day_hot['WEIGHT'])

                # 更新 hot_day_ids
                if not hot_value:
                    data["HOT_DAY_IDS"] = similar_hot[
                        'HOT_DAY_IDS'] + ',' + day_hot['ID']

                # 更新热点
                self._es.update_by_id("tab_iopm_hot_week_info",
                                      data_id=similar_hot.get("ID"),
                                      data=data)

            # 返回热点id
            return similar_hot.get("ID")
        else:
            # 将该舆情添加为热点
            hot_info = deepcopy(day_hot)

            # 处理事件类型
            del_tag_content = tools.del_html_tag(hot_info['CONTENT'])
            text = hot_info['TITLE'] + del_tag_content
            contain_event_ids = self._event_filter.find_contain_event(text)
            hot_info['EVENT_IDS'] = ','.join(contain_event_ids)

            hot_info['HOT_DAY_IDS'] = day_hot.get("ID")

            self._es.add('tab_iopm_hot_week_info',
                         hot_info,
                         data_id=hot_info['ID'])

            # 返回热点id
            return hot_info['ID']

Exemple #4

0

Afficher le fichier

Fichier : hot_sync.py Projet : striver-ing/iopm-sync

class HotSync():
    def __init__(self):
        self._es = ES()
        self._hot_week_sync = HotWeekSync()
        self._cut_text = CutText()
        self._cut_text.set_stop_words('utils/stop_words.txt')

    def _get_today_hots(self, text, release_time):
        release_day = release_time[:release_time.find(' ')]

        body = {
        "size":1,
          "query": {
            "filtered": {
              "filter": {
                "range": {
                   "RELEASE_TIME": { # 当日发布的新闻
                        "gte": release_day + ' 00:00:00',
                        "lte": release_day + ' 23:59:59'
                    }
                }
              },
              "query": {
                "multi_match": {
                    "query": text,
                    "fields": [
                        "TITLE"
                    ],
                    "operator": "or",
                    "minimum_should_match": "{percent}%".format(percent = int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比
                }
              }
            }
          }#,
          # "_source": [
          #       "ID",
          #       "TITLE",
          #       # "CONTENT",
          #       "RELEASE_TIME",
          #       "WEIGHT",
          #       "HOT",
          #       "ARTICLE_COUNT",
          #       "CLUES_IDS",
          #       "VIP_COUNT",
          #       "NEGATIVE_EMOTION_COUNT"
          # ],
          # "highlight": {
          #       "fields": {
          #           "TITLE": {}
          #       }
          # }
        }

        # 默认按照匹配分数排序
        hots = self._es.search('tab_iopm_hot_info', body)
        # print(tools.dumps_json(hots))

        return hots.get('hits', {}).get('hits', [])


    def get_hot_id(self, article_info, positions, weight_factor):
        '''
        @summary: 聚类
        ---------
        @param article_info:
        ---------
        @result:
        '''
        # weight_factor = 1

        article_text = article_info.get("TITLE")# + article_info.get("CONTENT")
        release_time = article_info.get("RELEASE_TIME")

        article_text = tools.del_html_tag(article_text)

        hots = self._get_today_hots(article_text, release_time)

        # 找最相似的热点
        similar_hot = None
        max_similarity = 0
        for hot_info in hots:
            hot = hot_info.get('_source')
            hot_text = hot.get('TITLE')# + hot.get('CONTENT')
            hot_text = tools.del_html_tag(hot_text)

            temp_similarity = compare_text(article_text, hot_text)
            if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity:
                similar_hot = hot
                max_similarity = temp_similarity

            break #hots 按照匹配值排序后，第一个肯定是最相似的，无需向后比较

        if similar_hot:# 找到相似的热点
            if similar_hot["ID"] != article_info["ID"]: # 防止同一个舆情 比较多次
                data = {}

                # 更新热点的热度与文章数
                data['HOT'] = similar_hot['HOT'] + INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) *  weight_factor
                data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + 1

                # 更新主流媒体数量及负面舆情数量
                data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + article_info["IS_VIP"]
                data["NEGATIVE_EMOTION_COUNT"] = similar_hot['NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION'] == 2 else 0)

                weight_temp = 0 # 记录更新前后的差值
                # 更新相关度
                if similar_hot['CLUES_IDS']:
                    url = IOPM_SERVICE_ADDRESS + 'related_sort'
                    data_args = {
                        'hot_id': similar_hot['ID'], # 文章id
                        'hot_value' :data['HOT'], # 热度值
                        'clues_ids': similar_hot['CLUES_IDS'],  #相关舆情匹配到的线索id
                        'article_count' : data['ARTICLE_COUNT'], # 文章总数
                        'vip_count': data["VIP_COUNT"],   # 主流媒体数
                        'negative_emotion_count': data["NEGATIVE_EMOTION_COUNT"],  # 负面情感数
                        'zero_ids':article_info['ZERO_ID']
                    }

                    result = tools.get_json_by_requests(url, data = data_args)
                    weight_temp = similar_hot['WEIGHT'] - result.get('weight', 0)
                    data['WEIGHT'] = result.get('weight', 0) * weight_factor

                # 更新热点
                self._es.update_by_id("tab_iopm_hot_info", data_id = similar_hot.get("ID"), data = data)
                # 同步7日热点
                self._hot_week_sync.cluster_week_hot(similar_hot, hot_value = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0), article_count = 1, vip_count = article_info["IS_VIP"], negative_emotion_count = 1 if article_info['EMOTION'] == 2 else 0, weight = weight_temp)


            # 返回热点id
            return similar_hot.get("ID")
        else:
            # 将该舆情添加为热点
            hot_info = deepcopy(article_info)
            hot_info.pop('HOT_ID') # 热点表中无hot_id

            # 默认用户行为数量为零
            hot_info['ACCEPT_COUNT'] = 0
            hot_info['UNACCEPT_COUNT'] = 0
            hot_info['WATCH_COUNT'] = 0

            # 其他值
            hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0
            hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info['EMOTION'] == 2 else 0

            hot_info['HOT'] = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) * weight_factor
            hot_info['ID'] = article_info.get("ID")
            hot_info['ARTICLE_COUNT'] = 1
            hot_info['HOT_KEYWORDS'] = ','.join(self._cut_text.cut_for_keyword(article_info["TITLE"]))  # 关键词 可优化速度  在比较相似度时已经分词了 TODO
            hot_info['POSITIONS'] = positions
            hot_info['EVENT_IDS'] = ''  # 事件类型（每日热点不需要 TODO | 每周热点已加）

            self._es.add('tab_iopm_hot_info', hot_info, data_id = hot_info['ID'])
            # 同步7日热点
            self._hot_week_sync.cluster_week_hot(hot_info)

            # 返回热点id
            return hot_info['ID']

Exemple #5

0

Afficher le fichier

class ExportData():
    INSERT = 1
    UPDATE = 2
    EXCEPTION = 3

    def __init__(self,
                 source_table='',
                 aim_table='',
                 key_map='',
                 unique_key=None,
                 unique_key_mapping_source_key=None,
                 update_read_status=True,
                 condition={'read_status': 0},
                 datas=[],
                 callback='',
                 sync_to_es=False):
        '''
        @summary: 初始化
        ---------
        @param source_table: 源table mongo数据库
        @param aim_table:    目标table
        @param key_map:      目标table 和 源table 的键的映射
        eg: key_map = {
            'aim_key1' : 'str_source_key2',          # 目标键 = 源键对应的值         类型为str
            'aim_key2' : 'int_source_key3',          # 目标键 = 源键对应的值         类型为int
            'aim_key3' : 'date_source_key4',         # 目标键 = 源键对应的值         类型为date
            'aim_key4' : 'vint_id',                  # 目标键 = 值                   类型为int
            'aim_key5' : 'vstr_name',                # 目标键 = 值                   类型为str
            'aim_key6' : 'vdate_name',               # 目标键 = 值                   类型为date
            'aim_key7' : 'sint_select id from xxx'   # 目标键 = 值为sql 查询出的结果 类型为int
            'aim_key8' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str
            'aim_key9' : 'clob_key8'                 # 目标键 = 源键对应的值         类型为clob
            'aim_key10' : 'clob_key8'                # 目标键 = 源键对应的值         类型为str
        }

        @param unique_key:    唯一的key 目标数据库根据该key去重
        @param unique_key_mapping_source_key: 目标表中唯一的key所对应的源表中的key 该值不为空时 更新目标表中已有的数据
         eg: unique_key_mapping_source_key = {
            'url':'str_url'                         # 目标键 = 源键对应的值         类型为str
         }
        @param condition:    导出满足什么样条件的数据 默认是read_status = 0 的
        @param datas:   要导出的数据，格式为[{...},{...}] 或者 {}用于直接将json数组导入到目标表，为空时默认导出mongodb的数据
        @param callback 导出数据的回调，导出一组，执行一次，callback(execute_type, sql) execute_type为执行类型（ExportData.INSERT、ExportData.UPDATE、ExportData.EXCEPTION）
        sql 为执行的语句
        ---------
        @result:
        '''

        super(ExportData, self).__init__()

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key
        self._update_read_status = update_read_status
        self._condition = condition

        self._mongodb = MongoDB() if self._source_table else ''
        self._datas = datas
        self._sync_to_es = sync_to_es
        self._callback = callback

        self._is_oracle = False
        self._is_set_unique_key = False
        self._is_set_unique_key = False
        self._export_count = 0
        self._update_count = 0
        self._unique_key_mapping_source_key = unique_key_mapping_source_key

    def export_to_oracle(self,
                         source_table='',
                         aim_table='',
                         key_map='',
                         unique_key=None,
                         unique_key_mapping_source_key=None,
                         update_read_status=True,
                         condition={'read_status': 0},
                         datas=[],
                         callback='',
                         sync_to_es=False):
        if aim_table:
            if self._aim_table != aim_table:
                self._is_set_unique_key = False
                self._es = ES() if sync_to_es else ''
                self._mongodb = MongoDB() if source_table else ''

            self._source_table = source_table
            self._aim_table = aim_table
            self._key_map = key_map
            self._unique_key = unique_key
            self._export_count = 0
            self._update_count = 0
            self._unique_key_mapping_source_key = unique_key_mapping_source_key
            self._update_read_status = update_read_status if not datas else False
            self._condition = condition
            self._datas = datas
            self._callback = callback
            self._sync_to_es = sync_to_es
            self._es = None

        self._aim_db = OracleDB()
        self._is_oracle = True

        return self.__export()

    def export_to_mysql(self,
                        source_table='',
                        aim_table='',
                        key_map='',
                        unique_key=None,
                        unique_key_mapping_source_key=None,
                        update_read_status=True,
                        condition={'read_status': 0},
                        datas=[],
                        callback=''):
        if self._aim_table != aim_table:
            self._is_set_unique_key = False

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key
        self._export_count = 0
        self._update_count = 0
        self._unique_key_mapping_source_key = unique_key_mapping_source_key
        self._update_read_status = update_read_status if not datas else False
        self._condition = condition
        self._datas = datas
        self._callback = callback

        self._aim_db = MysqlDB()
        return self.__export()

    def make_sql(self, data):
        '''
        @summary:
        ---------
        @param data: 数据字典
        ---------
        @result: 当unique_key_mapping_source_key不为空时返回insert_sql, update_sql 否则返回insert_sql
        '''
        aim_keys = tuple(self._key_map.keys())
        source_keys = tuple(self._key_map.values())

        # 取源key值 对应的type 和 key （源key包含type 和 key 信息）
        keys = []
        value_types = []
        for source_key in source_keys:
            temp_var = source_key.split('_', 1)
            value_types.append(temp_var[0])
            keys.append(temp_var[1])

        insert_sql = 'insert into ' + self._aim_table + " (" + ', '.join(
            aim_keys) + ") values ("
        update_sql = 'update ' + self._aim_table + " set "
        data_json = {}  # 导入到es中用
        values = []
        for i in range(len(keys)):
            if (value_types[i] != 'vint' and value_types[i] != 'vstr'
                    and value_types[i] != 'vdate' and value_types[i] != 'sint'
                    and value_types[i] != 'sstr') and (not data[keys[i]]
                                                       and data[keys[i]] != 0):
                values.append('null')
                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % values[-1]
                data_json[aim_keys[i].upper()] = None

            elif value_types[i] == 'str':
                values.append(
                    str(data[keys[i]]).replace("'", "''")
                )  # if isinstance(data[keys[i]], str) else data[keys[i]])  # 将单引号替换成两个单引号 否者insert_sql语句语法出错
                insert_sql += "'%s', "
                update_sql += aim_keys[i] + " = '%s', " % values[-1]
                data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'clob':
                text = str(data[keys[i]]).replace("'", "''")
                if not text:
                    insert_sql += "'%s', "
                    values.append(text)
                    update_sql += aim_keys[i] + " = '%s', " % values[-1]
                    data_json[aim_keys[i].upper()] = None
                else:
                    values_ = tools.cut_string(text, 1000)

                    clob_text = ''
                    for value in values_:
                        clob_text += "to_clob('%s') || " % value

                    clob_text = clob_text[:-len(' || ')]
                    values.append(clob_text)
                    insert_sql += "%s, "
                    update_sql += aim_keys[i] + " = %s, " % values[-1]
                    data_json[aim_keys[i].upper()] = data[keys[i]]

            elif value_types[i] == 'int':
                if isinstance(data[keys[i]], int) or isinstance(
                        data[keys[i]], float) or isinstance(
                            data[keys[i]], str):
                    values.append(data[keys[i]])
                elif isinstance(data[keys[i]], bool):
                    values.append(data[keys[i]] and 1 or 0)
                else:  # _id
                    values.append(int(str(data[keys[i]])[-6:], 16))

                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % values[-1]
                data_json[aim_keys[i].upper()] = eval(
                    values[-1]) if isinstance(values[-1], str) else values[-1]

            elif value_types[i] == 'date':
                values.append(data[keys[i]].replace('年', '-').replace(
                    '月', '-').replace('日', ''))
                if self._is_oracle:
                    format_date = 'yyyy-mm-dd hh24:mi:ss'[:len(
                        values[-1]) if len(values[-1]) <= 10 else None]
                    insert_sql += "to_date('%s','{}'), ".format(format_date)
                    update_sql += aim_keys[i] + "= to_date('%s','%s'), " % (
                        values[-1], format_date)
                    data_json[aim_keys[i].upper()] = values[-1]
                else:
                    insert_sql += "'%s', "
                    update_sql += aim_keys[i] + " = '%s', " % values[-1]
                    data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'vint':
                if tools.get_english_words(keys[i]):
                    sql = 'select %s from dual' % keys[i]
                    value = self._aim_db.find(sql)[0][0]
                    values.append(value)
                    data_json[aim_keys[i].upper()] = values[-1]
                else:
                    values.append(keys[i])
                    data_json[aim_keys[i].upper()] = eval(values[-1])

                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % values[-1]

            elif value_types[i] == 'vstr':
                values.append(keys[i])
                insert_sql += "'%s', "
                update_sql += aim_keys[i] + " = '%s', " % values[-1]
                data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'vdate':
                values.append(keys[i])
                if self._is_oracle:
                    format_date = 'yyyy-mm-dd hh24:mi:ss'[:len(
                        values[-1]) if len(values[-1]) <= 10 else None]
                    insert_sql += "to_date('%s','{}'), ".format(format_date)
                    update_sql += aim_keys[i] + "= to_date('%s','%s'), " % (
                        values[-1], format_date)
                    data_json[aim_keys[i].upper()] = values[-1]
                else:
                    insert_sql += "'%s', "
                    update_sql += aim_keys[i] + " = '%s', " % values[-1]
                    data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'sint':
                value = self._aim_db.find(keys[i], fetch_one=True)[0]
                values.append(value)
                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % value
                data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'sstr':
                value = self._aim_db.find(keys[i], fetch_one=True)[0]
                values.append(value)
                insert_sql += "'%s', "
                update_sql += aim_keys[i] + " = '%s', " % value
                data_json[aim_keys[i].upper()] = values[-1]

            else:
                error_msg = '%s不符合key_map规定格式' % value_types[i]
                raise (Exception(error_msg))

        insert_sql = insert_sql[:-2] + ")"
        insert_sql = insert_sql % tuple(values)
        # tools.print(data_json)

        # log.debug(insert_sql)
        if self._unique_key_mapping_source_key:
            # aim_key = tuple(self._unique_key_mapping_source_key.keys())[0]

            # value = tuple(self._unique_key_mapping_source_key.values())[0]
            # temp_var = value.split('_', 1)

            # source_key_types = temp_var[0]
            # source_key = temp_var[1]

            # if source_key_types == 'str':
            #     update_sql = update_sql[:-2] + " where %s = '%s'" %(aim_key, data[source_key])
            # elif source_key_types == 'int':
            #     update_sql = update_sql[:-2] + " where %s = %s" %(aim_key, data[source_key])

            # # log.debug(update_sql)

            return insert_sql, update_sql[:-2], data_json
        else:
            return insert_sql, data_json

    # @tools.run_safe_model(__name__)
    def __export(self):
        if self._unique_key and not self._is_set_unique_key:
            self._aim_db.set_unique_key(self._aim_table, self._unique_key)
            self._is_set_unique_key = True

        datas = self._mongodb.find(
            self._source_table,
            condition=self._condition) if self._mongodb else (
                self._datas
                if isinstance(self._datas, list) else [self._datas])
        for data in datas:
            if self._unique_key_mapping_source_key:
                insert_sql, update_sql, data_json = self.make_sql(data)
            else:
                insert_sql, data_json = self.make_sql(data)

            # tools.write_file(self._aim_table + '.txt', insert_sql, 'w+')
            def exception_callfunc(e):
                if 'ORA-00001' in str(e):
                    if self._update_read_status:
                        self._mongodb.update(self._source_table, data,
                                             {'read_status': 1})
                else:
                    log.error(insert_sql)

            execute_type = ExportData.EXCEPTION
            sql = ''
            # log.debug(insert_sql)
            if self._aim_db.add(insert_sql, exception_callfunc):
                self._export_count += 1
                sql = insert_sql
                execute_type = ExportData.INSERT

                if self._update_read_status:
                    self._mongodb.update(self._source_table, data,
                                         {'read_status': 1})

            elif self._unique_key_mapping_source_key:
                # 取id字段
                aim_key = tuple(self._unique_key_mapping_source_key.keys())[0]

                value = tuple(self._unique_key_mapping_source_key.values())[0]
                temp_var = value.split('_', 1)

                source_key_types = temp_var[0]
                source_key = temp_var[1]

                select_sql = 'select id from ' + self._aim_table
                if source_key_types == 'str':
                    select_sql = select_sql + " where %s = '%s'" % (
                        aim_key, data[source_key])
                elif source_key_types == 'int':
                    select_sql = select_sql + " where %s = %s" % (
                        aim_key, data[source_key])

                data_id = self._aim_db.find(select_sql)
                if data_id:
                    data_id = data_id[0][0]
                else:
                    continue

                #拼接update语句
                update_sql += " where id = %s" % data_id
                log.debug(update_sql)

                # 删除 update 里面 id= xxx 的条件，保证更新后的数据 ID不变
                id_info = ''.join(
                    tools.get_info(update_sql, [' id .*?,', ' ID .*?,']))
                update_sql = update_sql.replace(id_info, '')

                # 修改data_json 里的ID
                if "ID" in data_json.keys():
                    data_json["ID"] = data_id

                # 更新
                if self._aim_db.update(update_sql):
                    self._update_count += 1
                    sql = update_sql
                    execute_type = ExportData.UPDATE

                    if self._update_read_status:
                        self._mongodb.update(self._source_table, data,
                                             {'read_status': 1})

            # 同步到ES
            if self._sync_to_es and execute_type != ExportData.EXCEPTION:
                self._es.add(table=self._aim_table,
                             data=data_json,
                             data_id=data_json.get('ID'))

            if self._callback:
                self._callback(execute_type, sql, data_json)

        log.debug('''
            共导出%s条数据
            共更新%s条数据
            ''' % (self._export_count, self._update_count))

        return self._export_count + self._update_count

    def close(self):
        self._aim_db.close()

Exemple #6

0

Afficher le fichier

class NewsCluster():
    def __init__(self):
        self._es = ES()
        self._current_csr_res_id = tools.read_file(STO_CURRENT_ID_FILE)
        self._current_csr_res_id = self._current_csr_res_id and int(
            self._current_csr_res_id) or 0

    def _get_same_day_hots(self, text, start_time):
        news_day_time = start_time[:start_time.find(' ')]
        body = {
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "start_time": {
                                "gte": news_day_time + ' 00:00:00',
                                'lte': news_day_time + ' 59:59:59'
                            }
                        }
                    },
                    "query": {
                        "multi_match": {
                            "query":
                            text,
                            "fields": ["csr_content"],
                            "operator":
                            "or",
                            "minimum_should_match":
                            "{percent}%".format(percent=int(MIN_SIMILARITY *
                                                            100))  # 匹配到的关键词占比
                        }
                    }
                }
            },
            "_source": ["hot_id", "csr_res_ids", "csr_content", 'hot'],
            "highlight": {
                "fields": {
                    "csr_content": {}
                }
            }
        }

        # 默认按照匹配分数排序
        hots = self._es.search('tab_news_csr_hot', body)
        # print(tools.dumps_json(hots))

        return hots.get('hits', {}).get('hits', [])

    def _save_current_id(self):
        '''
        @summary: 保存做到的id， 下次接着做
        ---------
        ---------
        @result:
        '''

        tools.write_file(STO_CURRENT_ID_FILE, str(self._current_csr_res_id))

    def deal_news(self):
        '''
        @summary: 取tab_news_csr_result信息
        ---------
        ---------
        @result:
        '''
        while True:
            body = {
                "query": {
                    "filtered": {
                        "filter": {
                            "range": {
                                "csr_res_id": {  # 查询大于该csr_res_id 的信息
                                    "gt": self._current_csr_res_id
                                }
                            }
                        }
                    }
                },
                "_source": ["csr_res_id", "csr_content", "start_time"],
                "sort": [{
                    "csr_res_id": "asc"
                }]
            }

            news_json = self._es.search('tab_news_csr_result', body)
            news_list = news_json.get('hits', {}).get('hits', [])

            if not news_list:
                log.debug(
                    'tab_news_csr_result 表中无大于%s的csr_res_id\nsleep %s...' %
                    (self._current_csr_res_id, SLEEP_TIME))
                tools.delay_time(SLEEP_TIME)
                continue

            for news_info in news_list:
                news = news_info.get('_source')
                csr_res_id = news.get('csr_res_id')
                csr_content = news.get('csr_content')
                start_time = news.get('start_time')

                log.debug('''
                    处理 tab_news_csr_result
                    csr_res_id  %s
                    start_time  %s
                    csr_content %s
                    ''' % (csr_res_id, start_time, csr_content))

                # 找相似文章
                similar_hot = None
                hots = self._get_same_day_hots(csr_content, start_time)

                # 遍历相似的文章，比较相似度
                for hot_info in hots:
                    hot = hot_info.get('_source')
                    hot_text = hot.get('csr_content')

                    temp_similarity = compare_text(csr_content, hot_text)
                    if temp_similarity > MIN_SIMILARITY:
                        similar_hot = hot

                    break  #hots 按照匹配值排序后，第一个肯定是最相似的，无需向后比较

                # 如果找到相似的文章，追加csr_res_id和hot值， 否则将该条信息最为新的热点
                if similar_hot:  # 找到相似的热点
                    log.debug('找到所属热点：%s' % similar_hot.get('csr_content'))

                    data = {}

                    # 更新热点的热度及追加文章的id
                    data["hot"] = similar_hot["hot"] + 1
                    data["csr_res_ids"] = similar_hot[
                        "csr_res_ids"] + ',' + csr_res_id

                    # 更新热点
                    self._es.update_by_id("tab_news_csr_hot",
                                          data_id=similar_hot.get("hot_id"),
                                          data=data)

                else:  # 没有找到相似的热点， 将当前文章作为热点
                    log.debug('无所属热点')

                    hot_info = {
                        'hot_id': csr_res_id,
                        'hot': 1,
                        'start_time': start_time,
                        'csr_res_ids': csr_res_id,
                        'csr_content': csr_content
                    }
                    self._es.add('tab_news_csr_hot',
                                 hot_info,
                                 data_id=csr_res_id)

                # 保存当前的id
                self._current_csr_res_id = csr_res_id
                self._save_current_id()

Exemple #7

0

Afficher le fichier

Fichier : wechat_service.py Projet : xunux/wechat-spider-1

class WechatService():
    _todo_accounts = collections.deque()
    _rownum = 1

    _is_done = False

    def __init__(self):
        self._db = OracleDB()
        self._es = ES()
        self.__load_todo_account()

    def __load_todo_account(self):
        if not WechatService._todo_accounts:
            sql = '''
                select *
                   from (select rownum r, t.id, t.domain, t.biz
                           from TAB_IOPM_SITE t
                          where t.biz is not null and rownum < {size})
                  where r >= {rownum}
                '''.format(rownum=WechatService._rownum,
                           size=WechatService._rownum + SIZE)

            results = self._db.find(sql)
            if not results:
                WechatService._is_done = True
                WechatService._rownum = 1
                self.__load_todo_account()
            else:
                WechatService._todo_accounts = collections.deque(
                    results)  #  转为队列
                WechatService._rownum += SIZE

    def get_next_account(self):
        '''
        @summary:
        ---------
        ---------
        @result: 返回biz, 是否已做完一圈 (biz, True)
        '''
        if not WechatService._todo_accounts:
            self.__load_todo_account()

        next_account_info = WechatService._todo_accounts.popleft()
        next_account_id = next_account_info[2]
        next_account_biz = next_account_info[3]

        next_account = next_account_id, next_account_biz, WechatService._is_done
        # 重置_is_done 状态
        WechatService._is_done = False

        return next_account

    def is_exist(self, table, data_id):
        if self._es.get(table, data_id=data_id, doc_type=table):
            return True
        else:
            return False

    def add_article_info(self, article_info):
        '''
        @summary:
        ---------
        @param article_info:
        ---------
        @result:
        '''

        log.debug('''
            -----文章信息-----
            %s''' % tools.dumps_json(article_info))

        self._es.add('wechat_article', article_info,
                     article_info.get('article_id'))

    def add_account_info(self, account_info):
        log.debug('''
            -----公众号信息-----
            %s''' % tools.dumps_json(account_info))

        self._es.add('wechat_account', account_info, account_info.get('__biz'))

Exemple #8

0

Afficher le fichier

class HotSync():
    def __init__(self):
        self._es = ES()

    def _get_today_hots(self, text, release_time):
        release_day = release_time[:release_time.find(' ')]

        body = {
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "RELEASE_TIME": {  # 当日发布的新闻
                                "gte": release_day + ' 00:00:00',
                                "lte": release_day + ' 23:59:59'
                            }
                        }
                    },
                    "query": {
                        "multi_match": {
                            "query":
                            text,
                            "fields": ["TITLE"],
                            "operator":
                            "or",
                            "minimum_should_match":
                            "{percent}%".format(percent=int(MIN_SIMILARITY *
                                                            100))  # 匹配到的关键词占比
                        }
                    }
                }
            },
            "_source": [
                "ID", "TITLE", "CONTENT", "HOT", "CLUES_IDS", "VIP_COUNT",
                "NEGATIVE_EMOTION_COUNT"
            ],
            "highlight": {
                "fields": {
                    "TITLE": {}
                }
            }
        }

        # 默认按照匹配分数排序
        hots = self._es.search('tab_iopm_hot_info', body)
        # print(tools.dumps_json(hots))

        return hots.get('hits', {}).get('hits', [])

    def get_hot_id(self, article_info):
        article_text = article_info.get(
            "TITLE")  # + article_info.get("CONTENT")
        release_time = article_info.get("RELEASE_TIME")

        article_text = tools.del_html_tag(article_text)

        hots = self._get_today_hots(article_text, release_time)

        # 找最相似的热点
        similar_hot = None
        max_similarity = 0
        for hot_info in hots:
            hot = hot_info.get('_source')
            hot_text = hot.get('TITLE')  # + hot.get('CONTENT')
            hot_text = tools.del_html_tag(hot_text)

            temp_similarity = compare_text(article_text, hot_text)
            if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity:
                similar_hot = hot
                max_similarity = temp_similarity

            break  #hots 按照匹配值排序后，第一个肯定是最相似的，无需向后比较

        if similar_hot:  # 找到相似的热点
            if similar_hot["ID"] != article_info["ID"]:  # 防止同一个舆情 比较多次
                data = {}

                # 更新热点的热度
                data["HOT"] = similar_hot["HOT"] + 1

                # 更新主流媒体数量及负面舆情数量
                data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + (
                    1 if article_info["IS_VIP"] else 0)
                data["NEGATIVE_EMOTION_COUNT"] = similar_hot[
                    'NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION']
                                                 == 2 else 0)

                # 更新相关度
                if similar_hot['CLUES_IDS']:
                    url = IOPM_SERVICE_ADDRESS + 'related_sort'
                    data_args = {
                        'hot_id': similar_hot['ID'],  # 文章id
                        'hot_value': data['HOT'],  # 热度值
                        'clues_id': similar_hot['CLUES_IDS'],  #相关舆情匹配到的线索id
                        'article_count': data['HOT'],  # 文章总数
                        'vip_count': data["VIP_COUNT"],  # 主流媒体数
                        'negative_emotion_count':
                        data["NEGATIVE_EMOTION_COUNT"],  # 负面情感数
                        'zero_ids': article_info['ZERO_ID']
                    }

                    result = tools.get_json_by_requests(url, data=data_args)
                    if result:
                        data['WEIGHT'] = result.get('weight', 0)

                # 更新热点
                self._es.update_by_id("tab_iopm_hot_info",
                                      data_id=similar_hot.get("ID"),
                                      data=data)

            # 返回热点id
            return similar_hot.get("ID")
        else:
            # 将该舆情添加为热点
            hot_info = deepcopy(article_info)
            hot_info.pop('HOT_ID')  # 热点表中无hot_id

            # 默认用户行为数量为零
            hot_info['ACCEPT_COUNT'] = 0
            hot_info['UNACCEPT_COUNT'] = 0
            hot_info['WATCH_COUNT'] = 0

            # 其他值
            hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0
            hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info[
                'EMOTION'] == 2 else 0

            hot_info['HOT'] = 1
            hot_info['ID'] = article_info.get("ID")

            self._es.add('tab_iopm_hot_info', hot_info, data_id=hot_info['ID'])

            # 返回热点id
            return hot_info['ID']