Ejemplos de ES en Python, ejemplos de db.elastic_search.ES en Python

Ejemplo n.º 1

0

Mostrar archivo

    def export_to_oracle(self,
                         source_table='',
                         aim_table='',
                         key_map='',
                         unique_key=None,
                         unique_key_mapping_source_key=None,
                         update_read_status=True,
                         condition={'read_status': 0},
                         datas=[],
                         callback='',
                         sync_to_es=False):
        if aim_table:
            if self._aim_table != aim_table:
                self._is_set_unique_key = False
                self._es = ES() if sync_to_es else ''
                self._mongodb = MongoDB() if source_table else ''

            self._source_table = source_table
            self._aim_table = aim_table
            self._key_map = key_map
            self._unique_key = unique_key
            self._export_count = 0
            self._update_count = 0
            self._unique_key_mapping_source_key = unique_key_mapping_source_key
            self._update_read_status = update_read_status if not datas else False
            self._condition = condition
            self._datas = datas
            self._callback = callback
            self._sync_to_es = sync_to_es
            self._es = None

        self._aim_db = OracleDB()
        self._is_oracle = True

        return self.__export()

Ejemplo n.º 2

0

Mostrar archivo

class SyncES():
    def __init__(self):
        self._es = ES()
        self._db = OracleDB()

        self._max_id = tools.read_file(STO_MAX_ID_FILE)
        self._max_id = self._max_id and eval(self._max_id) or {}

    def get_data(self, sql):
        return self._db.find(sql, to_json=True)

    def export_to_es(self, table, data, data_id):
        self._es.add(table=table, data=data, data_id=data_id)

    def sync_data(self, table, step=20):
        '''
        @summary: 需要先把id设为主键
        ---------
        @param sql:
        @param table:
        @param is_step: 分批导 0 位一次导入， 适合数据量不多情况。速度快
        ---------
        @result:
        '''

        max_id = self._max_id.get(table, 0)
        self._db.set_primary_key(table)

        while True:
            inner_sql = 'select * from %s where id > %d and rownum <= %d order by id' % (
                table, max_id, step)
            datas = sync_es.get_data(inner_sql)

            if not datas:
                self.close()
                break

            for data in datas:
                data_id = data['ID']
                data = tools.dumps_json(data)
                print(data)
                print(data_id)
                # print(data)

                max_id = data_id

                self.export_to_es(table, data, data_id)

        self._max_id[table] = max_id

    def close(self):
        tools.write_file(STO_MAX_ID_FILE, str(self._max_id))

Ejemplo n.º 3

0

Mostrar archivo

Archivo: article_sync.py Proyecto: Boris-code/csr

 def __init__(self, table):
     self._record_time = tools.get_json(
         tools.read_file(SYNC_TIME_FILE)) or {}
     self._compare_keywords = CompareKeywords()
     self._summary = Summary()
     self._emotion = Emotion()
     self._word_cloud = WordCloud()
     self._es = ES()
     self._hot_sync = HotSync()
     self._vip_checked = VipChecked()
     self._table = table
     self._per_record_time_key = '{table}_record_time'.format(
         table=self._table)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: update_es.py Proyecto: striver-ing/iopm-sync

def main():
    oracledb = OracleDB()
    esdb = ES()

    # sql = 'select MSG_ID from TAB_IOPM_USER_ACTION t where action_type=301 and msg_type = 502 and record_time>=sysdate-1'
    # article_ids = oracledb.find(sql)

    article_ids = [8888515, 8888293, 8891299]
    for article_id in article_ids:
        # article_id = article_id[0]

        body = {"WEIGHT": 0}

        print(article_id)
        esdb.update_by_id('tab_iopm_article_info', article_id, body)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: elastic_search_server.py Proyecto: striver-ing/iopm-service

class ElasticSearchServer():
    def __init__(self):
        self._es = ES('192.168.60.40')
        # self._es = ES('localhost')

    def search(self, table, body):
        return self._es.search(table, body)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: article_sync.py Proyecto: BoragoCode/iopm-sync

    def __init__(self, table):
        self._sync_time_file = SYNC_TIME_FILE + table + '.txt'
        self._record_time = tools.get_json(
            tools.read_file(self._sync_time_file)) or {}
        self._compare_keywords = CompareKeywords()
        self._summary = Summary()
        self._emotion = Emotion()
        self._word_cloud = WordCloud()
        self._yqtj_es = ES(YQTJ)
        self._data_pool_es = ES(DATA_POOL)
        self._hot_sync = HotSync()
        self._vip_checked = VipChecked()
        self._province_filter = ProvinceFilter()
        self._table = table
        self._per_record_time_key = '{table}_record_time'.format(
            table=self._table)

        self._vip_checked.start()
        self._compare_keywords.start()

Ejemplo n.º 7

0

Mostrar archivo

class SyncArtice(threading.Thread):
    def __init__(self):
        super(SyncArtice, self).__init__()

        self._es = ES()
        self._redis = RedisDB()
        self._sync_count = 0

    def run(self):
        is_show_tip = False
        while True:
            try:
                datas = self.get_data_from_redis(SYNC_STEP)
                if not datas:
                    if not is_show_tip:
                        print('\n{time} 无数据 休眠...    '.format(
                            time=tools.get_current_date()))
                        is_show_tip = True
                elif self.add_data_to_es(datas):
                    is_show_tip = False
                    self._sync_count += len(datas)
                    tools.print_one_line('已同步 %d 条数据' % self._sync_count)
                tools.delay_time(1)
            except Exception as e:
                log.error(e)

    def get_data_from_redis(self, count):
        datas = self._redis.sget('news:news_article', count=count)
        return_datas = []
        for data in datas:
            data = eval(data)
            release_time = data.get('release_time')
            if release_time and len(release_time) == 19:
                return_datas.append(data)

        return return_datas

    def add_data_to_es(self, datas):
        return self._es.add_batch(datas,
                                  primary_key='uuid',
                                  table='news_article')

Ejemplo n.º 8

0

Mostrar archivo

class HotWeekSync():
    def __init__(self):
        self._es = ES()
        self._event_filter = EventFilter()
        self._event_filter.start()

    def _get_week_hots(self, text, release_time):
        before_week = tools.get_before_date(release_time, -7)

        body = {
            "size":
            1,
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "RELEASE_TIME": {  # 当日发布的新闻
                                "gte": before_week,
                                "lte": release_time
                            }
                        }
                    },
                    "query": {
                        "multi_match": {
                            "query":
                            text,
                            "fields": ["TITLE"],
                            "operator":
                            "or",
                            "minimum_should_match":
                            "{percent}%".format(percent=int(MIN_SIMILARITY *
                                                            100))  # 匹配到的关键词占比
                        }
                    }
                }
            },
            "_source": [
                "ID",
                "TITLE",
                # "CONTENT",
                "HOT",
                "ARTICLE_COUNT",
                "VIP_COUNT",
                "NEGATIVE_EMOTION_COUNT",
                "HOT_DAY_IDS",
                "WEIGHT"
            ],
            # "highlight": {
            #       "fields": {
            #           "TITLE": {}
            #       }
            # }
        }

        # 默认按照匹配分数排序
        hots = self._es.search('tab_iopm_hot_week_info', body)
        # print(tools.dumps_json(hots))

        return hots.get('hits', {}).get('hits', [])

    def cluster_week_hot(self,
                         day_hot,
                         hot_value=None,
                         article_count=None,
                         vip_count=None,
                         negative_emotion_count=None,
                         weight=None):
        '''
        @summary: 聚类
        ---------
        @param hot:每日热点信息
        @param hot_value: 一条舆情的热度 （不为空时表示该条每日热点为更新热点，那么7日热点已经聚过此热点， 热度应该只加该条舆情的热度）
        @param article_count:
        @param vip_count:
        @param negative_emotion_count:
        @param weight:
        ---------
        @result:
        '''

        article_text = day_hot.get("TITLE")  # + hot.get("CONTENT")
        release_time = day_hot.get("RELEASE_TIME")

        article_text = tools.del_html_tag(article_text)

        hots = self._get_week_hots(article_text, release_time)

        # 找最相似的热点
        similar_hot = None
        max_similarity = 0
        for hot_info in hots:
            hot = hot_info.get('_source')
            hot_text = hot.get('TITLE')  # + hot.get('CONTENT')
            hot_text = tools.del_html_tag(hot_text)

            temp_similarity = compare_text(article_text, hot_text)
            if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity:
                similar_hot = hot
                max_similarity = temp_similarity

            break  #hots 按照匹配值排序后，第一个肯定是最相似的，无需向后比较

        if similar_hot:  # 找到相似的热点
            if similar_hot["ID"] != day_hot["ID"]:  # 防止同一个舆情 比较多次
                data = {}

                # 更新热点的热度与文章数
                data['HOT'] = similar_hot['HOT'] + (hot_value
                                                    or day_hot.get('HOT'))
                data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + (
                    article_count or day_hot.get('ARTICLE_COUNT'))

                # 更新主流媒体数量及负面舆情数量
                data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + (
                    vip_count or day_hot.get('VIP_COUNT'))
                data["NEGATIVE_EMOTION_COUNT"] = similar_hot[
                    'NEGATIVE_EMOTION_COUNT'] + (
                        negative_emotion_count
                        or hot.get('NEGATIVE_EMOTION_COUNT'))

                # 更新相关度
                # data['WEIGHT'] = similar_hot['WEIGHT'] + (weight or day_hot['WEIGHT'])

                # 更新 hot_day_ids
                if not hot_value:
                    data["HOT_DAY_IDS"] = similar_hot[
                        'HOT_DAY_IDS'] + ',' + day_hot['ID']

                # 更新热点
                self._es.update_by_id("tab_iopm_hot_week_info",
                                      data_id=similar_hot.get("ID"),
                                      data=data)

            # 返回热点id
            return similar_hot.get("ID")
        else:
            # 将该舆情添加为热点
            hot_info = deepcopy(day_hot)

            # 处理事件类型
            del_tag_content = tools.del_html_tag(hot_info['CONTENT'])
            text = hot_info['TITLE'] + del_tag_content
            contain_event_ids = self._event_filter.find_contain_event(text)
            hot_info['EVENT_IDS'] = ','.join(contain_event_ids)

            hot_info['HOT_DAY_IDS'] = day_hot.get("ID")

            self._es.add('tab_iopm_hot_week_info',
                         hot_info,
                         data_id=hot_info['ID'])

            # 返回热点id
            return hot_info['ID']

Ejemplo n.º 9

0

Mostrar archivo

Archivo: article_sync.py Proyecto: striver-ing/iopm-sync

class ArticleSync():
    def __init__(self, table):
        self._sync_time_file = SYNC_TIME_FILE + table + '.txt'
        self._record_time = tools.get_json(
            tools.read_file(self._sync_time_file)) or {}
        self._compare_keywords = CompareKeywords()
        self._summary = Summary()
        self._emotion = Emotion()
        self._word_cloud = WordCloud()
        self._yqtj_es = ES(YQTJ)
        self._data_pool_es = ES(DATA_POOL)
        self._hot_sync = HotSync()
        self._vip_checked = VipChecked()
        self._province_filter = ProvinceFilter()
        # self._event_filter = EventFilter()
        self._table = table
        self._per_record_time_key = '{table}_record_time'.format(
            table=self._table)

        self._vip_checked.start()
        self._compare_keywords.start()
        # self._event_filter.start()

    def get_article_info(self):
        '''
        @summary: 取article的结构信息
        ---------
        ---------
        @result:
        '''

        article_info = {
            "EMOTION": None,
            "HOST": "",
            "AUTHOR": "",
            "URL": "",
            "WEBSITE_NAME": "",
            "ACCOUNT": "",
            "REVIEW_COUNT": None,
            "KEYWORDS_COUNT": None,
            "RELEASE_TIME": "",
            "CONTENT": "",
            "ID": None,
            "UUID": "",
            "WEIGHT": None,
            "CLUES_IDS": "",
            "UP_COUNT": None,
            "INTERACTION_COUNT": None,
            "RECORD_TIME": None,
            "COMMENT_COUNT": None,
            "IS_VIP": None,
            "INFO_TYPE": None,
            "HOT_ID": None,
            "KEYWORD_CLUES_ID": "",
            "MAY_INVALID": None,
            "TITLE": "",
            "KEYWORDS": "",
            "TRANSMIT_COUNT": None,
            "ZERO_ID": None,
            "FIRST_ID": None,
            "SECOND_ID": None,
            "SUMMARY": "",
            "WORD_CLOUD": "",
            "IMAGE_URL": ""
        }

        return article_info

    def get_article_clues_src(self):
        article_clues_src = {"CLUES_ID": "", "ARTICLE_ID": "", "ID": ""}

        return article_clues_src

    def get_per_record_time(self):
        per_record_time = self._record_time.get(self._per_record_time_key)
        return per_record_time

    def record_now_record_time(self, record_time):
        self._record_time[self._per_record_time_key] = record_time
        tools.write_file(self._sync_time_file,
                         tools.dumps_json(self._record_time))

    def get_article(self):
        '''
        @summary: 目前取的是record_time 为了保证有数据， 正常应该取releast_time TODO
        ---------
        ---------
        @result:
        '''
        log.debug("取代做种子集...")

        per_record_time = self.get_per_record_time()
        today_time = tools.get_current_date("%Y-%m-%d")
        min_day_ago = tools.get_before_date(today_time,
                                            -30,
                                            current_date_format='%Y-%m-%d',
                                            return_date_format='%Y-%m-%d')

        if per_record_time:
            # body = {
            #     "size":1500,
            #     "query": {
            #         "filtered": {
            #           "filter": {
            #             "range": {
            #                 "record_time" : {
            #                     "gt": per_record_time
            #                 }
            #             }
            #           }
            #         }
            #     },
            #     "sort":[{"record_time":"asc"}]
            # }

            body = {
                "size": 1500,
                "query": {
                    "filtered": {
                        "filter": {
                            "bool": {
                                "must": [
                                    {
                                        "range": {
                                            "record_time": {
                                                "gt": per_record_time
                                            }
                                        }
                                    },
                                    {
                                        "range": {
                                            "release_time": {
                                                "gte": min_day_ago +
                                                ' 00:00:00',  # 30日前
                                                "lte":
                                                today_time + ' 23:59:59'  # 今日
                                            }
                                        }
                                    }
                                ]
                            }
                        }
                    }
                },
                "sort": [{
                    "record_time": "asc"
                }]
            }

        else:
            body = {
                "query": {
                    "filtered": {
                        "filter": {
                            "range": {
                                "release_time": {
                                    "gte": three_day_ago + ' 00:00:00',  # 三日前
                                    "lte": today_time + ' 23:59:59'  # 今日
                                }
                            }
                        }
                    }
                },
                "size": 1500,
                "sort": [{
                    "record_time": "asc"
                }]
            }

        log.debug(self._table + " => " + tools.dumps_json(body))

        article = self._data_pool_es.search(self._table, body)
        return article.get('hits', {}).get('hits', [])

    def deal_article(self, article_list):
        '''
        @summary:处理article
        ---------
        @param article_list:
        ---------
        @result:
        '''
        article_infos = []
        # 补全剩余的信息
        for article_info in article_list:
            # print(tools.dumps_json(article_info))
            # 互动量
            article_info['INTERACTION_COUNT'] = (
                article_info['UP_COUNT']
                or 0) + (article_info['TRANSMIT_COUNT']
                         or 0) + (article_info['REVIEW_COUNT']
                                  or 0) + (article_info['COMMENT_COUNT'] or 0)

            # 检查库中是否已存在 存在则更新互动量
            if self._yqtj_es.get('tab_iopm_article_info', article_info["ID"]):
                log.debug('%s 已存在' % article_info['TITLE'])
                data = {
                    "INTERACTION_COUNT": article_info['INTERACTION_COUNT'],
                    "UP_COUNT": article_info['UP_COUNT'],
                    "TRANSMIT_COUNT": article_info['TRANSMIT_COUNT'],
                    "REVIEW_COUNT": article_info['REVIEW_COUNT'],
                    "COMMENT_COUNT": article_info['COMMENT_COUNT']
                }

                # 更新舆情
                self._yqtj_es.update_by_id("tab_iopm_article_info",
                                           data_id=article_info.get("ID"),
                                           data=data)
                continue

            # 标题+内容文本信息
            del_tag_content = tools.del_html_tag(article_info['CONTENT'])
            text = article_info['TITLE'] + del_tag_content
            # print(text)

            # 地域过滤
            contain_airs = ','.join(
                self._province_filter.find_contain_air(text))
            weight_factor = 1  # 权重系数
            if not contain_airs and PROVINCE:
                # log.debug('%s 不包含 本地地名 pass' % article_info['TITLE'])
                weight_factor = 0.01  # 不是本市的，权重系数较小； 权值 = 权重 * 权重系数

            # 线索关键词比对
            keywords, clues_ids, zero_ids, first_ids, second_ids, keyword_clues = self._compare_keywords.get_contained_keys(
                text)

            article_info[
                'KEYWORDS'] = keywords + ',' + contain_airs if keywords else contain_airs
            article_info['KEYWORDS'] = ','.join(
                set(article_info['KEYWORDS'].split(',')))
            article_info['CLUES_IDS'] = clues_ids
            article_info['ZERO_ID'] = zero_ids
            article_info['FIRST_ID'] = first_ids
            article_info['SECOND_ID'] = second_ids
            article_info['KEYWORDS_COUNT'] = len(keyword_clues)
            article_info['KEYWORD_CLUES_ID'] = str(keyword_clues)

            # # 线索与舆情中间表
            # article_clues_srcs = []
            # if clues_ids:
            #     for clues_id in clues_ids.split(','):
            #         article_clues_src = self.get_article_clues_src()
            #         article_clues_src['ID'] =  tools.get_uuid(clues_id, article_info['ID'])
            #         article_clues_src['CLUES_ID'] =  clues_id
            #         article_clues_src['ARTICLE_ID'] = article_info['ID']

            #         article_clues_srcs.append(article_clues_src)
            #         self._yqtj_es.add_batch(article_clues_srcs, "ID", 'tab_iopm_article_clues_src')

            # 词语图
            word_cloud = self._word_cloud.get_word_cloud(del_tag_content)
            article_info['WORD_CLOUD'] = tools.dumps_json(word_cloud)

            # 摘要
            if not article_info['SUMMARY']:
                article_info['SUMMARY'] = self._summary.get_summary(
                    del_tag_content)

            # 情感分析 (1 正 2 负 3 中立， 百度：0:负向，1:中性，2:正向)
            emotion = self._emotion.get_emotion(article_info['SUMMARY'])
            if emotion == 0:
                emotion = 2

            elif emotion == 1:
                emotion = 3

            elif emotion == 2:
                emotion = 1

            else:
                emotion = 3

            article_info['EMOTION'] = emotion

            # 主流媒体
            is_vip, zero_id, first_id, second_id = self._vip_checked.is_vip(
                article_info['HOST'], article_info['WEBSITE_NAME'])
            article_info["IS_VIP"] = is_vip
            if is_vip:
                article_info['ZERO_ID'] = article_info[
                    'ZERO_ID'] + ',' + zero_id if article_info[
                        'ZERO_ID'] else zero_id
                article_info['FIRST_ID'] = article_info[
                    'FIRST_ID'] + ',' + first_id if article_info[
                        'FIRST_ID'] else first_id
                article_info['SECOND_ID'] = article_info[
                    'SECOND_ID'] + ',' + second_id if article_info[
                        'SECOND_ID'] else second_id

            # 计算相关度
            url = IOPM_SERVICE_ADDRESS + 'related_sort'
            data = {
                'article_id': article_info['ID'],  # 文章id
                'clues_ids': article_info['CLUES_IDS'],  # 线索ids
                'may_invalid': 0,  #是否可能无效（微博包含@ 或者#）
                'vip_count': article_info['IS_VIP'],  # 主流媒体数
                'negative_emotion_count':
                1 if article_info['EMOTION'] == 2 else 0,  # 负面情感数
                'zero_ids': article_info['ZERO_ID']
            }

            result = tools.get_json_by_requests(url, data=data)
            article_info['WEIGHT'] = result.get('weight', 0) * weight_factor

            # 统计相似文章 热点
            if article_info['INFO_TYPE'] == 3:  # 微博
                article_info['TITLE'] = article_info['SUMMARY'][:30]

            article_info['HOT_ID'] = self._hot_sync.get_hot_id(
                article_info, contain_airs, weight_factor)

            log.debug('''
                title         %s
                release_time  %s
                record_time   %s
                url           %s
                匹配的关键字：%s
                线索id        %s
                一级分类      %s
                二级分类      %s
                三级分类      %s
                关键词-线索   %s
                地域          %s
                ''' % (article_info['TITLE'], article_info['RELEASE_TIME'],
                       article_info['RECORD_TIME'], article_info["URL"],
                       keywords, clues_ids, zero_ids, first_id, second_ids,
                       keyword_clues, contain_airs))

            # print(tools.dumps_json(article_info))
            article_infos.append(article_info)

            # print('article入库')
            # self._yqtj_es.add('tab_iopm_article_info', article_info, article_info["ID"])

        # article入库 批量
        print('article批量入库 size = %s' % len(article_infos))
        # print(tools.dumps_json(article_infos))
        self._yqtj_es.add_batch(article_infos, "ID", 'tab_iopm_article_info')

Ejemplo n.º 10

0

Mostrar archivo

Archivo: hot_sync.py Proyecto: striver-ing/iopm-sync

 def __init__(self):
     self._es = ES()
     self._hot_week_sync = HotWeekSync()
     self._cut_text = CutText()
     self._cut_text.set_stop_words('utils/stop_words.txt')

Ejemplo n.º 11

0

Mostrar archivo

Archivo: hot_sync.py Proyecto: striver-ing/iopm-sync

class HotSync():
    def __init__(self):
        self._es = ES()
        self._hot_week_sync = HotWeekSync()
        self._cut_text = CutText()
        self._cut_text.set_stop_words('utils/stop_words.txt')

    def _get_today_hots(self, text, release_time):
        release_day = release_time[:release_time.find(' ')]

        body = {
        "size":1,
          "query": {
            "filtered": {
              "filter": {
                "range": {
                   "RELEASE_TIME": { # 当日发布的新闻
                        "gte": release_day + ' 00:00:00',
                        "lte": release_day + ' 23:59:59'
                    }
                }
              },
              "query": {
                "multi_match": {
                    "query": text,
                    "fields": [
                        "TITLE"
                    ],
                    "operator": "or",
                    "minimum_should_match": "{percent}%".format(percent = int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比
                }
              }
            }
          }#,
          # "_source": [
          #       "ID",
          #       "TITLE",
          #       # "CONTENT",
          #       "RELEASE_TIME",
          #       "WEIGHT",
          #       "HOT",
          #       "ARTICLE_COUNT",
          #       "CLUES_IDS",
          #       "VIP_COUNT",
          #       "NEGATIVE_EMOTION_COUNT"
          # ],
          # "highlight": {
          #       "fields": {
          #           "TITLE": {}
          #       }
          # }
        }

        # 默认按照匹配分数排序
        hots = self._es.search('tab_iopm_hot_info', body)
        # print(tools.dumps_json(hots))

        return hots.get('hits', {}).get('hits', [])


    def get_hot_id(self, article_info, positions, weight_factor):
        '''
        @summary: 聚类
        ---------
        @param article_info:
        ---------
        @result:
        '''
        # weight_factor = 1

        article_text = article_info.get("TITLE")# + article_info.get("CONTENT")
        release_time = article_info.get("RELEASE_TIME")

        article_text = tools.del_html_tag(article_text)

        hots = self._get_today_hots(article_text, release_time)

        # 找最相似的热点
        similar_hot = None
        max_similarity = 0
        for hot_info in hots:
            hot = hot_info.get('_source')
            hot_text = hot.get('TITLE')# + hot.get('CONTENT')
            hot_text = tools.del_html_tag(hot_text)

            temp_similarity = compare_text(article_text, hot_text)
            if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity:
                similar_hot = hot
                max_similarity = temp_similarity

            break #hots 按照匹配值排序后，第一个肯定是最相似的，无需向后比较

        if similar_hot:# 找到相似的热点
            if similar_hot["ID"] != article_info["ID"]: # 防止同一个舆情 比较多次
                data = {}

                # 更新热点的热度与文章数
                data['HOT'] = similar_hot['HOT'] + INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) *  weight_factor
                data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + 1

                # 更新主流媒体数量及负面舆情数量
                data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + article_info["IS_VIP"]
                data["NEGATIVE_EMOTION_COUNT"] = similar_hot['NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION'] == 2 else 0)

                weight_temp = 0 # 记录更新前后的差值
                # 更新相关度
                if similar_hot['CLUES_IDS']:
                    url = IOPM_SERVICE_ADDRESS + 'related_sort'
                    data_args = {
                        'hot_id': similar_hot['ID'], # 文章id
                        'hot_value' :data['HOT'], # 热度值
                        'clues_ids': similar_hot['CLUES_IDS'],  #相关舆情匹配到的线索id
                        'article_count' : data['ARTICLE_COUNT'], # 文章总数
                        'vip_count': data["VIP_COUNT"],   # 主流媒体数
                        'negative_emotion_count': data["NEGATIVE_EMOTION_COUNT"],  # 负面情感数
                        'zero_ids':article_info['ZERO_ID']
                    }

                    result = tools.get_json_by_requests(url, data = data_args)
                    weight_temp = similar_hot['WEIGHT'] - result.get('weight', 0)
                    data['WEIGHT'] = result.get('weight', 0) * weight_factor

                # 更新热点
                self._es.update_by_id("tab_iopm_hot_info", data_id = similar_hot.get("ID"), data = data)
                # 同步7日热点
                self._hot_week_sync.cluster_week_hot(similar_hot, hot_value = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0), article_count = 1, vip_count = article_info["IS_VIP"], negative_emotion_count = 1 if article_info['EMOTION'] == 2 else 0, weight = weight_temp)


            # 返回热点id
            return similar_hot.get("ID")
        else:
            # 将该舆情添加为热点
            hot_info = deepcopy(article_info)
            hot_info.pop('HOT_ID') # 热点表中无hot_id

            # 默认用户行为数量为零
            hot_info['ACCEPT_COUNT'] = 0
            hot_info['UNACCEPT_COUNT'] = 0
            hot_info['WATCH_COUNT'] = 0

            # 其他值
            hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0
            hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info['EMOTION'] == 2 else 0

            hot_info['HOT'] = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) * weight_factor
            hot_info['ID'] = article_info.get("ID")
            hot_info['ARTICLE_COUNT'] = 1
            hot_info['HOT_KEYWORDS'] = ','.join(self._cut_text.cut_for_keyword(article_info["TITLE"]))  # 关键词 可优化速度  在比较相似度时已经分词了 TODO
            hot_info['POSITIONS'] = positions
            hot_info['EVENT_IDS'] = ''  # 事件类型（每日热点不需要 TODO | 每周热点已加）

            self._es.add('tab_iopm_hot_info', hot_info, data_id = hot_info['ID'])
            # 同步7日热点
            self._hot_week_sync.cluster_week_hot(hot_info)

            # 返回热点id
            return hot_info['ID']

Ejemplo n.º 12

0

Mostrar archivo

 def __init__(self):
     self._es = ES()

Ejemplo n.º 13

0

Mostrar archivo

Archivo: weibo_article_parser.py Proyecto: striver-ing/mms

sys.path.append('../../')

import init
import base.constance as Constance
import base.base_parser as base_parser
import video_info.parsers.base_parser as self_base_parser
import utils.tools as tools
from utils.log import log
import datetime
from db.elastic_search import ES
import random

SITE_ID = 3
NAME = '新浪微博'

es = ES()


def get_release_time(mblog):
    try:
        release_time = mblog['created_at']
        data = tools.time.time()
        ltime = tools.time.localtime(data)
        timeStr = tools.time.strftime("%Y-%m-%d", ltime)
        if tools.re.compile('今天').findall(release_time):
            release_time = release_time.replace('今天', '%s' % timeStr)
        elif tools.re.compile('昨天').findall(release_time):
            today = datetime.date.today()
            yesterday = today - datetime.timedelta(days=1)
            release_time = release_time.replace('昨天', '%s' % yesterday)
        elif '小时前' in release_time:

Ejemplo n.º 14

0

Mostrar archivo

Archivo: statistics_clues_msg.py Proyecto: striver-ing/iopm-sync

@summary:
---------
@author: Boris
'''

import sys
import os
sys.path.append('../')

from db.oracledb import OracleDB
from db.elastic_search import ES
import utils.tools as tools
from utils.log import log

oracledb = OracleDB()
esdb = ES()


###########【拆分词组相关】################
def match_keys(keys_list):
    '''
    @summary: 解析乘积关系的词组
    ---------
    @param keys_list: 词组列表
    ---------
    @result:
    '''

    list_size = len(keys_list)

    if list_size < 2:

Ejemplo n.º 15

0

Mostrar archivo

Archivo: article_sync.py Proyecto: Boris-code/csr

class ArticleSync():
    def __init__(self, table):
        self._record_time = tools.get_json(
            tools.read_file(SYNC_TIME_FILE)) or {}
        self._compare_keywords = CompareKeywords()
        self._summary = Summary()
        self._emotion = Emotion()
        self._word_cloud = WordCloud()
        self._es = ES()
        self._hot_sync = HotSync()
        self._vip_checked = VipChecked()
        self._table = table
        self._per_record_time_key = '{table}_record_time'.format(
            table=self._table)

    def get_article_info(self):
        '''
        @summary: 取article的结构信息
        ---------
        ---------
        @result:
        '''

        article_info = {
            "EMOTION": None,
            "HOST": "",
            "AUTHOR": "",
            "URL": "",
            "WEBSITE_NAME": "",
            "ACCOUNT": "",
            "REVIEW_COUNT": None,
            "KEYWORDS_COUNT": None,
            "RELEASE_TIME": "",
            "CONTENT": "",
            "ID": None,
            "UUID": "",
            "WEIGHT": None,
            "CLUES_IDS": "",
            "UP_COUNT": None,
            "INTERACTION_COUNT": None,
            "RECORD_TIME": tools.get_current_date(),
            "COMMENT_COUNT": None,
            "IS_VIP": None,
            "INFO_TYPE": None,
            "HOT_ID": None,
            "KEYWORD_CLUES_ID": "",
            "MAY_INVALID": None,
            "TITLE": "",
            "KEYWORDS": "",
            "TRANSMIT_COUNT": None,
            "ZERO_ID": None,
            "FIRST_ID": None,
            "SECOND_ID": None,
            "SUMMARY": "",
            "WORD_CLOUD": "",
            "IMAGE_URL": ""
        }

        return article_info

    def get_article_clues_src(self):
        article_clues_src = {"CLUES_ID": "", "ARTICLE_ID": "", "ID": ""}

        return article_clues_src

    def get_per_record_time(self):
        per_record_time = self._record_time.get(self._per_record_time_key)

        return per_record_time

    def record_now_record_time(self, record_time):
        self._record_time[self._per_record_time_key] = record_time
        tools.write_file(SYNC_TIME_FILE, tools.dumps_json(self._record_time))

    def get_article(self):
        '''
        @summary: 目前取的是record_time 为了保证有数据， 正常应该取releast_time TODO
        ---------
        ---------
        @result:
        '''

        per_record_time = self.get_per_record_time()

        today_time = tools.get_current_date('%Y-%m-%d')
        if per_record_time:
            sql = "select * from {table} where record_time > '{record_time}' and release_time >= '{today_time} 00:00:00' and release_time <= '{today_time} 23:59:59' order by record_time".format(
                table=self._table,
                record_time=per_record_time,
                today_time=today_time)
        else:
            sql = "select * from {table} where release_time >= '{today_time} 00:00:00' and release_time <= '{today_time} 23:59:59' order by record_time".format(
                table=self._table, today_time=today_time)

        url = 'http://{address}/_sql?sql={sql}'.format(address=ADDRESS,
                                                       sql=sql)
        log.debug(url)

        article = tools.get_json_by_requests(url)
        return article.get('hits', {}).get('hits', [])

    def deal_article(self, article_list):
        '''
        @summary:处理article
        ---------
        @param article_list:
        ---------
        @result:
        '''
        article_infos = []
        # 补全剩余的信息
        for article_info in article_list:
            # 互动量
            # print(tools.dumps_json(article_info))
            article_info['INTERACTION_COUNT'] = (
                article_info['UP_COUNT']
                or 0) + (article_info['TRANSMIT_COUNT']
                         or 0) + (article_info['REVIEW_COUNT']
                                  or 0) + (article_info['COMMENT_COUNT'] or 0)

            # 线索关键词比对
            del_tag_content = tools.del_html_tag(article_info['CONTENT'])
            text = article_info['TITLE'] + del_tag_content
            # print(text)
            keywords, clues_ids, zero_ids, first_id, second_ids, keyword_clues = self._compare_keywords.get_contained_keys(
                text)

            article_info['KEYWORDS'] = keywords
            article_info['CLUES_IDS'] = clues_ids
            article_info['ZERO_ID'] = zero_ids
            article_info['FIRST_ID'] = first_id
            article_info['SECOND_ID'] = second_ids
            article_info['KEYWORDS_COUNT'] = len(keyword_clues)
            article_info['KEYWORD_CLUES_ID'] = str(keyword_clues)

            # 线索与舆情中间表
            article_clues_srcs = []
            if clues_ids:
                for clues_id in clues_ids.split(','):
                    article_clues_src = self.get_article_clues_src()
                    article_clues_src['ID'] = tools.get_uuid(
                        clues_id, article_info['ID'])
                    article_clues_src['CLUES_ID'] = clues_id
                    article_clues_src['ARTICLE_ID'] = article_info['ID']

                    article_clues_srcs.append(article_clues_src)
                    self._es.add_batch(article_clues_srcs, "ID",
                                       'tab_iopm_article_clues_src')

            # 情感分析 (1 正 2 负 3 中立， 百度：0:负向，1:中性，2:正向)
            emotion = self._emotion.get_emotion(del_tag_content)
            if emotion == 0:
                emotion = 2

            elif emotion == 1:
                emotion = 3

            elif emotion == 2:
                emotion = 1

            else:
                emotion = 3

            article_info['EMOTION'] = emotion

            # 主流媒体
            is_vip = self._vip_checked.is_vip(
                article_info['URL']) or self._vip_checked.is_vip(
                    article_info['WEBSITE_NAME'])
            article_info["IS_VIP"] = is_vip

            # 计算相关度
            if article_info['CLUES_IDS']:
                url = IOPM_SERVICE_ADDRESS + 'related_sort'
                data = {
                    'article_id':
                    article_info['ID'],  # 文章id
                    'clues_ids':
                    article_info['CLUES_IDS'],  # 线索ids
                    'may_invalid':
                    0,  #是否可能无效（微博包含@ 或者#）
                    'vip_count':
                    article_info['IS_VIP'],  # 主流媒体数
                    'negative_emotion_count':
                    1 if article_info['EMOTION'] == 2 else 0,  # 负面情感数
                    'zero_ids':
                    article_info['ZERO_ID']
                }

                result = tools.get_json_by_requests(url, data=data)
                article_info['WEIGHT'] = result.get('weight', 0)
            else:
                article_info['WEIGHT'] = 0

            # 词语图
            word_cloud = self._word_cloud.get_word_cloud(del_tag_content)
            article_info['WORD_CLOUD'] = tools.dumps_json(word_cloud)

            # 摘要
            if not article_info['SUMMARY']:
                article_info['SUMMARY'] = self._summary.get_summary(
                    del_tag_content)

            # 统计相似文章 热点
            if article_info['INFO_TYPE'] == 3:  # 微博
                article_info['TITLE'] = article_info['SUMMARY'][:30]

            article_info['HOT_ID'] = self._hot_sync.get_hot_id(article_info)

            log.debug('''
                title         %s
                release_time  %s
                url           %s
                匹配的关键字：%s
                线索id        %s
                一级分类      %s
                二级分类      %s
                三级分类      %s
                关键词-线索   %s
                ''' % (article_info['TITLE'], article_info['RELEASE_TIME'],
                       article_info["URL"], keywords, clues_ids, zero_ids,
                       first_id, second_ids, keyword_clues))

            # print(tools.dumps_json(article_info))
            article_infos.append(article_info)

        # article入库
        print('article入库')
        # print(tools.dumps_json(article_infos))
        self._es.add_batch(article_infos, "ID", 'tab_iopm_article_info')

Ejemplo n.º 16

0

Mostrar archivo

Archivo: wechat_service.py Proyecto: yunsite/wechat-spider-1

class WechatService():
    _db = OracleDB()
    _es = ES()
    _redisdb = RedisDB()
    _wechat_sogou = WechatSogou()
    _wechat_public_platform = WechatPublicPlatform()

    _todo_accounts = collections.deque()
    _rownum = 1

    _is_done = False  # 做完一轮
    _is_all_done = False  # 所有账号当日发布的消息均已爬取

    # wechat_sogou 最后没被封的时间
    _wechat_sogou_enable = True
    _wechat_sogou_last_unenable_time = tools.get_current_timestamp()

    # wechat_public_platform 最后没被封的时间
    _wechat_public_platform_enable = True
    _wechat_public_platform_last_unenable_time = tools.get_current_timestamp()

    def __init__(self):
        pass

    def __load_todo_account(self):
        accounts = WechatService._redisdb.sget('wechat:account', count=1)

        for account in accounts:
            account = eval(account)
            WechatService._todo_accounts.append(account)

    def is_have_new_article(self, account_id, account_name, __biz):
        '''
        @summary: 检查是否有新发布的文章
        ---------
        @param account_id:
        @param __biz:
        ---------
        @result:
        '''

        result = ''
        if WechatService._wechat_sogou_enable:  # 搜狗微信可用
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 有新发布的文章 抓取
                pass

            elif result == constance.NOT_UPDATE:
                # 无新发布的文章 pass
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                # 被封了 请求失败 记录下失败时间
                WechatService._wechat_sogou_enable = False
                WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
                )

        # 搜狗微信停用时间超过24小时了 可重新尝试
        elif tools.get_current_timestamp(
        ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 搜狗微信可用
                WechatService._wechat_sogou_enable = True

            elif result == constance.NOT_UPDATE:
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                pass

            # 更新下可用时间
            WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
            )

        # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章
        if not result or result == constance.VERIFICATION_CODE:
            if WechatService._wechat_public_platform_enable:  # 微信公众平台可用
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    pass

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    WechatService._wechat_public_platform_enable = False
                    WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                    )

            elif tools.get_current_timestamp(
            ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    WechatService._wechat_public_platform_enable = True

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    pass

                # 更新下可用时间
                WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                )

        return result

    def get_next_account(self):
        '''
        @summary:
        ---------
        ---------
        @result: 返回biz, 是否已做完一圈 (biz, True)
        '''

        if not WechatService._todo_accounts:
            self.__load_todo_account()

        if not WechatService._todo_accounts:
            return None

        oralce_id, account_id, account_name, last_article_release_time, biz = WechatService._todo_accounts.popleft(
        )
        next_account_id = account_id
        next_account_biz = biz
        next_account_name = account_name

        next_account = next_account_id, next_account_biz

        sql = "update TAB_IOPM_SITE t set t.spider_status=602 where t.biz = '%s'" % (
            next_account_biz)
        WechatService._db.update(sql)

        return next_account

    def update_account_article_num(self, __biz):
        # 查询es 统计数量
        # 今日
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "record_time": {
                                "gte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 00:00:00',
                                "lte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 23:59:59'
                            }
                        }
                    },
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        today_msg = result.get('hits', {}).get('total', 0)

        # 历史总信息量
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        total_msg = result.get('hits', {}).get('total', 0)

        if total_msg:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d, t.spider_status=603 where t.biz = '%s'" % (
                today_msg, total_msg, __biz)
        else:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.spider_status=603 where t.biz = '%s'" % (
                today_msg, __biz)
        print(sql)
        WechatService._db.update(sql)

    def is_exist(self, table, data_id):
        if WechatService._es.get(table, data_id=data_id, doc_type=table):
            return True
        else:
            return False

    def add_article_info(self, article_info):
        '''
        @summary:
        ---------
        @param article_info:
        ---------
        @result:
        '''

        log.debug('''
            -----文章信息-----
            标题     %s
            发布时间 %s
            作者     %s
            公众号   %s
            url      %s
            ''' % (article_info['title'], article_info['release_time'],
                   article_info['author'], article_info['account'],
                   article_info['url']))

        WechatService._es.add('wechat_article', article_info,
                              article_info.get('article_id'))

    def add_account_info(self, account_info):
        log.debug('''
            -----公众号信息-----
            %s''' % tools.dumps_json(account_info))

        WechatService._es.add('wechat_account', account_info,
                              account_info.get('__biz'))

Ejemplo n.º 17

0

Mostrar archivo

 def __init__(self):
     self._yqtj_es = ES(YQTJ)

Ejemplo n.º 18

0

Mostrar archivo

class UpdateWeight():
    """docstring for UpdateWeight"""
    def __init__(self):
        self._yqtj_es = ES(YQTJ)

    def get_articles(self, table, record_time, release_time_begin,
                     release_time_end):
        body = {
            "query": {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {
                                    "range": {
                                        "RECORD_TIME": {  # 查询大于该csr_res_id 的信息
                                            "gt": record_time
                                        }
                                    }
                                },
                                {
                                    "range": {
                                        "RELEASE_TIME": {
                                            "gte": release_time_begin,
                                            "lte": release_time_end
                                        }
                                    }
                                }
                            ]
                        }
                    }
                }
            },
            "size": 1500,
            "sort": [{
                "RECORD_TIME": "asc"
            }]
        }

        print(tools.dumps_json(body))

        article = self._yqtj_es.search(table, body)
        return article.get('hits', {}).get('hits', [])

    def update_article_weight(self, articles):
        release_time = ''
        for article in articles:
            article_info = article.get('_source')
            if article_info['WEIGHT'] == 0:
                continue

            data = {
                'article_id': article_info['ID'],  # 文章id
                'clues_ids': article_info['CLUES_IDS'],  # 线索ids
                'may_invalid': 0,  #是否可能无效（微博包含@ 或者#）
                'vip_count': article_info['IS_VIP'],  # 主流媒体数
                'negative_emotion_count': article_info['EMOTION'],  # 负面情感数
                'zero_ids': article_info['ZERO_ID']
            }
            print(article_info["TITLE"])
            print(article_info["RELEASE_TIME"])

            result = tools.get_json_by_requests(IOPM_SERVICE_ADDRESS,
                                                data=data)
            weight = result.get('weight', 0)  # * weight_factor 没有考虑到地域
            tools.print_one_line("修改相关度 %s -> %s" %
                                 (article_info['WEIGHT'], weight))

            if self._yqtj_es.update_by_id('tab_iopm_article_info',
                                          article_info['ID'],
                                          {"WEIGHT": weight}):
                release_time, record_time = article_info[
                    "RELEASE_TIME"], article_info["RECORD_TIME"]

        return release_time, record_time

    def update_hot_weight(self, articles):
        release_time = ''
        for article in articles:
            article_info = article.get('_source')
            if article_info['WEIGHT'] == 0:
                continue

            data = {
                'hot_id': article_info['ID'],  # 文章id
                'hot_value': article_info['HOT'],  # 热度值
                'clues_ids': article_info['CLUES_IDS'],  #相关舆情匹配到的线索id
                'article_count': article_info['ARTICLE_COUNT'],  # 文章总数
                'vip_count': article_info["VIP_COUNT"],  # 主流媒体数
                'negative_emotion_count':
                article_info["NEGATIVE_EMOTION_COUNT"],  # 负面情感数
                'zero_ids': article_info['ZERO_ID']
            }
            print('''
                release_time %s
                record_time  %s
                ''' %
                  (article_info["RELEASE_TIME"], article_info["RECORD_TIME"]))

            result = tools.get_json_by_requests(IOPM_SERVICE_ADDRESS,
                                                data=data)
            weight = result.get('weight', 0)  # * weight_factor 没有考虑到地域
            tools.print_one_line("修改相关度 %s -> %s" %
                                 (article_info['WEIGHT'], weight))

            if self._yqtj_es.update_by_id('tab_iopm_hot_info',
                                          article_info['ID'],
                                          {"WEIGHT": weight}):
                record_time = article_info['RECORD_TIME']

        return record_time

Ejemplo n.º 19

0

Mostrar archivo

 def __init__(self):
     self._es = ES()
     self._current_csr_res_id = tools.read_file(STO_CURRENT_ID_FILE)
     self._current_csr_res_id = self._current_csr_res_id and int(
         self._current_csr_res_id) or 0

Ejemplo n.º 20

0

Mostrar archivo

class NewsCluster():
    def __init__(self):
        self._es = ES()
        self._current_csr_res_id = tools.read_file(STO_CURRENT_ID_FILE)
        self._current_csr_res_id = self._current_csr_res_id and int(
            self._current_csr_res_id) or 0

    def _get_same_day_hots(self, text, start_time):
        news_day_time = start_time[:start_time.find(' ')]
        body = {
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "start_time": {
                                "gte": news_day_time + ' 00:00:00',
                                'lte': news_day_time + ' 59:59:59'
                            }
                        }
                    },
                    "query": {
                        "multi_match": {
                            "query":
                            text,
                            "fields": ["csr_content"],
                            "operator":
                            "or",
                            "minimum_should_match":
                            "{percent}%".format(percent=int(MIN_SIMILARITY *
                                                            100))  # 匹配到的关键词占比
                        }
                    }
                }
            },
            "_source": ["hot_id", "csr_res_ids", "csr_content", 'hot'],
            "highlight": {
                "fields": {
                    "csr_content": {}
                }
            }
        }

        # 默认按照匹配分数排序
        hots = self._es.search('tab_news_csr_hot', body)
        # print(tools.dumps_json(hots))

        return hots.get('hits', {}).get('hits', [])

    def _save_current_id(self):
        '''
        @summary: 保存做到的id， 下次接着做
        ---------
        ---------
        @result:
        '''

        tools.write_file(STO_CURRENT_ID_FILE, str(self._current_csr_res_id))

    def deal_news(self):
        '''
        @summary: 取tab_news_csr_result信息
        ---------
        ---------
        @result:
        '''
        while True:
            body = {
                "query": {
                    "filtered": {
                        "filter": {
                            "range": {
                                "csr_res_id": {  # 查询大于该csr_res_id 的信息
                                    "gt": self._current_csr_res_id
                                }
                            }
                        }
                    }
                },
                "_source": ["csr_res_id", "csr_content", "start_time"],
                "sort": [{
                    "csr_res_id": "asc"
                }]
            }

            news_json = self._es.search('tab_news_csr_result', body)
            news_list = news_json.get('hits', {}).get('hits', [])

            if not news_list:
                log.debug(
                    'tab_news_csr_result 表中无大于%s的csr_res_id\nsleep %s...' %
                    (self._current_csr_res_id, SLEEP_TIME))
                tools.delay_time(SLEEP_TIME)
                continue

            for news_info in news_list:
                news = news_info.get('_source')
                csr_res_id = news.get('csr_res_id')
                csr_content = news.get('csr_content')
                start_time = news.get('start_time')

                log.debug('''
                    处理 tab_news_csr_result
                    csr_res_id  %s
                    start_time  %s
                    csr_content %s
                    ''' % (csr_res_id, start_time, csr_content))

                # 找相似文章
                similar_hot = None
                hots = self._get_same_day_hots(csr_content, start_time)

                # 遍历相似的文章，比较相似度
                for hot_info in hots:
                    hot = hot_info.get('_source')
                    hot_text = hot.get('csr_content')

                    temp_similarity = compare_text(csr_content, hot_text)
                    if temp_similarity > MIN_SIMILARITY:
                        similar_hot = hot

                    break  #hots 按照匹配值排序后，第一个肯定是最相似的，无需向后比较

                # 如果找到相似的文章，追加csr_res_id和hot值， 否则将该条信息最为新的热点
                if similar_hot:  # 找到相似的热点
                    log.debug('找到所属热点：%s' % similar_hot.get('csr_content'))

                    data = {}

                    # 更新热点的热度及追加文章的id
                    data["hot"] = similar_hot["hot"] + 1
                    data["csr_res_ids"] = similar_hot[
                        "csr_res_ids"] + ',' + csr_res_id

                    # 更新热点
                    self._es.update_by_id("tab_news_csr_hot",
                                          data_id=similar_hot.get("hot_id"),
                                          data=data)

                else:  # 没有找到相似的热点， 将当前文章作为热点
                    log.debug('无所属热点')

                    hot_info = {
                        'hot_id': csr_res_id,
                        'hot': 1,
                        'start_time': start_time,
                        'csr_res_ids': csr_res_id,
                        'csr_content': csr_content
                    }
                    self._es.add('tab_news_csr_hot',
                                 hot_info,
                                 data_id=csr_res_id)

                # 保存当前的id
                self._current_csr_res_id = csr_res_id
                self._save_current_id()

Ejemplo n.º 21

0

Mostrar archivo

Archivo: wechat_service.py Proyecto: xunux/wechat-spider-1

class WechatService():
    _todo_accounts = collections.deque()
    _rownum = 1

    _is_done = False

    def __init__(self):
        self._db = OracleDB()
        self._es = ES()
        self.__load_todo_account()

    def __load_todo_account(self):
        if not WechatService._todo_accounts:
            sql = '''
                select *
                   from (select rownum r, t.id, t.domain, t.biz
                           from TAB_IOPM_SITE t
                          where t.biz is not null and rownum < {size})
                  where r >= {rownum}
                '''.format(rownum=WechatService._rownum,
                           size=WechatService._rownum + SIZE)

            results = self._db.find(sql)
            if not results:
                WechatService._is_done = True
                WechatService._rownum = 1
                self.__load_todo_account()
            else:
                WechatService._todo_accounts = collections.deque(
                    results)  #  转为队列
                WechatService._rownum += SIZE

    def get_next_account(self):
        '''
        @summary:
        ---------
        ---------
        @result: 返回biz, 是否已做完一圈 (biz, True)
        '''
        if not WechatService._todo_accounts:
            self.__load_todo_account()

        next_account_info = WechatService._todo_accounts.popleft()
        next_account_id = next_account_info[2]
        next_account_biz = next_account_info[3]

        next_account = next_account_id, next_account_biz, WechatService._is_done
        # 重置_is_done 状态
        WechatService._is_done = False

        return next_account

    def is_exist(self, table, data_id):
        if self._es.get(table, data_id=data_id, doc_type=table):
            return True
        else:
            return False

    def add_article_info(self, article_info):
        '''
        @summary:
        ---------
        @param article_info:
        ---------
        @result:
        '''

        log.debug('''
            -----文章信息-----
            %s''' % tools.dumps_json(article_info))

        self._es.add('wechat_article', article_info,
                     article_info.get('article_id'))

    def add_account_info(self, account_info):
        log.debug('''
            -----公众号信息-----
            %s''' % tools.dumps_json(account_info))

        self._es.add('wechat_account', account_info, account_info.get('__biz'))

Ejemplo n.º 22

0

Mostrar archivo

Archivo: wechat_service.py Proyecto: xunux/wechat-spider-1

 def __init__(self):
     self._db = OracleDB()
     self._es = ES()
     self.__load_todo_account()

Ejemplo n.º 23

0

Mostrar archivo

 def __init__(self):
     self._es = ES()
     self._event_filter = EventFilter()
     self._event_filter.start()

Ejemplo n.º 24

0

Mostrar archivo

Archivo: elastic_search_server.py Proyecto: striver-ing/iopm-service

 def __init__(self):
     self._es = ES('192.168.60.40')

Ejemplo n.º 25

0

Mostrar archivo

    def __init__(self):
        self._es = ES()
        self._db = OracleDB()

        self._max_id = tools.read_file(STO_MAX_ID_FILE)
        self._max_id = self._max_id and eval(self._max_id) or {}

Ejemplo n.º 26

0

Mostrar archivo

class ExportData():
    INSERT = 1
    UPDATE = 2
    EXCEPTION = 3

    def __init__(self,
                 source_table='',
                 aim_table='',
                 key_map='',
                 unique_key=None,
                 unique_key_mapping_source_key=None,
                 update_read_status=True,
                 condition={'read_status': 0},
                 datas=[],
                 callback='',
                 sync_to_es=False):
        '''
        @summary: 初始化
        ---------
        @param source_table: 源table mongo数据库
        @param aim_table:    目标table
        @param key_map:      目标table 和 源table 的键的映射
        eg: key_map = {
            'aim_key1' : 'str_source_key2',          # 目标键 = 源键对应的值         类型为str
            'aim_key2' : 'int_source_key3',          # 目标键 = 源键对应的值         类型为int
            'aim_key3' : 'date_source_key4',         # 目标键 = 源键对应的值         类型为date
            'aim_key4' : 'vint_id',                  # 目标键 = 值                   类型为int
            'aim_key5' : 'vstr_name',                # 目标键 = 值                   类型为str
            'aim_key6' : 'vdate_name',               # 目标键 = 值                   类型为date
            'aim_key7' : 'sint_select id from xxx'   # 目标键 = 值为sql 查询出的结果 类型为int
            'aim_key8' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str
            'aim_key9' : 'clob_key8'                 # 目标键 = 源键对应的值         类型为clob
            'aim_key10' : 'clob_key8'                # 目标键 = 源键对应的值         类型为str
        }

        @param unique_key:    唯一的key 目标数据库根据该key去重
        @param unique_key_mapping_source_key: 目标表中唯一的key所对应的源表中的key 该值不为空时 更新目标表中已有的数据
         eg: unique_key_mapping_source_key = {
            'url':'str_url'                         # 目标键 = 源键对应的值         类型为str
         }
        @param condition:    导出满足什么样条件的数据 默认是read_status = 0 的
        @param datas:   要导出的数据，格式为[{...},{...}] 或者 {}用于直接将json数组导入到目标表，为空时默认导出mongodb的数据
        @param callback 导出数据的回调，导出一组，执行一次，callback(execute_type, sql) execute_type为执行类型（ExportData.INSERT、ExportData.UPDATE、ExportData.EXCEPTION）
        sql 为执行的语句
        ---------
        @result:
        '''

        super(ExportData, self).__init__()

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key
        self._update_read_status = update_read_status
        self._condition = condition

        self._mongodb = MongoDB() if self._source_table else ''
        self._datas = datas
        self._sync_to_es = sync_to_es
        self._callback = callback

        self._is_oracle = False
        self._is_set_unique_key = False
        self._is_set_unique_key = False
        self._export_count = 0
        self._update_count = 0
        self._unique_key_mapping_source_key = unique_key_mapping_source_key

    def export_to_oracle(self,
                         source_table='',
                         aim_table='',
                         key_map='',
                         unique_key=None,
                         unique_key_mapping_source_key=None,
                         update_read_status=True,
                         condition={'read_status': 0},
                         datas=[],
                         callback='',
                         sync_to_es=False):
        if aim_table:
            if self._aim_table != aim_table:
                self._is_set_unique_key = False
                self._es = ES() if sync_to_es else ''
                self._mongodb = MongoDB() if source_table else ''

            self._source_table = source_table
            self._aim_table = aim_table
            self._key_map = key_map
            self._unique_key = unique_key
            self._export_count = 0
            self._update_count = 0
            self._unique_key_mapping_source_key = unique_key_mapping_source_key
            self._update_read_status = update_read_status if not datas else False
            self._condition = condition
            self._datas = datas
            self._callback = callback
            self._sync_to_es = sync_to_es
            self._es = None

        self._aim_db = OracleDB()
        self._is_oracle = True

        return self.__export()

    def export_to_mysql(self,
                        source_table='',
                        aim_table='',
                        key_map='',
                        unique_key=None,
                        unique_key_mapping_source_key=None,
                        update_read_status=True,
                        condition={'read_status': 0},
                        datas=[],
                        callback=''):
        if self._aim_table != aim_table:
            self._is_set_unique_key = False

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key
        self._export_count = 0
        self._update_count = 0
        self._unique_key_mapping_source_key = unique_key_mapping_source_key
        self._update_read_status = update_read_status if not datas else False
        self._condition = condition
        self._datas = datas
        self._callback = callback

        self._aim_db = MysqlDB()
        return self.__export()

    def make_sql(self, data):
        '''
        @summary:
        ---------
        @param data: 数据字典
        ---------
        @result: 当unique_key_mapping_source_key不为空时返回insert_sql, update_sql 否则返回insert_sql
        '''
        aim_keys = tuple(self._key_map.keys())
        source_keys = tuple(self._key_map.values())

        # 取源key值 对应的type 和 key （源key包含type 和 key 信息）
        keys = []
        value_types = []
        for source_key in source_keys:
            temp_var = source_key.split('_', 1)
            value_types.append(temp_var[0])
            keys.append(temp_var[1])

        insert_sql = 'insert into ' + self._aim_table + " (" + ', '.join(
            aim_keys) + ") values ("
        update_sql = 'update ' + self._aim_table + " set "
        data_json = {}  # 导入到es中用
        values = []
        for i in range(len(keys)):
            if (value_types[i] != 'vint' and value_types[i] != 'vstr'
                    and value_types[i] != 'vdate' and value_types[i] != 'sint'
                    and value_types[i] != 'sstr') and (not data[keys[i]]
                                                       and data[keys[i]] != 0):
                values.append('null')
                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % values[-1]
                data_json[aim_keys[i].upper()] = None

            elif value_types[i] == 'str':
                values.append(
                    str(data[keys[i]]).replace("'", "''")
                )  # if isinstance(data[keys[i]], str) else data[keys[i]])  # 将单引号替换成两个单引号 否者insert_sql语句语法出错
                insert_sql += "'%s', "
                update_sql += aim_keys[i] + " = '%s', " % values[-1]
                data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'clob':
                text = str(data[keys[i]]).replace("'", "''")
                if not text:
                    insert_sql += "'%s', "
                    values.append(text)
                    update_sql += aim_keys[i] + " = '%s', " % values[-1]
                    data_json[aim_keys[i].upper()] = None
                else:
                    values_ = tools.cut_string(text, 1000)

                    clob_text = ''
                    for value in values_:
                        clob_text += "to_clob('%s') || " % value

                    clob_text = clob_text[:-len(' || ')]
                    values.append(clob_text)
                    insert_sql += "%s, "
                    update_sql += aim_keys[i] + " = %s, " % values[-1]
                    data_json[aim_keys[i].upper()] = data[keys[i]]

            elif value_types[i] == 'int':
                if isinstance(data[keys[i]], int) or isinstance(
                        data[keys[i]], float) or isinstance(
                            data[keys[i]], str):
                    values.append(data[keys[i]])
                elif isinstance(data[keys[i]], bool):
                    values.append(data[keys[i]] and 1 or 0)
                else:  # _id
                    values.append(int(str(data[keys[i]])[-6:], 16))

                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % values[-1]
                data_json[aim_keys[i].upper()] = eval(
                    values[-1]) if isinstance(values[-1], str) else values[-1]

            elif value_types[i] == 'date':
                values.append(data[keys[i]].replace('年', '-').replace(
                    '月', '-').replace('日', ''))
                if self._is_oracle:
                    format_date = 'yyyy-mm-dd hh24:mi:ss'[:len(
                        values[-1]) if len(values[-1]) <= 10 else None]
                    insert_sql += "to_date('%s','{}'), ".format(format_date)
                    update_sql += aim_keys[i] + "= to_date('%s','%s'), " % (
                        values[-1], format_date)
                    data_json[aim_keys[i].upper()] = values[-1]
                else:
                    insert_sql += "'%s', "
                    update_sql += aim_keys[i] + " = '%s', " % values[-1]
                    data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'vint':
                if tools.get_english_words(keys[i]):
                    sql = 'select %s from dual' % keys[i]
                    value = self._aim_db.find(sql)[0][0]
                    values.append(value)
                    data_json[aim_keys[i].upper()] = values[-1]
                else:
                    values.append(keys[i])
                    data_json[aim_keys[i].upper()] = eval(values[-1])

                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % values[-1]

            elif value_types[i] == 'vstr':
                values.append(keys[i])
                insert_sql += "'%s', "
                update_sql += aim_keys[i] + " = '%s', " % values[-1]
                data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'vdate':
                values.append(keys[i])
                if self._is_oracle:
                    format_date = 'yyyy-mm-dd hh24:mi:ss'[:len(
                        values[-1]) if len(values[-1]) <= 10 else None]
                    insert_sql += "to_date('%s','{}'), ".format(format_date)
                    update_sql += aim_keys[i] + "= to_date('%s','%s'), " % (
                        values[-1], format_date)
                    data_json[aim_keys[i].upper()] = values[-1]
                else:
                    insert_sql += "'%s', "
                    update_sql += aim_keys[i] + " = '%s', " % values[-1]
                    data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'sint':
                value = self._aim_db.find(keys[i], fetch_one=True)[0]
                values.append(value)
                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % value
                data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'sstr':
                value = self._aim_db.find(keys[i], fetch_one=True)[0]
                values.append(value)
                insert_sql += "'%s', "
                update_sql += aim_keys[i] + " = '%s', " % value
                data_json[aim_keys[i].upper()] = values[-1]

            else:
                error_msg = '%s不符合key_map规定格式' % value_types[i]
                raise (Exception(error_msg))

        insert_sql = insert_sql[:-2] + ")"
        insert_sql = insert_sql % tuple(values)
        # tools.print(data_json)

        # log.debug(insert_sql)
        if self._unique_key_mapping_source_key:
            # aim_key = tuple(self._unique_key_mapping_source_key.keys())[0]

            # value = tuple(self._unique_key_mapping_source_key.values())[0]
            # temp_var = value.split('_', 1)

            # source_key_types = temp_var[0]
            # source_key = temp_var[1]

            # if source_key_types == 'str':
            #     update_sql = update_sql[:-2] + " where %s = '%s'" %(aim_key, data[source_key])
            # elif source_key_types == 'int':
            #     update_sql = update_sql[:-2] + " where %s = %s" %(aim_key, data[source_key])

            # # log.debug(update_sql)

            return insert_sql, update_sql[:-2], data_json
        else:
            return insert_sql, data_json

    # @tools.run_safe_model(__name__)
    def __export(self):
        if self._unique_key and not self._is_set_unique_key:
            self._aim_db.set_unique_key(self._aim_table, self._unique_key)
            self._is_set_unique_key = True

        datas = self._mongodb.find(
            self._source_table,
            condition=self._condition) if self._mongodb else (
                self._datas
                if isinstance(self._datas, list) else [self._datas])
        for data in datas:
            if self._unique_key_mapping_source_key:
                insert_sql, update_sql, data_json = self.make_sql(data)
            else:
                insert_sql, data_json = self.make_sql(data)

            # tools.write_file(self._aim_table + '.txt', insert_sql, 'w+')
            def exception_callfunc(e):
                if 'ORA-00001' in str(e):
                    if self._update_read_status:
                        self._mongodb.update(self._source_table, data,
                                             {'read_status': 1})
                else:
                    log.error(insert_sql)

            execute_type = ExportData.EXCEPTION
            sql = ''
            # log.debug(insert_sql)
            if self._aim_db.add(insert_sql, exception_callfunc):
                self._export_count += 1
                sql = insert_sql
                execute_type = ExportData.INSERT

                if self._update_read_status:
                    self._mongodb.update(self._source_table, data,
                                         {'read_status': 1})

            elif self._unique_key_mapping_source_key:
                # 取id字段
                aim_key = tuple(self._unique_key_mapping_source_key.keys())[0]

                value = tuple(self._unique_key_mapping_source_key.values())[0]
                temp_var = value.split('_', 1)

                source_key_types = temp_var[0]
                source_key = temp_var[1]

                select_sql = 'select id from ' + self._aim_table
                if source_key_types == 'str':
                    select_sql = select_sql + " where %s = '%s'" % (
                        aim_key, data[source_key])
                elif source_key_types == 'int':
                    select_sql = select_sql + " where %s = %s" % (
                        aim_key, data[source_key])

                data_id = self._aim_db.find(select_sql)
                if data_id:
                    data_id = data_id[0][0]
                else:
                    continue

                #拼接update语句
                update_sql += " where id = %s" % data_id
                log.debug(update_sql)

                # 删除 update 里面 id= xxx 的条件，保证更新后的数据 ID不变
                id_info = ''.join(
                    tools.get_info(update_sql, [' id .*?,', ' ID .*?,']))
                update_sql = update_sql.replace(id_info, '')

                # 修改data_json 里的ID
                if "ID" in data_json.keys():
                    data_json["ID"] = data_id

                # 更新
                if self._aim_db.update(update_sql):
                    self._update_count += 1
                    sql = update_sql
                    execute_type = ExportData.UPDATE

                    if self._update_read_status:
                        self._mongodb.update(self._source_table, data,
                                             {'read_status': 1})

            # 同步到ES
            if self._sync_to_es and execute_type != ExportData.EXCEPTION:
                self._es.add(table=self._aim_table,
                             data=data_json,
                             data_id=data_json.get('ID'))

            if self._callback:
                self._callback(execute_type, sql, data_json)

        log.debug('''
            共导出%s条数据
            共更新%s条数据
            ''' % (self._export_count, self._update_count))

        return self._export_count + self._update_count

    def close(self):
        self._aim_db.close()

Ejemplo n.º 27

0

Mostrar archivo

import sys

sys.path.append('..')
import init
import base.constance as Constance
import utils.tools as tools
from utils.log import log
from db.mongodb import MongoDB
from db.elastic_search import ES

db = MongoDB()
es = ES()


def remove_table(tab_list):
    for tab in tab_list:
        db.delete(tab)


def reset_table(tab_list):
    for tab in tab_list:
        db.update(tab, {'status': 3}, {'status': 0})


def add_url(table,
            site_id='',
            url='',
            depth=0,
            remark='',
            status=Constance.TODO,
            title='',

Ejemplo n.º 28

0

Mostrar archivo

Archivo: wechat_service.py Proyecto: tyhg001/wechat-spider

class WechatService():
    _db = OracleDB()
    _es = ES()
    _wechat_sogou = WechatSogou()
    _wechat_public_platform = WechatPublicPlatform()

    _todo_accounts = collections.deque()
    _rownum = 1

    _is_done = False  # 做完一轮
    _is_all_done = False  # 所有账号当日发布的消息均已爬取

    # wechat_sogou 最后没被封的时间
    _wechat_sogou_enable = True
    _wechat_sogou_last_unenable_time = tools.get_current_timestamp()

    # wechat_public_platform 最后没被封的时间
    _wechat_public_platform_enable = True
    _wechat_public_platform_last_unenable_time = tools.get_current_timestamp()

    def __init__(self):
        pass

    def __load_todo_account(self):
        if not WechatService._todo_accounts:
            sql = ''
            if not WechatService._is_all_done:
                sql = '''
                    select *
                       from (select rownum r, t.id, t.domain, t.biz, t.name
                               from TAB_IOPM_SITE t
                              where t.biz is not null and mointor_status = 701 and (today_msg is null or today_msg = 0) and rownum < {size})
                      where r >= {rownum}
                    '''.format(rownum=WechatService._rownum,
                               size=WechatService._rownum + SIZE)
            else:  # 今日公众号发布的新文章均已爬取
                sql = '''
                    select *
                       from (select rownum r, t.id, t.domain, t.biz, t.name
                               from TAB_IOPM_SITE t
                              where t.biz is not null and mointor_status = 701 and rownum < {size})
                      where r >= {rownum}
                    '''.format(rownum=WechatService._rownum,
                               size=WechatService._rownum + SIZE)

            print(sql)
            results = WechatService._db.find(sql)
            if not results:
                if WechatService._rownum == 1:
                    # 今日公众号发布的新文章均已爬取，爬虫休息，明日再爬
                    WechatService._is_all_done = True  # 为了WeichatAction 设置休眠时间用
                    # 取下一天的公众号
                    self.__load_todo_account()

                else:
                    WechatService._is_done = True
                    WechatService._rownum = 1
                    self.__load_todo_account()

            else:
                WechatService._todo_accounts = collections.deque(
                    results)  #  转为队列
                WechatService._rownum += SIZE

    def is_have_new_article(self, account_id, account_name, __biz):
        '''
        @summary: 检查是否有新发布的文章
        ---------
        @param account_id:
        @param __biz:
        ---------
        @result:
        '''

        result = ''
        if WechatService._wechat_sogou_enable:  # 搜狗微信可用
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 有新发布的文章 抓取
                pass

            elif result == constance.NOT_UPDATE:
                # 无新发布的文章 pass
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                # 被封了 请求失败 记录下失败时间
                WechatService._wechat_sogou_enable = False
                WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
                )

        # 搜狗微信停用时间超过24小时了 可重新尝试
        elif tools.get_current_timestamp(
        ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 搜狗微信可用
                WechatService._wechat_sogou_enable = True

            elif result == constance.NOT_UPDATE:
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                pass

            # 更新下可用时间
            WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
            )

        # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章
        if not result or result == constance.VERIFICATION_CODE:
            if WechatService._wechat_public_platform_enable:  # 微信公众平台可用
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    pass

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    WechatService._wechat_public_platform_enable = False
                    WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                    )

            elif tools.get_current_timestamp(
            ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    WechatService._wechat_public_platform_enable = True

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    pass

                # 更新下可用时间
                WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                )

        return result

    def get_next_account(self):
        '''
        @summary:
        ---------
        ---------
        @result: 返回biz, 是否已做完一圈 (biz, True)
        '''

        while True:
            if not WechatService._todo_accounts:
                self.__load_todo_account()

            next_account_info = WechatService._todo_accounts.popleft()
            next_account_id = next_account_info[2]
            next_account_biz = next_account_info[3]
            next_account_name = next_account_info[4]

            next_account = next_account_id, next_account_biz, WechatService._is_done, WechatService._is_all_done

            if not WechatService._wechat_sogou_enable:
                log.debug('搜狗微信不可用')

            if not WechatService._wechat_public_platform_enable:
                log.debug('微信公众平台不可用')

            # 不用检查是否发布新文章 直接跳出
            if not CHECK_NEW_ARTICLE:
                break

            # 搜狗微信和微信公众平台均不可用 跳出
            if not WechatService._wechat_sogou_enable and not WechatService._wechat_public_platform_enable:
                break

            # 使用检查新文章时，有一定的几率跳出， 采用微信客户端直接爬取，防止搜狗微信使用频繁出现验证码
            if random.randint(1, 5) == 1:
                log.debug('跳出 防止搜狗微信被封')
                break

            # 检查是今日是否有文章发布
            result = self.is_have_new_article(next_account_id,
                                              next_account_name,
                                              next_account_biz)
            if result == constance.UPDATE:
                break
            elif result == constance.NOT_UPDATE:
                if WechatService._is_done:  # 防止公众号都没更新， 产生死循环 都检查完一遍 发现都没更新  直接跳出
                    break
                else:
                    # tools.delay_time(5)
                    continue
            elif result == constance.ERROR:
                break
            elif result == constance.VERIFICATION_CODE:
                break
            else:  # 检查更新不可用 直接调用客户端爬取
                break

        # 重置_is_done与_is_all_done 状态
        WechatService._is_done = False
        WechatService._is_all_done = False

        return next_account

    def update_account_article_num(self, __biz):
        # 查询es 统计数量
        # 今日
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "record_time": {
                                "gte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 00:00:00',
                                "lte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 23:59:59'
                            }
                        }
                    },
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        today_msg = result.get('hits', {}).get('total', 0)

        # 历史总信息量
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        total_msg = result.get('hits', {}).get('total', 0)

        if total_msg:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d where t.biz = '%s'" % (
                today_msg, total_msg, __biz)
        else:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d where t.biz = '%s'" % (
                today_msg, __biz)
        print(sql)
        WechatService._db.update(sql)

    def is_exist(self, table, data_id):
        if WechatService._es.get(table, data_id=data_id, doc_type=table):
            return True
        else:
            return False

    def add_article_info(self, article_info):
        '''
        @summary:
        ---------
        @param article_info:
        ---------
        @result:
        '''

        log.debug('''
            -----文章信息-----
            标题     %s
            发布时间 %s
            作者     %s
            公众号   %s
            url      %s
            ''' % (article_info['title'], article_info['release_time'],
                   article_info['author'], article_info['account'],
                   article_info['url']))

        WechatService._es.add('wechat_article', article_info,
                              article_info.get('article_id'))

    def add_account_info(self, account_info):
        log.debug('''
            -----公众号信息-----
            %s''' % tools.dumps_json(account_info))

        WechatService._es.add('wechat_account', account_info,
                              account_info.get('__biz'))

Ejemplo n.º 29

0

Mostrar archivo

    def __init__(self):
        super(SyncArtice, self).__init__()

        self._es = ES()
        self._redis = RedisDB()
        self._sync_count = 0

Ejemplo n.º 30

0

Mostrar archivo

class HotSync():
    def __init__(self):
        self._es = ES()

    def _get_today_hots(self, text, release_time):
        release_day = release_time[:release_time.find(' ')]

        body = {
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "RELEASE_TIME": {  # 当日发布的新闻
                                "gte": release_day + ' 00:00:00',
                                "lte": release_day + ' 23:59:59'
                            }
                        }
                    },
                    "query": {
                        "multi_match": {
                            "query":
                            text,
                            "fields": ["TITLE"],
                            "operator":
                            "or",
                            "minimum_should_match":
                            "{percent}%".format(percent=int(MIN_SIMILARITY *
                                                            100))  # 匹配到的关键词占比
                        }
                    }
                }
            },
            "_source": [
                "ID", "TITLE", "CONTENT", "HOT", "CLUES_IDS", "VIP_COUNT",
                "NEGATIVE_EMOTION_COUNT"
            ],
            "highlight": {
                "fields": {
                    "TITLE": {}
                }
            }
        }

        # 默认按照匹配分数排序
        hots = self._es.search('tab_iopm_hot_info', body)
        # print(tools.dumps_json(hots))

        return hots.get('hits', {}).get('hits', [])

    def get_hot_id(self, article_info):
        article_text = article_info.get(
            "TITLE")  # + article_info.get("CONTENT")
        release_time = article_info.get("RELEASE_TIME")

        article_text = tools.del_html_tag(article_text)

        hots = self._get_today_hots(article_text, release_time)

        # 找最相似的热点
        similar_hot = None
        max_similarity = 0
        for hot_info in hots:
            hot = hot_info.get('_source')
            hot_text = hot.get('TITLE')  # + hot.get('CONTENT')
            hot_text = tools.del_html_tag(hot_text)

            temp_similarity = compare_text(article_text, hot_text)
            if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity:
                similar_hot = hot
                max_similarity = temp_similarity

            break  #hots 按照匹配值排序后，第一个肯定是最相似的，无需向后比较

        if similar_hot:  # 找到相似的热点
            if similar_hot["ID"] != article_info["ID"]:  # 防止同一个舆情 比较多次
                data = {}

                # 更新热点的热度
                data["HOT"] = similar_hot["HOT"] + 1

                # 更新主流媒体数量及负面舆情数量
                data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + (
                    1 if article_info["IS_VIP"] else 0)
                data["NEGATIVE_EMOTION_COUNT"] = similar_hot[
                    'NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION']
                                                 == 2 else 0)

                # 更新相关度
                if similar_hot['CLUES_IDS']:
                    url = IOPM_SERVICE_ADDRESS + 'related_sort'
                    data_args = {
                        'hot_id': similar_hot['ID'],  # 文章id
                        'hot_value': data['HOT'],  # 热度值
                        'clues_id': similar_hot['CLUES_IDS'],  #相关舆情匹配到的线索id
                        'article_count': data['HOT'],  # 文章总数
                        'vip_count': data["VIP_COUNT"],  # 主流媒体数
                        'negative_emotion_count':
                        data["NEGATIVE_EMOTION_COUNT"],  # 负面情感数
                        'zero_ids': article_info['ZERO_ID']
                    }

                    result = tools.get_json_by_requests(url, data=data_args)
                    if result:
                        data['WEIGHT'] = result.get('weight', 0)

                # 更新热点
                self._es.update_by_id("tab_iopm_hot_info",
                                      data_id=similar_hot.get("ID"),
                                      data=data)

            # 返回热点id
            return similar_hot.get("ID")
        else:
            # 将该舆情添加为热点
            hot_info = deepcopy(article_info)
            hot_info.pop('HOT_ID')  # 热点表中无hot_id

            # 默认用户行为数量为零
            hot_info['ACCEPT_COUNT'] = 0
            hot_info['UNACCEPT_COUNT'] = 0
            hot_info['WATCH_COUNT'] = 0

            # 其他值
            hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0
            hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info[
                'EMOTION'] == 2 else 0

            hot_info['HOT'] = 1
            hot_info['ID'] = article_info.get("ID")

            self._es.add('tab_iopm_hot_info', hot_info, data_id=hot_info['ID'])

            # 返回热点id
            return hot_info['ID']