Ejemplo n.º 1
0
class ProvinceFilter():
    def __init__(self, province_name=PROVINCE):
        self._province_airs = []
        self._db = OracleDB()
        if province_name:
            self._province_airs.append(province_name)
            province_id = self.load_province_id(province_name)
            if province_id:
                self._province_airs.extend(
                    air[0] for air in self.load_province_air(province_id))
                # self._province_airs.extend(town[0] for town in self.load_province_town(province_id))
        else:  # 全国
            self._province_airs.extend(province[0]
                                       for province in self.load_province())

        print(self._province_airs)

    def load_province_id(self, province_name):
        sql = "select t.id from TAB_MANAGE_PROVINCE_INFO t where t.province_name like '%{province_name}%'".format(
            province_name=province_name)
        result = self._db.find(sql)
        province_id = result[0][0] if result else None
        if not province_id:
            log.debug('TAB_MANAGE_PROVINCE_INFO 无 %s 省份' % province_name)

        return province_id

    def load_province(self):
        sql = "select province_name from TAB_MANAGE_PROVINCE_INFO"
        province_names = self._db.find(sql)
        return province_names

    def load_province_air(self, province_id):
        sql = "select t.area_name from TAB_MANAGE_AREA_INFO t where t.province_id = %s" % province_id
        province_air = self._db.find(sql)
        return province_air

    def load_province_town(self, province_id):
        sql = "select t.town_name from TAB_MANAGE_TOWN_INFO t where t.province_id = %s" % province_id
        province_town = self._db.find(sql)
        return province_town

    def find_contain_air(self, text):
        contain_airs = []

        for air in self._province_airs:
            if air in text:
                contain_airs.append(air)

        return list(set(contain_airs))
Ejemplo n.º 2
0
class VipChecked(Singleton):
    def __init__(self):
        super(VipChecked, self).__init__()
        if not hasattr(self,'_vip_sites'):
            self._vip_sites = set()

            self._oracledb = OracleDB()

            self.load_vip_site()

    def load_vip_site(self):
        sql = 'select to_char(t.keyword2) from TAB_IOPM_CLUES t where zero_id = 7'
        sites = self._oracledb.find(sql)
        for site in sites:
            site_list = site[0].split(',')
            for site in site_list:
                if site:
                    self._vip_sites.add(site)

        # print(self._vip_sites)

    def is_vip(self, content):
        is_vip = False
        for site in self._vip_sites:
            is_vip = (content or False) and ((site in content) or (content in site))

            if is_vip:
                # print(site)
                break

        return int(is_vip)
Ejemplo n.º 3
0
class Keywords():
    def __init__(self):
        self._oracledb = OracleDB()
        self._clues = self.get_clues()

    def get_clues(self):
        sql = 'select t.id clues_id,to_char(t.keyword2),to_char(t.keyword3),t.zero_id, FIRST_ID, second_id  from TAB_IOPM_CLUES t where zero_id != 7'  # 7 为传播途径
        clues = self._oracledb.find(sql)
        return clues

    def get_keywords(self):

        keywords = []

        for clue in self._clues:
            clue_id = clue[0]
            key2 = clue[1]
            key3 = clue[2]
            zero_id = clue[3]
            first_id = clue[4]
            second_id = clue[5]

            keys = format_keywords(key2)  # 格式化线索词
            for key in keys:  #['新闻节目', '总理&主席', 'the xi factor']
                unit_keys = key.replace('&', ' ')  # [总理, 主席]
                keywords.append(unit_keys)

        return keywords
def get_clues():
    db = OracleDB()
    sql = 'select t.id clues_id,to_char(t.keyword2),to_char(t.keyword3),t.name  from TAB_IOPM_CLUES t where zero_id != 7'  # 7 为传播途径
    results = db.find(sql)

    clues_json = {
        "message": "查询成功",
        "status": 1,
        "data": [{
            "clues_id": 104,
            "包含": "",
            "不包含": "",
            "线索": 2
        }]
    }

    clues_json['data'] = []

    file = open('clues/clues.csv', 'w+', encoding='utf8')
    file.write("线索,关键词\n")

    for result in results:
        print(result)
        data = {
            "线索id":
            result[0] if result[0] else "",
            "包含":
            "%s" %
            (result[1].replace('"', '“').replace('、', '')[:-1] if result[1][-1]
             == ',' else result[1].replace('"', '')) if result[1] else "",
            "不包含":
            "%s" %
            (result[2].replace('"', '“').replace('、', '')[:-1] if result[2][-1]
             == ',' else result[2].replace('"', '')) if result[2] else "",
            "线索":
            result[3] if result[3] else ""
        }

        # is_delete, keyword2 = delete_keys(result[2])
        # if is_delete:
        #     print('修改后的key->  ', keyword2)
        #     sql = "update TAB_IOPM_CLUES t set t.keyword2 = to_clob('%s') where t.id = %s"%(keyword2, result[0])
        #     if db.update(sql):
        #         print('更新数据库成功\n')

        print(data)
        # data['keyword2'] = format_keys(data['keyword2'])
        # data['keyword3'] = format_keys(data['keyword3'])
        clues_json["data"].append(data)
        file.write('"%s","%s"\n' % (data['线索'], data['包含']))

    file.close()
    # clues_json = tools.dumps_json(clues_json)
    # print(clues_json)

    # tools.write_file('clues/clues.txt', clues_json)
    os.system('start clues\\')

    return clues_json
Ejemplo n.º 5
0
class CompareKeywords():
    def __init__(self):
        self._oracledb = OracleDB()
        self._clues = self.get_clues()

    def get_clues(self):
        sql = 'select t.id clues_id,to_char(t.keyword2),to_char(t.keyword3),t.zero_id, FIRST_ID, second_id  from TAB_IOPM_CLUES t where zero_id != 7'  # 7 为传播途径
        clues = self._oracledb.find(sql)
        return clues

    def get_contained_keys(self, text):
        '''
        @summary:
        ---------
        @param text:比较的文本
        @param keys:关键词列表
        ---------
        @result:
        '''
        keywords = []
        clues_ids = []
        zero_ids = []
        first_ids = []
        second_ids = []
        keyword_clues = {}

        for clue in self._clues:
            clue_id = clue[0]
            key2 = clue[1]
            key3 = clue[2]
            zero_id = clue[3]
            first_id = clue[4]
            second_id = clue[5]

            keys = format_keywords(key2)  # 格式化线索词
            for key in keys:  #['新闻节目', '总理&主席', 'the xi factor']
                # 获取单元key 如 总理&主席 必须全包含
                unit_keys = key.split('&')  # [总理, 主席]
                for unit_key in unit_keys:
                    if unit_key not in text:
                        break
                else:
                    keywords.extend(unit_keys)
                    clues_ids.append(str(clue_id))
                    zero_ids.append(str(zero_id))
                    first_ids.append(str(first_id))
                    second_ids.append(str(second_id))
                    for unit_key in unit_keys:
                        keyword_clues[unit_key] = clue_id

        return ','.join(set(keywords)), ','.join(set(clues_ids)), ','.join(
            set(zero_ids)), ','.join(set(first_ids)), ','.join(
                set(second_ids)), keyword_clues
Ejemplo n.º 6
0
class SyncES():
    def __init__(self):
        self._es = ES()
        self._db = OracleDB()

        self._max_id = tools.read_file(STO_MAX_ID_FILE)
        self._max_id = self._max_id and eval(self._max_id) or {}

    def get_data(self, sql):
        return self._db.find(sql, to_json=True)

    def export_to_es(self, table, data, data_id):
        self._es.add(table=table, data=data, data_id=data_id)

    def sync_data(self, table, step=20):
        '''
        @summary: 需要先把id设为主键
        ---------
        @param sql:
        @param table:
        @param is_step: 分批导 0 位一次导入, 适合数据量不多情况。速度快
        ---------
        @result:
        '''

        max_id = self._max_id.get(table, 0)
        self._db.set_primary_key(table)

        while True:
            inner_sql = 'select * from %s where id > %d and rownum <= %d order by id' % (
                table, max_id, step)
            datas = sync_es.get_data(inner_sql)

            if not datas:
                self.close()
                break

            for data in datas:
                data_id = data['ID']
                data = tools.dumps_json(data)
                print(data)
                print(data_id)
                # print(data)

                max_id = data_id

                self.export_to_es(table, data, data_id)

        self._max_id[table] = max_id

    def close(self):
        tools.write_file(STO_MAX_ID_FILE, str(self._max_id))
def main():
    db = OracleDB()
    mongodb = MongoDB()

    sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time  and search_type = 702'
    result_list = db.find(sql, fetch_one=False)
    if not result_list:
        log.debug('无任务 结束')
        return

    parser_params = {'result_list': result_list}
    # parser_params = []
    # for i in result:
    #     parser_params.extend(str(i[0]).split(','))

    def begin_callback():
        log.info('\n********** WWA_weibo_user begin **********')
        mongodb.delete('WWA_weibo_user_urls')

    def end_callback():
        # 导出数据
        key_map = {
            'id': 'int__id',
            'name': 'str_name',
            'sex': 'int_sex',
            'summary': 'str_summary',
            'fans_count': 'int_fans_count',
            'blog_verified': 'str_blog_verified',
            'is_verified': 'int_is_verified',
            'account_url': 'str_url',
            'follow_count': 'int_follow_count',
            'image_url': 'str_image_url',
            'monitor_status': 'vint_401',
            'SEARCH_TYPE' : 'vint_702',
            'region' : 'str_area'
        }

        export = ExportData('WWA_weibo_user_info', 'tab_mvms_weibo_info', key_map, 'account_url')
        export.export_to_oracle()
        log.info('\n********** WWA_weibo_user end **********')

    # 配置spider
    spider = Spider(tab_urls = 'WWA_weibo_user_urls', tab_site = 'WWA_site_info', tab_content = 'WWA_weibo_user_info',
                    parser_count = 1, begin_callback = begin_callback, end_callback = end_callback,
                    parser_params = parser_params)

    # 添加parser
    spider.add_parser(weibo_user_parser)
    spider.start()
Ejemplo n.º 8
0
class EventFilter(threading.Thread):
    def __init__(self):
        super(EventFilter, self).__init__()

        self._db = OracleDB()
        self._event_knowledges = self.load_event_knowledges()

    def run(self):
        while True:
            tools.delay_time(60 * 60)
            print('更新事件知识库...')
            self._event_knowledges = self.load_event_knowledges()
            print('更新事件知识库完毕')

    def load_event_knowledges(self):
        '''
        @summary:
        801 时事政治
        802 社会民生
        803 教育改革
        804 医疗卫生
        805 科技舆情
        806 意识形态(无)
        807 政策法规
        808 经济舆情(无)
        809 生态文明
        810 体育舆情(无)
        811 突发安全(无)
        ---------
        ---------
        @result:
        '''
        sql = 'select t.keyword, t.type from TAB_IOPM_EVENT_KNOWLEDEGE t'
        event_knowledges = self._db.find(sql)
        return event_knowledges

    def find_contain_event(self, text):
        contain_event_type = set()
        for event in self._event_knowledges:
            event_keyword = event[0]
            event_type = event[1]

            if event_keyword in text:
                contain_event_type.add(str(event_type))

        return list(contain_event_type)
Ejemplo n.º 9
0
def main():
    db = OracleDB()
    sql = 'select t.id clues_id,to_char(t.keyword1),to_char(t.keyword2),to_char(t.keyword3),t.zero_id  from TAB_IOPM_CLUES t'
    results = db.find(sql)

    clues_json = {
        "message":
        "查询成功",
        "status":
        1,
        "data": [{
            "clues_id": 104,
            "keyword1": "",
            "keyword2": "",
            "keyword3": "",
            "zero_id": 2
        }]
    }

    clues_json['data'] = []

    for result in results:
        data = {
            "clues_id":
            result[0] if result[0] else "",
            "keyword1":
            "%s" %
            (result[1].replace('"', '“').replace('、', '')[:-1] if result[1][-1]
             == ',' else result[1].replace('"', '')) if result[1] else "",
            "keyword2":
            "%s" %
            (result[2].replace('"', '“').replace('、', '')[:-1] if result[2][-1]
             == ',' else result[2].replace('"', '')) if result[2] else "",
            "keyword3":
            "%s" %
            (result[3].replace('"', '“').replace('、', '')[:-1] if result[3][-1]
             == ',' else result[3].replace('"', '')) if result[3] else "",
            "zero_id":
            result[4] if result[4] else ""
        }
        clues_json["data"].append(data)

    clues_json = tools.dumps_json(clues_json)
    print(clues_json)

    tools.write_file('./clues.txt', clues_json)
def main():
    db = MongoDB()
    oracle = OracleDB()

    def begin_callback():
        #db.update('WWA_app_urls',{'depth':0}, {'status':0})
        db.delete('WWA_search_app_urls')
        log.info('\n********** wwa begin **********')

    def end_callback():
        log.info('\n********** wwa end **********')
        export_data.main()

    keywords = []

    result_list = oracle.find(
        'select keyword from TAB_MVMS_SEARCH_INFO where  MONITOR_START_TIME <= sysdate AND MONITOR_END_TIME >= sysdate and search_type=703'
    )
    if not result_list:
        log.debug('无任务 结束')
        return

    keywords = []
    for result in result_list:
        keywords.extend(result[0].split(','))

    parser_params = {'keywords': keywords}

    # 配置spider
    spider = Spider(tab_urls='WWA_search_app_urls',
                    tab_site='WWA_search_app_site_info',
                    tab_content='WWA_search_app_content_info',
                    content_unique_key='title',
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params=parser_params)

    # 添加parser
    spider.add_parser(yingyongbao_parser)
    spider.add_parser(android_market_parser)
    spider.add_parser(baidu_mobile_assistant_parser)
    spider.add_parser(mobile360_assistant_parser)
    spider.start()
Ejemplo n.º 11
0
def main():
    oracledb = OracleDB()
    sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time  and search_type = 701'
    result_list = oracledb.find(sql)  #[(keys, monitor_type),()]
    if not result_list:
        log.debug('无任务 结束')
        return

    # print(result_list)
    # keywords = []
    # for result in result_list:
    #     keywords.extend(result[0].split(','))

    def begin_callback():
        log.info('\n********** WWA_wechat_account begin **********')
        db = MongoDB()
        db.delete('WWA_wechat_account_url', {})

    def end_callback():
        log.info('\n********** WWA_wechat_account end **********')
        export_data.account_main()

    parser_params = {'result_list': result_list}

    # 配置spider
    spider = Spider(tab_urls='WWA_wechat_account_url',
                    tab_site='WWA_wechat_site_info',
                    tab_content='WWA_wechat_official_accounts',
                    content_unique_key='account_id',
                    parser_count=1,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params=parser_params)

    # 添加parser
    spider.add_parser(wechat_account_parser)

    spider.start()
Ejemplo n.º 12
0
def main():
    oracledb = OracleDB()
    sql = 'select t.account_id, t.monitor_type from TAB_MVMS_WECHAT_INFO t where monitor_status = 402'
    result_list = oracledb.find(sql)
    if not result_list:
        log.debug('无任务 结束')
        return

    # keywords = []
    # for result in result_list:
    #     keywords.append(result[0])

    def begin_callback():
        log.info('\n********** WWA_wechat_article begin **********')
        db = MongoDB()
        db.delete('WWA_wechat_article_url', {})

    def end_callback():
        log.info('\n********** WWA_wechat_article end **********')
        export_data.article_main()

    parser_params = result_list

    # 配置spider
    spider = Spider(tab_urls='WWA_wechat_article_url',
                    tab_site='WWA_wechat_site_info',
                    tab_content='WWA_wechat_article',
                    content_unique_key='title',
                    parser_count=1,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params=parser_params)

    # 添加parser
    spider.add_parser(wechat_article_parser)

    spider.start()
Ejemplo n.º 13
0
def main():
    db = OracleDB()

    sql = '''
        select t.program_id, c.chan_name, program_name, d.name, t.image_url, t.official_blog
          from TAB_MMS_PROGRAM t
          left join tab_mam_chan c
            on c.chan_id = t.chan_id
          left join tab_mms_dictionary d
            on t.type = d.id
           and d.type = 2
    '''
    # where t.program_id =  226
    program_info = db.find(sql)

    def begin_callback():
        log.info('\n********** news begin **********')
        # 更新任务状态 doing

    def end_callback():
        log.info('\n********** news end **********')

    # 配置spider
    spider = Spider(tab_urls='mms_urls',
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    delete_tab_urls=True,
                    parser_params=program_info)

    # 添加parser
    # spider.add_parser(iqiyi_hot_parser)
    spider.add_parser(iqiyi_search_parser)
    # spider.add_parser(weibo_user_parser)
    # spider.add_parser(weibo_article_parser)

    spider.start()
Ejemplo n.º 14
0
def main():
    db = OracleDB()
    mongodb = MongoDB()

    sql = 'select t.ID, t.monitor_type from TAB_MVMS_WEIBO_INFO t where monitor_status = 402'
    result_list = db.find(sql, fetch_one=False)
    if not result_list:
        log.debug('无任务 结束')
        return

    parser_params = result_list

    # for i in result:
    #     parser_params.extend(str(i[0]).split(','))

    def begin_callback():
        log.info('\n********** WWA_weibo_info begin **********')
        mongodb.delete('WWA_weibo_info_urls')

    def end_callback():
        # 导出数据
        key_map = {
            'id': 'int__id',
            'release_time': 'date_release_time',
            'come_from': 'str_come_from',
            'content': 'clob_content',
            'image_url': 'str_image_url',
            'video_url': 'str_video_url',
            'transpond_count': 'int_transpond_count',
            'praise_count': 'int_praise_count',
            'check_status': 'vint_301',
            'weibo_id': 'int_weibo_id',
            'article_url': 'str_url',
            'violate_status': 'int_violate_id',
            'sensitive_id': 'int_sensitive_id',
            'record_time': 'date_record_time',
            'SEXY_IMAGE_STATUS': 'str_sexy_image_status'
        }

        export = ExportData('WWA_weibo_info_info',
                            'tab_mvms_weibo_article_info',
                            key_map,
                            unique_key='ARTICLE_url',
                            condition={
                                'read_status': 0,
                                "image_pron_status": 2
                            })
        export.export_to_oracle()
        log.info('\n********** WWA_weibo_info end **********')

    # 配置spider
    spider = Spider(tab_urls='WWA_weibo_info_urls',
                    tab_site='WWA_site_info',
                    tab_content='WWA_weibo_info_info',
                    parser_count=1,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params=parser_params)

    # 添加parser
    spider.add_parser(weibo_info_parser)
    spider.start()
Ejemplo n.º 15
0
class CheckNewArticle():
    def __init__(self):
        self._oracledb = OracleDB()
        self._redisdb = RedisDB()
        self._wechat_sogo = WechatSogou()

    def get_wait_check_account(self):
        '''
        @summary:
        ---------
        @param :
        ---------
        @result:
        '''
        # 取抓取完的公众号,且最近发布时间已过去两小时,则再次监测是否又发布新文章
        before_tow_hours = tools.timestamp_to_date(
            tools.get_current_timestamp() - 60 * 60 * 2)
        sql = '''
            select t.id,
                   t.domain,
                   t.name,
                   to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'),
                   t.biz
              from TAB_IOPM_SITE t
             where t.biz is not null
               and mointor_status = 701
               and t.spider_status = 603
               and (t.last_article_release_time is null or
                   t.last_article_release_time <=
                   to_date('{}', 'yyyy-mm-dd hh24:mi:ss'))
        '''.format(before_tow_hours)

        accounts = self._oracledb.find(sql)

        # 若无抓取完的公众号,且redis中无抓取任务,则数据库中非603任务可能为丢失任务,需要重新下发
        if not accounts and not self._redisdb.sget_count('wechat:account'):
            sql = '''
                select t.id,
                       t.domain,
                       t.name,
                       to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'),
                       t.biz
                  from TAB_IOPM_SITE t
                 where t.biz is not null
                   and mointor_status = 701
                   and t.spider_status != 603
            '''

            accounts = self._oracledb.find(sql)

        return accounts

    def check_new_article(self, account):
        oralce_id, account_id, account_name, last_article_release_time, biz = account

        article_release_time = self._wechat_sogo.get_article_release_time(
            account_id=account_id, account=account_name)
        print(article_release_time)
        if article_release_time:
            last_article_release_time = last_article_release_time or ''
            if article_release_time >= tools.get_current_date(
                    '%Y-%m-%d'
            ) and article_release_time > last_article_release_time:
                print('{} 有新文章发布,等待抓取。 发布时间:{}'.format(account_name,
                                                       article_release_time))

                sql = '''
                    update TAB_IOPM_SITE t set t.spider_status = 601,
                     t.last_article_release_time =
                           to_date('{}', 'yyyy-mm-dd hh24:mi:ss')
                     where id = {}
                '''.format(article_release_time, oralce_id)

                # 多线程, 数据库需每个线程持有一个
                oracledb = OracleDB()
                oracledb.update(sql)
                oracledb.close()

                # 入redis, 作为微信爬虫的任务池
                data = (oralce_id, account_id, account_name,
                        last_article_release_time, biz)
                self._redisdb.sadd('wechat:account', data)
Ejemplo n.º 16
0
class TaskManager():
    def __init__(self):
        self._oracledb = OracleDB()
        self._redisdb = RedisDB()
        self._news_url_table = 'news:news_urls'
        self._news_urls_dupefilter = 'news:news_urls_dupefilter'

    def get_task_count(self):
        '''
        @summary: redis 中是否有待做的url
        ---------
        ---------
        @result:
        '''

        return self._redisdb.zget_count(self._news_url_table)

    def get_ever_depth_count(self, total_depth=5):
        '''
        @summary:
        ---------
        @param total_depth: 不包含。 以客户角度的层数
        ---------
        @result:
        '''

        depth_count_info = {}
        total_count = 0
        for depth in range(total_depth):
            key = '第%s层url数' % (depth + 1)
            depth_count_info[key] = self._redisdb.sget_count(
                self._news_urls_dupefilter + str(depth))
            total_count += depth_count_info[key]

        depth_count_info['总url数'] = total_count
        return depth_count_info

    def get_task_from_oracle(self):
        tasks = []

        offset = 0
        while True:
            # 取任务
            task_sql = '''
                select *
                  from (select t.id, t.name, t.position, t.url, t.depth, rownum r
                          from TAB_IOPM_SITE t
                         where classify = 1
                           and t.mointor_status = 701
                           and (t.position != 35 or t.position is null)
                           and rownum < {page_size})
                 where r >= {offset}
            '''.format(page_size=offset + ONE_PAGE_SIZE, offset=offset)

            results = self._oracledb.find(task_sql)
            offset += ONE_PAGE_SIZE

            if not results: break

            # 拼装成json格式的url
            for task in results:
                website_id = task[0]
                website_name = task[1]
                website_position = task[2]
                website_url = task[3]
                website_domain = tools.get_domain(website_url)
                spider_depth = task[4]

                remark = {
                    'website_name': website_name,
                    'website_position': website_position,
                    'website_url': website_url,
                    'website_domain': website_domain,
                    'spider_depth': spider_depth
                }
                url_dict = {
                    'site_id': 1,
                    'url': website_url,
                    'depth': 0,
                    'remark': remark,
                    'retry_times': 0
                }

                tasks.append(url_dict)

        return tasks

    def add_task_to_redis(self, tasks):
        for task in tasks:
            url = task.get('url')
            if url:
                url_id = tools.get_sha1(url)
                if self._redisdb.sadd(self._news_urls_dupefilter, url_id):
                    self._redisdb.zadd(self._news_url_table, task, prioritys=0)
                    # 下面是统计每层url数量用的表
                    self._redisdb.sadd('news:news_urls_dupefilter0', url_id)

    def clear_task(self):
        # 清空url指纹表
        self._redisdb.sdelete('news:news_urls_dupefilter')
        # 下面是统计每层url数量用的表
        self._redisdb.sdelete('news:news_urls_dupefilter0')
        self._redisdb.sdelete('news:news_urls_dupefilter1')
        self._redisdb.sdelete('news:news_urls_dupefilter2')
        self._redisdb.sdelete('news:news_urls_dupefilter3')
        self._redisdb.sdelete('news:news_urls_dupefilter4')
Ejemplo n.º 17
0
class WechatSogou():
    def __init__(self):
        self._db = OracleDB()

    def deal_null_biz(self):
        sql = 'select id, name, domain from TAB_IOPM_SITE t where classify = 2 and t.biz is null'
        accounts_info = self._db.find(sql)

        for account_info in accounts_info:
            print(account_info)
            _id = account_info[0]
            account = account_info[1]
            account_id = account_info[2]

            account_info = self.get_account_info(account_id, account)
            log.debug(tools.dumps_json(account_info))

            if account_info.get('__biz'):
                account = account or account_info.get('account')
                account_id = account_id or account_info.get('account_id')
                __biz = account_info.get('__biz') or ''

                sql = "update TAB_IOPM_SITE set name = '%s', domain = '%s', biz = '%s' where id = %s"%(account, account_id, __biz, _id)
                log.debug(sql)
                self._db.update(sql)

            elif not account_info.get('check_info'):
                log.debug('查无此公众号 :%s'% account)

            tools.delay_time(60)



    def get_account_info(self, account_id = '', account = ''):
        keyword = account_id or account # 账号id优先
        keyword = keyword.lower()

        log.debug('search keywords ' + keyword)

        headers = {
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            "Cookie": "IPLOC=CN1100; ld=4yllllllll2zj$kYlllllVo3$xklllllWT89eyllll9lllllRklll5@@@@@@@@@@; SUV=00E3555B7B7CC4C55A0AA8195254D871; CXID=150E3ABE3C35F9E55217835F7720E719; ABTEST=8|1510801558|v1; LSTMV=418%2C28; LCLKINT=2070; ad=8kllllllll2zRlPflllllVoSynYlllllWT89eyllllwlllll9Cxlw@@@@@@@@@@@; SUID=C5C47C7B1508990A000000005A0AA818; weixinIndexVisited=1; JSESSIONID=aaa-1KvS1lhung8pB9v8v; sct=20; PHPSESSID=k3c9psast34njs32vjm3pas3l1; SUIR=E8E851562D28732A6B711C802DECBC6F; seccodeErrorCount=1|Tue, 28 Nov 2017 11:11:05 GMT; SNUID=A1A0181864613C6A610582E26446EC9A; successCount=1|Tue, 28 Nov 2017 11:11:22 GMT",
            "Host": "weixin.sogou.com"
        }

        proxies = ip_proxies.get_proxies()
        headers["User-Agent"] = ip_proxies.get_user_agent()

        url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&query=%s&ie=utf8&_sug_=n&_sug_type_='%(keyword)
        html, request = tools.get_html_by_requests(url, headers = headers, proxies = proxies)

        # 公众号信息块
        regex = '<!-- a -->(.*?)<!-- z -->'
        account_blocks = tools.get_info(html, regex)

        regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">'
        check_info = tools.get_info(html, regex, fetch_one = True)
        if check_info:
            log.debug('''取公众号列表 : %s
                         url : %s
                      '''%(check_info, url)
                      )

        account_info = {'check_info' : check_info}

        for account_block in account_blocks:
            regex = '<a.*?account_name.*?>(.*?)</a>'
            account = tools.get_info(account_block, regex, fetch_one = True)
            account = tools.del_html_tag(account)

            regex = '<label name="em_weixinhao">(.*?)</label>'
            account_id = tools.get_info(account_block, regex, fetch_one = True)

            regex = '<a.*?account_name.*?href="(.*?)">'
            account_url = tools.get_info(account_block, regex, fetch_one = True)
            account_url = account_url.replace('&amp;',"&")

            __biz = ''
            if account.lower() == keyword or account_id.lower() == keyword:
                # 取biz
                headers = {
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                    "Accept-Language": "zh-CN,zh;q=0.8",
                    "Host": "mp.weixin.qq.com",
                    "Connection": "keep-alive",
                    "Referer": "http://weixin.sogou.com/weixin?type=1&s_from=input&query=%E6%B3%B8%E5%B7%9E%E7%94%B5%E8%A7%86%E5%8F%B0&ie=utf8&_sug_=n&_sug_type_=",
                    "Cookie": "RK=XbmCLga7Pm; pgv_pvi=9492080640; noticeLoginFlag=1; ua_id=D8NYmIGpieSNub9rAAAAAGNz-Z1l4qe4x5WdelXsnmk=; xid=f3e1fb8a5fe8452b1d60a4059706017a; openid2ticket_opcqcjrNnRf62olc2Aj4PIU2hq9E=iNiYDe6xyIQ59zJxdOH0fmku4sXhFTq299CHyxYNJH8=; mm_lang=zh_CN; uin=o0564773807; skey=@Q46eRUFUE; pt2gguin=o0564773807; ptisp=cnc; ptcz=8deaf5ec9f0b3c27516ab6b735a6f3af99bc3517b922f52917b0ed5c6d82002f; o_cookie=564773807; pgv_info=ssid=s5664129956; pgv_pvid=8949522462; pac_uid=1_564773807; sig=h017174242e513ba3ec2450e63ac7a82981b57f85995f81aa47747b23e28ab077954627089b9d7fc947; pgv_si=s7924323328",
                    "Accept-Encoding": "gzip, deflate, br",
                    "Cache-Control": "max-age=0",
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
                    "Upgrade-Insecure-Requests": "1"
                }

                proxies = ip_proxies.get_proxies()
                headers["User-Agent"] = ip_proxies.get_user_agent()

                html, request = tools.get_html_by_requests(account_url, proxies = proxies)
                print(html)
                regex = 'var biz = "(.*?)"'

                __biz = tools.get_info(html, regex, fetch_one = True)

                log.debug('''
                    公众号名称          %s
                    公众号账号          %s
                    账号url             %s
                    __biz               %s
                    '''%(account, account_id, account_url, __biz))

                account_info = {
                    'account' : account,
                    'account_id' : account_id,
                    '__biz' : __biz,
                }

        return account_info
def add_anchor_info(table,
                    site_id,
                    title='',
                    name='',
                    image_url='',
                    room_id='',
                    room_url='',
                    video_path='',
                    watched_count='',
                    fans_count='',
                    sex='',
                    age='',
                    address='',
                    live_view=1,
                    watched_count_url=''):
    '''
    @summary:
    ---------
    @param table: 表名
    @param site_id: 网站id
    @param name: 主播名
    @param image_url: 贴图地址
    @param room_id: 房间号
    @param room_url: 房间网页的url
    @param video_path: 房间视频流地址
    @param watched_count: 观众数
    @param fans_count: 粉丝数
    @param sex:  性别
    @param age:  年龄
    @param address:   主播所在地址(城市)
    @param live_view: 直播状态(0 未直播 1 直播)
    @param watched_count_url: 实时观众数地址
    ---------
    @result:
    '''

    #违规知识库检索
    task_id = 0
    violate_content = ''
    #-交验--
    from db.oracledb import OracleDB
    oracle_db = OracleDB()

    sql = 'select t.name, t.keyword, t.task_id from tab_nbsp_violate_knowledge t where t.monitor_start_time <= sysdate and sysdate <= t.monitor_end_time'
    results = oracle_db.find(sql)  #[('色情低俗', '性感,枪支,格斗,脱衣,透视,胸器', 1)]

    for result in results:
        name_, keywords, task_id_ = result
        keywords = keywords.split(',')
        for keyword in keywords:
            if name.find(keyword) != -1:
                task_id = task_id_
                violate_content = name

    anchor_info_dict = {
        'site_id': site_id,
        'title': title,
        'task_id': task_id,
        'violate_content': violate_content,
        'name': name,
        'image_url': image_url,
        'sex': sex,
        'age': age,
        'address': address,
        'fans_count': fans_count,
        'watched_count': watched_count,
        'room_id': room_id,
        'room_url': room_url,
        'video_path': video_path,
        'live_view': live_view,
        'record_time': tools.get_current_date(),
        'watched_count_url': watched_count_url,
        'read_status': 0,
        'sexy_image_status': '',
        'sexy_image_url': '',
        'image_pron_status': 0
    }

    if not db.add(table, anchor_info_dict):
        anchor_info_dict.pop('_id')
        anchor_info_dict.pop('sexy_image_status')
        anchor_info_dict.pop('sexy_image_url')
        anchor_info_dict.pop('image_pron_status')
        db.update(table, {'room_id': room_id}, anchor_info_dict)
def add_WWA_search_app_info(table,
                            site_id,
                            url,
                            title='',
                            summary='',
                            update_info='',
                            score='',
                            author='',
                            app_url='',
                            image_url='',
                            software_size='',
                            tag='',
                            platform='',
                            download_count='',
                            release_time='',
                            language='',
                            sensitive_id='',
                            read_status=0):
    '''
    @summary:
    ---------
    @param title: 标题
    @param site_id: 网站id
    @param summary: 简介
    @param update_info: 更新信息
    @param socre: 评分
    @param author: 作者
    @param url: 原文url
    @param app_url: app下载的url
    @param image_url : 图片url(多个url逗号分割)
    @param classify_id: 分类
    @param software_size: 大小
    @param tag: 版本 |
    @param platform: 平台(ios / android)
    @param download_count:下载次数
    @param release_time: 发布时间
    @param record_time: 记录时间
    @param sensitive_id: varchar|||敏感信息id(多个敏感信息id用逗号分割)
    @param read_status: 读取状态(0没读, 1读取)
    ---------
    @result:
    '''

    # 过滤掉不符合的app
    from db.oracledb import OracleDB
    oracle_db = OracleDB()

    sql = 'select keyword from TAB_MVMS_SEARCH_INFO t where search_type = 703'
    results = oracle_db.find(sql)  #[('天天快报,今日头条,黑龙江',)]

    is_usefull = False

    text_content = title + summary + update_info + author
    for result in results:
        keywords = result[0]
        keywords = keywords.split(',')
        for keyword in keywords:
            if keyword in text_content:
                is_usefull = True
                break
        if is_usefull:
            break

    if not is_usefull:
        return

    if language == '中文':
        language = 601
    elif language == '英文':
        language = 602
    else:
        language = 603

    title = tools.del_html_tag(title)

    gameApp_info_dict = {
        'site_id': site_id,
        'url': url,
        'summary': tools.del_html_tag(summary, except_line_break=True),
        'title': title,
        'update_info': tools.del_html_tag(update_info, except_line_break=True),
        'score': score,
        'author': author,
        'app_url': app_url,
        'image_url': image_url,
        'software_size': software_size,
        'tag': tag,
        'platform': platform,
        'download_count': download_count,
        'release_time': release_time,
        'record_time': tools.get_current_date(),
        'language': language,
        'sensitive_id': sensitive_id,
        'read_status': 0,
        'sexy_image_status': '',
        'sexy_image_url': '',
        'image_pron_status': 0
    }
    db.add(table, gameApp_info_dict)
Ejemplo n.º 20
0
def main():
    db = OracleDB()

    # 查文章
    sql = '''
        select *
          from (select rownum r, id, title
                  from tab_iopm_article_info
                 where rownum >= 1)
         where r <= 100000
    '''
    articles = db.find(sql)

    # 查热点
    sql = 'select id, title from tab_iopm_hot_info'
    hots = db.find(sql)

    for article in articles:
        max_similar = {
            'similarity': 0,
            'hot_id': -1,
            'article_id': -1,
            'hot_title': ''
        }  # 最相似的文章 similarity表示相似度(0~1)
        article_id = article[1]
        article_text = article[2]

        for hot in hots:
            hot_id = hot[0]
            hot_text = hot[1]

            similarity = compare_text(hot_text, article_text)
            # print('''
            #     article_text %s
            #     hot_text     %s
            #     similarity   %s
            #     '''%(article_text, hot_text, similarity))
            if similarity > max_similar['similarity']:
                max_similar['similarity'] = similarity
                max_similar['hot_id'] = hot_id
                max_similar['article_id'] = article_id
                max_similar['hot_title'] = article_text if len(hot_text) > len(
                    article_text) else hot_text

        if max_similar['similarity'] > SIMILARITY:
            sql = 'update tab_iopm_article_info set hot_id = %s where id = %s' % (
                max_similar['hot_id'], max_similar['article_id'])
            db.update(sql)
            sql = "update tab_iopm_hot_info set hot = hot + 1, title = '%s' where id = %s" % (
                max_similar['hot_title'], max_similar['hot_id'])
            db.update(sql)

        else:
            sql = 'select sequence.nextval from dual'
            hot_id = db.find(sql)[0][0]
            sql = "insert into tab_iopm_hot_info (id, title, hot) values (%s, '%s', 1)" % (
                hot_id, article_text)
            db.add(sql)
            sql = 'update tab_iopm_article_info set hot_id = %s where id = %s' % (
                hot_id, article_id)
            db.update(sql)

        sql = 'select id, title from tab_iopm_hot_info'
        hots = db.find(sql)
Ejemplo n.º 21
0
def main():
    search_task_sleep_time = int(
        tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time'))

    db = OracleDB()

    #  更新符合日期条件的任务状态 未做
    sql = 'update tab_ivms_task_info t set t.task_status = 501 where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time'
    db.update(sql)

    # 更新关键词状态 未做
    sql = 'update tab_ivms_task_keyword k set k.finish_status = 601 where k.task_id in (select t.task_id from tab_ivms_task_info t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time)'
    db.update(sql)

    while True:
        # 查任务
        log.debug('查询任务...')

        sql = 'select t.task_id from TAB_IVMS_TASK_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and t.task_status = 501'
        result = db.find(sql, fetch_one=True)
        if not result:
            break

        task_id = result[0]

        while True:
            # 查看是否有正在执行的任务
            sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 602' % task_id
            do_task = db.find(sql, fetch_one=True)
            if do_task:
                time.sleep(search_task_sleep_time)
                continue

            sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 601' % task_id
            result = db.find(sql, fetch_one=True)
            if not result:
                break

            keyword_id = result[0]
            task_id = result[1]
            search_keyword1 = []
            search_keyword2 = result[2].split(',') if result[2] else []
            search_keyword3 = result[3].split(',') if result[3] else []

            def begin_callback():
                log.info('\n********** VA begin **********')
                # 更新任务状态 正在做
                sql = 'update TAB_IVMS_TASK_INFO set task_status = 502 where task_id = %d' % task_id
                db.update(sql)

                # 更新关键词状态 正在做
                sql = 'update tab_ivms_task_keyword set finish_status = 602 where id = %d' % keyword_id
                db.update(sql)

            def end_callback():
                # 更新关键词状态 做完
                sql = 'update tab_ivms_task_keyword set finish_status = 603 where id = %d' % keyword_id
                db.update(sql)

                # 如果该任务的所有关键词都做完 则更新任务状态为做完
                sql = 'select t.* from tab_ivms_task_keyword t where task_id = %d and finish_status = 601' % task_id
                results = db.find(sql)
                if not results:
                    # 导出数据
                    key_map = {
                        'program_id': 'vint_sequence.nextval',
                        'search_type': 'int_search_type',
                        'program_name': 'str_title',
                        'program_url': 'str_url',
                        'release_date': 'date_release_time',
                        'image_url': 'str_image_url',
                        'program_content': 'str_content',
                        'task_id': 'vint_%d' % task_id,
                        'keyword': 'str_keyword',
                        'keyword_count': 'int_keyword_count',
                        'check_status': 'vint_202'
                    }

                    export = ExportData('VA_content_info',
                                        'tab_ivms_program_info', key_map,
                                        'program_url')
                    export.export_to_oracle()

                    # 更新任务状态 做完
                    sql = 'update TAB_IVMS_TASK_INFO set task_status = 503 where task_id = %d' % task_id
                    db.update(sql)
                    log.info('\n********** VA end **********')

            # 配置spider
            spider = Spider(tab_urls='VA_urls',
                            tab_site='VA_site_info',
                            tab_content='VA_content_info',
                            parser_count=1,
                            begin_callback=begin_callback,
                            end_callback=end_callback,
                            search_keyword1=search_keyword1,
                            search_keyword2=search_keyword2,
                            search_keyword3=search_keyword3)

            # 添加parser
            spider.add_parser(baidu_parser)
            spider.add_parser(magnet_parser)
            spider.add_parser(netdisk_parser)
            spider.add_parser(weibo_parser)
            spider.add_parser(wechat_parser)
            spider.add_parser(soubaidupan_parser)
            spider.add_parser(douban_parser)

            spider.start()

            time.sleep(search_task_sleep_time)
Ejemplo n.º 22
0
class WechatService():
    _todo_accounts = collections.deque()
    _rownum = 1

    _is_done = False

    def __init__(self):
        self._db = OracleDB()
        self._es = ES()
        self.__load_todo_account()

    def __load_todo_account(self):
        if not WechatService._todo_accounts:
            sql = '''
                select *
                   from (select rownum r, t.id, t.domain, t.biz
                           from TAB_IOPM_SITE t
                          where t.biz is not null and rownum < {size})
                  where r >= {rownum}
                '''.format(rownum=WechatService._rownum,
                           size=WechatService._rownum + SIZE)

            results = self._db.find(sql)
            if not results:
                WechatService._is_done = True
                WechatService._rownum = 1
                self.__load_todo_account()
            else:
                WechatService._todo_accounts = collections.deque(
                    results)  #  转为队列
                WechatService._rownum += SIZE

    def get_next_account(self):
        '''
        @summary:
        ---------
        ---------
        @result: 返回biz, 是否已做完一圈 (biz, True)
        '''
        if not WechatService._todo_accounts:
            self.__load_todo_account()

        next_account_info = WechatService._todo_accounts.popleft()
        next_account_id = next_account_info[2]
        next_account_biz = next_account_info[3]

        next_account = next_account_id, next_account_biz, WechatService._is_done
        # 重置_is_done 状态
        WechatService._is_done = False

        return next_account

    def is_exist(self, table, data_id):
        if self._es.get(table, data_id=data_id, doc_type=table):
            return True
        else:
            return False

    def add_article_info(self, article_info):
        '''
        @summary:
        ---------
        @param article_info:
        ---------
        @result:
        '''

        log.debug('''
            -----文章信息-----
            %s''' % tools.dumps_json(article_info))

        self._es.add('wechat_article', article_info,
                     article_info.get('article_id'))

    def add_account_info(self, account_info):
        log.debug('''
            -----公众号信息-----
            %s''' % tools.dumps_json(account_info))

        self._es.add('wechat_account', account_info, account_info.get('__biz'))