def main():
    config_site = DbHelper.fetchmany(stmt='SELECT priority_code, site FROM config_site')
    config_site = dict(zip(map(lambda x: x['site'], config_site), map(lambda x: x['priority_code'], config_site)))

    batch = 10
    site = 'qichacha_web'
    index = 'drdata_qyk'
    doc_type = 'BusinessInfo'

    '''
        parse_status_success:1
        es_status:1 成功
        es_status:其他 未处理
    '''
    parse_status_success = 1
    es_status_success = 1
    select_company = " SELECT id, company_name, data_gsxx FROM qichacha_weixin_company where parse_status = %s and es_status !=%s limit %s "

    while True:

        items = DbHelper.fetchmany(stmt=select_company, data=(parse_status_success, es_status_success, batch))

        for item in items:
            time_begin = time.time()
            try:
                handle_single_item(item, config_site=config_site, site=site,
                                   index=index, doc_type=doc_type, es_status_success=es_status_success)
            except ConvertException as e:
                logging.getLogger().exception(e)
                raise e
            except Exception as e:
                logging.getLogger().exception(e)
                time.sleep(10)
            logging.info('cost: {0:f}'.format(time.time() - time_begin))
def main():
    site = 'xysj_weixin'
    batch = 100
    data_table = 'common_data_all'

    while True:
        stime = time.time()
        items = DbHelper.fetchmany(
            "select id, crawl_time, src_list from xysj_weixin_company where crawl_status = 1 and "
            " parse_status = 0  limit %s ", (batch, ))
        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, crawl_time, src_list = item[0], item[1], item[2]
            try:
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={u'info': src_list})
                logging.getLogger().info(" data stored ")
                DbHelper.execute(
                    "update xysj_weixin_company set parse_status = %s, data_table_name =%s  where id = %s",
                    (1, data_table, company_id))
                logging.getLogger().info(" parse status updated ")

            except Exception, err:
                logging.getLogger().exception(err)
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
Beispiel #3
0
def main():
    site = 'guangdong_weixin'
    batch = 100
    data_table = 'common_data_all'

    while True:

        stime = time.time()
        sql = '''
            select b.id, b.update_time, b.src_gongshang, b.src_qiye, b.src_other, b.src_judicial from (
                select id from gs_guangdong_company where status = %s and parse_status = %s limit %s
            )as a left join (select id, update_time, src_gongshang, src_qiye, src_other, src_judicial from gs_guangdong_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, [15, 0, batch])
        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, update_time = item['id'], item['update_time']
            gsgs, qygs, bmgs, sfgs = item['src_gongshang'], item[
                'src_qiye'], item['src_other'], item['src_judicial']
            try:
                logging.getLogger().info(" begin to parse company-id : %s " %
                                         (item[0], ))
                '''
                1 存储公司信息
                '''
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=update_time,
                                       data={
                                           u'gsgs': gsgs,
                                           u'qygs': qygs,
                                           u'bmgs': bmgs,
                                           u'sfgs': sfgs
                                       })
                logging.getLogger().info(" data stored ")
                '''
                2 更新相关信息, 即parse_status 和 data_table_name
                '''
                DbHelper.execute(
                    "update gs_guangdong_company set parse_status = %s, data_table_name =%s  where id = %s",
                    (1, data_table, company_id))
                logging.getLogger().info(
                    " parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (item[0]))
                DbHelper.execute(
                    "update gs_guangdong_company set parse_status = %s where id = %s",
                    [2, company_id])
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
def main():
    retrieved_config_site = DbHelper.fetchmany(
        stmt='SELECT priority_code, site FROM config_site')
    config_site = dict(
        zip(map(lambda x: x['site'], retrieved_config_site),
            map(lambda x: x['priority_code'], retrieved_config_site)))

    batch, site, index, doc_type = 10, 'xizhi_web', 'drdata_qyk', 'BusinessInfo'
    #     parse_status_success:3
    #     es_status:1 成功
    #     es_status:其他 未处理
    parse_status_success, es_status_success = 3, 1
    select_company = " SELECT id, company_name FROM xizhi_web_company where parse_status = %s and es_status !=%s limit %s "

    while True:

        items = DbHelper.fetchmany(stmt=select_company,
                                   data=(parse_status_success,
                                         es_status_success, batch))

        logging.info('batch begins, the size is %s:' % (len(items), ))
        batch_begin_time = time.time()

        if len(items) == 0:
            logging.info("no data on condition found in db")
            break

        for item in items:
            time_begin = time.time()
            try:
                handle_single_item(item,
                                   config_site=config_site,
                                   site=site,
                                   index=index,
                                   doc_type=doc_type,
                                   es_status_success=es_status_success)
            except ConvertException as e:
                logging.getLogger().exception(e)
                raise e
            except Exception as e:
                logging.getLogger().exception(e)
                time.sleep(10)
            logging.info('cost: {0:f}'.format(time.time() - time_begin))

        logging.info('batch ends, size is: %s, costs:%s' %
                     (len(items), time.time() - batch_begin_time))
def main():
    config_site = DbHelper.fetchmany(
        stmt='SELECT priority_code, site FROM config_site')
    config_site = dict(
        zip(map(lambda x: x['site'], config_site),
            map(lambda x: x['priority_code'], config_site)))

    site = 'czzx_web'
    index = 'drdata_qyk'
    doc_type = 'BusinessInfo'
    '''
        es_status:13 成功
        es_status:14 已存在
    '''
    es_status_success = 13
    es_status_exists = 14
    fetch_rows_limit = 2
    select_company = " select id,company_name,es_status from chuanzhong_web_company where es_status != %s and es_status != %s limit %s"

    while True:
        logging.info("round starts")
        items = DbHelper.fetchmany(stmt=select_company,
                                   data=(es_status_success, es_status_exists,
                                         fetch_rows_limit))

        for item in items:
            time_begin = time.time()
            try:
                handle_single_item(item,
                                   config_site=config_site,
                                   site=site,
                                   index=index,
                                   doc_type=doc_type,
                                   es_status_success=es_status_success,
                                   es_status_exists=es_status_exists)
            except ConvertException as ce:
                logging.getLogger().exception(ce)
                raise ce
            except Exception as e:
                logging.getLogger().exception(e)
                time.sleep(60)
            logging.info('cost: {0:f}'.format(time.time() - time_begin))
        logging.info("round ends")
Beispiel #6
0
def main():
    site = 'qichacha_weixin'
    batch = 100
    data_table = 'common_data_all'
    while True:

        stime = time.time()

        sql = '''
            select b.id, b.update_time, b.data from (
                select id from qichacha_weixin_company where status = %s and parse_status = %s  limit %s
            ) as a left join (select id, update_time, data from qichacha_weixin_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (1, 0, batch))

        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, update_time, data = item['id'], item[
                'update_time'], item['data']
            try:
                logging.getLogger().info(" begin to parse company-id : %s " %
                                         (item[0], ))
                '''
                1 存储公司信息
                '''
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=update_time,
                                       data={u'detail': data})
                logging.getLogger().info(" data stored ")
                '''
                2 更改状态为已解析以及data-table-name
                '''
                DbHelper.execute(
                    "update qichacha_weixin_company set parse_status = %s, data_table_name =%s  where id = %s",
                    [1, data_table, item[0]])
                logging.getLogger().info(
                    " parse status updated, and data_table_name inserted ")

            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (company_id, ))
                DbHelper.execute(
                    "update qichacha_weixin_company set parse_status = %s where id = %s",
                    (2, company_id))
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
def main():
    site = 'czzx_web'
    batch = 100
    data_table = 'common_data_all'
    while True:

        logging.getLogger().info(" Batch begins ")
        stime = time.time()
        sql = '''
            select b.id, b.update_time, b.data, b.data_table_name from (
                select id from chuanzhong_web_company where id >= 809439 and parse_status = %s  limit %s
            ) as a left join (select id, update_time, data, data_table_name from chuanzhong_web_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (0, batch))
        if items is None or len(items) == 0:
            time.sleep(10)
            continue
        for item in items:
            company_id, crawl_time, data = item['id'], item[
                'update_time'], item['data']
            try:
                '''
                存储公司信息
                '''
                result = json.dumps(parse(data),
                                    ensure_ascii=False,
                                    encoding='utf-8')
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={'基本信息': result})
                logging.getLogger().info(" data inserted ")
                '''
                更新相关信息, 即parse_status 和 data_table_name
                '''
                DbHelper.executemany(
                    "update chuanzhong_web_company set parse_status = %s, data_table_name= %s where id = %s",
                    (1, data_table, company_id))
                logging.getLogger().info(
                    " parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (item[0]))
                DbHelper.executemany(
                    "update chuanzhong_web_company set parse_status = %s where id = %s",
                    (2, company_id))
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
Beispiel #8
0
def main():
    batch = 1
    parse_status_success = 1
    crawl_status_success = 1

    while True:
        items = DbHelper.fetchmany(
            "SELECT id, web_data FROM wenshu_web WHERE parse_status != %s limit %s",
            (parse_status_success, batch))

        for item in items:

            case_list = json.loads(item['web_data'])

            for case in case_list:

                if 'Count' in case.keys():
                    continue
                case_name = case[u'案件名称']

                if not case_name.__contains__(u'公司'):
                    continue

                logging.info('starts to handle id: %s, case id: %s ' %
                             (item['id'], case[u'文书ID']))

                url = 'http://wenshu.court.gov.cn/content/content?DocID=%s' % (
                    case[u'文书ID'], )
                headers = None
                proxies = proxy.get_proxy("WenshuDetail")
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        timeout=15)
                if response.status_code != 200:
                    logging.info('case-fetch fails')
                    time.sleep(10)
                    continue
                content = response.text

                DbHelper.execute(
                    "INSERT INTO wenshu_web_detail(doc_id, summary, detail, crawl_status) VALUES (%s, %s, %s, %s)",
                    (case[u'文书ID'], json.dumps(case), content,
                     crawl_status_success))
                logging.info('case inserted')
                time.sleep(3)

            DbHelper.execute(
                'UPDATE wenshu_web SET parse_status = %s WHERE id = %s ',
                (parse_status_success, item['id']))
def main():
    site = 'shanghai_web'
    batch = 100
    data_table = 'common_data_all'

    while True:

        stime = time.time()
        sql = '''
            select b.id, b.update_time, b.src_gsgs, b.src_qygs, b.src_bmgs, b.src_sfgs from (
                select id from gs_shanghai_company where status = %s and parse_status = %s  limit %s
            ) as a left join (select id, update_time, src_gsgs, src_qygs, src_bmgs, src_sfgs from gs_shanghai_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (15, 0, batch))

        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, update_time = item['id'], item['update_time']
            gsgs, qygs, bmgs, sfgs = item['src_gsgs'], item['src_qygs'], item['src_bmgs'], item['src_sfgs']
            try:
                logging.getLogger().info(" begin to parse company-id : %s " % (item[0],))

                result = IcWebParser().parsing([etree.HTML(text=gsgs),
                                                etree.HTML(text=qygs),
                                                etree.HTML(text=bmgs),
                                                etree.HTML(text=sfgs)])
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=update_time,
                                       data={u'gsgs': json.dumps(result[u'工商公示信息']),
                                             u'qygs': json.dumps(result[u'企业公示信息']),
                                             u'bmgs': json.dumps(result[u'其他部门公示信息']),
                                             u'sfgs': json.dumps(result[u'司法协助公示信息'])})
                logging.getLogger().info(" data inserted ")

                DbHelper.execute("update gs_shanghai_company set parse_status = %s, data_table_name= %s where id = %s",
                                 (1, data_table, company_id))
                logging.getLogger().info(" parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,))
                DbHelper.execute("update gs_shanghai_company set parse_status = %s where id = %s", (2, company_id))
                continue

        logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main():
    site = "tianyancha_web"
    batch = 200
    while True:

        logging.getLogger().info(" Batch begins ")
        stime = time.time()
        sql = '''
            select b.id, b.crawl_time, b.src_detail from (
                select id from tianyancha_web_company where crawl_status = %s and parse_status = %s  limit %s
            ) as a left join (select id, crawl_time, src_detail from tianyancha_web_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (1, 0, batch))
        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            logging.getLogger().info(" begin to parse company-id : %s " %
                                     (item[0], ))
            data_table = "common_data_all"

            try:
                # parse html page
                company_id, crawl_time, src_detail = item[0], item[1], item[2]
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={u'detail': src_detail})
                logging.getLogger().info(" data stored ")
                # update parse status
                DbHelper.execute(
                    "update tianyancha_web_company set parse_status = %s, data_table_name =%s  where id = %s",
                    (1, data_table, item[0]))
                logging.getLogger().info(
                    " parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (company_id, ))
                DbHelper.execute(
                    "update tianyancha_web_company set parse_status = %s  where id = %s ",
                    (2, company_id))
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
def main():
    batch = 100
    site = 'qixin_weixin'
    data_table = 'common_data_all'
    while True:

        stime = time.time()
        sql = '''
            select b.id, b.crawl_time, b.src_homepageinfo, b.src_basicinfo, b.src_changeinfo from (
                select id from qixin_weixin_company where crawl_status = %s and parse_status = %s  limit %s
            ) as a left join (select id, crawl_time, src_homepageinfo,src_basicinfo,src_changeinfo from qixin_weixin_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (7, 0, batch))
        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, crawl_time = item['id'], item['crawl_time']
            pageinfo, basicinfo, changeinfo = item['src_homepageinfo'], item['src_basicinfo'], item['src_changeinfo']
            try:
                logging.getLogger().info(" begin to parse company-id : %s " % (company_id,))

                '''
                1 存储公司信息
                '''
                StoreHelper.store_data(data_table=data_table, company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={u'pageinfo': pageinfo, u'basicinfo': basicinfo, u'changeinfo': changeinfo})
                logging.getLogger().info(" data stored ")
                '''
                2 更改状态为已解析以及data-table-name
                '''
                DbHelper.execute("update qixin_weixin_company set parse_status = %s, data_table_name =%s  where id = %s",
                                 (1, data_table, company_id))
                logging.getLogger().info(" parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,))
                DbHelper.execute("update qixin_weixin_company set parse_status = %s where id = %s", [2, company_id])
                continue

        logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main():
    batch = 100
    while True:

        stime = time.time()
        logging.getLogger().info('Batch begins ')
        sql = '''
            select b.id, b.update_time, b.basic_info from (
                select id from qycxb_web_company where status = %s and parse_status = %s  limit %s
            ) as a left join (select id, update_time, basic_info from qycxb_web_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (1, 0, batch))

        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, update_time, basic_info = item[0], item[1], item[2]
            logging.getLogger().info(" begin to parse company-id : %s " %
                                     (company_id, ))

            try:
                # parse html page
                detail = parse(basic_info)
                # persist parsed company data into database
                DbHelper.executemany(
                    "update qycxb_web_company set parse_status = %s, parsed_content =%s  where id = %s",
                    (1, json.dumps(detail), company_id))
                logging.getLogger().info(
                    " parse status updated, and parsed content inserted ")
            except (ParseException, Exception) as err:
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (company_id, ))
                logging.getLogger().exception(err)
                DbHelper.executemany(
                    "update qycxb_web_company set parse_status = %s  where id = %s ",
                    (2, company_id))
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
def main():
    site = "xizhi_web"
    batch = 100
    while True:

        begin = time.time()
        sql = '''
            select b.id, b.update_time, b.data from (
                select id from xizhi_web_company where status = %s and  parse_status = %s limit %s
            ) as a left join (select id, update_time, data from xizhi_web_company ) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (1, 0, batch))

        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            data_table = "common_data_all"
            company_id, crawl_time, data = item['id'], item['update_time'], item['data']
            logging.getLogger().info(" begin to parse company-id : %s " % (company_id,))

            try:
                # parse html page
                detail = parse(data)
                # persist parsed company data into database
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={u'data': json.dumps(detail)})
                logging.getLogger().info(" data stored ")
                # update parse status
                DbHelper.execute("update xizhi_web_company set parse_status = %s, data_table_name =%s  where id = %s",
                                 (3, data_table, company_id))
                logging.getLogger().info(" parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,))
                DbHelper.execute("update xizhi_web_company set parse_status = %s  where id = %s ", (2, company_id))
                continue

        logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - begin))
Beispiel #14
0
def main():
    index = 'drdata_qyk'
    doc_type = 'RmfyggMessage'
    site = 'http://rmfygg.court.gov.cn/psca/lgnot/bulletin/page/'
    '''
    es_status 0 未导入
    es_status 1 已导入
    es_status 2 已存在
    '''
    es_status_success = 1
    es_status_exists = 2
    fetch_rows_limit = 1

    stmt = 'select id,old_data,web_data from gonggao_data where es_status != %s and es_status != %s limit %s'
    while True:
        logging.info("round starts")
        items = DbHelper.fetchmany(stmt=stmt,
                                   data=(es_status_success, es_status_exists,
                                         fetch_rows_limit))
        for item in items:
            time_begin = time.time()
            try:
                handle_single_item(item,
                                   index=index,
                                   doc_type=doc_type,
                                   site=site,
                                   es_status_success=es_status_success,
                                   es_status_exists=es_status_exists)
            except ConvertException as ce:
                logging.getLogger().exception(ce)
                raise ce
            except Exception as e:
                logging.getLogger().exception(e)
                time.sleep(60)
            logging.info('cost: {0:f}'.format(time.time() - time_begin))
        logging.info("round ends")
def main():
    config_site = DbHelper.fetchmany(
        stmt='SELECT priority_code, site FROM config_site')
    config_site = dict(
        zip(map(lambda x: x['site'], config_site),
            map(lambda x: x['priority_code'], config_site)))

    batch = 10
    site = None
    index = 'drdata_qyk'
    doc_type = 'BusinessInfo'
    '''
        parse_status_success:1
        es_status:2 成功
        es_status:其他 未处理
    '''
    parse_status_success = 1
    es_status_success = 2
    select_company = " SELECT id, company_name FROM %s where parse_status = %s and etl_status !=%s limit %s "

    configs = [
        {
            'site': 'shanghai_web',
            'table': 'gs_shanghai_company',
            'finished': False
        },
        {
            'site': 'fujian_web',
            'table': 'gs_fujian_company',
            'finished': False
        },
        {
            'site': 'hebei_web',
            'table': 'gs_hebei_company',
            'finished': False
        },
        {
            'site': 'hunan_web',
            'table': 'gs_hunan_company',
            'finished': False
        },
        {
            'site': 'yunnan_web',
            'table': 'gs_yunnan_company',
            'finished': False
        },
    ]
    while True:

        if len(filter(lambda x: not x['finished'], configs)) == 0:
            logging.info(" es etl finished, process going to closed")
            break

        for conf in configs:

            if conf['finished']:
                continue

            # items = DbHelper.fetchmany(stmt=select_company, data=(conf['table'], parse_status_success, es_status_success, batch))
            items = DbHelper.fetchmany(stmt=select_company %
                                       (conf['table'], parse_status_success,
                                        es_status_success, batch))
            if len(items) == 0:
                conf['finished'] = True

            for item in items:
                time_begin = time.time()
                try:
                    handle_single_item(item,
                                       config_site=config_site,
                                       site=conf['site'],
                                       table=conf['table'],
                                       index=index,
                                       doc_type=doc_type,
                                       es_status_success=es_status_success)
                except ConvertException as e:
                    logging.getLogger().exception(e)
                    raise e
                except Exception as e:
                    logging.getLogger().exception(e)
                    time.sleep(10)
                logging.info('cost: {0:f}'.format(time.time() - time_begin))