Ejemplo n.º 1
0
def handle_single_item(item, **kwargs):
    id, old_data, web_data = item['id'], item['old_data'], item['web_data']
    logging.info("starts to process gonggao_dta id: %s" % (id))
    convert_list = GonggaoWebEsConvert.convert(old_data, web_data)
    for convert in convert_list:
        company_id_es = generate_company_id(str(id) + convert['CmpName'])
        status = check_existence(company_id_es,
                                 index=kwargs['index'],
                                 doc_type=kwargs['doc_type'])
        convert['Meta'] = {
            'Source': kwargs['site'],
            'Time': datetime.datetime.now().date().__str__()
        }
        if status == 0:  # 不存在
            logging.info("starts to index id : %s,company : %s,doc_id : %s" %
                         (id, convert['CmpName'], company_id_es))
            EsHelper.es.index(index=kwargs['index'],
                              doc_type=kwargs['doc_type'],
                              id=company_id_es,
                              body=json.dumps(convert,
                                              ensure_ascii=False,
                                              encoding="utf-8"))
            logging.info("inserted into es")
        elif status == 1:  # 存在
            DbHelper.execute(
                "UPDATE gonggao_data set es_status = %s WHERE id = %s ",
                (kwargs['es_status_exists'], id))
            logging.info("company exists in es")
            return
        DbHelper.execute(
            "UPDATE gonggao_data set es_status = %s WHERE id = %s ",
            (kwargs['es_status_success'], id))
def main():
    config_site = DbHelper.fetchmany(stmt='SELECT priority_code, site FROM config_site')
    config_site = dict(zip(map(lambda x: x['site'], config_site), map(lambda x: x['priority_code'], config_site)))

    batch = 10
    site = 'qichacha_web'
    index = 'drdata_qyk'
    doc_type = 'BusinessInfo'

    '''
        parse_status_success:1
        es_status:1 成功
        es_status:其他 未处理
    '''
    parse_status_success = 1
    es_status_success = 1
    select_company = " SELECT id, company_name, data_gsxx FROM qichacha_weixin_company where parse_status = %s and es_status !=%s limit %s "

    while True:

        items = DbHelper.fetchmany(stmt=select_company, data=(parse_status_success, es_status_success, batch))

        for item in items:
            time_begin = time.time()
            try:
                handle_single_item(item, config_site=config_site, site=site,
                                   index=index, doc_type=doc_type, es_status_success=es_status_success)
            except ConvertException as e:
                logging.getLogger().exception(e)
                raise e
            except Exception as e:
                logging.getLogger().exception(e)
                time.sleep(10)
            logging.info('cost: {0:f}'.format(time.time() - time_begin))
def main():
    site = 'xysj_weixin'
    batch = 100
    data_table = 'common_data_all'

    while True:
        stime = time.time()
        items = DbHelper.fetchmany(
            "select id, crawl_time, src_list from xysj_weixin_company where crawl_status = 1 and "
            " parse_status = 0  limit %s ", (batch, ))
        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, crawl_time, src_list = item[0], item[1], item[2]
            try:
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={u'info': src_list})
                logging.getLogger().info(" data stored ")
                DbHelper.execute(
                    "update xysj_weixin_company set parse_status = %s, data_table_name =%s  where id = %s",
                    (1, data_table, company_id))
                logging.getLogger().info(" parse status updated ")

            except Exception, err:
                logging.getLogger().exception(err)
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
def handle_single_item(item, **kwargs):
    company_id, company_name, parsed_content = item['id'], item['company_name'].strip(), item['data_gsxx']

    company_id_es = ETLTools.generate_company_id(company_name)
    logging.info("starts to process company: %s, id: %s, the id of es document is: %s "
                 % (company_name, company_id, company_id_es))
    status = EsHelper.check(index=kwargs['index'], doc_type=kwargs['doc_type'],
                            doc_id=company_id_es, site=kwargs['site'], config_site=kwargs['config_site'])

    if status == 2:  # 数据存在
        logging.info("company exists in es")
        return

    if parsed_content is None:
        select_company_parsed_content = 'SELECT value from common_data_all  where company_id = %s and site = %s and key_desc = %s '
        parsed_content = DbHelper.fetchone(select_company_parsed_content, data=(company_id, kwargs['site'], 'data_gsxx'))['value']
    converted = Qichacha_WeiXin_EsConvert.convert(parsed_content)
    converted['CmpName'] = company_name.strip()
    converted['Meta'] = {'Source': kwargs['site'], 'Time': datetime.datetime.now().date().__str__()}

    if status == 0:  # es中不存在

        EsHelper.es.index(index=kwargs['index'],
                          doc_type=kwargs['doc_type'],
                          id=company_id_es,
                          body=json.dumps(converted, ensure_ascii=False, encoding="utf-8"))
        logging.info("inserted into es")
    else:  # status == 1  es存在,但是优先级较低,因此更新之
        EsHelper.es.update(index='drdata_qyk', doc_type='BusinessInfo', id=company_id_es,
                           body=json.dumps(converted, ensure_ascii=False, encoding="utf-8"))
        logging.info("updated into es")

    DbHelper.execute("UPDATE qichacha_weixin_company set es_status = %s WHERE id = %s ", (kwargs['es_status_success'], company_id))
Ejemplo n.º 5
0
def main():
    site = 'guangdong_weixin'
    batch = 100
    data_table = 'common_data_all'

    while True:

        stime = time.time()
        sql = '''
            select b.id, b.update_time, b.src_gongshang, b.src_qiye, b.src_other, b.src_judicial from (
                select id from gs_guangdong_company where status = %s and parse_status = %s limit %s
            )as a left join (select id, update_time, src_gongshang, src_qiye, src_other, src_judicial from gs_guangdong_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, [15, 0, batch])
        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, update_time = item['id'], item['update_time']
            gsgs, qygs, bmgs, sfgs = item['src_gongshang'], item[
                'src_qiye'], item['src_other'], item['src_judicial']
            try:
                logging.getLogger().info(" begin to parse company-id : %s " %
                                         (item[0], ))
                '''
                1 存储公司信息
                '''
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=update_time,
                                       data={
                                           u'gsgs': gsgs,
                                           u'qygs': qygs,
                                           u'bmgs': bmgs,
                                           u'sfgs': sfgs
                                       })
                logging.getLogger().info(" data stored ")
                '''
                2 更新相关信息, 即parse_status 和 data_table_name
                '''
                DbHelper.execute(
                    "update gs_guangdong_company set parse_status = %s, data_table_name =%s  where id = %s",
                    (1, data_table, company_id))
                logging.getLogger().info(
                    " parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (item[0]))
                DbHelper.execute(
                    "update gs_guangdong_company set parse_status = %s where id = %s",
                    [2, company_id])
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
Ejemplo n.º 6
0
def main():
    site = 'qichacha_weixin'
    batch = 100
    data_table = 'common_data_all'
    while True:

        stime = time.time()

        sql = '''
            select b.id, b.update_time, b.data from (
                select id from qichacha_weixin_company where status = %s and parse_status = %s  limit %s
            ) as a left join (select id, update_time, data from qichacha_weixin_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (1, 0, batch))

        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, update_time, data = item['id'], item[
                'update_time'], item['data']
            try:
                logging.getLogger().info(" begin to parse company-id : %s " %
                                         (item[0], ))
                '''
                1 存储公司信息
                '''
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=update_time,
                                       data={u'detail': data})
                logging.getLogger().info(" data stored ")
                '''
                2 更改状态为已解析以及data-table-name
                '''
                DbHelper.execute(
                    "update qichacha_weixin_company set parse_status = %s, data_table_name =%s  where id = %s",
                    [1, data_table, item[0]])
                logging.getLogger().info(
                    " parse status updated, and data_table_name inserted ")

            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (company_id, ))
                DbHelper.execute(
                    "update qichacha_weixin_company set parse_status = %s where id = %s",
                    (2, company_id))
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
def main():
    site = 'czzx_web'
    batch = 100
    data_table = 'common_data_all'
    while True:

        logging.getLogger().info(" Batch begins ")
        stime = time.time()
        sql = '''
            select b.id, b.update_time, b.data, b.data_table_name from (
                select id from chuanzhong_web_company where id >= 809439 and parse_status = %s  limit %s
            ) as a left join (select id, update_time, data, data_table_name from chuanzhong_web_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (0, batch))
        if items is None or len(items) == 0:
            time.sleep(10)
            continue
        for item in items:
            company_id, crawl_time, data = item['id'], item[
                'update_time'], item['data']
            try:
                '''
                存储公司信息
                '''
                result = json.dumps(parse(data),
                                    ensure_ascii=False,
                                    encoding='utf-8')
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={'基本信息': result})
                logging.getLogger().info(" data inserted ")
                '''
                更新相关信息, 即parse_status 和 data_table_name
                '''
                DbHelper.executemany(
                    "update chuanzhong_web_company set parse_status = %s, data_table_name= %s where id = %s",
                    (1, data_table, company_id))
                logging.getLogger().info(
                    " parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (item[0]))
                DbHelper.executemany(
                    "update chuanzhong_web_company set parse_status = %s where id = %s",
                    (2, company_id))
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
Ejemplo n.º 8
0
def main():
    batch = 1
    parse_status_success = 1
    crawl_status_success = 1

    while True:
        items = DbHelper.fetchmany(
            "SELECT id, web_data FROM wenshu_web WHERE parse_status != %s limit %s",
            (parse_status_success, batch))

        for item in items:

            case_list = json.loads(item['web_data'])

            for case in case_list:

                if 'Count' in case.keys():
                    continue
                case_name = case[u'案件名称']

                if not case_name.__contains__(u'公司'):
                    continue

                logging.info('starts to handle id: %s, case id: %s ' %
                             (item['id'], case[u'文书ID']))

                url = 'http://wenshu.court.gov.cn/content/content?DocID=%s' % (
                    case[u'文书ID'], )
                headers = None
                proxies = proxy.get_proxy("WenshuDetail")
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        timeout=15)
                if response.status_code != 200:
                    logging.info('case-fetch fails')
                    time.sleep(10)
                    continue
                content = response.text

                DbHelper.execute(
                    "INSERT INTO wenshu_web_detail(doc_id, summary, detail, crawl_status) VALUES (%s, %s, %s, %s)",
                    (case[u'文书ID'], json.dumps(case), content,
                     crawl_status_success))
                logging.info('case inserted')
                time.sleep(3)

            DbHelper.execute(
                'UPDATE wenshu_web SET parse_status = %s WHERE id = %s ',
                (parse_status_success, item['id']))
def main():
    site = 'shanghai_web'
    batch = 100
    data_table = 'common_data_all'

    while True:

        stime = time.time()
        sql = '''
            select b.id, b.update_time, b.src_gsgs, b.src_qygs, b.src_bmgs, b.src_sfgs from (
                select id from gs_shanghai_company where status = %s and parse_status = %s  limit %s
            ) as a left join (select id, update_time, src_gsgs, src_qygs, src_bmgs, src_sfgs from gs_shanghai_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (15, 0, batch))

        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, update_time = item['id'], item['update_time']
            gsgs, qygs, bmgs, sfgs = item['src_gsgs'], item['src_qygs'], item['src_bmgs'], item['src_sfgs']
            try:
                logging.getLogger().info(" begin to parse company-id : %s " % (item[0],))

                result = IcWebParser().parsing([etree.HTML(text=gsgs),
                                                etree.HTML(text=qygs),
                                                etree.HTML(text=bmgs),
                                                etree.HTML(text=sfgs)])
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=update_time,
                                       data={u'gsgs': json.dumps(result[u'工商公示信息']),
                                             u'qygs': json.dumps(result[u'企业公示信息']),
                                             u'bmgs': json.dumps(result[u'其他部门公示信息']),
                                             u'sfgs': json.dumps(result[u'司法协助公示信息'])})
                logging.getLogger().info(" data inserted ")

                DbHelper.execute("update gs_shanghai_company set parse_status = %s, data_table_name= %s where id = %s",
                                 (1, data_table, company_id))
                logging.getLogger().info(" parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,))
                DbHelper.execute("update gs_shanghai_company set parse_status = %s where id = %s", (2, company_id))
                continue

        logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main():
    site = "tianyancha_web"
    batch = 200
    while True:

        logging.getLogger().info(" Batch begins ")
        stime = time.time()
        sql = '''
            select b.id, b.crawl_time, b.src_detail from (
                select id from tianyancha_web_company where crawl_status = %s and parse_status = %s  limit %s
            ) as a left join (select id, crawl_time, src_detail from tianyancha_web_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (1, 0, batch))
        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            logging.getLogger().info(" begin to parse company-id : %s " %
                                     (item[0], ))
            data_table = "common_data_all"

            try:
                # parse html page
                company_id, crawl_time, src_detail = item[0], item[1], item[2]
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={u'detail': src_detail})
                logging.getLogger().info(" data stored ")
                # update parse status
                DbHelper.execute(
                    "update tianyancha_web_company set parse_status = %s, data_table_name =%s  where id = %s",
                    (1, data_table, item[0]))
                logging.getLogger().info(
                    " parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (company_id, ))
                DbHelper.execute(
                    "update tianyancha_web_company set parse_status = %s  where id = %s ",
                    (2, company_id))
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
def main():
    retrieved_config_site = DbHelper.fetchmany(
        stmt='SELECT priority_code, site FROM config_site')
    config_site = dict(
        zip(map(lambda x: x['site'], retrieved_config_site),
            map(lambda x: x['priority_code'], retrieved_config_site)))

    batch, site, index, doc_type = 10, 'xizhi_web', 'drdata_qyk', 'BusinessInfo'
    #     parse_status_success:3
    #     es_status:1 成功
    #     es_status:其他 未处理
    parse_status_success, es_status_success = 3, 1
    select_company = " SELECT id, company_name FROM xizhi_web_company where parse_status = %s and es_status !=%s limit %s "

    while True:

        items = DbHelper.fetchmany(stmt=select_company,
                                   data=(parse_status_success,
                                         es_status_success, batch))

        logging.info('batch begins, the size is %s:' % (len(items), ))
        batch_begin_time = time.time()

        if len(items) == 0:
            logging.info("no data on condition found in db")
            break

        for item in items:
            time_begin = time.time()
            try:
                handle_single_item(item,
                                   config_site=config_site,
                                   site=site,
                                   index=index,
                                   doc_type=doc_type,
                                   es_status_success=es_status_success)
            except ConvertException as e:
                logging.getLogger().exception(e)
                raise e
            except Exception as e:
                logging.getLogger().exception(e)
                time.sleep(10)
            logging.info('cost: {0:f}'.format(time.time() - time_begin))

        logging.info('batch ends, size is: %s, costs:%s' %
                     (len(items), time.time() - batch_begin_time))
def handle_single_item(item, **kwargs):
    company_id, company_name, parsed_content = item['id'], item[
        'company_name'].strip(), item['parsed_content']
    company_id_es = ETLTools.generate_company_id(company_name)
    logging.info(
        "starts to process company: %s, id: %s, the id of es document is: %s "
        % (company_name, company_id, company_id_es))
    status = EsHelper.check(index=kwargs['index'],
                            doc_type=kwargs['doc_type'],
                            doc_id=company_id_es,
                            site=kwargs['site'],
                            config_site=kwargs['config_site'])

    if status == 2:  # 数据存在
        logging.info("company exists in es")
        return

    converted = QycxbWebEsConvert.convert(parsed_content)
    converted['CmpName'] = company_name.strip()
    converted['Meta'] = {
        'Source': kwargs['site'],
        'Time': datetime.datetime.now().date().__str__()
    }

    if status == 0:  # es中不存在

        EsHelper.es.index(index=kwargs['index'],
                          doc_type=kwargs['doc_type'],
                          id=company_id_es,
                          body=json.dumps(converted,
                                          ensure_ascii=False,
                                          encoding="utf-8"))
        logging.info("inserted into es")
    else:  # status == 1  es存在,但是优先级较低,因此更新之
        EsHelper.es.update(index='drdata_qyk',
                           doc_type='BusinessInfo',
                           id=company_id_es,
                           body=json.dumps(converted,
                                           ensure_ascii=False,
                                           encoding="utf-8"))
        logging.info("updated into es")

    DbHelper.execute(
        "UPDATE qycxb_web_company set es_status = %s WHERE id = %s ",
        (kwargs['es_status_success'], company_id))
def main():
    batch = 100
    site = 'qixin_weixin'
    data_table = 'common_data_all'
    while True:

        stime = time.time()
        sql = '''
            select b.id, b.crawl_time, b.src_homepageinfo, b.src_basicinfo, b.src_changeinfo from (
                select id from qixin_weixin_company where crawl_status = %s and parse_status = %s  limit %s
            ) as a left join (select id, crawl_time, src_homepageinfo,src_basicinfo,src_changeinfo from qixin_weixin_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (7, 0, batch))
        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, crawl_time = item['id'], item['crawl_time']
            pageinfo, basicinfo, changeinfo = item['src_homepageinfo'], item['src_basicinfo'], item['src_changeinfo']
            try:
                logging.getLogger().info(" begin to parse company-id : %s " % (company_id,))

                '''
                1 存储公司信息
                '''
                StoreHelper.store_data(data_table=data_table, company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={u'pageinfo': pageinfo, u'basicinfo': basicinfo, u'changeinfo': changeinfo})
                logging.getLogger().info(" data stored ")
                '''
                2 更改状态为已解析以及data-table-name
                '''
                DbHelper.execute("update qixin_weixin_company set parse_status = %s, data_table_name =%s  where id = %s",
                                 (1, data_table, company_id))
                logging.getLogger().info(" parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,))
                DbHelper.execute("update qixin_weixin_company set parse_status = %s where id = %s", [2, company_id])
                continue

        logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main():
    batch = 100
    while True:

        stime = time.time()
        logging.getLogger().info('Batch begins ')
        sql = '''
            select b.id, b.update_time, b.basic_info from (
                select id from qycxb_web_company where status = %s and parse_status = %s  limit %s
            ) as a left join (select id, update_time, basic_info from qycxb_web_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (1, 0, batch))

        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, update_time, basic_info = item[0], item[1], item[2]
            logging.getLogger().info(" begin to parse company-id : %s " %
                                     (company_id, ))

            try:
                # parse html page
                detail = parse(basic_info)
                # persist parsed company data into database
                DbHelper.executemany(
                    "update qycxb_web_company set parse_status = %s, parsed_content =%s  where id = %s",
                    (1, json.dumps(detail), company_id))
                logging.getLogger().info(
                    " parse status updated, and parsed content inserted ")
            except (ParseException, Exception) as err:
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (company_id, ))
                logging.getLogger().exception(err)
                DbHelper.executemany(
                    "update qycxb_web_company set parse_status = %s  where id = %s ",
                    (2, company_id))
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
def main():
    config_site = DbHelper.fetchmany(
        stmt='SELECT priority_code, site FROM config_site')
    config_site = dict(
        zip(map(lambda x: x['site'], config_site),
            map(lambda x: x['priority_code'], config_site)))

    site = 'czzx_web'
    index = 'drdata_qyk'
    doc_type = 'BusinessInfo'
    '''
        es_status:13 成功
        es_status:14 已存在
    '''
    es_status_success = 13
    es_status_exists = 14
    fetch_rows_limit = 2
    select_company = " select id,company_name,es_status from chuanzhong_web_company where es_status != %s and es_status != %s limit %s"

    while True:
        logging.info("round starts")
        items = DbHelper.fetchmany(stmt=select_company,
                                   data=(es_status_success, es_status_exists,
                                         fetch_rows_limit))

        for item in items:
            time_begin = time.time()
            try:
                handle_single_item(item,
                                   config_site=config_site,
                                   site=site,
                                   index=index,
                                   doc_type=doc_type,
                                   es_status_success=es_status_success,
                                   es_status_exists=es_status_exists)
            except ConvertException as ce:
                logging.getLogger().exception(ce)
                raise ce
            except Exception as e:
                logging.getLogger().exception(e)
                time.sleep(60)
            logging.info('cost: {0:f}'.format(time.time() - time_begin))
        logging.info("round ends")
def main():
    site = "xizhi_web"
    batch = 100
    while True:

        begin = time.time()
        sql = '''
            select b.id, b.update_time, b.data from (
                select id from xizhi_web_company where status = %s and  parse_status = %s limit %s
            ) as a left join (select id, update_time, data from xizhi_web_company ) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (1, 0, batch))

        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            data_table = "common_data_all"
            company_id, crawl_time, data = item['id'], item['update_time'], item['data']
            logging.getLogger().info(" begin to parse company-id : %s " % (company_id,))

            try:
                # parse html page
                detail = parse(data)
                # persist parsed company data into database
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={u'data': json.dumps(detail)})
                logging.getLogger().info(" data stored ")
                # update parse status
                DbHelper.execute("update xizhi_web_company set parse_status = %s, data_table_name =%s  where id = %s",
                                 (3, data_table, company_id))
                logging.getLogger().info(" parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,))
                DbHelper.execute("update xizhi_web_company set parse_status = %s  where id = %s ", (2, company_id))
                continue

        logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - begin))
Ejemplo n.º 17
0
def main():
    index = 'drdata_qyk'
    doc_type = 'RmfyggMessage'
    site = 'http://rmfygg.court.gov.cn/psca/lgnot/bulletin/page/'
    '''
    es_status 0 未导入
    es_status 1 已导入
    es_status 2 已存在
    '''
    es_status_success = 1
    es_status_exists = 2
    fetch_rows_limit = 1

    stmt = 'select id,old_data,web_data from gonggao_data where es_status != %s and es_status != %s limit %s'
    while True:
        logging.info("round starts")
        items = DbHelper.fetchmany(stmt=stmt,
                                   data=(es_status_success, es_status_exists,
                                         fetch_rows_limit))
        for item in items:
            time_begin = time.time()
            try:
                handle_single_item(item,
                                   index=index,
                                   doc_type=doc_type,
                                   site=site,
                                   es_status_success=es_status_success,
                                   es_status_exists=es_status_exists)
            except ConvertException as ce:
                logging.getLogger().exception(ce)
                raise ce
            except Exception as e:
                logging.getLogger().exception(e)
                time.sleep(60)
            logging.info('cost: {0:f}'.format(time.time() - time_begin))
        logging.info("round ends")
def main():
    config_site = DbHelper.fetchmany(
        stmt='SELECT priority_code, site FROM config_site')
    config_site = dict(
        zip(map(lambda x: x['site'], config_site),
            map(lambda x: x['priority_code'], config_site)))

    batch = 10
    site = None
    index = 'drdata_qyk'
    doc_type = 'BusinessInfo'
    '''
        parse_status_success:1
        es_status:2 成功
        es_status:其他 未处理
    '''
    parse_status_success = 1
    es_status_success = 2
    select_company = " SELECT id, company_name FROM %s where parse_status = %s and etl_status !=%s limit %s "

    configs = [
        {
            'site': 'shanghai_web',
            'table': 'gs_shanghai_company',
            'finished': False
        },
        {
            'site': 'fujian_web',
            'table': 'gs_fujian_company',
            'finished': False
        },
        {
            'site': 'hebei_web',
            'table': 'gs_hebei_company',
            'finished': False
        },
        {
            'site': 'hunan_web',
            'table': 'gs_hunan_company',
            'finished': False
        },
        {
            'site': 'yunnan_web',
            'table': 'gs_yunnan_company',
            'finished': False
        },
    ]
    while True:

        if len(filter(lambda x: not x['finished'], configs)) == 0:
            logging.info(" es etl finished, process going to closed")
            break

        for conf in configs:

            if conf['finished']:
                continue

            # items = DbHelper.fetchmany(stmt=select_company, data=(conf['table'], parse_status_success, es_status_success, batch))
            items = DbHelper.fetchmany(stmt=select_company %
                                       (conf['table'], parse_status_success,
                                        es_status_success, batch))
            if len(items) == 0:
                conf['finished'] = True

            for item in items:
                time_begin = time.time()
                try:
                    handle_single_item(item,
                                       config_site=config_site,
                                       site=conf['site'],
                                       table=conf['table'],
                                       index=index,
                                       doc_type=doc_type,
                                       es_status_success=es_status_success)
                except ConvertException as e:
                    logging.getLogger().exception(e)
                    raise e
                except Exception as e:
                    logging.getLogger().exception(e)
                    time.sleep(10)
                logging.info('cost: {0:f}'.format(time.time() - time_begin))
Ejemplo n.º 19
0
def process():
    # get un-crawled company-keyword list
    keywords_response = requests.get(
        'http://10.51.1.201:3352/getKeywords',
        params={
            "data":
            json.dumps({
                "field": "SoftwareCopyrightStatus",
                "status": 0
            },
                       ensure_ascii=False)
        })
    if keywords_response.status_code != 200:
        time.sleep(10)
        return
    else:
        companies = json.loads(keywords_response.text)
    logging.getLogger("Crawler").info(
        "Get Companies From ES, And Size is : %s" % (len(companies), ))

    for company in companies:

        cmpid, cmpname = company["Id"], company['CmpName']
        logging.getLogger("Crawler").info(
            "Begins to Crawl Info, the CmpName:%s, CmpId:%s " %
            (cmpname, cmpid))
        url_template = "http://www.ccopyright.com.cn/cpcc/RRegisterAction.do?method=list&no=fck&sql_name=&sql_regnum=&sql_author=%s&curPage=1"
        url = url_template % (quote(cmpname.replace(u"・", u"").encode("gbk")))
        headers = None
        proxies = proxy.get_proxy("SoftWareCopyright")

        crawling_response = requests.get(url=url,
                                         headers=headers,
                                         proxies=proxies,
                                         timeout=15)
        if crawling_response.status_code != 200:
            time.sleep(10)
            continue
        content = crawling_response.text
        try:
            parsed_content = parse(content)
        except Exception as e:
            logging.getLogger("Crawler").exception(
                "Exceptions occurs when parsing crawled page")
            logging.getLogger("Crawler").exception(e)
            continue
        if len(parsed_content) == 0:
            logging.getLogger("Crawler").info("No software copyright found")
        else:
            DbHelper.execute(
                "INSERT INTO software_copyright(cmpid, src) VALUES(%s, %s)",
                (cmpid, content))
            parse_status = 1
            DbHelper.execute(
                "UPDATE software_copyright SET parse_status = %s, parsed_content = %s where cmpid = %s ",
                (parse_status, json.dumps(parsed_content), cmpid))
            logging.getLogger("Crawler").info("Page Parsed Successfully")

        notify_response = requests.get(
            url=url,
            params={
                "data":
                json.dumps([
                    {
                        "Id": cmpid,
                        "field": "SoftwareCopyrightStatus",
                        "status": 1
                    },
                ],
                           ensure_ascii=False)
            })

        if notify_response.status_code != 200:
            logging.getLogger("Crawler").info(
                "Action, which notify es, fails.")
        logging.getLogger("Crawler").info("Info Crawled Successfully")
        time.sleep(2)
Ejemplo n.º 20
0
def process(cookie, pagenum):
    url = 'http://webdata.cqccms.com.cn/webdata/query/CCCCerti.do'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Content-Length':
        '110',
        'Content-Type':
        'application/x-www-form-urlencoded',
        'Cookie':
        cookie,
        'Host':
        'webdata.cqccms.com.cn',
        'Origin':
        'http://webdata.cqccms.com.cn',
        'Referer':
        'http://webdata.cqccms.com.cn/webdata/query/CCCCerti.do;jsessionid=qxkxYRZYCCtHGGd17y3J5TlsJqNvSGLGTt1hVcpp618JkmTfpp1T!-510284702',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36'
    }
    data = {
        'keyword': u"公司".encode('GBK'),
        '_h_select_chaxuntype': 'appname',
        'chaxuntype': 'appname',
        'pageNumber': pagenum,
        'pageSize': 10,
        'sortColumns': 'null'
    }
    proxies = proxy.get_proxy("products_info")
    crawling_response = requests.post(url=url,
                                      data=data,
                                      headers=headers,
                                      proxies=proxies)
    # crawling_response = requests.post(url=url, data=data, headers=headers)
    if crawling_response.status_code != 200:
        time.sleep(10)
    content = crawling_response.text
    print content

    parsed_content = parse(content)
    if len(parsed_content) == 0:
        logging.getLogger("Crawler").info("Nothing parsed ")
        return 0
    try:
        DbHelper.executemany(
            "INSERT INTO product(certificat_no,applicant,manufacturer,factory,product,model_specification,standard,"
            "issue_date,original_issue_date,expiry_date,`status`,status_changing_time,reason,attachment) VALUES(%s,%s,%s,%s,%s,%s,%s,"
            "%s,%s,%s,%s,%s,%s,%s)",
            data=parsed_content)
    except IntegrityError as e:
        logging.getLogger("Crawler").exception(
            "Exceptions Occurs During One Batch Process")
        logging.getLogger("Crawler").exception(e)
    except Exception, e:
        logging.getLogger("Crawler").exception(
            "Exceptions Occurs When Inserting Into DB ")
        logging.getLogger("Crawler").exception(e)
        raise e