def main():
    site = 'czzx_web'
    batch = 100
    data_table = 'common_data_all'
    while True:

        logging.getLogger().info(" Batch begins ")
        stime = time.time()
        sql = '''
            select b.id, b.update_time, b.data, b.data_table_name from (
                select id from chuanzhong_web_company where id >= 809439 and parse_status = %s  limit %s
            ) as a left join (select id, update_time, data, data_table_name from chuanzhong_web_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (0, batch))
        if items is None or len(items) == 0:
            time.sleep(10)
            continue
        for item in items:
            company_id, crawl_time, data = item['id'], item[
                'update_time'], item['data']
            try:
                '''
                存储公司信息
                '''
                result = json.dumps(parse(data),
                                    ensure_ascii=False,
                                    encoding='utf-8')
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={'基本信息': result})
                logging.getLogger().info(" data inserted ")
                '''
                更新相关信息, 即parse_status 和 data_table_name
                '''
                DbHelper.executemany(
                    "update chuanzhong_web_company set parse_status = %s, data_table_name= %s where id = %s",
                    (1, data_table, company_id))
                logging.getLogger().info(
                    " parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (item[0]))
                DbHelper.executemany(
                    "update chuanzhong_web_company set parse_status = %s where id = %s",
                    (2, company_id))
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
def main():
    batch = 100
    while True:

        stime = time.time()
        logging.getLogger().info('Batch begins ')
        sql = '''
            select b.id, b.update_time, b.basic_info from (
                select id from qycxb_web_company where status = %s and parse_status = %s  limit %s
            ) as a left join (select id, update_time, basic_info from qycxb_web_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (1, 0, batch))

        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, update_time, basic_info = item[0], item[1], item[2]
            logging.getLogger().info(" begin to parse company-id : %s " %
                                     (company_id, ))

            try:
                # parse html page
                detail = parse(basic_info)
                # persist parsed company data into database
                DbHelper.executemany(
                    "update qycxb_web_company set parse_status = %s, parsed_content =%s  where id = %s",
                    (1, json.dumps(detail), company_id))
                logging.getLogger().info(
                    " parse status updated, and parsed content inserted ")
            except (ParseException, Exception) as err:
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (company_id, ))
                logging.getLogger().exception(err)
                DbHelper.executemany(
                    "update qycxb_web_company set parse_status = %s  where id = %s ",
                    (2, company_id))
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
def process(cookie, pagenum):
    url = 'http://webdata.cqccms.com.cn/webdata/query/CCCCerti.do'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Content-Length':
        '110',
        'Content-Type':
        'application/x-www-form-urlencoded',
        'Cookie':
        cookie,
        'Host':
        'webdata.cqccms.com.cn',
        'Origin':
        'http://webdata.cqccms.com.cn',
        'Referer':
        'http://webdata.cqccms.com.cn/webdata/query/CCCCerti.do;jsessionid=qxkxYRZYCCtHGGd17y3J5TlsJqNvSGLGTt1hVcpp618JkmTfpp1T!-510284702',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36'
    }
    data = {
        'keyword': u"公司".encode('GBK'),
        '_h_select_chaxuntype': 'appname',
        'chaxuntype': 'appname',
        'pageNumber': pagenum,
        'pageSize': 10,
        'sortColumns': 'null'
    }
    proxies = proxy.get_proxy("products_info")
    crawling_response = requests.post(url=url,
                                      data=data,
                                      headers=headers,
                                      proxies=proxies)
    # crawling_response = requests.post(url=url, data=data, headers=headers)
    if crawling_response.status_code != 200:
        time.sleep(10)
    content = crawling_response.text
    print content

    parsed_content = parse(content)
    if len(parsed_content) == 0:
        logging.getLogger("Crawler").info("Nothing parsed ")
        return 0
    try:
        DbHelper.executemany(
            "INSERT INTO product(certificat_no,applicant,manufacturer,factory,product,model_specification,standard,"
            "issue_date,original_issue_date,expiry_date,`status`,status_changing_time,reason,attachment) VALUES(%s,%s,%s,%s,%s,%s,%s,"
            "%s,%s,%s,%s,%s,%s,%s)",
            data=parsed_content)
    except IntegrityError as e:
        logging.getLogger("Crawler").exception(
            "Exceptions Occurs During One Batch Process")
        logging.getLogger("Crawler").exception(e)
    except Exception, e:
        logging.getLogger("Crawler").exception(
            "Exceptions Occurs When Inserting Into DB ")
        logging.getLogger("Crawler").exception(e)
        raise e