def main(): site = 'czzx_web' batch = 100 data_table = 'common_data_all' while True: logging.getLogger().info(" Batch begins ") stime = time.time() sql = ''' select b.id, b.update_time, b.data, b.data_table_name from ( select id from chuanzhong_web_company where id >= 809439 and parse_status = %s limit %s ) as a left join (select id, update_time, data, data_table_name from chuanzhong_web_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, crawl_time, data = item['id'], item[ 'update_time'], item['data'] try: ''' 存储公司信息 ''' result = json.dumps(parse(data), ensure_ascii=False, encoding='utf-8') StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=crawl_time, data={'基本信息': result}) logging.getLogger().info(" data inserted ") ''' 更新相关信息, 即parse_status 和 data_table_name ''' DbHelper.executemany( "update chuanzhong_web_company set parse_status = %s, data_table_name= %s where id = %s", (1, data_table, company_id)) logging.getLogger().info( " parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info( "exception/err occurs, company id: %s" % (item[0])) DbHelper.executemany( "update chuanzhong_web_company set parse_status = %s where id = %s", (2, company_id)) continue logging.getLogger().info( " the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main(): batch = 100 while True: stime = time.time() logging.getLogger().info('Batch begins ') sql = ''' select b.id, b.update_time, b.basic_info from ( select id from qycxb_web_company where status = %s and parse_status = %s limit %s ) as a left join (select id, update_time, basic_info from qycxb_web_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (1, 0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, update_time, basic_info = item[0], item[1], item[2] logging.getLogger().info(" begin to parse company-id : %s " % (company_id, )) try: # parse html page detail = parse(basic_info) # persist parsed company data into database DbHelper.executemany( "update qycxb_web_company set parse_status = %s, parsed_content =%s where id = %s", (1, json.dumps(detail), company_id)) logging.getLogger().info( " parse status updated, and parsed content inserted ") except (ParseException, Exception) as err: logging.getLogger().info( "exception/err occurs, company id: %s" % (company_id, )) logging.getLogger().exception(err) DbHelper.executemany( "update qycxb_web_company set parse_status = %s where id = %s ", (2, company_id)) continue logging.getLogger().info( " the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def process(cookie, pagenum): url = 'http://webdata.cqccms.com.cn/webdata/query/CCCCerti.do' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Content-Length': '110', 'Content-Type': 'application/x-www-form-urlencoded', 'Cookie': cookie, 'Host': 'webdata.cqccms.com.cn', 'Origin': 'http://webdata.cqccms.com.cn', 'Referer': 'http://webdata.cqccms.com.cn/webdata/query/CCCCerti.do;jsessionid=qxkxYRZYCCtHGGd17y3J5TlsJqNvSGLGTt1hVcpp618JkmTfpp1T!-510284702', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36' } data = { 'keyword': u"公司".encode('GBK'), '_h_select_chaxuntype': 'appname', 'chaxuntype': 'appname', 'pageNumber': pagenum, 'pageSize': 10, 'sortColumns': 'null' } proxies = proxy.get_proxy("products_info") crawling_response = requests.post(url=url, data=data, headers=headers, proxies=proxies) # crawling_response = requests.post(url=url, data=data, headers=headers) if crawling_response.status_code != 200: time.sleep(10) content = crawling_response.text print content parsed_content = parse(content) if len(parsed_content) == 0: logging.getLogger("Crawler").info("Nothing parsed ") return 0 try: DbHelper.executemany( "INSERT INTO product(certificat_no,applicant,manufacturer,factory,product,model_specification,standard," "issue_date,original_issue_date,expiry_date,`status`,status_changing_time,reason,attachment) VALUES(%s,%s,%s,%s,%s,%s,%s," "%s,%s,%s,%s,%s,%s,%s)", data=parsed_content) except IntegrityError as e: logging.getLogger("Crawler").exception( "Exceptions Occurs During One Batch Process") logging.getLogger("Crawler").exception(e) except Exception, e: logging.getLogger("Crawler").exception( "Exceptions Occurs When Inserting Into DB ") logging.getLogger("Crawler").exception(e) raise e