def handle_single_item(item, **kwargs): id, old_data, web_data = item['id'], item['old_data'], item['web_data'] logging.info("starts to process gonggao_dta id: %s" % (id)) convert_list = GonggaoWebEsConvert.convert(old_data, web_data) for convert in convert_list: company_id_es = generate_company_id(str(id) + convert['CmpName']) status = check_existence(company_id_es, index=kwargs['index'], doc_type=kwargs['doc_type']) convert['Meta'] = { 'Source': kwargs['site'], 'Time': datetime.datetime.now().date().__str__() } if status == 0: # 不存在 logging.info("starts to index id : %s,company : %s,doc_id : %s" % (id, convert['CmpName'], company_id_es)) EsHelper.es.index(index=kwargs['index'], doc_type=kwargs['doc_type'], id=company_id_es, body=json.dumps(convert, ensure_ascii=False, encoding="utf-8")) logging.info("inserted into es") elif status == 1: # 存在 DbHelper.execute( "UPDATE gonggao_data set es_status = %s WHERE id = %s ", (kwargs['es_status_exists'], id)) logging.info("company exists in es") return DbHelper.execute( "UPDATE gonggao_data set es_status = %s WHERE id = %s ", (kwargs['es_status_success'], id))
def main(): config_site = DbHelper.fetchmany(stmt='SELECT priority_code, site FROM config_site') config_site = dict(zip(map(lambda x: x['site'], config_site), map(lambda x: x['priority_code'], config_site))) batch = 10 site = 'qichacha_web' index = 'drdata_qyk' doc_type = 'BusinessInfo' ''' parse_status_success:1 es_status:1 成功 es_status:其他 未处理 ''' parse_status_success = 1 es_status_success = 1 select_company = " SELECT id, company_name, data_gsxx FROM qichacha_weixin_company where parse_status = %s and es_status !=%s limit %s " while True: items = DbHelper.fetchmany(stmt=select_company, data=(parse_status_success, es_status_success, batch)) for item in items: time_begin = time.time() try: handle_single_item(item, config_site=config_site, site=site, index=index, doc_type=doc_type, es_status_success=es_status_success) except ConvertException as e: logging.getLogger().exception(e) raise e except Exception as e: logging.getLogger().exception(e) time.sleep(10) logging.info('cost: {0:f}'.format(time.time() - time_begin))
def main(): site = 'xysj_weixin' batch = 100 data_table = 'common_data_all' while True: stime = time.time() items = DbHelper.fetchmany( "select id, crawl_time, src_list from xysj_weixin_company where crawl_status = 1 and " " parse_status = 0 limit %s ", (batch, )) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, crawl_time, src_list = item[0], item[1], item[2] try: StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=crawl_time, data={u'info': src_list}) logging.getLogger().info(" data stored ") DbHelper.execute( "update xysj_weixin_company set parse_status = %s, data_table_name =%s where id = %s", (1, data_table, company_id)) logging.getLogger().info(" parse status updated ") except Exception, err: logging.getLogger().exception(err) continue logging.getLogger().info( " the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def handle_single_item(item, **kwargs): company_id, company_name, parsed_content = item['id'], item['company_name'].strip(), item['data_gsxx'] company_id_es = ETLTools.generate_company_id(company_name) logging.info("starts to process company: %s, id: %s, the id of es document is: %s " % (company_name, company_id, company_id_es)) status = EsHelper.check(index=kwargs['index'], doc_type=kwargs['doc_type'], doc_id=company_id_es, site=kwargs['site'], config_site=kwargs['config_site']) if status == 2: # 数据存在 logging.info("company exists in es") return if parsed_content is None: select_company_parsed_content = 'SELECT value from common_data_all where company_id = %s and site = %s and key_desc = %s ' parsed_content = DbHelper.fetchone(select_company_parsed_content, data=(company_id, kwargs['site'], 'data_gsxx'))['value'] converted = Qichacha_WeiXin_EsConvert.convert(parsed_content) converted['CmpName'] = company_name.strip() converted['Meta'] = {'Source': kwargs['site'], 'Time': datetime.datetime.now().date().__str__()} if status == 0: # es中不存在 EsHelper.es.index(index=kwargs['index'], doc_type=kwargs['doc_type'], id=company_id_es, body=json.dumps(converted, ensure_ascii=False, encoding="utf-8")) logging.info("inserted into es") else: # status == 1 es存在,但是优先级较低,因此更新之 EsHelper.es.update(index='drdata_qyk', doc_type='BusinessInfo', id=company_id_es, body=json.dumps(converted, ensure_ascii=False, encoding="utf-8")) logging.info("updated into es") DbHelper.execute("UPDATE qichacha_weixin_company set es_status = %s WHERE id = %s ", (kwargs['es_status_success'], company_id))
def main(): site = 'guangdong_weixin' batch = 100 data_table = 'common_data_all' while True: stime = time.time() sql = ''' select b.id, b.update_time, b.src_gongshang, b.src_qiye, b.src_other, b.src_judicial from ( select id from gs_guangdong_company where status = %s and parse_status = %s limit %s )as a left join (select id, update_time, src_gongshang, src_qiye, src_other, src_judicial from gs_guangdong_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, [15, 0, batch]) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, update_time = item['id'], item['update_time'] gsgs, qygs, bmgs, sfgs = item['src_gongshang'], item[ 'src_qiye'], item['src_other'], item['src_judicial'] try: logging.getLogger().info(" begin to parse company-id : %s " % (item[0], )) ''' 1 存储公司信息 ''' StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=update_time, data={ u'gsgs': gsgs, u'qygs': qygs, u'bmgs': bmgs, u'sfgs': sfgs }) logging.getLogger().info(" data stored ") ''' 2 更新相关信息, 即parse_status 和 data_table_name ''' DbHelper.execute( "update gs_guangdong_company set parse_status = %s, data_table_name =%s where id = %s", (1, data_table, company_id)) logging.getLogger().info( " parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info( "exception/err occurs, company id: %s" % (item[0])) DbHelper.execute( "update gs_guangdong_company set parse_status = %s where id = %s", [2, company_id]) continue logging.getLogger().info( " the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main(): site = 'qichacha_weixin' batch = 100 data_table = 'common_data_all' while True: stime = time.time() sql = ''' select b.id, b.update_time, b.data from ( select id from qichacha_weixin_company where status = %s and parse_status = %s limit %s ) as a left join (select id, update_time, data from qichacha_weixin_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (1, 0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, update_time, data = item['id'], item[ 'update_time'], item['data'] try: logging.getLogger().info(" begin to parse company-id : %s " % (item[0], )) ''' 1 存储公司信息 ''' StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=update_time, data={u'detail': data}) logging.getLogger().info(" data stored ") ''' 2 更改状态为已解析以及data-table-name ''' DbHelper.execute( "update qichacha_weixin_company set parse_status = %s, data_table_name =%s where id = %s", [1, data_table, item[0]]) logging.getLogger().info( " parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info( "exception/err occurs, company id: %s" % (company_id, )) DbHelper.execute( "update qichacha_weixin_company set parse_status = %s where id = %s", (2, company_id)) continue logging.getLogger().info( " the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main(): site = 'czzx_web' batch = 100 data_table = 'common_data_all' while True: logging.getLogger().info(" Batch begins ") stime = time.time() sql = ''' select b.id, b.update_time, b.data, b.data_table_name from ( select id from chuanzhong_web_company where id >= 809439 and parse_status = %s limit %s ) as a left join (select id, update_time, data, data_table_name from chuanzhong_web_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, crawl_time, data = item['id'], item[ 'update_time'], item['data'] try: ''' 存储公司信息 ''' result = json.dumps(parse(data), ensure_ascii=False, encoding='utf-8') StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=crawl_time, data={'基本信息': result}) logging.getLogger().info(" data inserted ") ''' 更新相关信息, 即parse_status 和 data_table_name ''' DbHelper.executemany( "update chuanzhong_web_company set parse_status = %s, data_table_name= %s where id = %s", (1, data_table, company_id)) logging.getLogger().info( " parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info( "exception/err occurs, company id: %s" % (item[0])) DbHelper.executemany( "update chuanzhong_web_company set parse_status = %s where id = %s", (2, company_id)) continue logging.getLogger().info( " the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main(): batch = 1 parse_status_success = 1 crawl_status_success = 1 while True: items = DbHelper.fetchmany( "SELECT id, web_data FROM wenshu_web WHERE parse_status != %s limit %s", (parse_status_success, batch)) for item in items: case_list = json.loads(item['web_data']) for case in case_list: if 'Count' in case.keys(): continue case_name = case[u'案件名称'] if not case_name.__contains__(u'公司'): continue logging.info('starts to handle id: %s, case id: %s ' % (item['id'], case[u'文书ID'])) url = 'http://wenshu.court.gov.cn/content/content?DocID=%s' % ( case[u'文书ID'], ) headers = None proxies = proxy.get_proxy("WenshuDetail") response = requests.get(url=url, headers=headers, proxies=proxies, timeout=15) if response.status_code != 200: logging.info('case-fetch fails') time.sleep(10) continue content = response.text DbHelper.execute( "INSERT INTO wenshu_web_detail(doc_id, summary, detail, crawl_status) VALUES (%s, %s, %s, %s)", (case[u'文书ID'], json.dumps(case), content, crawl_status_success)) logging.info('case inserted') time.sleep(3) DbHelper.execute( 'UPDATE wenshu_web SET parse_status = %s WHERE id = %s ', (parse_status_success, item['id']))
def main(): site = 'shanghai_web' batch = 100 data_table = 'common_data_all' while True: stime = time.time() sql = ''' select b.id, b.update_time, b.src_gsgs, b.src_qygs, b.src_bmgs, b.src_sfgs from ( select id from gs_shanghai_company where status = %s and parse_status = %s limit %s ) as a left join (select id, update_time, src_gsgs, src_qygs, src_bmgs, src_sfgs from gs_shanghai_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (15, 0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, update_time = item['id'], item['update_time'] gsgs, qygs, bmgs, sfgs = item['src_gsgs'], item['src_qygs'], item['src_bmgs'], item['src_sfgs'] try: logging.getLogger().info(" begin to parse company-id : %s " % (item[0],)) result = IcWebParser().parsing([etree.HTML(text=gsgs), etree.HTML(text=qygs), etree.HTML(text=bmgs), etree.HTML(text=sfgs)]) StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=update_time, data={u'gsgs': json.dumps(result[u'工商公示信息']), u'qygs': json.dumps(result[u'企业公示信息']), u'bmgs': json.dumps(result[u'其他部门公示信息']), u'sfgs': json.dumps(result[u'司法协助公示信息'])}) logging.getLogger().info(" data inserted ") DbHelper.execute("update gs_shanghai_company set parse_status = %s, data_table_name= %s where id = %s", (1, data_table, company_id)) logging.getLogger().info(" parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,)) DbHelper.execute("update gs_shanghai_company set parse_status = %s where id = %s", (2, company_id)) continue logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main(): site = "tianyancha_web" batch = 200 while True: logging.getLogger().info(" Batch begins ") stime = time.time() sql = ''' select b.id, b.crawl_time, b.src_detail from ( select id from tianyancha_web_company where crawl_status = %s and parse_status = %s limit %s ) as a left join (select id, crawl_time, src_detail from tianyancha_web_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (1, 0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: logging.getLogger().info(" begin to parse company-id : %s " % (item[0], )) data_table = "common_data_all" try: # parse html page company_id, crawl_time, src_detail = item[0], item[1], item[2] StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=crawl_time, data={u'detail': src_detail}) logging.getLogger().info(" data stored ") # update parse status DbHelper.execute( "update tianyancha_web_company set parse_status = %s, data_table_name =%s where id = %s", (1, data_table, item[0])) logging.getLogger().info( " parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info( "exception/err occurs, company id: %s" % (company_id, )) DbHelper.execute( "update tianyancha_web_company set parse_status = %s where id = %s ", (2, company_id)) continue logging.getLogger().info( " the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main(): retrieved_config_site = DbHelper.fetchmany( stmt='SELECT priority_code, site FROM config_site') config_site = dict( zip(map(lambda x: x['site'], retrieved_config_site), map(lambda x: x['priority_code'], retrieved_config_site))) batch, site, index, doc_type = 10, 'xizhi_web', 'drdata_qyk', 'BusinessInfo' # parse_status_success:3 # es_status:1 成功 # es_status:其他 未处理 parse_status_success, es_status_success = 3, 1 select_company = " SELECT id, company_name FROM xizhi_web_company where parse_status = %s and es_status !=%s limit %s " while True: items = DbHelper.fetchmany(stmt=select_company, data=(parse_status_success, es_status_success, batch)) logging.info('batch begins, the size is %s:' % (len(items), )) batch_begin_time = time.time() if len(items) == 0: logging.info("no data on condition found in db") break for item in items: time_begin = time.time() try: handle_single_item(item, config_site=config_site, site=site, index=index, doc_type=doc_type, es_status_success=es_status_success) except ConvertException as e: logging.getLogger().exception(e) raise e except Exception as e: logging.getLogger().exception(e) time.sleep(10) logging.info('cost: {0:f}'.format(time.time() - time_begin)) logging.info('batch ends, size is: %s, costs:%s' % (len(items), time.time() - batch_begin_time))
def handle_single_item(item, **kwargs): company_id, company_name, parsed_content = item['id'], item[ 'company_name'].strip(), item['parsed_content'] company_id_es = ETLTools.generate_company_id(company_name) logging.info( "starts to process company: %s, id: %s, the id of es document is: %s " % (company_name, company_id, company_id_es)) status = EsHelper.check(index=kwargs['index'], doc_type=kwargs['doc_type'], doc_id=company_id_es, site=kwargs['site'], config_site=kwargs['config_site']) if status == 2: # 数据存在 logging.info("company exists in es") return converted = QycxbWebEsConvert.convert(parsed_content) converted['CmpName'] = company_name.strip() converted['Meta'] = { 'Source': kwargs['site'], 'Time': datetime.datetime.now().date().__str__() } if status == 0: # es中不存在 EsHelper.es.index(index=kwargs['index'], doc_type=kwargs['doc_type'], id=company_id_es, body=json.dumps(converted, ensure_ascii=False, encoding="utf-8")) logging.info("inserted into es") else: # status == 1 es存在,但是优先级较低,因此更新之 EsHelper.es.update(index='drdata_qyk', doc_type='BusinessInfo', id=company_id_es, body=json.dumps(converted, ensure_ascii=False, encoding="utf-8")) logging.info("updated into es") DbHelper.execute( "UPDATE qycxb_web_company set es_status = %s WHERE id = %s ", (kwargs['es_status_success'], company_id))
def main(): batch = 100 site = 'qixin_weixin' data_table = 'common_data_all' while True: stime = time.time() sql = ''' select b.id, b.crawl_time, b.src_homepageinfo, b.src_basicinfo, b.src_changeinfo from ( select id from qixin_weixin_company where crawl_status = %s and parse_status = %s limit %s ) as a left join (select id, crawl_time, src_homepageinfo,src_basicinfo,src_changeinfo from qixin_weixin_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (7, 0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, crawl_time = item['id'], item['crawl_time'] pageinfo, basicinfo, changeinfo = item['src_homepageinfo'], item['src_basicinfo'], item['src_changeinfo'] try: logging.getLogger().info(" begin to parse company-id : %s " % (company_id,)) ''' 1 存储公司信息 ''' StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=crawl_time, data={u'pageinfo': pageinfo, u'basicinfo': basicinfo, u'changeinfo': changeinfo}) logging.getLogger().info(" data stored ") ''' 2 更改状态为已解析以及data-table-name ''' DbHelper.execute("update qixin_weixin_company set parse_status = %s, data_table_name =%s where id = %s", (1, data_table, company_id)) logging.getLogger().info(" parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,)) DbHelper.execute("update qixin_weixin_company set parse_status = %s where id = %s", [2, company_id]) continue logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main(): batch = 100 while True: stime = time.time() logging.getLogger().info('Batch begins ') sql = ''' select b.id, b.update_time, b.basic_info from ( select id from qycxb_web_company where status = %s and parse_status = %s limit %s ) as a left join (select id, update_time, basic_info from qycxb_web_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (1, 0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, update_time, basic_info = item[0], item[1], item[2] logging.getLogger().info(" begin to parse company-id : %s " % (company_id, )) try: # parse html page detail = parse(basic_info) # persist parsed company data into database DbHelper.executemany( "update qycxb_web_company set parse_status = %s, parsed_content =%s where id = %s", (1, json.dumps(detail), company_id)) logging.getLogger().info( " parse status updated, and parsed content inserted ") except (ParseException, Exception) as err: logging.getLogger().info( "exception/err occurs, company id: %s" % (company_id, )) logging.getLogger().exception(err) DbHelper.executemany( "update qycxb_web_company set parse_status = %s where id = %s ", (2, company_id)) continue logging.getLogger().info( " the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main(): config_site = DbHelper.fetchmany( stmt='SELECT priority_code, site FROM config_site') config_site = dict( zip(map(lambda x: x['site'], config_site), map(lambda x: x['priority_code'], config_site))) site = 'czzx_web' index = 'drdata_qyk' doc_type = 'BusinessInfo' ''' es_status:13 成功 es_status:14 已存在 ''' es_status_success = 13 es_status_exists = 14 fetch_rows_limit = 2 select_company = " select id,company_name,es_status from chuanzhong_web_company where es_status != %s and es_status != %s limit %s" while True: logging.info("round starts") items = DbHelper.fetchmany(stmt=select_company, data=(es_status_success, es_status_exists, fetch_rows_limit)) for item in items: time_begin = time.time() try: handle_single_item(item, config_site=config_site, site=site, index=index, doc_type=doc_type, es_status_success=es_status_success, es_status_exists=es_status_exists) except ConvertException as ce: logging.getLogger().exception(ce) raise ce except Exception as e: logging.getLogger().exception(e) time.sleep(60) logging.info('cost: {0:f}'.format(time.time() - time_begin)) logging.info("round ends")
def main(): site = "xizhi_web" batch = 100 while True: begin = time.time() sql = ''' select b.id, b.update_time, b.data from ( select id from xizhi_web_company where status = %s and parse_status = %s limit %s ) as a left join (select id, update_time, data from xizhi_web_company ) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (1, 0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: data_table = "common_data_all" company_id, crawl_time, data = item['id'], item['update_time'], item['data'] logging.getLogger().info(" begin to parse company-id : %s " % (company_id,)) try: # parse html page detail = parse(data) # persist parsed company data into database StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=crawl_time, data={u'data': json.dumps(detail)}) logging.getLogger().info(" data stored ") # update parse status DbHelper.execute("update xizhi_web_company set parse_status = %s, data_table_name =%s where id = %s", (3, data_table, company_id)) logging.getLogger().info(" parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,)) DbHelper.execute("update xizhi_web_company set parse_status = %s where id = %s ", (2, company_id)) continue logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - begin))
def main(): index = 'drdata_qyk' doc_type = 'RmfyggMessage' site = 'http://rmfygg.court.gov.cn/psca/lgnot/bulletin/page/' ''' es_status 0 未导入 es_status 1 已导入 es_status 2 已存在 ''' es_status_success = 1 es_status_exists = 2 fetch_rows_limit = 1 stmt = 'select id,old_data,web_data from gonggao_data where es_status != %s and es_status != %s limit %s' while True: logging.info("round starts") items = DbHelper.fetchmany(stmt=stmt, data=(es_status_success, es_status_exists, fetch_rows_limit)) for item in items: time_begin = time.time() try: handle_single_item(item, index=index, doc_type=doc_type, site=site, es_status_success=es_status_success, es_status_exists=es_status_exists) except ConvertException as ce: logging.getLogger().exception(ce) raise ce except Exception as e: logging.getLogger().exception(e) time.sleep(60) logging.info('cost: {0:f}'.format(time.time() - time_begin)) logging.info("round ends")
def main(): config_site = DbHelper.fetchmany( stmt='SELECT priority_code, site FROM config_site') config_site = dict( zip(map(lambda x: x['site'], config_site), map(lambda x: x['priority_code'], config_site))) batch = 10 site = None index = 'drdata_qyk' doc_type = 'BusinessInfo' ''' parse_status_success:1 es_status:2 成功 es_status:其他 未处理 ''' parse_status_success = 1 es_status_success = 2 select_company = " SELECT id, company_name FROM %s where parse_status = %s and etl_status !=%s limit %s " configs = [ { 'site': 'shanghai_web', 'table': 'gs_shanghai_company', 'finished': False }, { 'site': 'fujian_web', 'table': 'gs_fujian_company', 'finished': False }, { 'site': 'hebei_web', 'table': 'gs_hebei_company', 'finished': False }, { 'site': 'hunan_web', 'table': 'gs_hunan_company', 'finished': False }, { 'site': 'yunnan_web', 'table': 'gs_yunnan_company', 'finished': False }, ] while True: if len(filter(lambda x: not x['finished'], configs)) == 0: logging.info(" es etl finished, process going to closed") break for conf in configs: if conf['finished']: continue # items = DbHelper.fetchmany(stmt=select_company, data=(conf['table'], parse_status_success, es_status_success, batch)) items = DbHelper.fetchmany(stmt=select_company % (conf['table'], parse_status_success, es_status_success, batch)) if len(items) == 0: conf['finished'] = True for item in items: time_begin = time.time() try: handle_single_item(item, config_site=config_site, site=conf['site'], table=conf['table'], index=index, doc_type=doc_type, es_status_success=es_status_success) except ConvertException as e: logging.getLogger().exception(e) raise e except Exception as e: logging.getLogger().exception(e) time.sleep(10) logging.info('cost: {0:f}'.format(time.time() - time_begin))
def process(): # get un-crawled company-keyword list keywords_response = requests.get( 'http://10.51.1.201:3352/getKeywords', params={ "data": json.dumps({ "field": "SoftwareCopyrightStatus", "status": 0 }, ensure_ascii=False) }) if keywords_response.status_code != 200: time.sleep(10) return else: companies = json.loads(keywords_response.text) logging.getLogger("Crawler").info( "Get Companies From ES, And Size is : %s" % (len(companies), )) for company in companies: cmpid, cmpname = company["Id"], company['CmpName'] logging.getLogger("Crawler").info( "Begins to Crawl Info, the CmpName:%s, CmpId:%s " % (cmpname, cmpid)) url_template = "http://www.ccopyright.com.cn/cpcc/RRegisterAction.do?method=list&no=fck&sql_name=&sql_regnum=&sql_author=%s&curPage=1" url = url_template % (quote(cmpname.replace(u"・", u"").encode("gbk"))) headers = None proxies = proxy.get_proxy("SoftWareCopyright") crawling_response = requests.get(url=url, headers=headers, proxies=proxies, timeout=15) if crawling_response.status_code != 200: time.sleep(10) continue content = crawling_response.text try: parsed_content = parse(content) except Exception as e: logging.getLogger("Crawler").exception( "Exceptions occurs when parsing crawled page") logging.getLogger("Crawler").exception(e) continue if len(parsed_content) == 0: logging.getLogger("Crawler").info("No software copyright found") else: DbHelper.execute( "INSERT INTO software_copyright(cmpid, src) VALUES(%s, %s)", (cmpid, content)) parse_status = 1 DbHelper.execute( "UPDATE software_copyright SET parse_status = %s, parsed_content = %s where cmpid = %s ", (parse_status, json.dumps(parsed_content), cmpid)) logging.getLogger("Crawler").info("Page Parsed Successfully") notify_response = requests.get( url=url, params={ "data": json.dumps([ { "Id": cmpid, "field": "SoftwareCopyrightStatus", "status": 1 }, ], ensure_ascii=False) }) if notify_response.status_code != 200: logging.getLogger("Crawler").info( "Action, which notify es, fails.") logging.getLogger("Crawler").info("Info Crawled Successfully") time.sleep(2)
def process(cookie, pagenum): url = 'http://webdata.cqccms.com.cn/webdata/query/CCCCerti.do' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Content-Length': '110', 'Content-Type': 'application/x-www-form-urlencoded', 'Cookie': cookie, 'Host': 'webdata.cqccms.com.cn', 'Origin': 'http://webdata.cqccms.com.cn', 'Referer': 'http://webdata.cqccms.com.cn/webdata/query/CCCCerti.do;jsessionid=qxkxYRZYCCtHGGd17y3J5TlsJqNvSGLGTt1hVcpp618JkmTfpp1T!-510284702', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36' } data = { 'keyword': u"公司".encode('GBK'), '_h_select_chaxuntype': 'appname', 'chaxuntype': 'appname', 'pageNumber': pagenum, 'pageSize': 10, 'sortColumns': 'null' } proxies = proxy.get_proxy("products_info") crawling_response = requests.post(url=url, data=data, headers=headers, proxies=proxies) # crawling_response = requests.post(url=url, data=data, headers=headers) if crawling_response.status_code != 200: time.sleep(10) content = crawling_response.text print content parsed_content = parse(content) if len(parsed_content) == 0: logging.getLogger("Crawler").info("Nothing parsed ") return 0 try: DbHelper.executemany( "INSERT INTO product(certificat_no,applicant,manufacturer,factory,product,model_specification,standard," "issue_date,original_issue_date,expiry_date,`status`,status_changing_time,reason,attachment) VALUES(%s,%s,%s,%s,%s,%s,%s," "%s,%s,%s,%s,%s,%s,%s)", data=parsed_content) except IntegrityError as e: logging.getLogger("Crawler").exception( "Exceptions Occurs During One Batch Process") logging.getLogger("Crawler").exception(e) except Exception, e: logging.getLogger("Crawler").exception( "Exceptions Occurs When Inserting Into DB ") logging.getLogger("Crawler").exception(e) raise e