def handle_single_item(item, **kwargs): company_id, company_name, parsed_content = item['id'], item['company_name'].strip(), item['data_gsxx'] company_id_es = ETLTools.generate_company_id(company_name) logging.info("starts to process company: %s, id: %s, the id of es document is: %s " % (company_name, company_id, company_id_es)) status = EsHelper.check(index=kwargs['index'], doc_type=kwargs['doc_type'], doc_id=company_id_es, site=kwargs['site'], config_site=kwargs['config_site']) if status == 2: # 数据存在 logging.info("company exists in es") return if parsed_content is None: select_company_parsed_content = 'SELECT value from common_data_all where company_id = %s and site = %s and key_desc = %s ' parsed_content = DbHelper.fetchone(select_company_parsed_content, data=(company_id, kwargs['site'], 'data_gsxx'))['value'] converted = Qichacha_WeiXin_EsConvert.convert(parsed_content) converted['CmpName'] = company_name.strip() converted['Meta'] = {'Source': kwargs['site'], 'Time': datetime.datetime.now().date().__str__()} if status == 0: # es中不存在 EsHelper.es.index(index=kwargs['index'], doc_type=kwargs['doc_type'], id=company_id_es, body=json.dumps(converted, ensure_ascii=False, encoding="utf-8")) logging.info("inserted into es") else: # status == 1 es存在,但是优先级较低,因此更新之 EsHelper.es.update(index='drdata_qyk', doc_type='BusinessInfo', id=company_id_es, body=json.dumps(converted, ensure_ascii=False, encoding="utf-8")) logging.info("updated into es") DbHelper.execute("UPDATE qichacha_weixin_company set es_status = %s WHERE id = %s ", (kwargs['es_status_success'], company_id))
def handle_single_item(item, **kwargs): id, old_data, web_data = item['id'], item['old_data'], item['web_data'] logging.info("starts to process gonggao_dta id: %s" % (id)) convert_list = GonggaoWebEsConvert.convert(old_data, web_data) for convert in convert_list: company_id_es = generate_company_id(str(id) + convert['CmpName']) status = check_existence(company_id_es, index=kwargs['index'], doc_type=kwargs['doc_type']) convert['Meta'] = { 'Source': kwargs['site'], 'Time': datetime.datetime.now().date().__str__() } if status == 0: # 不存在 logging.info("starts to index id : %s,company : %s,doc_id : %s" % (id, convert['CmpName'], company_id_es)) EsHelper.es.index(index=kwargs['index'], doc_type=kwargs['doc_type'], id=company_id_es, body=json.dumps(convert, ensure_ascii=False, encoding="utf-8")) logging.info("inserted into es") elif status == 1: # 存在 DbHelper.execute( "UPDATE gonggao_data set es_status = %s WHERE id = %s ", (kwargs['es_status_exists'], id)) logging.info("company exists in es") return DbHelper.execute( "UPDATE gonggao_data set es_status = %s WHERE id = %s ", (kwargs['es_status_success'], id))
def main(): site = 'xysj_weixin' batch = 100 data_table = 'common_data_all' while True: stime = time.time() items = DbHelper.fetchmany( "select id, crawl_time, src_list from xysj_weixin_company where crawl_status = 1 and " " parse_status = 0 limit %s ", (batch, )) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, crawl_time, src_list = item[0], item[1], item[2] try: StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=crawl_time, data={u'info': src_list}) logging.getLogger().info(" data stored ") DbHelper.execute( "update xysj_weixin_company set parse_status = %s, data_table_name =%s where id = %s", (1, data_table, company_id)) logging.getLogger().info(" parse status updated ") except Exception, err: logging.getLogger().exception(err) continue logging.getLogger().info( " the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main(): site = 'guangdong_weixin' batch = 100 data_table = 'common_data_all' while True: stime = time.time() sql = ''' select b.id, b.update_time, b.src_gongshang, b.src_qiye, b.src_other, b.src_judicial from ( select id from gs_guangdong_company where status = %s and parse_status = %s limit %s )as a left join (select id, update_time, src_gongshang, src_qiye, src_other, src_judicial from gs_guangdong_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, [15, 0, batch]) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, update_time = item['id'], item['update_time'] gsgs, qygs, bmgs, sfgs = item['src_gongshang'], item[ 'src_qiye'], item['src_other'], item['src_judicial'] try: logging.getLogger().info(" begin to parse company-id : %s " % (item[0], )) ''' 1 存储公司信息 ''' StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=update_time, data={ u'gsgs': gsgs, u'qygs': qygs, u'bmgs': bmgs, u'sfgs': sfgs }) logging.getLogger().info(" data stored ") ''' 2 更新相关信息, 即parse_status 和 data_table_name ''' DbHelper.execute( "update gs_guangdong_company set parse_status = %s, data_table_name =%s where id = %s", (1, data_table, company_id)) logging.getLogger().info( " parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info( "exception/err occurs, company id: %s" % (item[0])) DbHelper.execute( "update gs_guangdong_company set parse_status = %s where id = %s", [2, company_id]) continue logging.getLogger().info( " the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main(): site = 'qichacha_weixin' batch = 100 data_table = 'common_data_all' while True: stime = time.time() sql = ''' select b.id, b.update_time, b.data from ( select id from qichacha_weixin_company where status = %s and parse_status = %s limit %s ) as a left join (select id, update_time, data from qichacha_weixin_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (1, 0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, update_time, data = item['id'], item[ 'update_time'], item['data'] try: logging.getLogger().info(" begin to parse company-id : %s " % (item[0], )) ''' 1 存储公司信息 ''' StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=update_time, data={u'detail': data}) logging.getLogger().info(" data stored ") ''' 2 更改状态为已解析以及data-table-name ''' DbHelper.execute( "update qichacha_weixin_company set parse_status = %s, data_table_name =%s where id = %s", [1, data_table, item[0]]) logging.getLogger().info( " parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info( "exception/err occurs, company id: %s" % (company_id, )) DbHelper.execute( "update qichacha_weixin_company set parse_status = %s where id = %s", (2, company_id)) continue logging.getLogger().info( " the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main(): batch = 1 parse_status_success = 1 crawl_status_success = 1 while True: items = DbHelper.fetchmany( "SELECT id, web_data FROM wenshu_web WHERE parse_status != %s limit %s", (parse_status_success, batch)) for item in items: case_list = json.loads(item['web_data']) for case in case_list: if 'Count' in case.keys(): continue case_name = case[u'案件名称'] if not case_name.__contains__(u'公司'): continue logging.info('starts to handle id: %s, case id: %s ' % (item['id'], case[u'文书ID'])) url = 'http://wenshu.court.gov.cn/content/content?DocID=%s' % ( case[u'文书ID'], ) headers = None proxies = proxy.get_proxy("WenshuDetail") response = requests.get(url=url, headers=headers, proxies=proxies, timeout=15) if response.status_code != 200: logging.info('case-fetch fails') time.sleep(10) continue content = response.text DbHelper.execute( "INSERT INTO wenshu_web_detail(doc_id, summary, detail, crawl_status) VALUES (%s, %s, %s, %s)", (case[u'文书ID'], json.dumps(case), content, crawl_status_success)) logging.info('case inserted') time.sleep(3) DbHelper.execute( 'UPDATE wenshu_web SET parse_status = %s WHERE id = %s ', (parse_status_success, item['id']))
def main(): site = 'shanghai_web' batch = 100 data_table = 'common_data_all' while True: stime = time.time() sql = ''' select b.id, b.update_time, b.src_gsgs, b.src_qygs, b.src_bmgs, b.src_sfgs from ( select id from gs_shanghai_company where status = %s and parse_status = %s limit %s ) as a left join (select id, update_time, src_gsgs, src_qygs, src_bmgs, src_sfgs from gs_shanghai_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (15, 0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, update_time = item['id'], item['update_time'] gsgs, qygs, bmgs, sfgs = item['src_gsgs'], item['src_qygs'], item['src_bmgs'], item['src_sfgs'] try: logging.getLogger().info(" begin to parse company-id : %s " % (item[0],)) result = IcWebParser().parsing([etree.HTML(text=gsgs), etree.HTML(text=qygs), etree.HTML(text=bmgs), etree.HTML(text=sfgs)]) StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=update_time, data={u'gsgs': json.dumps(result[u'工商公示信息']), u'qygs': json.dumps(result[u'企业公示信息']), u'bmgs': json.dumps(result[u'其他部门公示信息']), u'sfgs': json.dumps(result[u'司法协助公示信息'])}) logging.getLogger().info(" data inserted ") DbHelper.execute("update gs_shanghai_company set parse_status = %s, data_table_name= %s where id = %s", (1, data_table, company_id)) logging.getLogger().info(" parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,)) DbHelper.execute("update gs_shanghai_company set parse_status = %s where id = %s", (2, company_id)) continue logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main(): site = "tianyancha_web" batch = 200 while True: logging.getLogger().info(" Batch begins ") stime = time.time() sql = ''' select b.id, b.crawl_time, b.src_detail from ( select id from tianyancha_web_company where crawl_status = %s and parse_status = %s limit %s ) as a left join (select id, crawl_time, src_detail from tianyancha_web_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (1, 0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: logging.getLogger().info(" begin to parse company-id : %s " % (item[0], )) data_table = "common_data_all" try: # parse html page company_id, crawl_time, src_detail = item[0], item[1], item[2] StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=crawl_time, data={u'detail': src_detail}) logging.getLogger().info(" data stored ") # update parse status DbHelper.execute( "update tianyancha_web_company set parse_status = %s, data_table_name =%s where id = %s", (1, data_table, item[0])) logging.getLogger().info( " parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info( "exception/err occurs, company id: %s" % (company_id, )) DbHelper.execute( "update tianyancha_web_company set parse_status = %s where id = %s ", (2, company_id)) continue logging.getLogger().info( " the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def handle_single_item(item, **kwargs): company_id, company_name, parsed_content = item['id'], item[ 'company_name'].strip(), item['parsed_content'] company_id_es = ETLTools.generate_company_id(company_name) logging.info( "starts to process company: %s, id: %s, the id of es document is: %s " % (company_name, company_id, company_id_es)) status = EsHelper.check(index=kwargs['index'], doc_type=kwargs['doc_type'], doc_id=company_id_es, site=kwargs['site'], config_site=kwargs['config_site']) if status == 2: # 数据存在 logging.info("company exists in es") return converted = QycxbWebEsConvert.convert(parsed_content) converted['CmpName'] = company_name.strip() converted['Meta'] = { 'Source': kwargs['site'], 'Time': datetime.datetime.now().date().__str__() } if status == 0: # es中不存在 EsHelper.es.index(index=kwargs['index'], doc_type=kwargs['doc_type'], id=company_id_es, body=json.dumps(converted, ensure_ascii=False, encoding="utf-8")) logging.info("inserted into es") else: # status == 1 es存在,但是优先级较低,因此更新之 EsHelper.es.update(index='drdata_qyk', doc_type='BusinessInfo', id=company_id_es, body=json.dumps(converted, ensure_ascii=False, encoding="utf-8")) logging.info("updated into es") DbHelper.execute( "UPDATE qycxb_web_company set es_status = %s WHERE id = %s ", (kwargs['es_status_success'], company_id))
def main(): batch = 100 site = 'qixin_weixin' data_table = 'common_data_all' while True: stime = time.time() sql = ''' select b.id, b.crawl_time, b.src_homepageinfo, b.src_basicinfo, b.src_changeinfo from ( select id from qixin_weixin_company where crawl_status = %s and parse_status = %s limit %s ) as a left join (select id, crawl_time, src_homepageinfo,src_basicinfo,src_changeinfo from qixin_weixin_company) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (7, 0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: company_id, crawl_time = item['id'], item['crawl_time'] pageinfo, basicinfo, changeinfo = item['src_homepageinfo'], item['src_basicinfo'], item['src_changeinfo'] try: logging.getLogger().info(" begin to parse company-id : %s " % (company_id,)) ''' 1 存储公司信息 ''' StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=crawl_time, data={u'pageinfo': pageinfo, u'basicinfo': basicinfo, u'changeinfo': changeinfo}) logging.getLogger().info(" data stored ") ''' 2 更改状态为已解析以及data-table-name ''' DbHelper.execute("update qixin_weixin_company set parse_status = %s, data_table_name =%s where id = %s", (1, data_table, company_id)) logging.getLogger().info(" parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,)) DbHelper.execute("update qixin_weixin_company set parse_status = %s where id = %s", [2, company_id]) continue logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main(): site = "xizhi_web" batch = 100 while True: begin = time.time() sql = ''' select b.id, b.update_time, b.data from ( select id from xizhi_web_company where status = %s and parse_status = %s limit %s ) as a left join (select id, update_time, data from xizhi_web_company ) as b on a.id = b.id ''' items = DbHelper.fetchmany(sql, (1, 0, batch)) if items is None or len(items) == 0: time.sleep(10) continue for item in items: data_table = "common_data_all" company_id, crawl_time, data = item['id'], item['update_time'], item['data'] logging.getLogger().info(" begin to parse company-id : %s " % (company_id,)) try: # parse html page detail = parse(data) # persist parsed company data into database StoreHelper.store_data(data_table=data_table, company_id=company_id, site=site, crawl_time=crawl_time, data={u'data': json.dumps(detail)}) logging.getLogger().info(" data stored ") # update parse status DbHelper.execute("update xizhi_web_company set parse_status = %s, data_table_name =%s where id = %s", (3, data_table, company_id)) logging.getLogger().info(" parse status updated, and data_table_name inserted ") except Exception, err: logging.getLogger().exception(err) logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,)) DbHelper.execute("update xizhi_web_company set parse_status = %s where id = %s ", (2, company_id)) continue logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - begin))
def process(): # get un-crawled company-keyword list keywords_response = requests.get( 'http://10.51.1.201:3352/getKeywords', params={ "data": json.dumps({ "field": "SoftwareCopyrightStatus", "status": 0 }, ensure_ascii=False) }) if keywords_response.status_code != 200: time.sleep(10) return else: companies = json.loads(keywords_response.text) logging.getLogger("Crawler").info( "Get Companies From ES, And Size is : %s" % (len(companies), )) for company in companies: cmpid, cmpname = company["Id"], company['CmpName'] logging.getLogger("Crawler").info( "Begins to Crawl Info, the CmpName:%s, CmpId:%s " % (cmpname, cmpid)) url_template = "http://www.ccopyright.com.cn/cpcc/RRegisterAction.do?method=list&no=fck&sql_name=&sql_regnum=&sql_author=%s&curPage=1" url = url_template % (quote(cmpname.replace(u"・", u"").encode("gbk"))) headers = None proxies = proxy.get_proxy("SoftWareCopyright") crawling_response = requests.get(url=url, headers=headers, proxies=proxies, timeout=15) if crawling_response.status_code != 200: time.sleep(10) continue content = crawling_response.text try: parsed_content = parse(content) except Exception as e: logging.getLogger("Crawler").exception( "Exceptions occurs when parsing crawled page") logging.getLogger("Crawler").exception(e) continue if len(parsed_content) == 0: logging.getLogger("Crawler").info("No software copyright found") else: DbHelper.execute( "INSERT INTO software_copyright(cmpid, src) VALUES(%s, %s)", (cmpid, content)) parse_status = 1 DbHelper.execute( "UPDATE software_copyright SET parse_status = %s, parsed_content = %s where cmpid = %s ", (parse_status, json.dumps(parsed_content), cmpid)) logging.getLogger("Crawler").info("Page Parsed Successfully") notify_response = requests.get( url=url, params={ "data": json.dumps([ { "Id": cmpid, "field": "SoftwareCopyrightStatus", "status": 1 }, ], ensure_ascii=False) }) if notify_response.status_code != 200: logging.getLogger("Crawler").info( "Action, which notify es, fails.") logging.getLogger("Crawler").info("Info Crawled Successfully") time.sleep(2)