class ZdbPedaily_tzsj: def __init__(self): self.urls = ["https://zdb.pedaily.cn/inv/p{}/".format(i) for i in range(1, 770)] self.util = Util() self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Cookie": "__uid=1452122016; __fromtype=0; ARRAffinity=197ae5372184c64aeca47f780a2e053f3a50366e2bda392cd4bfa3b38e39a929; Hm_lvt_25919c38fb62b67cfb40d17ce3348508=1564455299,1564997145,1565057017,1565061687; BAIDU_SSP_lcr=https://www.baidu.com/link?url=mXXXmWT7-LUN6gg9o-kkJIw_k0SkPj9aL3XGvS6wRVmJjG_3dfydZul0mdFS1rSa&wd=&eqid=cf1c52fe000195ab000000065d48f231; __utma=23980325.1444638820.1563415171.1565057028.1565061688.26; __utmc=23980325; __utmz=23980325.1565061688.26.11.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; Hm_lpvt_25919c38fb62b67cfb40d17ce3348508={}; __utmb=23980325.5.10.1565061688", "Host": "zdb.pedaily.cn", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } def get_shareholder(self, id_code, detail_html): shareholder_info = detail_html.xpath("//table[@class=\"shareholder-info\"]/tbody/tr") if shareholder_info: for si in shareholder_info: shareholder_name = si.xpath("./td[1]/text()")[0] shareholder_type = si.xpath("./td[2]/text()")[0] if si.xpath("./td[3]/text()"): shareholder_money = si.xpath("./td[3]/text()")[0] else: shareholder_money = "" crawl_time = self.util.get_now_time() sql_sharholder = "insert into INV_EVT_SHH_INF(ID,SHH_INF,SHH_TYP,SSCR_CTRB_AMT,INPT_DT) " \ "values('%s', '%s', '%s', '%s','%s')" % ( id_code, shareholder_name, shareholder_type, shareholder_money, crawl_time) self.util.insert2mysql("股东信息", sql_sharholder) def get_main_people(self, id_code, detail_html): main_people = detail_html.xpath("//div[@class=\"business-people\"]/ul/li") if main_people: for p in main_people: mp_name = p.xpath("./h3/text()")[0] mp_position = p.xpath("./p/text()")[0] crawl_time = self.util.get_now_time() sql_main_people = "insert into INV_EVT_MAIN_PSN_INF(ID, MAIN_PPL_NM, MAIN_PPL_POS, INPT_DT) values('%s', '%s', '%s','%s')" % ( id_code, mp_name, mp_position, crawl_time) self.util.insert2mysql("主要人物", sql_main_people) def get_detail_info(self, detail_url): self.headers["Cookie"] = self.headers["Cookie"].format(self.util.get_stamp()) detail_res = self.util.get_req(url=detail_url, headers=self.headers) print(detail_res.status_code) if detail_res.status_code == 200: detail_html = self.util.get_xpath_obj(detail_res) # 详情页信息获取 company_name = detail_html.xpath("//h1/text()")[0] company_base = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[1]/text()")[0] company_reg_loc = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[2]/text()")[0] company_bound_date = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[3]/text()")[0] company_industry = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[4]/text()")[0] if detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[@class=\"link\"]/a/text()"): company_site = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[@class=\"link\"]/a/text()")[0] else: company_site = "" if detail_html.xpath('//div[@class="box-fix-l"]/p/text()'): company_intro = detail_html.xpath('//div[@class="box-fix-l"]/p/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/p/span/text()'): company_intro = detail_html.xpath('//div[@class="box-fix-l"]/p/span/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/pre/text()'): company_intro = detail_html.xpath('//div[@class="box-fix-l"]/pre/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/div/div/text()'): company_intro = detail_html.xpath('//div[@class="box-fix-l"]/div/div/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/div/text()'): company_intro = detail_html.xpath('//div[@class="box-fix-l"]/div/text()')[0] elif detail_html.xpath('//div[@id="cke_pastebin"]//text()'): company_intro = detail_html.xpath('//div[@id="cke_pastebin"]//text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/ul/text()'): company_intro = detail_html.xpath('//div[@class="box-fix-l"]/ul/text()')[0] else: company_intro = "" if detail_html.xpath("//div[@id=\"business\"]"): legal_person = detail_html.xpath("//table[@class=\"base-info\"]/tr[1]/td[2]/text()")[0] founded_time = detail_html.xpath("//table[@class=\"base-info\"]/tr[1]/td[4]/text()")[0] registered_capital = detail_html.xpath("//table[@class=\"base-info\"]/tr[2]/td[2]/text()")[0] operational_authority = detail_html.xpath("//table[@class=\"base-info\"]/tr[2]/td[4]/text()")[0] registered_num = detail_html.xpath("//table[@class=\"base-info\"]/tr[3]/td[2]/text()")[0] approval_date = detail_html.xpath("//table[@class=\"base-info\"]/tr[3]/td[4]/text()")[0] organizational_code = detail_html.xpath("//table[@class=\"base-info\"]/tr[4]/td[2]/text()")[0] creditfcode = detail_html.xpath("//table[@class=\"base-info\"]/tr[4]/td[4]/text()")[0] identification_number = detail_html.xpath("//table[@class=\"base-info\"]/tr[5]/td[2]/text()")[0] registration_authority = detail_html.xpath("//table[@class=\"base-info\"]/tr[5]/td[4]/text()")[0] enterprise_type = detail_html.xpath("//table[@class=\"base-info\"]/tr[6]/td[2]/text()")[0] else: legal_person = "" founded_time = "" registered_capital = "" operational_authority = "" registered_num = "" approval_date = "" organizational_code = "" creditfcode = "" identification_number = "" registration_authority = "" enterprise_type = "" id_code = self.util.MD5(company_name + creditfcode) # 融资事件 信息处理 for rz_html in detail_html.xpath("//div[@class=\"list-invest\"]/ul/li"): if rz_html.xpath("./div[@class=\"view\"]/a/@href")[0].startswith("http"): rz_url = rz_html.xpath("./div[@class=\"view\"]/a/@href")[0] # 融资事件新开页 else: rz_url = "https://zdb.pedaily.cn" + rz_html.xpath("./div[@class=\"view\"]/a/@href")[0] # 融资事件新开页 print(rz_url) rz_res = self.util.get_req(url=rz_url, headers=self.headers) if rz_res.status_code == 200: rz_html = self.util.get_xpath_obj(rz_res.text) # 投资事件 信息获取 rz_title = rz_html.xpath("//h1/text()")[0] rz_info = "".join(rz_html.xpath("//div[@class=\"info\"]/ul/li//text()")) rz_intro = rz_html.xpath("//div[@id=\"desc\"]/p/text()")[0] crawl_time = self.util.get_now_time() sql_rzsj = """insert into INV_EVT_INF(ID,CMP_NM,ORG_TOT_DEPT,REG_PLC_PNT,CMP_SET_UP_TM,AFL_IDT,FORML_WEB,CMP_INTRO,LVRG_NM,LVRG_INF,LVGR_DTL,LGP_INF,SET_UP_TM,REG_CPT,OPR_RIT,REG_NBR,APRV_TM,ORG_ORG_CD_NBR,SOC_CRD_CD,TAX_PSN_RCG_NBR,REG_INSTT,ENTP_TYP,INPT_DT )values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % (id_code, company_name, company_base, company_reg_loc, company_bound_date, company_industry, company_site, company_intro, rz_title, rz_info, rz_intro, legal_person, founded_time, registered_capital, operational_authority, registered_num, approval_date, organizational_code, creditfcode, identification_number, registration_authority, enterprise_type, crawl_time) self.util.insert2mysql("融资公司信息", sql_rzsj) self.get_main_people(id_code, detail_html) self.get_shareholder(id_code, detail_html) def get_items_list(self, res): html = self.util.get_xpath_obj(res) for li in html.xpath("//ul[@id=\"inv-list\"]/li"): time.sleep(2) # 详情页获取 if li.xpath("./div[1]/a/@href"): detail_url = "https://zdb.pedaily.cn" + li.xpath("./div[1]/a/@href")[0] # 地址获取 else: continue print(detail_url) self.get_detail_info(detail_url) def run(self): self.headers["Cookie"] = self.headers["Cookie"].format(self.util.get_stamp()) for url in self.urls: print("列表页:" + url + "开始爬取") res = self.util.get_req(url=url, headers=self.headers) # 列表页列表获取 self.get_items_list(res)
class ZdbPedaily: def __init__(self): self.urls = [ "https://zdb.pedaily.cn/enterprise/p{}/".format(i) for i in range(1, 770) ] self.util = Util() self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "Cookie": "__uid=1452122016; " "__utmc=23980325; " "ARRAffinity=197ae5372184c64aeca47f780a2e053f3a50366e2bda392cd4bfa3b38e39a929; " "BAIDU_SSP_lcr=https://www.baidu.com/link?url=LHrB83UJlUcy6-MhfY_1I-IRwU723Vl0YUkuCsVJ5MlEYZUAvU2Mv5jTfYQ2ZC0u&wd=&eqid=b0d97bf1000ba11a000000065d3018e2; " "Hm_lvt_25919c38fb62b67cfb40d17ce3348508=1563415171,1563433191,1563523111; " "__utma=23980325.1444638820.1563415171.1563433192.1563523112.3; " "__utmz=23980325.1563523112.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; " "__fromtype=1; " "accesstoken=PQZUMOXSH2; " "Hm_lpvt_25919c38fb62b67cfb40d17ce3348508={}; " "__utmb=23980325.10.10.1563523112", "Host": "zdb.pedaily.cn", "Referer": "https://zdb.pedaily.cn/", "Upgrade - Insecure - Requests": "1", } def get_shareholder(self, id_code, detail_html): shareholder_info = detail_html.xpath( "//table[@class=\"shareholder-info\"]/tbody/tr") if shareholder_info: for si in shareholder_info: shareholder_name = si.xpath("./td[1]/text()")[0] shareholder_type = si.xpath("./td[2]/text()")[0] if si.xpath("./td[3]/text()"): shareholder_money = si.xpath("./td[3]/text()")[0] else: shareholder_money = "" crawl_time = self.util.get_now_time() sql_sharholder = "insert into INV_EVT_ENTP_SHH_INF(ID,SHH_INF,SHH_TYP,SSCR_CTRB_AMT,INPT_DT) " \ "values('%s', '%s', '%s', '%s','%s')" \ % (id_code, shareholder_name, shareholder_type, shareholder_money, crawl_time) self.util.insert2mysql("股东信息", sql_sharholder) def get_main_people(self, id_code, detail_html): main_people = detail_html.xpath( "//div[@class=\"business-people\"]/ul/li") if main_people: for p in main_people: mp_name = p.xpath("./h3/text()")[0] mp_position = p.xpath("./p/text()")[0] crawl_time = self.util.get_now_time() sql_main_people = "insert into INV_EVT_ENTP_MAIN_PSN_INF(ID,MAIN_PPL_NM,MAIN_PPL_POS,INPT_DT) " \ "values('%s', '%s', '%s','%s')" % (id_code, mp_name, mp_position, crawl_time) self.util.insert2mysql("主要人物", sql_main_people) def get_detail_info(self, detail_url): detail_res = self.util.get_req(url=detail_url, headers=self.headers) print(detail_res.status_code) if detail_res.status_code == 200: detail_html = self.util.get_xpath_obj(detail_res) # 详情页信息获取 company_name = detail_html.xpath("//h1/text()")[0] company_base = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[1]/text()")[0] company_reg_loc = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[2]/text()")[0] company_bound_date = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[3]/text()")[0] company_industry = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[4]/text()")[0] if detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[@class=\"link\"]/a/text()" ): company_site = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[@class=\"link\"]/a/text()" )[0] else: company_site = "" if detail_html.xpath('//div[@class="box-fix-l"]/p/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/p/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/p/span/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/p/span/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/pre/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/pre/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/div/div/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/div/div/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/div/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/div/text()')[0] elif detail_html.xpath('//div[@id="cke_pastebin"]//text()'): company_intro = detail_html.xpath( '//div[@id="cke_pastebin"]//text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/ul/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/ul/text()')[0] else: company_intro = "" if detail_html.xpath("//div[@id=\"business\"]"): legal_person = detail_html.xpath( "//table[@class=\"base-info\"]/tr[1]/td[2]/text()")[0] founded_time = detail_html.xpath( "//table[@class=\"base-info\"]/tr[1]/td[4]/text()")[0] registered_capital = detail_html.xpath( "//table[@class=\"base-info\"]/tr[2]/td[2]/text()")[0] operational_authority = detail_html.xpath( "//table[@class=\"base-info\"]/tr[2]/td[4]/text()")[0] registered_num = detail_html.xpath( "//table[@class=\"base-info\"]/tr[3]/td[2]/text()")[0] approval_date = detail_html.xpath( "//table[@class=\"base-info\"]/tr[3]/td[4]/text()")[0] organizational_code = detail_html.xpath( "//table[@class=\"base-info\"]/tr[4]/td[2]/text()")[0] creditfcode = detail_html.xpath( "//table[@class=\"base-info\"]/tr[4]/td[4]/text()")[0] identification_number = detail_html.xpath( "//table[@class=\"base-info\"]/tr[5]/td[2]/text()")[0] registration_authority = detail_html.xpath( "//table[@class=\"base-info\"]/tr[5]/td[4]/text()")[0] enterprise_type = detail_html.xpath( "//table[@class=\"base-info\"]/tr[6]/td[2]/text()")[0] else: legal_person = "" founded_time = "" registered_capital = "" operational_authority = "" registered_num = "" approval_date = "" organizational_code = "" creditfcode = "" identification_number = "" registration_authority = "" enterprise_type = "" id_code = self.util.MD5(company_name + creditfcode) if detail_html.xpath("//*[@id=\"contact\"]"): contact = "".join( detail_html.xpath( "//*[@id=\"contact\"]/p//text()")).replace("'", "").strip() else: contact = "" # 融资事件 信息处理 if detail_html.xpath("//div[@class=\"list-invest\"]/ul/li"): for rz_html in detail_html.xpath( "//div[@class=\"list-invest\"]/ul/li"): if rz_html.xpath("./div[@class=\"view\"]/a/@href" )[0].startswith("http"): rz_url = rz_html.xpath( "./div[@class=\"view\"]/a/@href")[0] # 融资事件新开页 else: rz_url = "https://zdb.pedaily.cn" + rz_html.xpath( "./div[@class=\"view\"]/a/@href")[0] # 融资事件新开页 print(rz_url) self.headers["Cookie"] = self.headers["Cookie"].format( self.util.get_stamp()) rz_res = self.util.get_req(url=rz_url, headers=self.headers) if rz_res.status_code == 200: print("融资事件详情页请求成功") rz_html = self.util.get_xpath_obj(rz_res.text) # 投资事件 信息获取 rz_title = rz_html.xpath("//h1/text()")[0] rz_info = "".join( rz_html.xpath( "//div[@class=\"info\"]/ul/li//text()")) if rz_html.xpath("//div[@id=\"desc\"]/p/text()"): rz_intro = rz_html.xpath( "//div[@id=\"desc\"]/p/text()")[0] else: rz_intro = "" else: rz_title = "" rz_info = "" rz_intro = "" crawl_time = self.util.get_now_time().replace("'", "") sql_qyk = """insert into INV_EVT_ENTP_BAS_INF( ID ,CMP_NM ,ORG_TOT_DEPT ,REG_PLC_PNT ,CMP_SET_UP_TM ,AFL_IDT ,FORMAL_WEB ,CMP_INTRO ,LVRG_TTL ,LVRG_INF ,LVRG_INTRO ,LGP_RPRS ,SET_UP_TM ,REG_CPT ,OPR_RIT ,REG_NBR ,APRV_TM ,ORG_ORG_CD_NBR ,SOC_CRD_CD ,TAX_PSN_RCG_NBR ,REG_INSTT ,ENTP_TYP ,CTC_MTH ,INPT_DT )values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % ( id_code, company_name, company_base, company_reg_loc, company_bound_date, pymysql.escape_string(company_industry), company_site, company_intro, rz_title, rz_info, rz_intro, legal_person, founded_time, registered_capital, operational_authority, registered_num, approval_date, organizational_code, creditfcode, identification_number, registration_authority, enterprise_type, contact, crawl_time) # print(sql_qyk) self.util.insert2mysql("融资公司信息", sql_qyk) self.get_main_people(id_code, detail_html) self.get_shareholder(id_code, detail_html) def get_items_list(self, res): html = self.util.get_xpath_obj(res) for li in html.xpath("//ul[@id=\"enterprise-list\"]/li"): time.sleep(2) # 详情页获取 if li.xpath("./div[1]/a/@href"): detail_url = "https://zdb.pedaily.cn" + li.xpath( "./div[1]/a/@href")[0] # 地址获取 else: continue print(detail_url) self.get_detail_info(detail_url) def run(self): self.headers["Cookie"] = self.headers["Cookie"].format( self.util.get_stamp()) for url in self.urls: print("列表页:" + url + "开始爬取") res = self.util.get_req(url=url, headers=self.headers) # 列表页列表获取 self.get_items_list(res)
class WebapiCninfo: def __init__(self): self.get_code_key_h = { "Referer": "http://webapi.cninfo.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763", "Cache-Control": "max-age=0", "Accept": "image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5", "Accept-Language": "zh-CN", "Accept-Encoding": "gzip, deflate", "Host": "webapi.cninfo.com.cn", "Connection": "Keep-Alive", "Cookie": "cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=E4307520E006F39592E00F72DAAEA7D9; pgv_pvid=194670220; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564557528,1564557544,1564557814,1564557966; __qc_wId=595; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}", } self.get_loc_mess_h = { "Origin": "http://webapi.cninfo.com.cn", "Referer": "http://webapi.cninfo.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763", "Cache-Control": "max-age=0", "Accept": "*/*", "Accept-Language": "zh-CN", "mcode": "{}", "X-Requested-With": "XMLHttpRequest", "Accept-Encoding": "gzip, deflate", "Content-Length": "0", "Host": "webapi.cninfo.com.cn", "Connection": "Keep-Alive", "Pragma": "no-cache", "Cookie": "UC-JSESSIONID=E4307520E006F39592E00F72DAAEA7D9; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; __qc_wId=595; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}; pgv_pvid=194670220; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564557966,1564558754,1564559126,{}; codeKey={}", } self.get_comp_name_h = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Content-Length": "0", "Cookie": "pgv_pvid=9659403051; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=54EC36EB821D8FDBF427E3268AD8E2B7; __qc_wId=281; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564622577,1564623888,1564625108,{}; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}; codeKey={}", "Host": "webapi.cninfo.com.cn", "mcode": "{}", "Origin": "http://webapi.cninfo.com.cn", "Referer": "http://webapi.cninfo.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "X-Requested-With": "XMLHttpRequest", } self.get_data_h = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Content-Length": "0", "Cookie": "pgv_pvid=9659403051; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=54EC36EB821D8FDBF427E3268AD8E2B7; __qc_wId=281; codeKey={}; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564623888,1564625108,1564625230,{}; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}", "Host": "webapi.cninfo.com.cn", "mcode": "{}", "Origin": "http://webapi.cninfo.com.cn", "Referer": "http://webapi.cninfo.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/75.0.3770.100 Safari/537.36", "X-Requested-With": "XMLHttpRequest", } self.get_data_d = { "scode": "", "sdate": "", "edate": "", "type": "071001", "@column": "SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V,F006N,F007N,F008N,F009N,F010N,F011N,F012N" ",F013N,F014N,F015N,F016N,F017N,F018N,F019N,F020N,F021N,F022N,F023N,F024N,F025N,F026N,F027N" ",F028N,F029N,F030N,F031N,F032N,F033N,F034N,F035N,F036N,F037N,F038N,F039N,F040N,F041N,F043N" ",F044N,F045N,F046N,F047N,F048N,F049N,F050N,F051N,F052N,F053N,F054N,F055N,F056N,F057N,F058N" ",F059N,F060N,F061N,F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N,F070N,F071N,F072N,F073N" ",F074N,F075N,F076N,F077N,F078N,F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N,F088N" ",F089N,F090N,F091N", } self.get_comp_name_d = { "platetype": "{}", "platecode": "{}", "@orderby": "SECCODE:asc", "@column": "SECCODE,SECNAME", } self.session = requests.Session() self.util = Util() self.get_code_url = "http://webapi.cninfo.com.cn/api-cloud-platform/login/getVerfyCode" self.get_loc_url = "https://webapi.cninfo.com.cn/api/sysapi/p_sysapi1016" self.d_date = [i + j for i in ["2017", "2018", "2019"] for j in ["0331", "0630", "0930", "1231"]] def parse_json(self, content): content = self.util.get_json_obj(content) datas = content["records"][3]["children"] return ["http://webapi.cninfo.com.cn/{}?{}&@column=SECCODE,SECNAME"\ .format(data["API"], data["PARAM"]) for data in datas] def parse_data(self, data): y = self.get_data_d["sdate"][:4] if self.get_data_d["sdate"][4:6] == "03": quarter = "第一季度" elif self.get_data_d["sdate"][4:6] == "06": quarter = "第二季度" elif self.get_data_d["sdate"][4:6] == "09": quarter = "第三季度" elif self.get_data_d["sdate"][4:6] == "12": quarter = "第四季度" else: quarter = "--" if isinstance(data, str): data = self.util.get_json_obj(data) for d in data["records"]: id_code = self.util.MD5(d["SECNAME"] + y + quarter) print(d["SECNAME"]) sql = """insert into webapi_cninfo(id, SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V, F006N,F007N,F008N,F009N,F010N,F011N,F012N,F013N,F014N, F015N,F016N,F017N,F018N,F019N,F020N,F021N,F022N,F023N, F024N,F025N,F026N,F027N,F028N,F029N,F030N,F031N,F032N, F033N,F034N,F035N,F036N,F037N,F038N,F039N,F040N,F041N, F043N,F044N,F045N,F046N,F047N,F048N,F049N,F050N,F051N, F052N,F053N,F054N,F055N,F056N,F057N,F058N,F059N,F060N, F061N,F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N, F070N,F071N,F072N,F073N,F074N,F075N,F076N,F077N,F078N, F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N, F088N,F089N,F090N,F091N,y,quarter,crawl_time) values ('%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s')""" \ % ( id_code, d["SECCODE"], d["SECNAME"], d["STARTDATE"], d["ENDDATE"], d["F001D"], d["F002V"], d["F003V"], d["F006N"], d["F007N"], d["F008N"], d["F009N"], d["F010N"], d["F011N"], d["F012N"], d["F013N"], d["F014N"], d["F015N"], d["F016N"], d["F017N"], d["F018N"], d["F019N"], d["F020N"], d["F021N"], d["F022N"], d["F023N"], d["F024N"], d["F025N"], d["F026N"], d["F027N"], d["F028N"], d["F029N"], d["F030N"], d["F031N"], d["F032N"], d["F033N"], d["F034N"], d["F035N"], d["F036N"], d["F037N"], d["F038N"], d["F039N"], d["F040N"], d["F041N"], d["F043N"], d["F044N"], d["F045N"], d["F046N"], d["F047N"], d["F048N"], d["F049N"], d["F050N"], d["F051N"], d["F052N"], d["F053N"], d["F054N"], d["F055N"], d["F056N"], d["F057N"], d["F058N"], d["F059N"], d["F060N"], d["F061N"], d["F062N"], d["F063N"], d["F064N"], d["F065N"], d["F066N"], d["F067N"], d["F068N"], d["F069N"], d["F070N"], d["F071N"], d["F072N"], d["F073N"], d["F074N"], d["F075N"], d["F076N"], d["F077N"], d["F078N"], d["F079N"], d["F080N"], d["F081N"], d["F082N"], d["F083N"], d["F084N"], d["F085N"], d["F086N"], d["F087N"], d["F088N"], d["F089N"], d["F090N"], d["F091N"], y, quarter, self.util.get_now_time() ) self.util.insert2mysql(d["SECNAME"], sql) time.sleep(0.3) def cut_comp_code(self, scode, codekey, ts): # 请求数据的base_url data_url = "http://webapi.cninfo.com.cn/api/stock/p_stock2332?scode={}" \ "&sdate=20190331&edate=20190331&type=071001&" \ "@column=SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V,F006N,F007N,F008N," \ "F009N,F010N,F011N,F012N,F013N,F014N,F015N,F016N,F017N,F018N,F019N,F020N,F021N," \ "F022N,F023N,F024N,F025N,F026N,F027N,F028N,F029N,F030N,F031N,F032N,F033N,F034N," \ "F035N,F036N,F037N,F038N,F039N,F040N,F041N,F043N,F044N,F045N,F046N,F047N,F048N," \ "F049N,F050N,F051N,F052N,F053N,F054N,F055N,F056N,F057N,F058N,F059N,F060N,F061N," \ "F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N,F070N,F071N,F072N,F073N,F074N," \ "F075N,F076N,F077N,F078N,F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N," \ "F088N,F089N,F090N,F091N".format(scode) stamp = self.util.get_stamp() # 统一时间戳 # 生成新的请求headers self.get_data_h["Cookie"] = self.get_data_h["Cookie"].format(codekey, stamp, stamp) self.get_data_h["mcode"] = self.get_data_h["mcode"].format(self.util.base64_encode(ts).decode("utf-8")) self.get_data_d["scode"] = scode data = self.session.post(url=data_url, headers=self.get_data_h, data=self.get_data_d).text self.parse_data(data) # 处理公司的json数据 def parse_comp_json(self, json_res, codekey, ts): content = self.util.get_json_obj(json_res) ls_comp_code = [] for c in content["records"]: ls_comp_code.append(c["SECCODE"]) # 得到公司代码 if len(ls_comp_code) % 20 == 0: loop = int(len(ls_comp_code) / 20) else: loop = int(len(ls_comp_code) / 20) for dd in self.d_date: print(dd) self.get_data_d["sdate"] = dd self.get_data_d["edate"] = dd s = 0 e = 20 for _ in range(loop): time.sleep(1.5) scode = ",".join(ls_comp_code[s:e]) s += 20 if e < len(ls_comp_code): e += 20 else: e = len(ls_comp_code) self.cut_comp_code(scode, codekey, ts) time.sleep(30) # 获取所有公司名称 def get_comp_name(self, get_loc_res, codekey, ts): # 获取公司名称 for get_comp_name_url in self.parse_json(get_loc_res): # 处理请求参数 self.get_comp_name_h["Cookie"] = self.get_comp_name_h["Cookie"] \ .format(self.util.get_stamp(), self.util.get_stamp(), codekey) self.get_comp_name_h["mcode"] = self.get_comp_name_h["mcode"].format(self.util.base64_encode(ts)) self.get_comp_name_d["platetype"] = self.get_comp_name_d["platetype"].format( re.findall(r'platetype=(\d+)&', get_comp_name_url)[0]) self.get_comp_name_d["platecode"] = self.get_comp_name_d["platecode"].format( re.findall(r'platecode=(\d+)&', get_comp_name_url)[0]) # 开始请求公司名称 comp_name_res = self.session.post(url=get_comp_name_url, headers=self.get_comp_name_h, data=self.get_comp_name_d).text self.parse_comp_json(comp_name_res, codekey, ts) def main(self): # 请求网页,为得到本次会话的codekey 值 self.get_code_key_h["Cookie"] = self.get_code_key_h["Cookie"].format(int(time.time())) # 构造headers get_code_res = self.session.get(url=self.get_code_url, headers=self.get_code_key_h, verify=False) ts = int(time.time()) # 获取本次会话的时间戳 codekey = re.findall(r'codeKey=(.*?);', get_code_res.headers["Set-Cookie"])[0] # 得到codekey # 得到以地区分类的网页 self.get_loc_mess_h["mcode"] = self.get_loc_mess_h["mcode"].format(self.util.base64_encode(ts)) self.get_loc_mess_h["Cookie"] = self.get_loc_mess_h["Cookie"]\ .format(self.util.get_stamp(), self.util.get_stamp(), codekey) get_loc_res = self.session.post(url=self.get_loc_url, headers=self.get_loc_mess_h).text # 处理获取公司名称 self.get_comp_name(get_loc_res, codekey, ts)
class WzzxbsMofocom: def __init__(self): self.url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadRecordData.action" self.detail_base_url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadEntpRecordDetails.action?params.recordId={}&time={}" self.headers = { "Accept": "application/json, text/javascript, */*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "Content-Length": "169", "Content-Type": "application/x-www-form-urlencoded", "Cookie": "insert_cookie=32151754", "Host": "wzzxbs.mofcom.gov.cn", "Origin": "http://wzzxbs.mofcom.gov.cn", "Referer": "http://wzzxbs.mofcom.gov.cn/WebProSP/app/infoPub/entpRecord", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "X-Requested-With": "XMLHttpRequest" } self.detail_headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "insert_cookie=32151754", "Host": "wzzxbs.mofcom.gov.cn", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } self.data = { "params.entpName": "", "page.currentPage": "", "page.limit": "2000", "page.option": "next", "page.start": "", "page.rowCount": "", "listGrid.col": "1:showRecordInfo(0),2,3,4", "listGrid.type": "link,ro,ro,ro" } self.detail_data = {"params.recordId": "", "time": ""} self.util = Util() self.user_agent = UserAgent() def parse_18(self, detail_html, business_type): # 一、备案情况 item_content = detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[3]/td/text()")[0].replace( "\xe5", "") # 变更事项 # print(item_content) item_date = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[4]/td/text()")[0].replace( "\xe5", "") # 完成备案时间 # print(item_date) item_number = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[5]/td/text()")[0].replace( "\xe5", "") # 备案号 # print(item_number) # 二、外商投资企业基本信息 comp_name = detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[7]/td/text()")[0].replace( "\ue07e", "").replace("\xe5", "") # 公司名称 # print(comp_name) regi_addr = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[8]/td/text()")[0].replace( '\u3bbe', '').replace('\ue07e', '').replace("\xe5", "").replace("\ue096", "") # 注册地址 # print(regi_addr) try: crit_code = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[9]/td/text()" )[0].replace("\xe5", "") # 统一社会信用代码 except IndexError: crit_code = "" # print(crit_code) comp_type = re.findall( r'checked="checked"/> (.*?) ', str( etree.tostring(detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[10]/td")[0], encoding='utf-8'), 'utf-8').strip().replace("\xe5", ""), re.S)[0] # 企业类型 # print(comp_type) operating_period = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[11]/td/text()")[0].strip( ).replace("\xe5", "") # 经营期限 # print(operating_period) try: investment_industry = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[12]/td/text()" )[0].replace("\xe5", "") # 投资行业 except Exception: investment_industry = "" # print(investment_industry) business_scope = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[13]/td/text()")[0].replace( "\xe5", "").replace("\xe5", "") # 经营范围 # print(business_scope) try: total_investment = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[14]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xe5", "").replace("\ue07e", "") except IndexError: total_investment = "" # print(total_investment) registered_capital = str.split( detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[15]/td/text()")[0], " ")[0].replace("\xa0", "").replace("\xe5", "").replace("\ue07e", "") # 注册资本 # print(registered_capital) try: legal_representative = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[16]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xe5", "").replace("\ue07e", "").replace("\u4b72", " ") # 法定代表人 except IndexError: legal_representative = "" # print(legal_representative) md5_id = comp_name + business_type + item_date + item_number cols = (self.util.MD5(item_number), business_type, item_content, item_date, item_number, comp_name, regi_addr, crit_code, comp_type, operating_period, investment_industry, business_scope, total_investment, registered_capital, pymysql.escape_string(legal_representative), self.util.MD5(md5_id), self.util.get_now_time()) s = self.get_sql(cols) self.util.insert2mysql(comp_name, s) return md5_id, item_number def parse_17(self, detail_html, business_type): item_content = "" # 变更事项 item_date = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[3]/td/text()")[0].replace( "\xe5", "") # 完成备案时间 # print(item_date) item_number = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[4]/td/text()")[0].replace( "\xe5", "") # 备案号 # print(item_number) # 二、外商投资企业基本信息 comp_name = detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[6]/td/text()")[0].replace( "\ue07e", "").replace("\xe5", "") # 公司名称 # print(comp_name) regi_addr = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[7]/td/text()")[0].replace( '\u3bbe', '').replace('\ue07e', '').replace("\xe5", "").replace("\ue096", "") # 注册地址 # print(regi_addr) try: crit_code = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[8]/td/text()" )[0].replace("\xe5", "") # 统一社会信用代码 except IndexError: crit_code = "" # print(crit_code) comp_type = re.findall( r'checked="checked"/> (.*?) ', str( etree.tostring(detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[9]/td")[0], encoding='utf-8'), 'utf-8') # .replace(" ", "").replace("<input", "").replace("\n", "") .strip().replace("\xe5", ""), re.S)[0] # 企业类型 # print(comp_type) operating_period = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[10]/td/text()")[0].strip( ).replace("\xe5", "") # 经营期限 # print(operating_period) try: investment_industry = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[11]/td/text()" )[0].replace("\xe5", "") # 投资行业 except Exception: investment_industry = "" # print(investment_industry) business_scope = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[12]/td/text()")[0].replace( "\xe5", "").replace("\xe5", "") # 经营范围 # print(business_scope) try: total_investment = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[13]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xe5", "") # 投资总额 except IndexError: total_investment = "" # print(total_investment) registered_capital = str.split( detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[14]/td/text()")[0], " ")[0].replace("\xa0", "").replace("\xe5", "") # 注册资本 # print(registered_capital) try: legal_representative = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[15]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xd6", "").replace("\xe5", "") # 法定代表人 except IndexError: legal_representative = "" # print(legal_representative) md5_id = comp_name + business_type + item_date + item_number cols = (self.util.MD5(item_number), business_type, item_content, item_date, item_number, comp_name, regi_addr, crit_code, comp_type, operating_period, investment_industry, business_scope, total_investment, registered_capital, pymysql.escape_string(legal_representative), self.util.MD5(md5_id), self.util.get_now_time()) self.util.insert2mysql(comp_name, self.get_sql(cols)) return md5_id, item_number def get_sql(self, col_tuple): info_sql = """ insert into wzzxbs_mofcom_info( id, business_type, item_content, item_date, item_number, comp_name, regi_addr, crit_code, comp_type, operating_period, investment_industry, business_scope, total_investment, registered_capital, legal_representative, cust_id, craw_time )values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % col_tuple return info_sql def parse_invesment_info(self, detail_html, md5_id, n): for mes in detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[{}]/table/tr".format( n))[1:]: name_of_investor = str.split(mes.xpath("./td[1]/text()")[0], " ")[0]\ .replace("\ue07e", "")\ .replace("\xe5", "")\ .replace("\xd6", "") # print(name_of_investor) different_countries = mes.xpath("./td[2]/text()")[0].replace( "\xe5", "") # print(different_countries) amount_invested = str.split(mes.xpath("./td[3]/text()")[0], " ")[0]\ .replace("\xa0", "")\ .replace("\xd6", "")\ .replace("\xe5", "")\ .replace("\ue07e", "") # print(amount_invested) investment_sql = """ insert into wzzxbs_mofcom_investment_info( id, name_of_investor, different_countries, amount_invested, cust_id, craw_time )values('%s', '%s', '%s', '%s', '%s', '%s') """ % (self.util.MD5(name_of_investor + different_countries + amount_invested), pymysql.escape_string(name_of_investor), different_countries, amount_invested, self.util.MD5(md5_id), self.util.get_now_time()) self.util.insert2mysql("投资信息|", investment_sql) def parse(self, num): self.data["page.currentPage"] = str(num) if num: self.data["page.start"] = str((int(num) - 1) * 2000) while True: try: page_req = requests.post(url=self.url, headers=self.headers, data=self.data) items = self.util.get_json_obj(page_req.text)["rows"] page_req.close() for item in items: # item business_type = item["data"][1] item_code = re.findall(r'showRecordInfo\(\"(.*?)\"\)', item["data"][0])[0] detail_url = self.detail_base_url.format( item_code, self.util.get_stamp()) # 详情页请求连接 print(detail_url) self.detail_data["params.recordId"] = item_code self.detail_data["time"] = self.util.get_stamp() while True: try: detail_req = requests.get( url=detail_url, headers=self.detail_headers, data=self.detail_data) # 详情页请求 detail_html = self.util.get_xpath_obj( detail_req.text) detail_req.close() if len( detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr") ) == 18: try: md5_id, item_number = self.parse_18( detail_html, business_type) self.parse_invesment_info( detail_html, md5_id, 18) except Exception as e18: print("e18" + str(e18)) print("问题在此处被捕获了") else: try: md5_id, item_number = self.parse_17( detail_html, business_type) # 三、外商投资企业投资者基本信息 self.parse_invesment_info( detail_html, md5_id, 17) except Exception as e17: print("e17" + str(e17)) print("问题在此处被捕获了") break except requests.exceptions.ChunkedEncodingError as e: print("e" + str(e)) except Exception as e1: print("e1" + str(e1)) print("==>远程关闭连接,休息等待中。。。") time.sleep(300) time.sleep(1.5) break except requests.exceptions.ChunkedEncodingError as e2: print("e2" + str(e2)) except Exception as e3: print("e3" + str(e3)) print("=====>远程关闭连接,休息等待中。。。") time.sleep(300) def main(self): req = requests.post(url=self.url, headers=self.headers, data=self.data) # 初始数据请求 res_json = self.util.get_json_obj(req.text) self.data["page.rowCount"] = res_json["rowCount"] for i in range(29, int(res_json["rowCount"])): print("#####{}#####".format(i)) self.parse(i) time.sleep(30)