class WebapiCninfo: def __init__(self): self.get_code_key_h = { "Referer": "http://webapi.cninfo.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763", "Cache-Control": "max-age=0", "Accept": "image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5", "Accept-Language": "zh-CN", "Accept-Encoding": "gzip, deflate", "Host": "webapi.cninfo.com.cn", "Connection": "Keep-Alive", "Cookie": "cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=E4307520E006F39592E00F72DAAEA7D9; pgv_pvid=194670220; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564557528,1564557544,1564557814,1564557966; __qc_wId=595; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}", } self.get_loc_mess_h = { "Origin": "http://webapi.cninfo.com.cn", "Referer": "http://webapi.cninfo.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763", "Cache-Control": "max-age=0", "Accept": "*/*", "Accept-Language": "zh-CN", "mcode": "{}", "X-Requested-With": "XMLHttpRequest", "Accept-Encoding": "gzip, deflate", "Content-Length": "0", "Host": "webapi.cninfo.com.cn", "Connection": "Keep-Alive", "Pragma": "no-cache", "Cookie": "UC-JSESSIONID=E4307520E006F39592E00F72DAAEA7D9; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; __qc_wId=595; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}; pgv_pvid=194670220; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564557966,1564558754,1564559126,{}; codeKey={}", } self.get_comp_name_h = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Content-Length": "0", "Cookie": "pgv_pvid=9659403051; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=54EC36EB821D8FDBF427E3268AD8E2B7; __qc_wId=281; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564622577,1564623888,1564625108,{}; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}; codeKey={}", "Host": "webapi.cninfo.com.cn", "mcode": "{}", "Origin": "http://webapi.cninfo.com.cn", "Referer": "http://webapi.cninfo.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "X-Requested-With": "XMLHttpRequest", } self.get_data_h = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Content-Length": "0", "Cookie": "pgv_pvid=9659403051; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=54EC36EB821D8FDBF427E3268AD8E2B7; __qc_wId=281; codeKey={}; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564623888,1564625108,1564625230,{}; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}", "Host": "webapi.cninfo.com.cn", "mcode": "{}", "Origin": "http://webapi.cninfo.com.cn", "Referer": "http://webapi.cninfo.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/75.0.3770.100 Safari/537.36", "X-Requested-With": "XMLHttpRequest", } self.get_data_d = { "scode": "", "sdate": "", "edate": "", "type": "071001", "@column": "SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V,F006N,F007N,F008N,F009N,F010N,F011N,F012N" ",F013N,F014N,F015N,F016N,F017N,F018N,F019N,F020N,F021N,F022N,F023N,F024N,F025N,F026N,F027N" ",F028N,F029N,F030N,F031N,F032N,F033N,F034N,F035N,F036N,F037N,F038N,F039N,F040N,F041N,F043N" ",F044N,F045N,F046N,F047N,F048N,F049N,F050N,F051N,F052N,F053N,F054N,F055N,F056N,F057N,F058N" ",F059N,F060N,F061N,F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N,F070N,F071N,F072N,F073N" ",F074N,F075N,F076N,F077N,F078N,F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N,F088N" ",F089N,F090N,F091N", } self.get_comp_name_d = { "platetype": "{}", "platecode": "{}", "@orderby": "SECCODE:asc", "@column": "SECCODE,SECNAME", } self.session = requests.Session() self.util = Util() self.get_code_url = "http://webapi.cninfo.com.cn/api-cloud-platform/login/getVerfyCode" self.get_loc_url = "https://webapi.cninfo.com.cn/api/sysapi/p_sysapi1016" self.d_date = [i + j for i in ["2017", "2018", "2019"] for j in ["0331", "0630", "0930", "1231"]] def parse_json(self, content): content = self.util.get_json_obj(content) datas = content["records"][3]["children"] return ["http://webapi.cninfo.com.cn/{}?{}&@column=SECCODE,SECNAME"\ .format(data["API"], data["PARAM"]) for data in datas] def parse_data(self, data): y = self.get_data_d["sdate"][:4] if self.get_data_d["sdate"][4:6] == "03": quarter = "第一季度" elif self.get_data_d["sdate"][4:6] == "06": quarter = "第二季度" elif self.get_data_d["sdate"][4:6] == "09": quarter = "第三季度" elif self.get_data_d["sdate"][4:6] == "12": quarter = "第四季度" else: quarter = "--" if isinstance(data, str): data = self.util.get_json_obj(data) for d in data["records"]: id_code = self.util.MD5(d["SECNAME"] + y + quarter) print(d["SECNAME"]) sql = """insert into webapi_cninfo(id, SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V, F006N,F007N,F008N,F009N,F010N,F011N,F012N,F013N,F014N, F015N,F016N,F017N,F018N,F019N,F020N,F021N,F022N,F023N, F024N,F025N,F026N,F027N,F028N,F029N,F030N,F031N,F032N, F033N,F034N,F035N,F036N,F037N,F038N,F039N,F040N,F041N, F043N,F044N,F045N,F046N,F047N,F048N,F049N,F050N,F051N, F052N,F053N,F054N,F055N,F056N,F057N,F058N,F059N,F060N, F061N,F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N, F070N,F071N,F072N,F073N,F074N,F075N,F076N,F077N,F078N, F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N, F088N,F089N,F090N,F091N,y,quarter,crawl_time) values ('%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s')""" \ % ( id_code, d["SECCODE"], d["SECNAME"], d["STARTDATE"], d["ENDDATE"], d["F001D"], d["F002V"], d["F003V"], d["F006N"], d["F007N"], d["F008N"], d["F009N"], d["F010N"], d["F011N"], d["F012N"], d["F013N"], d["F014N"], d["F015N"], d["F016N"], d["F017N"], d["F018N"], d["F019N"], d["F020N"], d["F021N"], d["F022N"], d["F023N"], d["F024N"], d["F025N"], d["F026N"], d["F027N"], d["F028N"], d["F029N"], d["F030N"], d["F031N"], d["F032N"], d["F033N"], d["F034N"], d["F035N"], d["F036N"], d["F037N"], d["F038N"], d["F039N"], d["F040N"], d["F041N"], d["F043N"], d["F044N"], d["F045N"], d["F046N"], d["F047N"], d["F048N"], d["F049N"], d["F050N"], d["F051N"], d["F052N"], d["F053N"], d["F054N"], d["F055N"], d["F056N"], d["F057N"], d["F058N"], d["F059N"], d["F060N"], d["F061N"], d["F062N"], d["F063N"], d["F064N"], d["F065N"], d["F066N"], d["F067N"], d["F068N"], d["F069N"], d["F070N"], d["F071N"], d["F072N"], d["F073N"], d["F074N"], d["F075N"], d["F076N"], d["F077N"], d["F078N"], d["F079N"], d["F080N"], d["F081N"], d["F082N"], d["F083N"], d["F084N"], d["F085N"], d["F086N"], d["F087N"], d["F088N"], d["F089N"], d["F090N"], d["F091N"], y, quarter, self.util.get_now_time() ) self.util.insert2mysql(d["SECNAME"], sql) time.sleep(0.3) def cut_comp_code(self, scode, codekey, ts): # 请求数据的base_url data_url = "http://webapi.cninfo.com.cn/api/stock/p_stock2332?scode={}" \ "&sdate=20190331&edate=20190331&type=071001&" \ "@column=SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V,F006N,F007N,F008N," \ "F009N,F010N,F011N,F012N,F013N,F014N,F015N,F016N,F017N,F018N,F019N,F020N,F021N," \ "F022N,F023N,F024N,F025N,F026N,F027N,F028N,F029N,F030N,F031N,F032N,F033N,F034N," \ "F035N,F036N,F037N,F038N,F039N,F040N,F041N,F043N,F044N,F045N,F046N,F047N,F048N," \ "F049N,F050N,F051N,F052N,F053N,F054N,F055N,F056N,F057N,F058N,F059N,F060N,F061N," \ "F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N,F070N,F071N,F072N,F073N,F074N," \ "F075N,F076N,F077N,F078N,F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N," \ "F088N,F089N,F090N,F091N".format(scode) stamp = self.util.get_stamp() # 统一时间戳 # 生成新的请求headers self.get_data_h["Cookie"] = self.get_data_h["Cookie"].format(codekey, stamp, stamp) self.get_data_h["mcode"] = self.get_data_h["mcode"].format(self.util.base64_encode(ts).decode("utf-8")) self.get_data_d["scode"] = scode data = self.session.post(url=data_url, headers=self.get_data_h, data=self.get_data_d).text self.parse_data(data) # 处理公司的json数据 def parse_comp_json(self, json_res, codekey, ts): content = self.util.get_json_obj(json_res) ls_comp_code = [] for c in content["records"]: ls_comp_code.append(c["SECCODE"]) # 得到公司代码 if len(ls_comp_code) % 20 == 0: loop = int(len(ls_comp_code) / 20) else: loop = int(len(ls_comp_code) / 20) for dd in self.d_date: print(dd) self.get_data_d["sdate"] = dd self.get_data_d["edate"] = dd s = 0 e = 20 for _ in range(loop): time.sleep(1.5) scode = ",".join(ls_comp_code[s:e]) s += 20 if e < len(ls_comp_code): e += 20 else: e = len(ls_comp_code) self.cut_comp_code(scode, codekey, ts) time.sleep(30) # 获取所有公司名称 def get_comp_name(self, get_loc_res, codekey, ts): # 获取公司名称 for get_comp_name_url in self.parse_json(get_loc_res): # 处理请求参数 self.get_comp_name_h["Cookie"] = self.get_comp_name_h["Cookie"] \ .format(self.util.get_stamp(), self.util.get_stamp(), codekey) self.get_comp_name_h["mcode"] = self.get_comp_name_h["mcode"].format(self.util.base64_encode(ts)) self.get_comp_name_d["platetype"] = self.get_comp_name_d["platetype"].format( re.findall(r'platetype=(\d+)&', get_comp_name_url)[0]) self.get_comp_name_d["platecode"] = self.get_comp_name_d["platecode"].format( re.findall(r'platecode=(\d+)&', get_comp_name_url)[0]) # 开始请求公司名称 comp_name_res = self.session.post(url=get_comp_name_url, headers=self.get_comp_name_h, data=self.get_comp_name_d).text self.parse_comp_json(comp_name_res, codekey, ts) def main(self): # 请求网页,为得到本次会话的codekey 值 self.get_code_key_h["Cookie"] = self.get_code_key_h["Cookie"].format(int(time.time())) # 构造headers get_code_res = self.session.get(url=self.get_code_url, headers=self.get_code_key_h, verify=False) ts = int(time.time()) # 获取本次会话的时间戳 codekey = re.findall(r'codeKey=(.*?);', get_code_res.headers["Set-Cookie"])[0] # 得到codekey # 得到以地区分类的网页 self.get_loc_mess_h["mcode"] = self.get_loc_mess_h["mcode"].format(self.util.base64_encode(ts)) self.get_loc_mess_h["Cookie"] = self.get_loc_mess_h["Cookie"]\ .format(self.util.get_stamp(), self.util.get_stamp(), codekey) get_loc_res = self.session.post(url=self.get_loc_url, headers=self.get_loc_mess_h).text # 处理获取公司名称 self.get_comp_name(get_loc_res, codekey, ts)
class JrjgcfSpider(scrapy.Spider): name = 'jrjgcf' allowed_domains = ['app.finchina.com'] start_urls = ['https://app.finchina.com/finchinaAPP/getOrgFamilyCreaditData_SE.action?selTopRecommended=%E9%87%91%E8%9E%8D%E7%9B%91%E7%AE%A1%E5%A4%84%E7%BD%9A&skip=1'] def __init__(self): super(JrjgcfSpider, self).__init__() self.u = Util() self.detail_headers = { "Host": "app.finchina.com", "client": "finchina", "system": "v4.3.1.551,13.2.3,iOS,iPhone,iPhone,iPhone11,8", "Accept-Encoding": "gzip;q=1.0, compress;q=0.5", "Accept-Language": "zh-Hans-CN;q=1.0", "Connection": "keep-alive", "Accept": "*/*", "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148", "Referer": "https://app.finchina.com/finchinaAPP/f9/creditArchives/creditDetail.html?user=20191212160004_15561585051&id={}&getContent=0&token=ee7d9333-95fe-4530-b901-e05b35211cf4&companyName={}", "token": "0c6a8e27-d8a7-4d4a-8a78-4b89a98dcd6c", "X-Requested-With": "XMLHttpRequest" } self.page = 1 def parse(self, response): if self.u.get_json_obj(response.body)["returncode"] == 0: datas = self.u.get_json_obj(response.body)["data"] while True: if len(datas): for data in datas: id_code = data["infoId"] name = data["related"][0]["name"] type = data["type"] time.sleep(0.2) self.detail_headers["Referer"] = self.detail_headers["Referer"].format(id_code, self.u.url_encode(name)) self.detail_headers["User-Agent"] = settings.random_ua() yield scrapy.Request( url="https://app.finchina.com/finchinaAPP/getOrgFamilyCreaditDataContentDetails.action?" "type={}&getContent=0&id={}".format(type, id_code), headers=self.detail_headers, callback=self.parse_detail) self.page += 1 time.sleep(3) yield \ scrapy.Request( url="https://app.finchina.com/finchinaAPP/getOrgFamilyCreaditData_SE.action?" "selTopRecommended=%E9%87%91%E8%9E%8D%E7%9B%91%E7%AE%A1%E5%A4%84%E7%BD%9A&skip={}".format(self.page) , callback=self.parse ) break else: print("响应错误!!!") def parse_detail(self, response): item = JrjgcfItem() detail_datas = self.u.get_json_obj(response.body)["data"] for i in detail_datas: print("*" * 100) item["pub_date"] = i["it0026_006"] # 披露日期 item["about_people"] = i["it0026_005"] # 当事人 item["handle_people"] = i["it0026_016"] # 处理人 item["punish_type"] = i["risk"][0]["name"] # 处罚类型 item["irregularities"] = i["it0026_009"] # 违法行为 item["punish_content"] = i["it0026_011"] # 处罚内容 item["symbol_num"] = i["it0026_017"] # 文号 item["file_url"] = i["file"][0]["fileUrl"] item["file_name"] = i["file"][0]["fileName"] print("*" * 100) yield item
class WzzxbsMofocom: def __init__(self): self.url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadRecordData.action" self.detail_base_url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadEntpRecordDetails.action?params.recordId={}&time={}" self.headers = { "Accept": "application/json, text/javascript, */*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "Content-Length": "169", "Content-Type": "application/x-www-form-urlencoded", "Cookie": "insert_cookie=32151754", "Host": "wzzxbs.mofcom.gov.cn", "Origin": "http://wzzxbs.mofcom.gov.cn", "Referer": "http://wzzxbs.mofcom.gov.cn/WebProSP/app/infoPub/entpRecord", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "X-Requested-With": "XMLHttpRequest" } self.detail_headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "insert_cookie=32151754", "Host": "wzzxbs.mofcom.gov.cn", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } self.data = { "params.entpName": "", "page.currentPage": "", "page.limit": "2000", "page.option": "next", "page.start": "", "page.rowCount": "", "listGrid.col": "1:showRecordInfo(0),2,3,4", "listGrid.type": "link,ro,ro,ro" } self.detail_data = {"params.recordId": "", "time": ""} self.util = Util() self.user_agent = UserAgent() def parse_18(self, detail_html, business_type): # 一、备案情况 item_content = detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[3]/td/text()")[0].replace( "\xe5", "") # 变更事项 # print(item_content) item_date = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[4]/td/text()")[0].replace( "\xe5", "") # 完成备案时间 # print(item_date) item_number = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[5]/td/text()")[0].replace( "\xe5", "") # 备案号 # print(item_number) # 二、外商投资企业基本信息 comp_name = detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[7]/td/text()")[0].replace( "\ue07e", "").replace("\xe5", "") # 公司名称 # print(comp_name) regi_addr = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[8]/td/text()")[0].replace( '\u3bbe', '').replace('\ue07e', '').replace("\xe5", "").replace("\ue096", "") # 注册地址 # print(regi_addr) try: crit_code = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[9]/td/text()" )[0].replace("\xe5", "") # 统一社会信用代码 except IndexError: crit_code = "" # print(crit_code) comp_type = re.findall( r'checked="checked"/> (.*?) ', str( etree.tostring(detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[10]/td")[0], encoding='utf-8'), 'utf-8').strip().replace("\xe5", ""), re.S)[0] # 企业类型 # print(comp_type) operating_period = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[11]/td/text()")[0].strip( ).replace("\xe5", "") # 经营期限 # print(operating_period) try: investment_industry = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[12]/td/text()" )[0].replace("\xe5", "") # 投资行业 except Exception: investment_industry = "" # print(investment_industry) business_scope = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[13]/td/text()")[0].replace( "\xe5", "").replace("\xe5", "") # 经营范围 # print(business_scope) try: total_investment = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[14]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xe5", "").replace("\ue07e", "") except IndexError: total_investment = "" # print(total_investment) registered_capital = str.split( detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[15]/td/text()")[0], " ")[0].replace("\xa0", "").replace("\xe5", "").replace("\ue07e", "") # 注册资本 # print(registered_capital) try: legal_representative = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[16]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xe5", "").replace("\ue07e", "").replace("\u4b72", " ") # 法定代表人 except IndexError: legal_representative = "" # print(legal_representative) md5_id = comp_name + business_type + item_date + item_number cols = (self.util.MD5(item_number), business_type, item_content, item_date, item_number, comp_name, regi_addr, crit_code, comp_type, operating_period, investment_industry, business_scope, total_investment, registered_capital, pymysql.escape_string(legal_representative), self.util.MD5(md5_id), self.util.get_now_time()) s = self.get_sql(cols) self.util.insert2mysql(comp_name, s) return md5_id, item_number def parse_17(self, detail_html, business_type): item_content = "" # 变更事项 item_date = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[3]/td/text()")[0].replace( "\xe5", "") # 完成备案时间 # print(item_date) item_number = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[4]/td/text()")[0].replace( "\xe5", "") # 备案号 # print(item_number) # 二、外商投资企业基本信息 comp_name = detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[6]/td/text()")[0].replace( "\ue07e", "").replace("\xe5", "") # 公司名称 # print(comp_name) regi_addr = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[7]/td/text()")[0].replace( '\u3bbe', '').replace('\ue07e', '').replace("\xe5", "").replace("\ue096", "") # 注册地址 # print(regi_addr) try: crit_code = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[8]/td/text()" )[0].replace("\xe5", "") # 统一社会信用代码 except IndexError: crit_code = "" # print(crit_code) comp_type = re.findall( r'checked="checked"/> (.*?) ', str( etree.tostring(detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[9]/td")[0], encoding='utf-8'), 'utf-8') # .replace(" ", "").replace("<input", "").replace("\n", "") .strip().replace("\xe5", ""), re.S)[0] # 企业类型 # print(comp_type) operating_period = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[10]/td/text()")[0].strip( ).replace("\xe5", "") # 经营期限 # print(operating_period) try: investment_industry = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[11]/td/text()" )[0].replace("\xe5", "") # 投资行业 except Exception: investment_industry = "" # print(investment_industry) business_scope = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[12]/td/text()")[0].replace( "\xe5", "").replace("\xe5", "") # 经营范围 # print(business_scope) try: total_investment = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[13]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xe5", "") # 投资总额 except IndexError: total_investment = "" # print(total_investment) registered_capital = str.split( detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[14]/td/text()")[0], " ")[0].replace("\xa0", "").replace("\xe5", "") # 注册资本 # print(registered_capital) try: legal_representative = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[15]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xd6", "").replace("\xe5", "") # 法定代表人 except IndexError: legal_representative = "" # print(legal_representative) md5_id = comp_name + business_type + item_date + item_number cols = (self.util.MD5(item_number), business_type, item_content, item_date, item_number, comp_name, regi_addr, crit_code, comp_type, operating_period, investment_industry, business_scope, total_investment, registered_capital, pymysql.escape_string(legal_representative), self.util.MD5(md5_id), self.util.get_now_time()) self.util.insert2mysql(comp_name, self.get_sql(cols)) return md5_id, item_number def get_sql(self, col_tuple): info_sql = """ insert into wzzxbs_mofcom_info( id, business_type, item_content, item_date, item_number, comp_name, regi_addr, crit_code, comp_type, operating_period, investment_industry, business_scope, total_investment, registered_capital, legal_representative, cust_id, craw_time )values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % col_tuple return info_sql def parse_invesment_info(self, detail_html, md5_id, n): for mes in detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[{}]/table/tr".format( n))[1:]: name_of_investor = str.split(mes.xpath("./td[1]/text()")[0], " ")[0]\ .replace("\ue07e", "")\ .replace("\xe5", "")\ .replace("\xd6", "") # print(name_of_investor) different_countries = mes.xpath("./td[2]/text()")[0].replace( "\xe5", "") # print(different_countries) amount_invested = str.split(mes.xpath("./td[3]/text()")[0], " ")[0]\ .replace("\xa0", "")\ .replace("\xd6", "")\ .replace("\xe5", "")\ .replace("\ue07e", "") # print(amount_invested) investment_sql = """ insert into wzzxbs_mofcom_investment_info( id, name_of_investor, different_countries, amount_invested, cust_id, craw_time )values('%s', '%s', '%s', '%s', '%s', '%s') """ % (self.util.MD5(name_of_investor + different_countries + amount_invested), pymysql.escape_string(name_of_investor), different_countries, amount_invested, self.util.MD5(md5_id), self.util.get_now_time()) self.util.insert2mysql("投资信息|", investment_sql) def parse(self, num): self.data["page.currentPage"] = str(num) if num: self.data["page.start"] = str((int(num) - 1) * 2000) while True: try: page_req = requests.post(url=self.url, headers=self.headers, data=self.data) items = self.util.get_json_obj(page_req.text)["rows"] page_req.close() for item in items: # item business_type = item["data"][1] item_code = re.findall(r'showRecordInfo\(\"(.*?)\"\)', item["data"][0])[0] detail_url = self.detail_base_url.format( item_code, self.util.get_stamp()) # 详情页请求连接 print(detail_url) self.detail_data["params.recordId"] = item_code self.detail_data["time"] = self.util.get_stamp() while True: try: detail_req = requests.get( url=detail_url, headers=self.detail_headers, data=self.detail_data) # 详情页请求 detail_html = self.util.get_xpath_obj( detail_req.text) detail_req.close() if len( detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr") ) == 18: try: md5_id, item_number = self.parse_18( detail_html, business_type) self.parse_invesment_info( detail_html, md5_id, 18) except Exception as e18: print("e18" + str(e18)) print("问题在此处被捕获了") else: try: md5_id, item_number = self.parse_17( detail_html, business_type) # 三、外商投资企业投资者基本信息 self.parse_invesment_info( detail_html, md5_id, 17) except Exception as e17: print("e17" + str(e17)) print("问题在此处被捕获了") break except requests.exceptions.ChunkedEncodingError as e: print("e" + str(e)) except Exception as e1: print("e1" + str(e1)) print("==>远程关闭连接,休息等待中。。。") time.sleep(300) time.sleep(1.5) break except requests.exceptions.ChunkedEncodingError as e2: print("e2" + str(e2)) except Exception as e3: print("e3" + str(e3)) print("=====>远程关闭连接,休息等待中。。。") time.sleep(300) def main(self): req = requests.post(url=self.url, headers=self.headers, data=self.data) # 初始数据请求 res_json = self.util.get_json_obj(req.text) self.data["page.rowCount"] = res_json["rowCount"] for i in range(29, int(res_json["rowCount"])): print("#####{}#####".format(i)) self.parse(i) time.sleep(30)