class WebapiCninfo:
    def __init__(self):
        self.get_code_key_h = {
            "Referer": "http://webapi.cninfo.com.cn/",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763",
            "Cache-Control": "max-age=0",
            "Accept": "image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5",
            "Accept-Language": "zh-CN",
            "Accept-Encoding": "gzip, deflate",
            "Host": "webapi.cninfo.com.cn",
            "Connection": "Keep-Alive",
            "Cookie": "cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=E4307520E006F39592E00F72DAAEA7D9; pgv_pvid=194670220; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564557528,1564557544,1564557814,1564557966; __qc_wId=595; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}",
        }
        self.get_loc_mess_h = {
            "Origin": "http://webapi.cninfo.com.cn",
            "Referer": "http://webapi.cninfo.com.cn/",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763",
            "Cache-Control": "max-age=0",
            "Accept": "*/*",
            "Accept-Language": "zh-CN",
            "mcode": "{}",
            "X-Requested-With": "XMLHttpRequest",
            "Accept-Encoding": "gzip, deflate",
            "Content-Length": "0",
            "Host": "webapi.cninfo.com.cn",
            "Connection": "Keep-Alive",
            "Pragma": "no-cache",
            "Cookie": "UC-JSESSIONID=E4307520E006F39592E00F72DAAEA7D9; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; __qc_wId=595; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}; pgv_pvid=194670220; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564557966,1564558754,1564559126,{}; codeKey={}",
        }
        self.get_comp_name_h = {
            "Accept": "*/*",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            "Content-Length": "0",
            "Cookie": "pgv_pvid=9659403051; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=54EC36EB821D8FDBF427E3268AD8E2B7; __qc_wId=281; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564622577,1564623888,1564625108,{}; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}; codeKey={}",
            "Host": "webapi.cninfo.com.cn",
            "mcode": "{}",
            "Origin": "http://webapi.cninfo.com.cn",
            "Referer": "http://webapi.cninfo.com.cn/",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest",
        }
        self.get_data_h = {
                "Accept": "application/json, text/javascript, */*; q=0.01",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.9",
                "Connection": "keep-alive",
                "Content-Length": "0",
                "Cookie": "pgv_pvid=9659403051; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=54EC36EB821D8FDBF427E3268AD8E2B7; __qc_wId=281; codeKey={}; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564623888,1564625108,1564625230,{}; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}",
                "Host": "webapi.cninfo.com.cn",
                "mcode": "{}",
                "Origin": "http://webapi.cninfo.com.cn",
                "Referer": "http://webapi.cninfo.com.cn/",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/75.0.3770.100 Safari/537.36",
                "X-Requested-With": "XMLHttpRequest",
            }
        self.get_data_d = {
            "scode": "",
            "sdate": "",
            "edate": "",
            "type": "071001",
            "@column": "SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V,F006N,F007N,F008N,F009N,F010N,F011N,F012N"
                       ",F013N,F014N,F015N,F016N,F017N,F018N,F019N,F020N,F021N,F022N,F023N,F024N,F025N,F026N,F027N"
                       ",F028N,F029N,F030N,F031N,F032N,F033N,F034N,F035N,F036N,F037N,F038N,F039N,F040N,F041N,F043N"
                       ",F044N,F045N,F046N,F047N,F048N,F049N,F050N,F051N,F052N,F053N,F054N,F055N,F056N,F057N,F058N"
                       ",F059N,F060N,F061N,F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N,F070N,F071N,F072N,F073N"
                       ",F074N,F075N,F076N,F077N,F078N,F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N,F088N"
                       ",F089N,F090N,F091N",
        }
        self.get_comp_name_d = {
            "platetype": "{}",
            "platecode": "{}",
            "@orderby": "SECCODE:asc",
            "@column": "SECCODE,SECNAME",
        }
        self.session = requests.Session()
        self.util = Util()
        self.get_code_url = "http://webapi.cninfo.com.cn/api-cloud-platform/login/getVerfyCode"
        self.get_loc_url = "https://webapi.cninfo.com.cn/api/sysapi/p_sysapi1016"
        self.d_date = [i + j for i in ["2017", "2018", "2019"] for j in ["0331", "0630", "0930", "1231"]]

    def parse_json(self, content):
        content = self.util.get_json_obj(content)
        datas = content["records"][3]["children"]
        return ["http://webapi.cninfo.com.cn/{}?{}&@column=SECCODE,SECNAME"\
                .format(data["API"], data["PARAM"]) for data in datas]

    def parse_data(self, data):
        y = self.get_data_d["sdate"][:4]
        if self.get_data_d["sdate"][4:6] == "03":
            quarter = "第一季度"
        elif self.get_data_d["sdate"][4:6] == "06":
            quarter = "第二季度"
        elif self.get_data_d["sdate"][4:6] == "09":
            quarter = "第三季度"
        elif self.get_data_d["sdate"][4:6] == "12":
            quarter = "第四季度"
        else:
            quarter = "--"
        if isinstance(data, str):
            data = self.util.get_json_obj(data)
        for d in data["records"]:
            id_code = self.util.MD5(d["SECNAME"] + y + quarter)
            print(d["SECNAME"])
            sql = """insert into  webapi_cninfo(id,
                    SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V,
                    F006N,F007N,F008N,F009N,F010N,F011N,F012N,F013N,F014N,
                    F015N,F016N,F017N,F018N,F019N,F020N,F021N,F022N,F023N,
                    F024N,F025N,F026N,F027N,F028N,F029N,F030N,F031N,F032N,
                    F033N,F034N,F035N,F036N,F037N,F038N,F039N,F040N,F041N,
                    F043N,F044N,F045N,F046N,F047N,F048N,F049N,F050N,F051N,
                    F052N,F053N,F054N,F055N,F056N,F057N,F058N,F059N,F060N,
                    F061N,F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N,
                    F070N,F071N,F072N,F073N,F074N,F075N,F076N,F077N,F078N,
                    F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N,
                    F088N,F089N,F090N,F091N,y,quarter,crawl_time)
                    values
                    ('%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s',
                    '%s','%s','%s','%s','%s','%s')""" \
                  % (
                    id_code,
                    d["SECCODE"],
                    d["SECNAME"],
                    d["STARTDATE"],
                    d["ENDDATE"],
                    d["F001D"],
                    d["F002V"],
                    d["F003V"],
                    d["F006N"],
                    d["F007N"],
                    d["F008N"],
                    d["F009N"],
                    d["F010N"],
                    d["F011N"],
                    d["F012N"],
                    d["F013N"],
                    d["F014N"],
                    d["F015N"],
                    d["F016N"],
                    d["F017N"],
                    d["F018N"],
                    d["F019N"],
                    d["F020N"],
                    d["F021N"],
                    d["F022N"],
                    d["F023N"],
                    d["F024N"],
                    d["F025N"],
                    d["F026N"],
                    d["F027N"],
                    d["F028N"],
                    d["F029N"],
                    d["F030N"],
                    d["F031N"],
                    d["F032N"],
                    d["F033N"],
                    d["F034N"],
                    d["F035N"],
                    d["F036N"],
                    d["F037N"],
                    d["F038N"],
                    d["F039N"],
                    d["F040N"],
                    d["F041N"],
                    d["F043N"],
                    d["F044N"],
                    d["F045N"],
                    d["F046N"],
                    d["F047N"],
                    d["F048N"],
                    d["F049N"],
                    d["F050N"],
                    d["F051N"],
                    d["F052N"],
                    d["F053N"],
                    d["F054N"],
                    d["F055N"],
                    d["F056N"],
                    d["F057N"],
                    d["F058N"],
                    d["F059N"],
                    d["F060N"],
                    d["F061N"],
                    d["F062N"],
                    d["F063N"],
                    d["F064N"],
                    d["F065N"],
                    d["F066N"],
                    d["F067N"],
                    d["F068N"],
                    d["F069N"],
                    d["F070N"],
                    d["F071N"],
                    d["F072N"],
                    d["F073N"],
                    d["F074N"],
                    d["F075N"],
                    d["F076N"],
                    d["F077N"],
                    d["F078N"],
                    d["F079N"],
                    d["F080N"],
                    d["F081N"],
                    d["F082N"],
                    d["F083N"],
                    d["F084N"],
                    d["F085N"],
                    d["F086N"],
                    d["F087N"],
                    d["F088N"],
                    d["F089N"],
                    d["F090N"],
                    d["F091N"],
                    y,
                    quarter,
                    self.util.get_now_time()
                                        )
            self.util.insert2mysql(d["SECNAME"], sql)
            time.sleep(0.3)

    def cut_comp_code(self, scode, codekey, ts):
            # 请求数据的base_url
            data_url = "http://webapi.cninfo.com.cn/api/stock/p_stock2332?scode={}" \
                       "&sdate=20190331&edate=20190331&type=071001&" \
                       "@column=SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V,F006N,F007N,F008N," \
                       "F009N,F010N,F011N,F012N,F013N,F014N,F015N,F016N,F017N,F018N,F019N,F020N,F021N," \
                       "F022N,F023N,F024N,F025N,F026N,F027N,F028N,F029N,F030N,F031N,F032N,F033N,F034N," \
                       "F035N,F036N,F037N,F038N,F039N,F040N,F041N,F043N,F044N,F045N,F046N,F047N,F048N," \
                       "F049N,F050N,F051N,F052N,F053N,F054N,F055N,F056N,F057N,F058N,F059N,F060N,F061N," \
                       "F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N,F070N,F071N,F072N,F073N,F074N," \
                       "F075N,F076N,F077N,F078N,F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N," \
                       "F088N,F089N,F090N,F091N".format(scode)
            stamp = self.util.get_stamp()  # 统一时间戳
            # 生成新的请求headers
            self.get_data_h["Cookie"] = self.get_data_h["Cookie"].format(codekey, stamp, stamp)
            self.get_data_h["mcode"] = self.get_data_h["mcode"].format(self.util.base64_encode(ts).decode("utf-8"))
            self.get_data_d["scode"] = scode
            data = self.session.post(url=data_url, headers=self.get_data_h, data=self.get_data_d).text
            self.parse_data(data)

    # 处理公司的json数据
    def parse_comp_json(self, json_res, codekey, ts):
        content = self.util.get_json_obj(json_res)
        ls_comp_code = []
        for c in content["records"]:
            ls_comp_code.append(c["SECCODE"])  # 得到公司代码

        if len(ls_comp_code) % 20 == 0:
            loop = int(len(ls_comp_code) / 20)
        else:
            loop = int(len(ls_comp_code) / 20)
        for dd in self.d_date:
            print(dd)
            self.get_data_d["sdate"] = dd
            self.get_data_d["edate"] = dd
            s = 0
            e = 20
            for _ in range(loop):
                time.sleep(1.5)
                scode = ",".join(ls_comp_code[s:e])
                s += 20
                if e < len(ls_comp_code):
                    e += 20
                else:
                    e = len(ls_comp_code)

                self.cut_comp_code(scode, codekey, ts)
            time.sleep(30)

    # 获取所有公司名称
    def get_comp_name(self, get_loc_res, codekey, ts):
        # 获取公司名称
        for get_comp_name_url in self.parse_json(get_loc_res):
            # 处理请求参数
            self.get_comp_name_h["Cookie"] = self.get_comp_name_h["Cookie"] \
                .format(self.util.get_stamp(), self.util.get_stamp(), codekey)
            self.get_comp_name_h["mcode"] = self.get_comp_name_h["mcode"].format(self.util.base64_encode(ts))
            self.get_comp_name_d["platetype"] = self.get_comp_name_d["platetype"].format(
                re.findall(r'platetype=(\d+)&', get_comp_name_url)[0])
            self.get_comp_name_d["platecode"] = self.get_comp_name_d["platecode"].format(
                re.findall(r'platecode=(\d+)&', get_comp_name_url)[0])
            # 开始请求公司名称
            comp_name_res = self.session.post(url=get_comp_name_url,
                                              headers=self.get_comp_name_h,
                                              data=self.get_comp_name_d).text
            self.parse_comp_json(comp_name_res, codekey, ts)

    def main(self):
        # 请求网页,为得到本次会话的codekey 值
        self.get_code_key_h["Cookie"] = self.get_code_key_h["Cookie"].format(int(time.time()))  # 构造headers
        get_code_res = self.session.get(url=self.get_code_url, headers=self.get_code_key_h, verify=False)
        ts = int(time.time())  # 获取本次会话的时间戳
        codekey = re.findall(r'codeKey=(.*?);', get_code_res.headers["Set-Cookie"])[0]  # 得到codekey
        # 得到以地区分类的网页
        self.get_loc_mess_h["mcode"] = self.get_loc_mess_h["mcode"].format(self.util.base64_encode(ts))
        self.get_loc_mess_h["Cookie"] = self.get_loc_mess_h["Cookie"]\
            .format(self.util.get_stamp(), self.util.get_stamp(), codekey)
        get_loc_res = self.session.post(url=self.get_loc_url, headers=self.get_loc_mess_h).text
        # 处理获取公司名称
        self.get_comp_name(get_loc_res, codekey, ts)
Beispiel #2
0
class JrjgcfSpider(scrapy.Spider):
    name = 'jrjgcf'
    allowed_domains = ['app.finchina.com']
    start_urls = ['https://app.finchina.com/finchinaAPP/getOrgFamilyCreaditData_SE.action?selTopRecommended=%E9%87%91%E8%9E%8D%E7%9B%91%E7%AE%A1%E5%A4%84%E7%BD%9A&skip=1']

    def __init__(self):
        super(JrjgcfSpider, self).__init__()
        self.u = Util()
        self.detail_headers = {
            "Host": "app.finchina.com",
            "client": "finchina",
            "system": "v4.3.1.551,13.2.3,iOS,iPhone,iPhone,iPhone11,8",
            "Accept-Encoding": "gzip;q=1.0, compress;q=0.5",
            "Accept-Language": "zh-Hans-CN;q=1.0",
            "Connection": "keep-alive",
            "Accept": "*/*",
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148",
            "Referer": "https://app.finchina.com/finchinaAPP/f9/creditArchives/creditDetail.html?user=20191212160004_15561585051&id={}&getContent=0&token=ee7d9333-95fe-4530-b901-e05b35211cf4&companyName={}",
            "token": "0c6a8e27-d8a7-4d4a-8a78-4b89a98dcd6c",
            "X-Requested-With": "XMLHttpRequest"
        }
        self.page = 1

    def parse(self, response):
        if self.u.get_json_obj(response.body)["returncode"] == 0:
            datas = self.u.get_json_obj(response.body)["data"]
            while True:
                if len(datas):
                    for data in datas:
                        id_code = data["infoId"]
                        name = data["related"][0]["name"]
                        type = data["type"]
                        time.sleep(0.2)
                        self.detail_headers["Referer"] = self.detail_headers["Referer"].format(id_code, self.u.url_encode(name))
                        self.detail_headers["User-Agent"] = settings.random_ua()

                        yield scrapy.Request(
                            url="https://app.finchina.com/finchinaAPP/getOrgFamilyCreaditDataContentDetails.action?"
                                "type={}&getContent=0&id={}".format(type, id_code),
                            headers=self.detail_headers,
                            callback=self.parse_detail)

                self.page += 1
                time.sleep(3)
                yield \
                    scrapy.Request(
                        url="https://app.finchina.com/finchinaAPP/getOrgFamilyCreaditData_SE.action?"
                            "selTopRecommended=%E9%87%91%E8%9E%8D%E7%9B%91%E7%AE%A1%E5%A4%84%E7%BD%9A&skip={}".format(self.page)
                        , callback=self.parse
                    )
                break
        else:
            print("响应错误!!!")

    def parse_detail(self, response):
        item = JrjgcfItem()
        detail_datas = self.u.get_json_obj(response.body)["data"]
        for i in detail_datas:
            print("*" * 100)
            item["pub_date"] = i["it0026_006"]  # 披露日期
            item["about_people"] = i["it0026_005"]  # 当事人
            item["handle_people"] = i["it0026_016"]  # 处理人
            item["punish_type"] = i["risk"][0]["name"]  # 处罚类型
            item["irregularities"] = i["it0026_009"]  # 违法行为
            item["punish_content"] = i["it0026_011"]  # 处罚内容
            item["symbol_num"] = i["it0026_017"]  # 文号
            item["file_url"] = i["file"][0]["fileUrl"]
            item["file_name"] = i["file"][0]["fileName"]
            print("*" * 100)
            yield item
Beispiel #3
0
class WzzxbsMofocom:
    def __init__(self):
        self.url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadRecordData.action"
        self.detail_base_url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadEntpRecordDetails.action?params.recordId={}&time={}"
        self.headers = {
            "Accept": "application/json, text/javascript, */*",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection": "keep-alive",
            "Content-Length": "169",
            "Content-Type": "application/x-www-form-urlencoded",
            "Cookie": "insert_cookie=32151754",
            "Host": "wzzxbs.mofcom.gov.cn",
            "Origin": "http://wzzxbs.mofcom.gov.cn",
            "Referer":
            "http://wzzxbs.mofcom.gov.cn/WebProSP/app/infoPub/entpRecord",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest"
        }
        self.detail_headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            "Cookie":
            "insert_cookie=32151754",
            "Host":
            "wzzxbs.mofcom.gov.cn",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
        }
        self.data = {
            "params.entpName": "",
            "page.currentPage": "",
            "page.limit": "2000",
            "page.option": "next",
            "page.start": "",
            "page.rowCount": "",
            "listGrid.col": "1:showRecordInfo(0),2,3,4",
            "listGrid.type": "link,ro,ro,ro"
        }
        self.detail_data = {"params.recordId": "", "time": ""}
        self.util = Util()
        self.user_agent = UserAgent()

    def parse_18(self, detail_html, business_type):
        # 一、备案情况
        item_content = detail_html.xpath(
            "//div[@class=\"Table1\"]/table/tr[3]/td/text()")[0].replace(
                "\xe5", "")  # 变更事项
        # print(item_content)
        item_date = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[4]/td/text()")[0].replace(
                "\xe5", "")  # 完成备案时间
        # print(item_date)
        item_number = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[5]/td/text()")[0].replace(
                "\xe5", "")  # 备案号
        # print(item_number)

        # 二、外商投资企业基本信息
        comp_name = detail_html.xpath(
            "//div[@class=\"Table1\"]/table/tr[7]/td/text()")[0].replace(
                "\ue07e", "").replace("\xe5", "")  # 公司名称
        # print(comp_name)
        regi_addr = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[8]/td/text()")[0].replace(
                '\u3bbe', '').replace('\ue07e',
                                      '').replace("\xe5",
                                                  "").replace("\ue096",
                                                              "")  # 注册地址
        # print(regi_addr)
        try:
            crit_code = detail_html.xpath(
                "//div[@class=\"Table1\"]/table[1]/tr[9]/td/text()"
            )[0].replace("\xe5", "")  # 统一社会信用代码
        except IndexError:
            crit_code = ""
        # print(crit_code)
        comp_type = re.findall(
            r'checked="checked"/> (.*?)&#13;',
            str(
                etree.tostring(detail_html.xpath(
                    "//div[@class=\"Table1\"]/table[1]/tr[10]/td")[0],
                               encoding='utf-8'),
                'utf-8').strip().replace("\xe5", ""), re.S)[0]  # 企业类型
        # print(comp_type)
        operating_period = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[11]/td/text()")[0].strip(
            ).replace("\xe5", "")  # 经营期限
        # print(operating_period)
        try:
            investment_industry = detail_html.xpath(
                "//div[@class=\"Table1\"]/table[1]/tr[12]/td/text()"
            )[0].replace("\xe5", "")  # 投资行业
        except Exception:
            investment_industry = ""
        # print(investment_industry)
        business_scope = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[13]/td/text()")[0].replace(
                "\xe5", "").replace("\xe5", "")  # 经营范围
        # print(business_scope)
        try:
            total_investment = \
            str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[14]/td/text()")[0], " ")[0].replace(
                "\xa0", "").replace("\xe5", "").replace("\ue07e", "")
        except IndexError:
            total_investment = ""
        # print(total_investment)
        registered_capital = str.split(
            detail_html.xpath(
                "//div[@class=\"Table1\"]/table[1]/tr[15]/td/text()")[0],
            " ")[0].replace("\xa0", "").replace("\xe5",
                                                "").replace("\ue07e",
                                                            "")  # 注册资本
        # print(registered_capital)
        try:
            legal_representative = \
            str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[16]/td/text()")[0], " ")[0].replace(
                "\xa0", "").replace("\xe5", "").replace("\ue07e", "").replace("\u4b72", " ")  # 法定代表人
        except IndexError:
            legal_representative = ""
        # print(legal_representative)
        md5_id = comp_name + business_type + item_date + item_number
        cols = (self.util.MD5(item_number), business_type, item_content,
                item_date, item_number, comp_name, regi_addr, crit_code,
                comp_type, operating_period, investment_industry,
                business_scope, total_investment, registered_capital,
                pymysql.escape_string(legal_representative),
                self.util.MD5(md5_id), self.util.get_now_time())
        s = self.get_sql(cols)
        self.util.insert2mysql(comp_name, s)
        return md5_id, item_number

    def parse_17(self, detail_html, business_type):
        item_content = ""  # 变更事项
        item_date = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[3]/td/text()")[0].replace(
                "\xe5", "")  # 完成备案时间
        # print(item_date)
        item_number = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[4]/td/text()")[0].replace(
                "\xe5", "")  # 备案号
        # print(item_number)

        # 二、外商投资企业基本信息
        comp_name = detail_html.xpath(
            "//div[@class=\"Table1\"]/table/tr[6]/td/text()")[0].replace(
                "\ue07e", "").replace("\xe5", "")  # 公司名称
        # print(comp_name)
        regi_addr = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[7]/td/text()")[0].replace(
                '\u3bbe', '').replace('\ue07e',
                                      '').replace("\xe5",
                                                  "").replace("\ue096",
                                                              "")  # 注册地址
        # print(regi_addr)
        try:
            crit_code = detail_html.xpath(
                "//div[@class=\"Table1\"]/table[1]/tr[8]/td/text()"
            )[0].replace("\xe5", "")  # 统一社会信用代码
        except IndexError:
            crit_code = ""
        # print(crit_code)
        comp_type = re.findall(
            r'checked="checked"/> (.*?)&#13;',
            str(
                etree.tostring(detail_html.xpath(
                    "//div[@class=\"Table1\"]/table[1]/tr[9]/td")[0],
                               encoding='utf-8'), 'utf-8')
            # .replace("&#13;", "").replace("<input", "").replace("\n", "")
            .strip().replace("\xe5", ""),
            re.S)[0]  # 企业类型
        # print(comp_type)
        operating_period = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[10]/td/text()")[0].strip(
            ).replace("\xe5", "")  # 经营期限
        # print(operating_period)
        try:
            investment_industry = detail_html.xpath(
                "//div[@class=\"Table1\"]/table[1]/tr[11]/td/text()"
            )[0].replace("\xe5", "")  # 投资行业
        except Exception:
            investment_industry = ""
        # print(investment_industry)
        business_scope = detail_html.xpath(
            "//div[@class=\"Table1\"]/table[1]/tr[12]/td/text()")[0].replace(
                "\xe5", "").replace("\xe5", "")  # 经营范围
        # print(business_scope)
        try:
            total_investment = \
            str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[13]/td/text()")[0], " ")[0].replace(
                "\xa0", "").replace("\xe5", "")  # 投资总额
        except IndexError:
            total_investment = ""
        # print(total_investment)
        registered_capital = str.split(
            detail_html.xpath(
                "//div[@class=\"Table1\"]/table[1]/tr[14]/td/text()")[0],
            " ")[0].replace("\xa0", "").replace("\xe5", "")  # 注册资本
        # print(registered_capital)
        try:
            legal_representative = \
            str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[15]/td/text()")[0], " ")[0].replace(
                "\xa0", "").replace("\xd6", "").replace("\xe5", "")  # 法定代表人
        except IndexError:
            legal_representative = ""
        # print(legal_representative)
        md5_id = comp_name + business_type + item_date + item_number
        cols = (self.util.MD5(item_number), business_type, item_content,
                item_date, item_number, comp_name, regi_addr, crit_code,
                comp_type, operating_period, investment_industry,
                business_scope, total_investment, registered_capital,
                pymysql.escape_string(legal_representative),
                self.util.MD5(md5_id), self.util.get_now_time())
        self.util.insert2mysql(comp_name, self.get_sql(cols))
        return md5_id, item_number

    def get_sql(self, col_tuple):
        info_sql = """
                            insert into wzzxbs_mofcom_info(
                            id,
                            business_type,
                            item_content,
                            item_date,
                            item_number,
                            comp_name,
                            regi_addr,
                            crit_code,
                            comp_type,
                            operating_period,
                            investment_industry,
                            business_scope,
                            total_investment,
                            registered_capital,
                            legal_representative,
                            cust_id,
                            craw_time
                            )values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')
                        """ % col_tuple
        return info_sql

    def parse_invesment_info(self, detail_html, md5_id, n):
        for mes in detail_html.xpath(
                "//div[@class=\"Table1\"]/table/tr[{}]/table/tr".format(
                    n))[1:]:
            name_of_investor = str.split(mes.xpath("./td[1]/text()")[0], " ")[0]\
                .replace("\ue07e", "")\
                .replace("\xe5", "")\
                .replace("\xd6", "")
            # print(name_of_investor)
            different_countries = mes.xpath("./td[2]/text()")[0].replace(
                "\xe5", "")
            # print(different_countries)
            amount_invested = str.split(mes.xpath("./td[3]/text()")[0], " ")[0]\
                .replace("\xa0", "")\
                .replace("\xd6", "")\
                .replace("\xe5", "")\
                .replace("\ue07e", "")
            # print(amount_invested)
            investment_sql = """
                insert into wzzxbs_mofcom_investment_info(
                id,
                name_of_investor,
                different_countries,
                amount_invested,
                cust_id,
                craw_time
                )values('%s', '%s', '%s', '%s', '%s', '%s')
            """ % (self.util.MD5(name_of_investor + different_countries +
                                 amount_invested),
                   pymysql.escape_string(name_of_investor),
                   different_countries, amount_invested, self.util.MD5(md5_id),
                   self.util.get_now_time())
            self.util.insert2mysql("投资信息|", investment_sql)

    def parse(self, num):
        self.data["page.currentPage"] = str(num)
        if num:
            self.data["page.start"] = str((int(num) - 1) * 2000)
        while True:
            try:
                page_req = requests.post(url=self.url,
                                         headers=self.headers,
                                         data=self.data)
                items = self.util.get_json_obj(page_req.text)["rows"]
                page_req.close()

                for item in items:  # item
                    business_type = item["data"][1]
                    item_code = re.findall(r'showRecordInfo\(\"(.*?)\"\)',
                                           item["data"][0])[0]
                    detail_url = self.detail_base_url.format(
                        item_code, self.util.get_stamp())  # 详情页请求连接
                    print(detail_url)
                    self.detail_data["params.recordId"] = item_code
                    self.detail_data["time"] = self.util.get_stamp()
                    while True:
                        try:
                            detail_req = requests.get(
                                url=detail_url,
                                headers=self.detail_headers,
                                data=self.detail_data)  # 详情页请求
                            detail_html = self.util.get_xpath_obj(
                                detail_req.text)
                            detail_req.close()
                            if len(
                                    detail_html.xpath(
                                        "//div[@class=\"Table1\"]/table[1]/tr")
                            ) == 18:
                                try:
                                    md5_id, item_number = self.parse_18(
                                        detail_html, business_type)
                                    self.parse_invesment_info(
                                        detail_html, md5_id, 18)
                                except Exception as e18:
                                    print("e18" + str(e18))
                                    print("问题在此处被捕获了")
                            else:
                                try:
                                    md5_id, item_number = self.parse_17(
                                        detail_html, business_type)
                                    # 三、外商投资企业投资者基本信息
                                    self.parse_invesment_info(
                                        detail_html, md5_id, 17)
                                except Exception as e17:
                                    print("e17" + str(e17))
                                    print("问题在此处被捕获了")
                            break
                        except requests.exceptions.ChunkedEncodingError as e:
                            print("e" + str(e))
                        except Exception as e1:
                            print("e1" + str(e1))
                            print("==>远程关闭连接,休息等待中。。。")
                            time.sleep(300)
                    time.sleep(1.5)
                break
            except requests.exceptions.ChunkedEncodingError as e2:
                print("e2" + str(e2))
            except Exception as e3:
                print("e3" + str(e3))
                print("=====>远程关闭连接,休息等待中。。。")
                time.sleep(300)

    def main(self):
        req = requests.post(url=self.url, headers=self.headers,
                            data=self.data)  # 初始数据请求
        res_json = self.util.get_json_obj(req.text)
        self.data["page.rowCount"] = res_json["rowCount"]
        for i in range(29, int(res_json["rowCount"])):
            print("#####{}#####".format(i))
            self.parse(i)
            time.sleep(30)