def get_json(self): company_list = read_company2(self.path) for c in company_list: company = c.strip() if company: i = Item_dump(company) ret = i.item_dump() if not ret: item = {} item["company"] = company item["type"] = self.type nd = int(time.time()) * 1000 nd1 = nd + 2500000 url = "https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?resource_id=6899&query=%E5%A4%B1%E4%BF%A1%E8%A2%AB%E6%89%A7%E8%A1%8C%E4%BA%BA%E5%90%8D%E5%8D%95&cardNum=&iname={}&areaName=&ie=utf-8&oe=utf-8&format=json&t={}&cb=jQuery1102025075011778413225_{}&_={}".format( company, nd1, self.code, nd) u = Rand_ua() ua = u.rand_chose() headers = { "User-Agent": ua, } try: ret = requests.get(url, headers=headers, timeout=60) except Exception as e: with open('log/ss_log.log', 'a') as f: now = str(datetime.datetime.now()) f.write(now + ',' + str(e) + ',' + company + ',' + '失信信息' + '\n') continue json = ret.content.decode() h = Handel_json(json, company) ss_list = h.handel_json() item["失信信息"] = ss_list print(item) self.save_mongodb(item) time.sleep(3)
def __init__(self,company_name): u = Rand_ua() ua = u.rand_chose() self.cp_url = "http://wenshu.court.gov.cn/" self.company_name = company_name options = webdriver.ChromeOptions() path = "E:\python开发环境\chromedriver.exe" options.add_argument('--user-agent={}'.format(ua)) self.driver = webdriver.Chrome(chrome_options=options, executable_path=path)
def __init__(self, path, text_list=None, type=None,proxies=None): self.proxies = proxies self.u = Rand_ua() self.client = pymongo.MongoClient(host='127.0.0.1', port=27017) self.conn = self.client["qg_ss"]['shesu_info'] self.type = type # 公司类型 self.text_list = text_list # 测试列表 self.path = path # 公司名单路径 self.captcha_url = 'http://zhixing.court.gov.cn/search/captcha.do?captchaId=fda97538121240b38b0c73eeac144dbe&random={}'
def text_dama(): u = Rand_ua() ua = u.rand_chose() headers = {'User-Agent': ua} captcha_url = 'http://zhixing.court.gov.cn/search/captcha.do?captchaId=fda97538121240b38b0c73eeac144dbe&random={}'.format( random.randint(999999999999999, 9999999999999999) / 10000000000000000) response = requests.get(captcha_url, headers=headers) print(type(response.content)) dmt = DamatuApi("469819183", "54188") ret = dmt.decode(response.content, 42) print(ret)
def __init__(self, company): u = Rand_ua() ua = u.rand_chose() try: options = webdriver.ChromeOptions() options.add_argument('--user-agent={}'.format(ua)) self.driver = webdriver.Chrome(chrome_options=options) except Exception as e: Log('log/cp_log.log', e=e) return self.cp_url = "http://wenshu.court.gov.cn/" self.company = company
def _search_company(company, captcha_url): u = Rand_ua() ua = u.rand_chose() headers = {'User-Agent': ua} try: captcha_response = requests.get(captcha_url, headers=headers, timeout=60) # captcha = indetify(captcha_response.content) dmt = DamatuApi("469819183", "54188") captcha = dmt.decode(captcha_response.content, 42) except Exception as e: print(e) with open("log/shesu_log.log", 'a') as f: now = str(datetime.datetime.now()) f.write(now + ',' + company + ',' + str(e) + '\n') return "验证码获取失败!" post_data = { "searchCourtName": "全国法院(包含地方各级法院)", "selectCourtId": 1, "selectCourtArrange": 1, "pname": company, "cardNum": "", "j_captcha": captcha, "captchaId": "fda97538121240b38b0c73eeac144dbe" } # print(post_data) resp = requests.post('http://zhixing.court.gov.cn/search/newsearch', data=post_data, headers=headers, timeout=60) print("查询++++++++++", resp.status_code) content = resp.content.decode() html = etree.HTML(content) # 判断验证码是否错误 text = html.xpath("//title/text()")[0] print("*" * 20, text) # 处理验证码,超过5次忽略,计入日志 assert (text != "验证码出现错误,请重新输入!" and resp.status_code == 200) # 请求详情页 ss_list = get_detail(html, captcha, headers, company) return ss_list
def __init__(self, company): u = Rand_ua() ua = u.rand_chose() try: options = webdriver.ChromeOptions() path = "E:\python开发环境\chromedriver.exe" options.add_argument('--user-agent={}'.format(ua)) # options.add_argument("headless") # prefs = {'profile.default_content_setting_values': {'images': 2}} # options.add_experimental_option('prefs', prefs) # 采用无图模式效果不理想 self.driver = webdriver.Chrome(chrome_options=options,executable_path=path) except Exception as e: Log('log/cp_log.log', e=e) # self.display.stop() return self.cp_url = "http://wenshu.court.gov.cn/" self.company = company
def get_detail(self, href): # /content/content?DocID=b18f2733-6f07-4d42-ab8b-d1859ce3222f&KeyWord=江苏和信工程咨询有限公司' # http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=d47506cc-644b-4c51-bffe-a71c0102f2b9 if href != "未获取到文书id": d_id = re.findall(r"DocID=(.*?)&", href) doc_id = d_id[0] if d_id else 0 if doc_id: u = Rand_ua() ua = u.rand_chose() headers = {"User-Agent": ua} d_url = "http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID={}".format( doc_id) print(d_url) try: ret = requests.get(d_url, headers=headers, timeout=60) except Exception as e: print("*" * 10, str(e)) Log('log/cp_log.log', e=e) return "获取文书内容失败" try: html = ret.content.decode() # print(html) xml = etree.HTML(html) except Exception as e: print("*" * 10, str(e)) Log('log/cp_log.log', e=e) return "获取文书内容失败" try: ws_detail = xml.xpath("//body//text()") except Exception as e: print("*" * 10, str(e)) Log('log/cp_log.log', e=e) return "获取文书内容失败" time.sleep(2) return ws_detail else: return "未获取到文书id"
class ZhixingSpider: def __init__(self, path, text_list=None, type=None,proxies=None): self.proxies = proxies self.u = Rand_ua() self.client = pymongo.MongoClient(host='127.0.0.1', port=27017) self.conn = self.client["qg_ss"]['shesu_info'] self.type = type # 公司类型 self.text_list = text_list # 测试列表 self.path = path # 公司名单路径 self.captcha_url = 'http://zhixing.court.gov.cn/search/captcha.do?captchaId=fda97538121240b38b0c73eeac144dbe&random={}' @retry(stop_max_attempt_number=5,stop_max_delay=60000) def _search_company(self, company, captcha_url): ua = self.u.rand_chose() headers = {'User-Agent': ua} try: captcha_response = requests.get(captcha_url, headers=headers, timeout=60, proxies=self.proxies) captcha = indetify(captcha_response.content) except Exception as e: print(e) with open("log/shesu_log.log", 'a') as f: now = str(datetime.datetime.now()) f.write(now+','+company+','+str(e)+'\n') return "验证码获取失败!" post_data = { "searchCourtName": "全国法院(包含地方各级法院)", "selectCourtId": 1, "selectCourtArrange": 1, "pname": company, "cardNum": "", "j_captcha": captcha, "captchaId": "fda97538121240b38b0c73eeac144dbe" } # print(post_data) resp = requests.post('http://zhixing.court.gov.cn/search/newsearch', data=post_data, headers=headers, timeout=60,proxies=self.proxies) print("查询++++++++++", resp.status_code) content = resp.content.decode() html = etree.HTML(content) # 判断验证码是否错误 text = html.xpath("//title/text()")[0] print("*"*20, text) # 处理验证码,超过5次忽略,计入日志 assert (text != "验证码出现错误,请重新输入!" and resp.status_code == 200) # 请求详情页 ss_list = self.get_detail(html, captcha, headers) return ss_list def search_company(self, company, captcha_url): try: ss_list = self._search_company(company, captcha_url) except Exception as e: print(e) with open("log/shesu_log.log", 'a') as f: now = str(datetime.datetime.now()) f.write(now+','+company+','+str(e)+'\n') ss_list = ["未获取到信息"] with open("log/except_company.csv", 'a') as f: f.write(company+'\n') return ss_list def get_detail(self, html, captcha, headers): # 判断是否有查询结果,并获取案号id ss_list = [] tr_list = html.xpath("//tbody//tr") # print(tr_list) if len(tr_list)>1: print("*" * 20) for tr in tr_list: id = tr.xpath(".//td[@align='center']/a/@id") if len(id)>0: ss_one = [] id = id[0] # print(id) # 拼接详情页的链接 # http://zhixing.court.gov.cn/search/newdetail?id=16900266&j_captcha=pzt8&captchaId=fda97538121240b38b0c73eeac144dbe&_=1515212716230 time_id = int(time.time())*1000 detail_url = "http://zhixing.court.gov.cn/search/newdetail?id={}&j_captcha={}&captchaId=fda97538121240b38b0c73eeac144dbe&_={}".format(id, captcha, time_id) # 发送请求 try: ret = requests.get(detail_url, headers=headers, timeout=60,proxies=self.proxies) except Exception as e: with open("log/shesu_log.log", 'a') as f: now = str(datetime.datetime.now()) f.write(now + ',' + company + ',' + str(e) + '\n') continue print("查看++++++++++",ret.status_code) ret_json = ret.content.decode() # 将json 格式转换成python类型 ret_dic = json.loads(ret_json) # 获取所需的字段 如果字典没有这个建会报异常 try: pname = ret_dic["pname"] except: pname = "未获取到" try: caseCode = ret_dic["caseCode"]# 案号 except: caseCode = "未获取到" try: caseCreateTime = ret_dic["caseCreateTime"] # 立案时间 except: caseCreateTime = "未获取到" try: partyCardNum = ret_dic["partyCardNum"]# 身份证号码 except: partyCardNum = "未获取到" try: execCourtName = ret_dic["execCourtName"]# 执行法院 except: execCourtName = "未获取到" try: execMoney = ret_dic["execMoney"]# 执行标的 except: execMoney = "未获取到" ss_one.append(pname) ss_one.append(caseCode) ss_one.append(caseCreateTime) ss_one.append(partyCardNum) ss_one.append(execCourtName) ss_one.append(execMoney) ss_list.append(ss_one) time.sleep(2) return ss_list def save_mongodb(self, item): self.conn.insert_one(dict(item)) print("保存成功!") def run(self): # 给验证码的url拼接16位随机数 company_list = read_company1(self.path) for company in company_list: i = Item_dump(company) ret = i.item_dump() if not ret: item = {} item["company"] = company item["type"] = self.type captcha_url = self.captcha_url.format(random.randint(999999999999999,9999999999999999)/10000000000000000) ss_list = self.search_company(company, captcha_url) item["涉诉信息"] = ss_list self.save_mongodb(item) print(item) # 测试模式 def run_text(self): # 给验证码的url拼接16位随机数 company_list = read_company2(self.path) for company in company_list: item = {} item["company"] = company item["type"] = self.type captcha_url = self.captcha_url.format( random.randint(999999999999999, 9999999999999999) / 10000000000000000) ss_list = self.search_company(company, captcha_url) item["涉诉信息"] = ss_list print(item)