def getShengInfo(self): """ 获取每个省的法院ID """ areacodeList = [] while len(areacodeList) <= 0: r = requests.get(self.href + "/court", headers=self.headers, proxies=self.proxies) r.encoding = "utf-8" tree = etree.HTML(r.text) areacodeList = tree.xpath('//*[@id="wrapper"]/div[4]/div/div[1]/div/span/@areacode') areaList = tree.xpath('//*[@id="wrapper"]/div[4]/div/div[1]/div/span/text()') courtCode_pattern = re.compile('<a href="/court/(\d+)" target="_blank">') ips = utils.getProxy() print("换代理") print(obj.proxies) self.proxies["http"] = ips[0] self.proxies["https"] = ips[1] time.sleep(1) i = 0 for areacode in areacodeList: time.sleep(3) href = self.href + "/court?areaCode=" + areacode r = requests.get(href, headers=self.headers) print href r.encoding = "utf-8" data = {} data["areaName"] = areaList[i] data["courtList"] = [] countCode_reslut = re.findall(courtCode_pattern, r.text) for result in countCode_reslut: data["courtList"].append(result) self.areaCourtList.append(data) i += 1 print(u"省信息获取完毕")
def getUnitHerf(self): href = self.href + "/gfcms/web/unit/allUnit.do" r = requests.get(href, proxies=self.proxies) time.sleep(1) while "您的请求过于频繁" in r.content: ips = utils.getProxy() print("换代理") print(self.proxies) self.proxies["http"] = ips[0] self.proxies["https"] = ips[1] time.sleep(1) r = requests.get(href, proxies=self.proxies) soup = BeautifulSoup(r.content) areas = soup.find_all('area', attrs={"shape": "circle"}) codeName_pattern = re.compile("""<a href="javascript:closeME\('(.*?)','(.*?)'\)\"""", re.S) codeName_result = re.findall(codeName_pattern, r.content) for reslut in codeName_result: self.codeName[reslut[0]] = reslut[1] for area in areas: r = requests.get(self.href + area["href"], proxies=self.proxies) while "您的请求过于频繁" in r.content: ips = utils.getProxy() print("换代理") print(self.proxies) self.proxies["http"] = ips[0] self.proxies["https"] = ips[1] time.sleep(3) r = requests.get(self.href + area["href"], proxies=self.proxies) codeName_result = re.findall(codeName_pattern, r.content) for reslut in codeName_result: self.codeName[reslut[0]] = reslut[1] j = 0 for k in self.codeName.keys(): self.isNextFayuan = False self.page = 1 self.return_time = 5 print("法院个数:" + str(j) + "/" + str(len(self.codeName))) # print(self.codeName[k]) j += 1 while not self.isNextFayuan: self.getInfo(k, self.page) self.page += 1
def getYuanwen(self, href): s = requests.session() s.keep_alive = False while True: try: r = requests.get(self.href + href, proxies=self.proxies) except: s = requests.session() s.keep_alive = False ips = utils.getProxy() print("请求错误2:换代理") print(self.proxies) self.proxies["http"] = ips[0] self.proxies["https"] = ips[1] time.sleep(2) else: break time.sleep(2) while "您的请求过于频繁" in r.content: ips = utils.getProxy() print("换代理") print(self.proxies) self.proxies["http"] = ips[0] self.proxies["https"] = ips[1] r = requests.get(self.href + href, proxies=self.proxies) soup = BeautifulSoup(r.content) div = soup.find("div", attrs={"class": "fy_bm_rga"}) chengban_pattern = re.compile("<p>.*?承办人:( )?( )?(.*?)</p>") chengban_reslut = re.findall(chengban_pattern, str(div)) chengban = "" try: chengban = chengban_reslut[0][2].replace(' ', '') except: pass return (str(div), str(chengban).strip())
def Proxy(userDN, group, role, logger): """ """ userProxy = '' try: serviceCert = config.General.serviceCert serviceKey = config.General.serviceKey defaultDelegation = { 'logger': logger, 'credServerPath': '/data/certs/creds/', 'myProxySvr': 'myproxy.cern.ch', 'min_time_left': 36000, 'serverDN': config.General.serverDN, 'uisource': "/dev/null" } cache_area = config.General.cache_area getCache = re.compile('https?://([^/]*)/.*') myproxyAccount = getCache.findall(cache_area)[0] defaultDelegation['myproxyAccount'] = myproxyAccount defaultDelegation['server_cert'] = serviceCert defaultDelegation['server_key'] = serviceKey valid = False defaultDelegation['userDN'] = userDN defaultDelegation['group'] = group defaultDelegation['role'] = role valid, proxy = getProxy(defaultDelegation, logger) except Exception as ex: msg = "Error getting the user proxy" print(msg) msg += str(ex) msg += str(traceback.format_exc()) logger.error(msg) if valid: userProxy = proxy else: logger.error('Did not get valid proxy.') logger.info("userProxy: %s" % userProxy) return userProxy
def getDetailInfo(self, href): while True: try: r = requests.get(href, proxies=self.proxies) except: ips = utils.getProxy() print("换代理") print(self.proxies) self.proxies["http"] = ips[0] self.proxies["https"] = ips[1] else: break soup = BeautifulSoup(r.content) yuanwen = soup.find("div", attrs={"class": "detail"}) data = copy.deepcopy(self.data) data.gonggao = str(yuanwen).replace("'", "\"") data.created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) self.sqltemp.insertKtgg(data)
def setUp(self): self.vdsm = getProxy()
href = obj.href + "live/" + str(id) print href r = requests.get(href, headers=obj.headers) time.sleep(1) r.encoding = 'utf-8' print(len(r.text)) s = requests.session() s.keep_alive = False ergodic_times = 10 while 'document.location.reload();' in r.text: ergodic_times -= 1 if ergodic_times < 0: obj.log.log('crawler failed:id:' + str(id), 'error', 'update_err') break ips = utils.getProxy() print("换代理") obj.proxies["http"] = ips[0] obj.proxies["https"] = ips[1] print(obj.proxies) time.sleep(1) try: r.close() print u"使用代理" r = requests.get(href, headers=obj.headers, proxies=obj.proxies) except: ips = utils.getProxy() print("换代理") obj.proxies["http"] = ips[0] obj.proxies["https"] = ips[1] print(obj.proxies)
def getInfo(self, code, page): href = 'http://www.hbfy.org/gfcms/templetPro.do?templetPath=overtbegin/overtbeginPage.html' para = {"page": page, "currChannelid": "5913d1c6-a73b-4cec-923c-c63376a05752", "currUnitId": code, "siteid": "ce0b9496-6b88-4f66-8da7-ede1a989fd6e", "pageNum": page} s = requests.session() s.keep_alive = False print("请求: " + self.codeName[code] + " page" + str(page)) while True: try: r = requests.post(href, data=para, proxies=self.proxies) except: s = requests.session() s.keep_alive = False ips = utils.getProxy() print("请求错误1:" + "换代理") print(self.proxies) time.sleep(2) self.proxies["http"] = ips[0] self.proxies["https"] = ips[1] else: break while "您的请求过于频繁" in r.content: ips = utils.getProxy() print("换代理") print(self.proxies) self.proxies["http"] = ips[0] self.proxies["https"] = ips[1] time.sleep(3) r = requests.post(href, data=para, proxies=self.proxies) soup = BeautifulSoup(r.content) table = soup.find("table", attrs={"class": "zebra"}) if table: trs = table.find_all("tr") else: trs = [] beigao_pattern = re.compile('((被告)|(被申请(执行)?(再审)?)|(被上诉))(人)?:(.*?)(;|$)', re.S) yuangao_patten = re.compile('((原告)|(申请(执行)?(再审)?)|(上诉))(人)?:(.*?)(;|$)', re.S) qita_patten = re.compile('(第三)(人|方):(.*?)(;|$)', re.S) if len(trs) > 1: for i in range(1, len(trs)): data = copy.deepcopy(self.data) data.created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) tds = trs[i].find_all("td") a_pattern = re.compile('<a href="(.*?)".*?>.*?</a>', re.S) a_reslut = re.search(a_pattern, str(tds[0])) yuanwen_result = self.getYuanwen(a_reslut.group(1)) data.gonggao = str(yuanwen_result[0]) data.chengban = str(yuanwen_result[1]) data.kaitingriqi = tds[1].string.replace('年', '-').replace('月', '-').replace("日", "") data.dangshiren = str(tds[2].string.strip()) data.anyou = str(tds[3].string) data.fayuan = self.codeName[code] yuangao_reslut = re.findall(yuangao_patten, str(data.dangshiren)) beigao_reslut = re.findall(beigao_pattern, str(data.dangshiren)) qita_result = re.findall(qita_patten, str(data.dangshiren)) try: data.yuangao = str(yuangao_reslut[0][7]) except IndexError: print "没有原告" try: data.qita = str(qita_result[0][2]) except IndexError: print "没有第三方" try: data.beigao = str(beigao_reslut[0][7]) except IndexError: print("没有被告") data.gonggao_id = str(uuid.uuid3(uuid.NAMESPACE_OID, str(data.dangshiren + "_" + data.kaitingriqi))) self.sqltemp.insertKtgg(data) else: self.return_time -= 1 if self.return_time >= 0: print "10秒后再请求一次" time.sleep(10) self.getInfo(code, page) else: self.isNextFayuan = True self.page = 1