Beispiel #1
0
 def getShengInfo(self):
     """
     获取每个省的法院ID
     """
     areacodeList = []
     while len(areacodeList) <= 0:
         r = requests.get(self.href + "/court", headers=self.headers, proxies=self.proxies)
         r.encoding = "utf-8"
         tree = etree.HTML(r.text)
         areacodeList = tree.xpath('//*[@id="wrapper"]/div[4]/div/div[1]/div/span/@areacode')
         areaList = tree.xpath('//*[@id="wrapper"]/div[4]/div/div[1]/div/span/text()')
         courtCode_pattern = re.compile('<a href="/court/(\d+)" target="_blank">')
         ips = utils.getProxy()
         print("换代理")
         print(obj.proxies)
         self.proxies["http"] = ips[0]
         self.proxies["https"] = ips[1]
         time.sleep(1)
     i = 0
     for areacode in areacodeList:
         time.sleep(3)
         href = self.href + "/court?areaCode=" + areacode
         r = requests.get(href, headers=self.headers)
         print href
         r.encoding = "utf-8"
         data = {}
         data["areaName"] = areaList[i]
         data["courtList"] = []
         countCode_reslut = re.findall(courtCode_pattern, r.text)
         for result in countCode_reslut:
             data["courtList"].append(result)
         self.areaCourtList.append(data)
         i += 1
     print(u"省信息获取完毕")
Beispiel #2
0
 def getUnitHerf(self):
     href = self.href + "/gfcms/web/unit/allUnit.do"
     r = requests.get(href, proxies=self.proxies)
     time.sleep(1)
     while "您的请求过于频繁" in r.content:
         ips = utils.getProxy()
         print("换代理")
         print(self.proxies)
         self.proxies["http"] = ips[0]
         self.proxies["https"] = ips[1]
         time.sleep(1)
         r = requests.get(href, proxies=self.proxies)
     soup = BeautifulSoup(r.content)
     areas = soup.find_all('area', attrs={"shape": "circle"})
     codeName_pattern = re.compile("""<a href="javascript:closeME\('(.*?)','(.*?)'\)\"""", re.S)
     codeName_result = re.findall(codeName_pattern, r.content)
     for reslut in codeName_result:
         self.codeName[reslut[0]] = reslut[1]
     for area in areas:
         r = requests.get(self.href + area["href"], proxies=self.proxies)
         while "您的请求过于频繁" in r.content:
             ips = utils.getProxy()
             print("换代理")
             print(self.proxies)
             self.proxies["http"] = ips[0]
             self.proxies["https"] = ips[1]
             time.sleep(3)
             r = requests.get(self.href + area["href"], proxies=self.proxies)
         codeName_result = re.findall(codeName_pattern, r.content)
         for reslut in codeName_result:
             self.codeName[reslut[0]] = reslut[1]
     j = 0
     for k in self.codeName.keys():
         self.isNextFayuan = False
         self.page = 1
         self.return_time = 5
         print("法院个数:" + str(j) + "/" + str(len(self.codeName)))
         # print(self.codeName[k])
         j += 1
         while not self.isNextFayuan:
             self.getInfo(k, self.page)
             self.page += 1
Beispiel #3
0
    def getYuanwen(self, href):
        s = requests.session()
        s.keep_alive = False
        while True:
            try:
                r = requests.get(self.href + href, proxies=self.proxies)
            except:
                s = requests.session()
                s.keep_alive = False
                ips = utils.getProxy()
                print("请求错误2:换代理")
                print(self.proxies)
                self.proxies["http"] = ips[0]
                self.proxies["https"] = ips[1]
                time.sleep(2)
            else:
                break
        time.sleep(2)
        while "您的请求过于频繁" in r.content:
            ips = utils.getProxy()
            print("换代理")
            print(self.proxies)
            self.proxies["http"] = ips[0]
            self.proxies["https"] = ips[1]
            r = requests.get(self.href + href, proxies=self.proxies)
        soup = BeautifulSoup(r.content)
        div = soup.find("div", attrs={"class": "fy_bm_rga"})
        chengban_pattern = re.compile("<p>.*?承办人:( )?( )?(.*?)</p>")

        chengban_reslut = re.findall(chengban_pattern, str(div))
        chengban = ""
        try:
            chengban = chengban_reslut[0][2].replace(' ', '')
        except:
            pass
        return (str(div), str(chengban).strip())
Beispiel #4
0
def Proxy(userDN, group, role, logger):
    """

    """
    userProxy = ''

    try:
        serviceCert = config.General.serviceCert
        serviceKey = config.General.serviceKey

        defaultDelegation = {
            'logger': logger,
            'credServerPath': '/data/certs/creds/',
            'myProxySvr': 'myproxy.cern.ch',
            'min_time_left': 36000,
            'serverDN': config.General.serverDN,
            'uisource': "/dev/null"
        }

        cache_area = config.General.cache_area
        getCache = re.compile('https?://([^/]*)/.*')
        myproxyAccount = getCache.findall(cache_area)[0]
        defaultDelegation['myproxyAccount'] = myproxyAccount

        defaultDelegation['server_cert'] = serviceCert
        defaultDelegation['server_key'] = serviceKey

        valid = False
        defaultDelegation['userDN'] = userDN
        defaultDelegation['group'] = group
        defaultDelegation['role'] = role

        valid, proxy = getProxy(defaultDelegation, logger)
    except Exception as ex:
        msg = "Error getting the user proxy"
        print(msg)
        msg += str(ex)
        msg += str(traceback.format_exc())
        logger.error(msg)
    if valid:
        userProxy = proxy
    else:
        logger.error('Did not get valid proxy.')

    logger.info("userProxy: %s" % userProxy)

    return userProxy
Beispiel #5
0
 def getDetailInfo(self, href):
     while True:
         try:
             r = requests.get(href, proxies=self.proxies)
         except:
             ips = utils.getProxy()
             print("换代理")
             print(self.proxies)
             self.proxies["http"] = ips[0]
             self.proxies["https"] = ips[1]
         else:
             break
     soup = BeautifulSoup(r.content)
     yuanwen = soup.find("div", attrs={"class": "detail"})
     data = copy.deepcopy(self.data)
     data.gonggao = str(yuanwen).replace("'", "\"")
     data.created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     self.sqltemp.insertKtgg(data)
Beispiel #6
0
 def setUp(self):
     self.vdsm = getProxy()
Beispiel #7
0
 def setUp(self):
     self.vdsm = getProxy()
Beispiel #8
0
        href = obj.href + "live/" + str(id)
        print href
        r = requests.get(href, headers=obj.headers)
        time.sleep(1)
        r.encoding = 'utf-8'
        print(len(r.text))
        s = requests.session()
        s.keep_alive = False

        ergodic_times = 10
        while 'document.location.reload();' in r.text:
            ergodic_times -= 1
            if ergodic_times < 0:
                obj.log.log('crawler failed:id:' + str(id), 'error', 'update_err')
                break
            ips = utils.getProxy()
            print("换代理")
            obj.proxies["http"] = ips[0]
            obj.proxies["https"] = ips[1]
            print(obj.proxies)
            time.sleep(1)
            try:
                r.close()
                print u"使用代理"
                r = requests.get(href, headers=obj.headers, proxies=obj.proxies)
            except:
                ips = utils.getProxy()
                print("换代理")
                obj.proxies["http"] = ips[0]
                obj.proxies["https"] = ips[1]
                print(obj.proxies)
Beispiel #9
0
    def getInfo(self, code, page):
        href = 'http://www.hbfy.org/gfcms/templetPro.do?templetPath=overtbegin/overtbeginPage.html'
        para = {"page": page, "currChannelid": "5913d1c6-a73b-4cec-923c-c63376a05752", "currUnitId": code,
                "siteid": "ce0b9496-6b88-4f66-8da7-ede1a989fd6e", "pageNum": page}
        s = requests.session()
        s.keep_alive = False
        print("请求:  " + self.codeName[code] + "   page" + str(page))
        while True:
            try:
                r = requests.post(href, data=para, proxies=self.proxies)
            except:
                s = requests.session()
                s.keep_alive = False
                ips = utils.getProxy()
                print("请求错误1:" + "换代理")
                print(self.proxies)
                time.sleep(2)
                self.proxies["http"] = ips[0]
                self.proxies["https"] = ips[1]
            else:
                break

        while "您的请求过于频繁" in r.content:
            ips = utils.getProxy()
            print("换代理")
            print(self.proxies)
            self.proxies["http"] = ips[0]
            self.proxies["https"] = ips[1]
            time.sleep(3)
            r = requests.post(href, data=para, proxies=self.proxies)

        soup = BeautifulSoup(r.content)
        table = soup.find("table", attrs={"class": "zebra"})
        if table:
            trs = table.find_all("tr")
        else:
            trs = []

        beigao_pattern = re.compile('((被告)|(被申请(执行)?(再审)?)|(被上诉))(人)?:(.*?)(;|$)', re.S)
        yuangao_patten = re.compile('((原告)|(申请(执行)?(再审)?)|(上诉))(人)?:(.*?)(;|$)', re.S)
        qita_patten = re.compile('(第三)(人|方):(.*?)(;|$)', re.S)
        if len(trs) > 1:
            for i in range(1, len(trs)):
                data = copy.deepcopy(self.data)
                data.created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                tds = trs[i].find_all("td")
                a_pattern = re.compile('<a href="(.*?)".*?>.*?</a>', re.S)
                a_reslut = re.search(a_pattern, str(tds[0]))
                yuanwen_result = self.getYuanwen(a_reslut.group(1))

                data.gonggao = str(yuanwen_result[0])
                data.chengban = str(yuanwen_result[1])
                data.kaitingriqi = tds[1].string.replace('年', '-').replace('月', '-').replace("日", "")
                data.dangshiren = str(tds[2].string.strip())
                data.anyou = str(tds[3].string)
                data.fayuan = self.codeName[code]
                yuangao_reslut = re.findall(yuangao_patten, str(data.dangshiren))
                beigao_reslut = re.findall(beigao_pattern, str(data.dangshiren))
                qita_result = re.findall(qita_patten, str(data.dangshiren))
                try:
                    data.yuangao = str(yuangao_reslut[0][7])
                except IndexError:
                    print "没有原告"
                try:
                    data.qita = str(qita_result[0][2])
                except IndexError:
                    print "没有第三方"
                try:
                    data.beigao = str(beigao_reslut[0][7])
                except IndexError:
                    print("没有被告")
                data.gonggao_id = str(uuid.uuid3(uuid.NAMESPACE_OID, str(data.dangshiren + "_" + data.kaitingriqi)))
                self.sqltemp.insertKtgg(data)
        else:
            self.return_time -= 1
            if self.return_time >= 0:
                print "10秒后再请求一次"
                time.sleep(10)
                self.getInfo(code, page)
            else:
                self.isNextFayuan = True
                self.page = 1