예제 #1
0
 def parse(self, response):
     log(f'{self.name}抓取代理成功', 'DEBUG')
     proxies_list = []
     proxy = response.xpath('//tr')[1:]
     urls = response.xpath(
         '//ul[@class="pagination"]/li/a/@href').getall()[-1]
     url = response.urljoin(urls)
     for i in proxy:
         http = i.xpath('./td/a/text()').getall()[-1]
         if '高匿' in http:
             ip = i.xpath('./td/a/text()').get()
             host = i.xpath('./td/text()').get()
             save = ip, host
             proxies = save[0] + ':' + save[1]
             proxies_list.append(proxies)
     proxies_list = [[self.name, i] for i in proxies_list]
     with ThreadPoolExecutor(max_workers=THREADPOOL) as t:
         for i in proxies_list:
             t.submit(parse_pool, i)
     if urls:
         yield scrapy.Request(url=url,
                              callback=self.parse,
                              dont_filter=True)
     else:
         yield scrapy.Request(url='http://ip.ihuan.me/',
                              callback=self.parse,
                              dont_filter=True)
예제 #2
0
 def process_exception(self, request, exception, spider):
     if 'ihuan' == spider.name:
         return request
     try:
         p = re.split('//', request.meta['proxy'])[1]
         r.srem(f'{spider.name}', p)
     except:
         pass
     log(f'请求失败错误信息为:{exception}', False)
     log(f'请求失败网址为:{request.url}', False)
예제 #3
0
 def parse(self, response):
     log(f'{self.name}抓取代理成功', 'DEBUG')
     proxies_list = []
     proxy = response.xpath('//tbody/tr')
     for i in proxy:
         http = i.xpath('./td/text()')[2].get()
         if '高匿' in http:
             proxies = i.xpath('./td/text()')[0].get()
             proxies_list.append(proxies)
     proxies_list = [[self.name, i] for i in proxies_list]
     with ThreadPoolExecutor(max_workers=THREADPOOL) as t:
         for i in proxies_list:
             t.submit(parse_pool, i)
예제 #4
0
파일: jisu.py 프로젝트: yuzhiyizhan/spidery
 def parse(self, response):
     log(f'{self.name}抓取代理成功', 'DEBUG')
     proxies_list = []
     proxy = response.xpath('//tr')[5:]
     for i in proxy:
         ip = i.xpath('./td/text()').get()
         host = i.xpath('./td/text()')[1].get()
         save = ip, host
         proxies = save[0] + ':' + save[1]
         proxies_list.append(proxies)
     proxies_list = [[self.name, i] for i in proxies_list]
     with ThreadPoolExecutor(max_workers=THREADPOOL) as t:
         for i in proxies_list:
             t.submit(parse_pool, i)
예제 #5
0
 def process_response(self, request, response, spider):
     if response.status == 200:
         log(f'请求成功网址为:{response.url}', 'DEBUG')
         return response
예제 #6
0
def parse_pool(proxy):
    if PROXIES_MOD == 'HTTPS':
        proxies = {'https': 'https://' + proxy}
        error = 0
        while True:
            try:
                response = requests.get(url=VERIFICATION_URL,
                                        headers=VERIFICATION_HEADERS,
                                        proxies=proxies,
                                        timeout=2)
                if response.status_code == 200:
                    log(f'可用ip:{proxy}重试过{error}次')
                    r.sadd('https', proxy)
                    break
            except:
                error = error + 1
                if error > 3:
                    log(f'删除ip:{proxy}', False)
                    requests.get(
                        url=f'http://127.0.0.1:5555/deletes?delete={proxy}')
                    break
                else:
                    log(f'重试ip{error}次:{proxy}', "DEBUG")
                    continue
    if PROXIES_MOD == 'HTTP':
        proxies = {'http': 'http://' + proxy}
        error = 0
        while True:
            try:
                response = requests.get(url=VERIFICATION_URL,
                                        headers=VERIFICATION_HEADERS,
                                        proxies=proxies,
                                        timeout=2)
                if response.status_code == 200:
                    log(f'可用ip:{proxy}重试过{error}次')
                    r.sadd('http', proxy)
                    break
            except:
                error = error + 1
            if error > 3:
                log(f'删除ip:{proxy}', False)
                requests.get(
                    url=f'http://127.0.0.1:5555/delete?delete={proxy}')
                break
            else:
                log(f'重试ip{error}次:{proxy}', "DEBUG")
                continue
예제 #7
0
def parse_pool(proxy):
    if PROXIES_MOD == 'HTTPS':
        proxies = {'https': 'https://' + proxy[1]}
        error = 0
        while True:
            try:
                response = requests.get(url=VERIFICATION_URL,
                                        headers=VERIFICATION_HEADERS,
                                        proxies=proxies,
                                        timeout=DOWNLOAD_TIMEOUT)
                if response.status_code == 200:
                    log(f"可用ip:{proxy[1]}重试过{error}次,代理来自{proxy[0]}")
                    r.sadd('https', proxy[1])
                    if type(REDIS_TIMEOUT) == int:
                        r.exists('https', REDIS_TIMEOUT)
                    break
            except:
                error = error + 1
                if error > 3:
                    log(f'无效ip:{proxy[1]}', False)
                    break
                else:
                    log(f'重试ip{error}次:{proxy[1]}', "DEBUG")
                    continue
    if PROXIES_MOD == 'HTTP':
        proxies = {'http': 'http://' + proxy[1]}
        error = 0
        while True:
            try:
                response = requests.get(url=VERIFICATION_URL,
                                        headers=VERIFICATION_HEADERS,
                                        proxies=proxies,
                                        timeout=DOWNLOAD_TIMEOUT)
                if response.status_code == 200:
                    log(f"可用ip:{proxy[1]}重试过{error}次,代理来自{proxy[0]}")
                    r.sadd('http', proxy[1])
                    if type(REDIS_TIMEOUT) == int:
                        r.exists('http', REDIS_TIMEOUT)
                    break
            except:
                error = error + 1
            if error > 3:
                log(f'无效ip:{proxy[1]}', False)
                break
            else:
                log(f'重试ip{error}次:{proxy[1]}', "DEBUG")
                continue