def parse(self, response): log(f'{self.name}抓取代理成功', 'DEBUG') proxies_list = [] proxy = response.xpath('//tr')[1:] urls = response.xpath( '//ul[@class="pagination"]/li/a/@href').getall()[-1] url = response.urljoin(urls) for i in proxy: http = i.xpath('./td/a/text()').getall()[-1] if '高匿' in http: ip = i.xpath('./td/a/text()').get() host = i.xpath('./td/text()').get() save = ip, host proxies = save[0] + ':' + save[1] proxies_list.append(proxies) proxies_list = [[self.name, i] for i in proxies_list] with ThreadPoolExecutor(max_workers=THREADPOOL) as t: for i in proxies_list: t.submit(parse_pool, i) if urls: yield scrapy.Request(url=url, callback=self.parse, dont_filter=True) else: yield scrapy.Request(url='http://ip.ihuan.me/', callback=self.parse, dont_filter=True)
def process_exception(self, request, exception, spider): if 'ihuan' == spider.name: return request try: p = re.split('//', request.meta['proxy'])[1] r.srem(f'{spider.name}', p) except: pass log(f'请求失败错误信息为:{exception}', False) log(f'请求失败网址为:{request.url}', False)
def parse(self, response): log(f'{self.name}抓取代理成功', 'DEBUG') proxies_list = [] proxy = response.xpath('//tbody/tr') for i in proxy: http = i.xpath('./td/text()')[2].get() if '高匿' in http: proxies = i.xpath('./td/text()')[0].get() proxies_list.append(proxies) proxies_list = [[self.name, i] for i in proxies_list] with ThreadPoolExecutor(max_workers=THREADPOOL) as t: for i in proxies_list: t.submit(parse_pool, i)
def parse(self, response): log(f'{self.name}抓取代理成功', 'DEBUG') proxies_list = [] proxy = response.xpath('//tr')[5:] for i in proxy: ip = i.xpath('./td/text()').get() host = i.xpath('./td/text()')[1].get() save = ip, host proxies = save[0] + ':' + save[1] proxies_list.append(proxies) proxies_list = [[self.name, i] for i in proxies_list] with ThreadPoolExecutor(max_workers=THREADPOOL) as t: for i in proxies_list: t.submit(parse_pool, i)
def process_response(self, request, response, spider): if response.status == 200: log(f'请求成功网址为:{response.url}', 'DEBUG') return response
def parse_pool(proxy): if PROXIES_MOD == 'HTTPS': proxies = {'https': 'https://' + proxy} error = 0 while True: try: response = requests.get(url=VERIFICATION_URL, headers=VERIFICATION_HEADERS, proxies=proxies, timeout=2) if response.status_code == 200: log(f'可用ip:{proxy}重试过{error}次') r.sadd('https', proxy) break except: error = error + 1 if error > 3: log(f'删除ip:{proxy}', False) requests.get( url=f'http://127.0.0.1:5555/deletes?delete={proxy}') break else: log(f'重试ip{error}次:{proxy}', "DEBUG") continue if PROXIES_MOD == 'HTTP': proxies = {'http': 'http://' + proxy} error = 0 while True: try: response = requests.get(url=VERIFICATION_URL, headers=VERIFICATION_HEADERS, proxies=proxies, timeout=2) if response.status_code == 200: log(f'可用ip:{proxy}重试过{error}次') r.sadd('http', proxy) break except: error = error + 1 if error > 3: log(f'删除ip:{proxy}', False) requests.get( url=f'http://127.0.0.1:5555/delete?delete={proxy}') break else: log(f'重试ip{error}次:{proxy}', "DEBUG") continue
def parse_pool(proxy): if PROXIES_MOD == 'HTTPS': proxies = {'https': 'https://' + proxy[1]} error = 0 while True: try: response = requests.get(url=VERIFICATION_URL, headers=VERIFICATION_HEADERS, proxies=proxies, timeout=DOWNLOAD_TIMEOUT) if response.status_code == 200: log(f"可用ip:{proxy[1]}重试过{error}次,代理来自{proxy[0]}") r.sadd('https', proxy[1]) if type(REDIS_TIMEOUT) == int: r.exists('https', REDIS_TIMEOUT) break except: error = error + 1 if error > 3: log(f'无效ip:{proxy[1]}', False) break else: log(f'重试ip{error}次:{proxy[1]}', "DEBUG") continue if PROXIES_MOD == 'HTTP': proxies = {'http': 'http://' + proxy[1]} error = 0 while True: try: response = requests.get(url=VERIFICATION_URL, headers=VERIFICATION_HEADERS, proxies=proxies, timeout=DOWNLOAD_TIMEOUT) if response.status_code == 200: log(f"可用ip:{proxy[1]}重试过{error}次,代理来自{proxy[0]}") r.sadd('http', proxy[1]) if type(REDIS_TIMEOUT) == int: r.exists('http', REDIS_TIMEOUT) break except: error = error + 1 if error > 3: log(f'无效ip:{proxy[1]}', False) break else: log(f'重试ip{error}次:{proxy[1]}', "DEBUG") continue