class Validate: def __init__(self): self.header = {} self.urls = [] self.timeout = 4 # 时间间隔 self.tables = [FILTERED_PROXY, FREE_IP_PROXY] def start_requests(self): for table in self.tables: self.proxy = Mongo(table) proxys = self.proxy.select_proxy() for proxy in proxys: for url in self.urls: logging.info("Validate:" + "URL:" + url + "Proxy:" + proxy) proxies = {'http': '%s:%s' % (proxy.get("ip"), proxy.get("port"))} try: requests.get(url, proxies=proxies, timeout=self.timeout) self.success_parse(proxy) except: self.error_parse(proxy) def success_parse(self, proxy): logging.info("Qualified:" + proxy) def error_parse(self, proxy): # 响应失败 self.proxy.delete_with_ip(proxy.get("ip")) logging.info("Unqualified:" + proxy + "it is so slowly,will deleted") def close(self): # 将爬取的ip加入到最终的ip表中 pass_proxys = Mongo(FREE_IP_PROXY).select_proxy() for pass_proxy in pass_proxys: logging.info("Join:" + pass_proxy + "will join filtered_collection") Mongo(FILTERED_PROXY).insert_proxy(pass_proxy)
class Validate(Spider): name = 'base' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.header = {} self.urls = [] self.timeout = 7 # 时间间隔 self.proxy = None self.tables = [FILTERED_PROXY, FREE_IP_PROXY] def start_requests(self): for table in self.tables: self.proxy = Mongo(table) proxys = self.proxy.select_proxy() for proxy in proxys: for url in self.urls: logging.info("Validate:" + "URL:" + url + "Proxy:" + proxy) yield Request(url=url, meta={ 'proxy': 'http://%s:%s' % (proxy.get("ip"), proxy.get("port")), 'download_timeout': self.timeout, }, priority=0, headers=self.header, dont_filter=True, callback=self.success_parse, errback=self.error_parse) def success_parse(self, response): # 响应成功 proxy = response.meta.get('proxy_info') speed = time.time() - response.meta.get('cur_time') if speed >= self.timeout: # 超时删除改ip logging.info("Unqualified:" + proxy + "it is so slowly,will deleted") self.proxy.delete_with_ip(proxy.get("ip", "")) else: logging.info("Qualified:" + proxy) def error_parse(self, failure): # 响应超时 request = failure.request proxy = request.meta.get('proxy_info') proxy.https = request.meta.get('https') self.proxy.delete_with_ip(proxy.get("ip", "")) logging.info("Unqualified:" + proxy + "it is error,will deleted") def close(spider, reason): # 将免费ip中的前缀加入最终的代理池中 pass