class Validate: def __init__(self): self.header = {} self.urls = [] self.timeout = 4 # 时间间隔 self.tables = [FILTERED_PROXY, FREE_IP_PROXY] def start_requests(self): for table in self.tables: self.proxy = Mongo(table) proxys = self.proxy.select_proxy() for proxy in proxys: for url in self.urls: logging.info("Validate:" + "URL:" + url + "Proxy:" + proxy) proxies = {'http': '%s:%s' % (proxy.get("ip"), proxy.get("port"))} try: requests.get(url, proxies=proxies, timeout=self.timeout) self.success_parse(proxy) except: self.error_parse(proxy) def success_parse(self, proxy): logging.info("Qualified:" + proxy) def error_parse(self, proxy): # 响应失败 self.proxy.delete_with_ip(proxy.get("ip")) logging.info("Unqualified:" + proxy + "it is so slowly,will deleted") def close(self): # 将爬取的ip加入到最终的ip表中 pass_proxys = Mongo(FREE_IP_PROXY).select_proxy() for pass_proxy in pass_proxys: logging.info("Join:" + pass_proxy + "will join filtered_collection") Mongo(FILTERED_PROXY).insert_proxy(pass_proxy)
class Validate(Spider): name = 'base' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.header = {} self.urls = [] self.timeout = 7 # 时间间隔 self.proxy = None self.tables = [FILTERED_PROXY, FREE_IP_PROXY] def start_requests(self): for table in self.tables: self.proxy = Mongo(table) proxys = self.proxy.select_proxy() for proxy in proxys: for url in self.urls: logging.info("Validate:" + "URL:" + url + "Proxy:" + proxy) yield Request(url=url, meta={ 'proxy': 'http://%s:%s' % (proxy.get("ip"), proxy.get("port")), 'download_timeout': self.timeout, }, priority=0, headers=self.header, dont_filter=True, callback=self.success_parse, errback=self.error_parse) def success_parse(self, response): # 响应成功 proxy = response.meta.get('proxy_info') speed = time.time() - response.meta.get('cur_time') if speed >= self.timeout: # 超时删除改ip logging.info("Unqualified:" + proxy + "it is so slowly,will deleted") self.proxy.delete_with_ip(proxy.get("ip", "")) else: logging.info("Qualified:" + proxy) def error_parse(self, failure): # 响应超时 request = failure.request proxy = request.meta.get('proxy_info') proxy.https = request.meta.get('https') self.proxy.delete_with_ip(proxy.get("ip", "")) logging.info("Unqualified:" + proxy + "it is error,will deleted") def close(spider, reason): # 将免费ip中的前缀加入最终的代理池中 pass
def start_requests(self): for table in self.tables: self.proxy = Mongo(table) proxys = self.proxy.select_proxy() for proxy in proxys: for url in self.urls: logging.info("Validate:" + "URL:" + url + "Proxy:" + proxy) proxies = {'http': '%s:%s' % (proxy.get("ip"), proxy.get("port"))} try: requests.get(url, proxies=proxies, timeout=self.timeout) self.success_parse(proxy) except: self.error_parse(proxy)
def start_requests(self): for table in self.tables: self.proxy = Mongo(table) proxys = self.proxy.select_proxy() for proxy in proxys: for url in self.urls: logging.info("Validate:" + "URL:" + url + "Proxy:" + proxy) yield Request(url=url, meta={ 'proxy': 'http://%s:%s' % (proxy.get("ip"), proxy.get("port")), 'download_timeout': self.timeout, }, priority=0, headers=self.header, dont_filter=True, callback=self.success_parse, errback=self.error_parse)
def insert_data(self): proxy = Mongo(FREE_IP_PROXY) proxy.insert_proxy(dict(self))
def get_random_ip(self): proxy = Mongo(FILTERED_PROXY).obtain_fasted_proxy() return proxy
def get_ips(num=1): proxy = Mongo(FILTERED_PROXY) proxys = proxy.obtain_proxy(num) return json.dumps(proxys)
def close(self): # 将爬取的ip加入到最终的ip表中 pass_proxys = Mongo(FREE_IP_PROXY).select_proxy() for pass_proxy in pass_proxys: logging.info("Join:" + pass_proxy + "will join filtered_collection") Mongo(FILTERED_PROXY).insert_proxy(pass_proxy)
def init_log(): # 创建日志文件夹 if not os.path.exists(LOG_PATH): os.mkdir(LOG_PATH) logging.basicConfig(filename=LOG_FILENAME, format=LOG_FORMAT, level=LOG_LEVEL) if __name__ == "__main__": # 初始化日志 init_log() logging.info("Begin proxy crawl") spiders = [IP181, SixIp, KuaiDaiLi, XiCiDaiLi] validates = [Baidu, HttpBin] while True: ipproxy = Mongo(FREE_IP_PROXY) ipproxy.drop_collections() # 爬取代理 for spider in spiders: scrapydo.run_spider(spider) # 校验代理 for validate in validates: va = validate() va.start_requests() va.close() time.sleep(7200)