Esempio n. 1
0
class Validate:
    def __init__(self):
        self.header = {}
        self.urls = []
        self.timeout = 4  # 时间间隔
        self.tables = [FILTERED_PROXY, FREE_IP_PROXY]

    def start_requests(self):
        for table in self.tables:
            self.proxy = Mongo(table)
            proxys = self.proxy.select_proxy()
            for proxy in proxys:
                for url in self.urls:
                    logging.info("Validate:" + "URL:" + url + "Proxy:" + proxy)
                    proxies = {'http': '%s:%s' % (proxy.get("ip"), proxy.get("port"))}
                    try:
                        requests.get(url, proxies=proxies, timeout=self.timeout)
                        self.success_parse(proxy)
                    except:
                        self.error_parse(proxy)

    def success_parse(self, proxy):
        logging.info("Qualified:" + proxy)

    def error_parse(self, proxy):
        # 响应失败
        self.proxy.delete_with_ip(proxy.get("ip"))
        logging.info("Unqualified:" + proxy + "it is so slowly,will deleted")

    def close(self):
        # 将爬取的ip加入到最终的ip表中
        pass_proxys = Mongo(FREE_IP_PROXY).select_proxy()
        for pass_proxy in pass_proxys:
            logging.info("Join:" + pass_proxy + "will join filtered_collection")
            Mongo(FILTERED_PROXY).insert_proxy(pass_proxy)
Esempio n. 2
0
class Validate(Spider):
    name = 'base'

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.header = {}
        self.urls = []
        self.timeout = 7  # 时间间隔
        self.proxy = None
        self.tables = [FILTERED_PROXY, FREE_IP_PROXY]

    def start_requests(self):
        for table in self.tables:
            self.proxy = Mongo(table)
            proxys = self.proxy.select_proxy()
            for proxy in proxys:
                for url in self.urls:
                    logging.info("Validate:" + "URL:" + url + "Proxy:" + proxy)
                    yield Request(url=url,
                                  meta={
                                      'proxy':
                                      'http://%s:%s' %
                                      (proxy.get("ip"), proxy.get("port")),
                                      'download_timeout':
                                      self.timeout,
                                  },
                                  priority=0,
                                  headers=self.header,
                                  dont_filter=True,
                                  callback=self.success_parse,
                                  errback=self.error_parse)

    def success_parse(self, response):
        # 响应成功
        proxy = response.meta.get('proxy_info')
        speed = time.time() - response.meta.get('cur_time')
        if speed >= self.timeout:
            # 超时删除改ip
            logging.info("Unqualified:" + proxy +
                         "it is so slowly,will deleted")
            self.proxy.delete_with_ip(proxy.get("ip", ""))
        else:
            logging.info("Qualified:" + proxy)

    def error_parse(self, failure):
        # 响应超时
        request = failure.request
        proxy = request.meta.get('proxy_info')
        proxy.https = request.meta.get('https')
        self.proxy.delete_with_ip(proxy.get("ip", ""))
        logging.info("Unqualified:" + proxy + "it is error,will deleted")

    def close(spider, reason):
        # 将免费ip中的前缀加入最终的代理池中
        pass