Esempio n. 1
0
class Validate:
    def __init__(self):
        self.header = {}
        self.urls = []
        self.timeout = 4  # 时间间隔
        self.tables = [FILTERED_PROXY, FREE_IP_PROXY]

    def start_requests(self):
        for table in self.tables:
            self.proxy = Mongo(table)
            proxys = self.proxy.select_proxy()
            for proxy in proxys:
                for url in self.urls:
                    logging.info("Validate:" + "URL:" + url + "Proxy:" + proxy)
                    proxies = {'http': '%s:%s' % (proxy.get("ip"), proxy.get("port"))}
                    try:
                        requests.get(url, proxies=proxies, timeout=self.timeout)
                        self.success_parse(proxy)
                    except:
                        self.error_parse(proxy)

    def success_parse(self, proxy):
        logging.info("Qualified:" + proxy)

    def error_parse(self, proxy):
        # 响应失败
        self.proxy.delete_with_ip(proxy.get("ip"))
        logging.info("Unqualified:" + proxy + "it is so slowly,will deleted")

    def close(self):
        # 将爬取的ip加入到最终的ip表中
        pass_proxys = Mongo(FREE_IP_PROXY).select_proxy()
        for pass_proxy in pass_proxys:
            logging.info("Join:" + pass_proxy + "will join filtered_collection")
            Mongo(FILTERED_PROXY).insert_proxy(pass_proxy)
Esempio n. 2
0
class Validate(Spider):
    name = 'base'

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.header = {}
        self.urls = []
        self.timeout = 7  # 时间间隔
        self.proxy = None
        self.tables = [FILTERED_PROXY, FREE_IP_PROXY]

    def start_requests(self):
        for table in self.tables:
            self.proxy = Mongo(table)
            proxys = self.proxy.select_proxy()
            for proxy in proxys:
                for url in self.urls:
                    logging.info("Validate:" + "URL:" + url + "Proxy:" + proxy)
                    yield Request(url=url,
                                  meta={
                                      'proxy':
                                      'http://%s:%s' %
                                      (proxy.get("ip"), proxy.get("port")),
                                      'download_timeout':
                                      self.timeout,
                                  },
                                  priority=0,
                                  headers=self.header,
                                  dont_filter=True,
                                  callback=self.success_parse,
                                  errback=self.error_parse)

    def success_parse(self, response):
        # 响应成功
        proxy = response.meta.get('proxy_info')
        speed = time.time() - response.meta.get('cur_time')
        if speed >= self.timeout:
            # 超时删除改ip
            logging.info("Unqualified:" + proxy +
                         "it is so slowly,will deleted")
            self.proxy.delete_with_ip(proxy.get("ip", ""))
        else:
            logging.info("Qualified:" + proxy)

    def error_parse(self, failure):
        # 响应超时
        request = failure.request
        proxy = request.meta.get('proxy_info')
        proxy.https = request.meta.get('https')
        self.proxy.delete_with_ip(proxy.get("ip", ""))
        logging.info("Unqualified:" + proxy + "it is error,will deleted")

    def close(spider, reason):
        # 将免费ip中的前缀加入最终的代理池中
        pass
Esempio n. 3
0
 def start_requests(self):
     for table in self.tables:
         self.proxy = Mongo(table)
         proxys = self.proxy.select_proxy()
         for proxy in proxys:
             for url in self.urls:
                 logging.info("Validate:" + "URL:" + url + "Proxy:" + proxy)
                 proxies = {'http': '%s:%s' % (proxy.get("ip"), proxy.get("port"))}
                 try:
                     requests.get(url, proxies=proxies, timeout=self.timeout)
                     self.success_parse(proxy)
                 except:
                     self.error_parse(proxy)
Esempio n. 4
0
 def start_requests(self):
     for table in self.tables:
         self.proxy = Mongo(table)
         proxys = self.proxy.select_proxy()
         for proxy in proxys:
             for url in self.urls:
                 logging.info("Validate:" + "URL:" + url + "Proxy:" + proxy)
                 yield Request(url=url,
                               meta={
                                   'proxy':
                                   'http://%s:%s' %
                                   (proxy.get("ip"), proxy.get("port")),
                                   'download_timeout':
                                   self.timeout,
                               },
                               priority=0,
                               headers=self.header,
                               dont_filter=True,
                               callback=self.success_parse,
                               errback=self.error_parse)
Esempio n. 5
0
 def insert_data(self):
     proxy = Mongo(FREE_IP_PROXY)
     proxy.insert_proxy(dict(self))
Esempio n. 6
0
 def get_random_ip(self):
     proxy = Mongo(FILTERED_PROXY).obtain_fasted_proxy()
     return proxy
Esempio n. 7
0
def get_ips(num=1):
    proxy = Mongo(FILTERED_PROXY)
    proxys = proxy.obtain_proxy(num)
    return json.dumps(proxys)
Esempio n. 8
0
 def close(self):
     # 将爬取的ip加入到最终的ip表中
     pass_proxys = Mongo(FREE_IP_PROXY).select_proxy()
     for pass_proxy in pass_proxys:
         logging.info("Join:" + pass_proxy + "will join filtered_collection")
         Mongo(FILTERED_PROXY).insert_proxy(pass_proxy)
Esempio n. 9
0
def init_log():
    # 创建日志文件夹
    if not os.path.exists(LOG_PATH):
        os.mkdir(LOG_PATH)

    logging.basicConfig(filename=LOG_FILENAME,
                        format=LOG_FORMAT,
                        level=LOG_LEVEL)


if __name__ == "__main__":
    # 初始化日志
    init_log()
    logging.info("Begin proxy crawl")
    spiders = [IP181, SixIp, KuaiDaiLi, XiCiDaiLi]
    validates = [Baidu, HttpBin]

    while True:
        ipproxy = Mongo(FREE_IP_PROXY)
        ipproxy.drop_collections()
        # 爬取代理
        for spider in spiders:
            scrapydo.run_spider(spider)
        # 校验代理
        for validate in validates:
            va = validate()
            va.start_requests()
            va.close()

        time.sleep(7200)