Beispiel #1
0
class PoolAdder(object):
    """
    add proxy to pool
    """
    def __init__(self, threshold):
        self._threshold = threshold
        self._conn = RedisClient()
        self._tester = ValidityTester()
        self._crawler = FreeProxyGetter()

    def is_over_threshold(self):
        """
        judge if count is overflow.
        """
        if self._conn.queue_len >= self._threshold:
            return True
        else:
            return False

    def add_to_queue(self):
        print('PoolAdder is working')
        proxy_count = 0
        while not self.is_over_threshold():
            for callback_label in range(self._crawler.__CrawlFuncCount__):
                callback = self._crawler.__CrawlFunc__[callback_label]
                raw_proxies = self._crawler.get_raw_proxies(callback)
                # test crawled proxies
                self._tester.set_raw_proxies(raw_proxies)
                self._tester.test()
                proxy_count += len(raw_proxies)
                if self.is_over_threshold():
                    print('IP is enough, waiting to be used')
                    break
            if proxy_count == 0:
                raise ResourceDepletionError
Beispiel #2
0
class GetNewProxy(object):
    #得到新的代理ip
    def __init__(self,max):
        self._max_count=max
        self._redis=RedisClient()
        self._tester=ValidityTester()
        self._getter=FreeProxyGetter()
    def is_over(self):
        #是否超过总数
        print(self._redis.len())
        if self._redis.len()>=self._max_count:
            return True
        else:
            return False

    def add_new_proxy(self):
        #增加新的代理ip到代理池中
        print('Add and get new proxy')
        while not self.is_over():
            for callback in range(self._getter.__CrawlFuncCount__):
                # print(self._getter.__CrawlFunc__[callback])
                self._tester.set_raw_proxies(self._getter.get_raw_proxies(self._getter.__CrawlFunc__[callback]))
                self._tester.check_some_proxies()
                if self.is_over():
                    print('IP is enough, waiting to be used')
                    break
Beispiel #3
0
class PoolAdder(object):
    def __init__(self, upper_threshold):
        self._upper_threshold = upper_threshold  #150
        self._conn = RedisClient()
        self._tester = ValidityTester()
        self._crawler = FreeProxyGetter()

    def over_upper_threshold(self):
        """
        判断代理池是否过盈
        """
        return True if self._conn.list_len >= self._upper_threshold else False

    def add_to_pool(self):
        print('PoolAdder is working...')
        raw_proxies_count = 0
        while not self.over_upper_threshold():
            for callback_label in range(self._crawler.__CrawlFuncCount__):
                callback = self._crawler.__CrawlFunc__[callback_label]
                raw_proxies = self._crawler.get_raw_proxies(
                    callback=callback)  #执行回调函数,返回抓取的内容
                self._tester.set_raw_proxies(
                    raw_proxies)  #修改self._raw_proxies的内容
                self._tester.test()  #进行测试
                raw_proxies_count += len(raw_proxies)
                if self.over_upper_threshold():
                    print('IPs are enough, waiting to be used')
                    break  #当数据足够就退出
            if raw_proxies_count == 0:
                raise ResourceDepletionError
Beispiel #4
0
class PoolAdder(object):
    """
    添加器,负责向池中补充代理
    """
    def __init__(self, threshold):
        self._threshold = threshold
        self._conn = RedisClient()
        self._tester = VaildityTester()
        self._crawler = FreeProxyGetter()

    def is_over_threshold(self):
        """
        判断代理池中的数量是否达到阈值
        """
        if self._conn.queue_len >= self._threshold:
            return True
        else:
            return False

    def add_to_queue(self):
        """
        命令爬虫抓取一定量未检测的代理,然后检测,将通过检测的代理加入到代理池中
        """

        Logger.log_normal('PoolAdder is working')
        proxy_count = 0
        while not self.is_over_threshold():
            for callback_label in range(self._crawler.__CrawlFuncCount__):
                callback = self._crawler.__CrawlFunc__[callback_label]
                raw_proxies = self._crawler.get_raw_proxies(callback)

                # test crawled proxies
                self._tester.set_raw_proxies(raw_proxies)
                self._tester.test()

                proxy_count += len(raw_proxies)
                if self.is_over_threshold():
                    Logger.log_high('IP is enough, waiting to be used')
                    break
                if proxy_count == 0:
                    raise ResourceDepletionError
Beispiel #5
0
class PoolAdder(object):
    """
    add proxy to pool
    """

    def __init__(self, threshold):#threshold是阀值
        self._threshold = threshold
        self._conn = RedisClient()
        self._tester = ValidityTester()
        self._crawler = FreeProxyGetter()#实例化元类

    def is_over_threshold(self):
        """
        judge if count is overflow.
        """
        if self._conn.queue_len >= self._threshold:#判断数据库内数据的数量是否溢出
            return True
        else:
            return False

    def add_to_queue(self):
        print('PoolAdder is working')
        proxy_count = 0
        while not self.is_over_threshold():#当数量没溢出时执行一下循环,创建类的实例对象
            for callback_label in range(self._crawler.__CrawlFuncCount__):#每个数字对应一个代理
                callback = self._crawler.__CrawlFunc__[callback_label]
                raw_proxies = self._crawler.get_raw_proxies(callback)#实例对象,此时raw_proxies具有__CrawlFuncCount__,__CrawlFunc__的属性
                # test crawled proxies
                self._tester.set_raw_proxies(raw_proxies)#传入抓取的免费代理
                self._tester.test()
                proxy_count += len(raw_proxies)
                if self.is_over_threshold():#溢出时进行的操作
                    print('IP is enough, waiting to be used')
                    break
            if proxy_count == 0:
                raise ResourceDepletionError
Beispiel #6
0
class PoolAdder(object):
    """
    add proxy to pool
    """

    def __init__(self, threshold):
        self._threshold = threshold
        self._conn = RedisClient()
        self._tester = ValidityTester()
        self._crawler = FreeProxyGetter()

    def is_over_threshold(self):
        """
        judge if count is overflow.
        """
        if self._conn.queue_len >= self._threshold:
            return True
        else:
            return False

    def add_to_queue(self):
        print('PoolAdder is working')
        proxy_count = 0
        while not self.is_over_threshold():
            for callback_label in range(self._crawler.__CrawlFuncCount__):
                callback = self._crawler.__CrawlFunc__[callback_label]
                raw_proxies = self._crawler.get_raw_proxies(callback)
                # test crawled proxies
                self._tester.set_raw_proxies(raw_proxies)
                self._tester.test()
                proxy_count += len(raw_proxies)
                if self.is_over_threshold():
                    print('IP is enough, waiting to be used')
                    break
            if proxy_count == 0:
                raise ResourceDepletionError