class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: print(proxy, type(proxy)) self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__crawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class ValidityTester(object): test_api = TEST_API def __init__(self): self._raw_proxies = None self._available_proxies = [] self._conn = RedisClient() self.logger = logging.getLogger(__name__) def set_raw_proxies(self, proxies): self._raw_proxies = proxies async def test_one_proxy(self, proxy): async with aiohttp.ClientSession() as session: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = "http://" + proxy try: async with session.get(ValidityTester.test_api, proxy=real_proxy, timeout=10) as rep: if rep.status == 200: self._conn.add(proxy) self.logger.info("valid proxy:" + proxy) except Exception as e: self.logger.error("invalid proxy:" + proxy) def test(self): """ aio test all proxies. """ print('ValidityTester is working') try: loop = asyncio.get_event_loop() tasks = [self.test_one_proxy(proxy) for proxy in self._raw_proxies] loop.run_until_complete(asyncio.wait(tasks)) except ValueError: self.logger.error('Async Error')
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ Compare to the capacity of proxy pool. """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): logger.debug('Getter is running.') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)