def extend_proxy_pool(): """Check proxies count if need to extend proxy pool.""" loop = asyncio.get_event_loop() proxies = asyncio.Queue() crawler = ProxyCrawler(proxies) validator = ProxyValidator() while 1: if conn.count > LOWER_LIMIT: time.sleep(CHECK_CYCLE_TIME) continue logger.debug('extend proxy pool started') flag = asyncio.Event() try: loop.run_until_complete(asyncio.gather( ProxyPool.crawler_start(crawler, validator, proxies, flag), ProxyPool.crawler_stop(crawler, flag) )) except Exception: logger.error(traceback.format_exc()) logger.debug('extend proxy pool finished') time.sleep(CHECK_INTERVAL_TIME) crawler.reset() # create new flag
def extend_proxy_pool(): """Check proxies count if need to extend proxy pool. """ conn = rc() loop = asyncio.get_event_loop() flag = asyncio.Event() proxies = asyncio.Queue() crawler = ProxyCrawler(proxies) validator = ProxyValidator(conn) while 1: if conn.count > lower_limit: time.sleep(check_cycle_time) continue logger.debug('extend proxy pool started') try: loop.run_until_complete( asyncio.gather( ProxyPool.crawler_start(crawler, validator, proxies, flag), ProxyPool.crawler_stop(crawler, conn, flag))) except Exception as e: logger.error(e, exc_info=True) logger.debug('extend proxy pool finished') time.sleep(check_interval_time) flag.clear() crawler.reset() # clear flags
def proxy_crawler_run(proxies, rules=None): pc = ProxyCrawler(proxies, rules) loop = asyncio.get_event_loop() try: loop.run_until_complete(pc.start()) except: logger.error(traceback.format_exc()) finally: loop.close()
def proxy_validator_run(): conn = rc() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) validator = ProxyValidator(conn) while 1: try: loop.run_until_complete(validator.start()) except Exception as e: logger.error(e, exc_info=True) time.sleep(validate_cycle_time)
def proxy_validator_run(): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) validator = ProxyValidator() while 1: logger.debug('regular validator started') try: loop.run_until_complete(validator.start()) except Exception: logger.error(traceback.format_exc()) logger.debug('regular validator finished') time.sleep(VALIDATE_CYCLE_TIME)
async def validate(self, proxies): logger.debug('validator started') while 1: proxy = await proxies.get() async with aiohttp.ClientSession() as session: try: real_proxy = 'http://' + proxy async with session.get(self.validate_url, proxy=real_proxy, timeout=validate_timeout) as resp: self._conn.put(proxy) except Exception as e: logger.error(e) proxies.task_done()
async def _validator(self, proxy): HEADERS['User-Agent'] = choice(USER_AGENT) async with aiohttp.ClientSession() as session: try: real_proxy = 'http://' + proxy async with session.get(self.validate_url, headers=HEADERS, proxy=real_proxy, timeout=VALIDATE_TIMEOUT) as resp: if resp.status == 200: conn.put(proxy) except asyncio.TimeoutError: pass except Exception as e: logger.error(e)
async def regular_validate(self): count = min(ceil(self._conn.count * validate_ratio), validate_upper_limit) old_proxies = self._conn.get_list(count) # TODO: set an upper limit valid_proxies = [] logger.debug('regular validator started, {0} to validate'.format(len(old_proxies))) async with aiohttp.ClientSession() as session: for proxy in old_proxies: try: real_proxy = 'http://' + proxy.decode('utf-8') # proxy from redis was bytes type async with session.get(self.validate_url, proxy=real_proxy, timeout=validate_timeout) as resp: valid_proxies.append(proxy) except asyncio.TimeoutError: continue except Exception as e: logger.error(e) logger.debug('regular validator finished, {0} passed'.format(len(valid_proxies))) self._conn.put_list(valid_proxies)