def deduction(self, proxy): """ deduct score of proxy :param proxy: proxy :return: """ score = self.con.zscore(REDIS_KEY, proxy.string()) if score and score > MIN_SCORE: logger.info(f"{proxy.string()} with score {score}, deduct") return self.con.zincrby(REDIS_KEY, -1, proxy.string()) else: logger.info(f"{proxy.string()} with score {score}, remove") return self.con.zrem(REDIS_KEY, proxy.string())
async def test(self, proxy: Proxy): """ test single proxy :param proxy: Proxy object :return: """ async with aiohttp.ClientSession(connector=aiohttp.TCPConnector( ssl=False)) as session: try: logger.info(f'testing {proxy.string()}') async with session.get(TEST_URL, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT, allow_redirects=False) as response: if response.status in TEST_VALID_STATUS: self.redis.max(proxy) logger.info( f'proxy {proxy.string()} is valid, set max score') else: self.redis.deduction(proxy) logger.info( f'proxy {proxy.string()} is invalid, decrease score' ) except EXCEPTIONS: self.redis.deduction(proxy) logger.warn( f'proxy {proxy.string()} is invalid, decrease score') finally: await session.close()
def run(self): def usable(proxy) -> bool: proxies = {'https': 'https://' + proxy} try: if requests.get('https://movie.douban.com/', proxies=proxies, cookies=self.cookies, timeout=2, headers=self.headers).status_code == 200: return True except requests.exceptions.ProxyError: logger.error(f"proxy error with {proxy}") except requests.exceptions.ConnectTimeout: logger.error(f"ConnectTimeout with {proxy}") return False if self.full(): return for crawler in self.crawlers: for proxy in crawler.crawl(): if self.con.add(proxy): logger.info(f"proxy {proxy.string()} been added")
def run(self): """ test main method :return: """ # event loop of aiohttp logger.info('stating tester...') count = self.redis.count() logger.info(f'{count} proxies to test') for i in range(0, count, TEST_BATCH): # start end end offset start, end = i, min(i + TEST_BATCH, count) logger.info(f'testing proxies from {start} to {end} indices') proxies = self.redis.batch(start, end) tasks = [self.test(proxy) for proxy in proxies] # run tasks using event loop self.loop.run_until_complete(asyncio.wait(tasks))
def run(self): global tester_process, getter_process, server_process try: logger.info('starting ProxyPool...') tester_process = multiprocessing.Process(target=self.run_tester) logger.info(f'starting tester, pid {tester_process.pid}...') tester_process.start() getter_process = multiprocessing.Process(target=self.run_getter) logger.info(f'starting getter, pid{getter_process.pid}...') getter_process.start() server_process = multiprocessing.Process(target=self.run_server) logger.info(f'starting server, pid{server_process.pid}...') server_process.start() tester_process.join() getter_process.join() server_process.join() except KeyboardInterrupt: logger.info('received keyboard interrupt signal') tester_process.terminate() getter_process.terminate() server_process.terminate() finally: # must call join method before calling is_alive tester_process.join() getter_process.join() server_process.join() logger.info( f'tester is {"alive" if tester_process.is_alive() else "dead"}' ) logger.info( f'getter is {"alive" if getter_process.is_alive() else "dead"}' ) logger.info( f'server is {"alive" if server_process.is_alive() else "dead"}' ) logger.info('proxy terminated')