def run(self): """ Run tester. :return: """ logger.debug('Tester is running.') try: count = self.redis.count() logger.info( 'There are {} proxy (proxies) in proxy pool now.'.format( count)) for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) # 分批测试防止内存开销过大 logger.debug( 'Testing proxies with index between {} and {}.'.format( start + 1, stop)) test_proxies = self.redis.batch(start, stop) # 异步测试加快速度 loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) logger.info('Testing finished') except Exception as e: logger.warning('Tester error {}'.format(e.args))
def schedule_getter(self, cycle=GETTER_CYCLE): """ Get proxies periodically """ getter = Getter() while True: logger.info('Getter begins to run.') getter.run() time.sleep(cycle)
def schedule_tester(self, cycle=TESTER_CYCLE): """ Test proxies periodically """ tester = Tester() while True: logger.info('Tester begins to run.') tester.run() time.sleep(cycle)
def run(self): logger.info('Proxy pool management system begins to run.') if TESTER_ENABLED: tester_process = Process(target=self.schedule_tester) tester_process.start() if GETTER_ENABLED: getter_process = Process(target=self.schedule_getter) getter_process.start() if API_ENABLED: api_process = Process(target=self.schedule_api) api_process.start()
def get_page(url, options={}): """ :param url: :param additional entries of headers: :return: """ headers = dict(base_headers, **options) logger.debug('Crawling: {}'.format(url)) try: response = requests.get(url, headers=headers) logger.info('Finished crawling {}, status_code is {}'.format( url, response.status_code)) if response.status_code == 200: return response.text except ConnectionError as e: logger.warning('Failed to crawl {} because of {}'.format(url, repr(e))) return None
def count(self): """ Count the number of proxy in proxy pool. :return: total number """ return self.db.zcard(REDIS_KEY) def all(self): """ List all the proxies. :return: a list """ return self.db.zrangebyscore(REDIS_KEY, MIN_SCORE, MAX_SCORE) def batch(self, start, stop): """ Get proxy by batch. :param start: the start index of proxy :param stop: the end index of proxy :return: a list of this batch """ return self.db.zrevrange(REDIS_KEY, start, stop - 1) if __name__ == '__main__': conn = RedisClient() result_1 = conn.batch(680, 688) result_2 = conn.db.zrangebyscore(REDIS_KEY, MAX_SCORE, MAX_SCORE) logger.info('There are {} proxies with MAX_SCORE'.format(len(result_2)))