Example #1
0
    def run(self):
        """
        Run tester.
        :return:
        """
        logger.debug('Tester is running.')
        try:
            count = self.redis.count()
            logger.info(
                'There are {} proxy (proxies) in proxy pool now.'.format(
                    count))
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                # 分批测试防止内存开销过大
                logger.debug(
                    'Testing proxies with index between {} and {}.'.format(
                        start + 1, stop))
                test_proxies = self.redis.batch(start, stop)
                # 异步测试加快速度
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
            logger.info('Testing finished')

        except Exception as e:
            logger.warning('Tester error {}'.format(e.args))
Example #2
0
 def schedule_getter(self, cycle=GETTER_CYCLE):
     """
     Get proxies periodically
     """
     getter = Getter()
     while True:
         logger.info('Getter begins to run.')
         getter.run()
         time.sleep(cycle)
Example #3
0
 def schedule_tester(self, cycle=TESTER_CYCLE):
     """
     Test proxies periodically
     """
     tester = Tester()
     while True:
         logger.info('Tester begins to run.')
         tester.run()
         time.sleep(cycle)
Example #4
0
    def run(self):
        logger.info('Proxy pool management system begins to run.')

        if TESTER_ENABLED:
            tester_process = Process(target=self.schedule_tester)
            tester_process.start()

        if GETTER_ENABLED:
            getter_process = Process(target=self.schedule_getter)
            getter_process.start()

        if API_ENABLED:
            api_process = Process(target=self.schedule_api)
            api_process.start()
Example #5
0
def get_page(url, options={}):
    """
    :param url:
    :param additional entries of headers:
    :return:
    """
    headers = dict(base_headers, **options)
    logger.debug('Crawling: {}'.format(url))
    try:
        response = requests.get(url, headers=headers)
        logger.info('Finished crawling {}, status_code is {}'.format(
            url, response.status_code))
        if response.status_code == 200:
            return response.text
    except ConnectionError as e:
        logger.warning('Failed to crawl {} because of {}'.format(url, repr(e)))
        return None
Example #6
0
    def count(self):
        """
        Count the number of proxy in proxy pool.
        :return: total number
        """
        return self.db.zcard(REDIS_KEY)

    def all(self):
        """
        List all the proxies.
        :return: a list
        """
        return self.db.zrangebyscore(REDIS_KEY, MIN_SCORE, MAX_SCORE)

    def batch(self, start, stop):
        """
        Get proxy by batch.
        :param start: the start index of proxy
        :param stop: the end index of proxy
        :return: a list of this batch
        """
        return self.db.zrevrange(REDIS_KEY, start, stop - 1)


if __name__ == '__main__':
    conn = RedisClient()
    result_1 = conn.batch(680, 688)
    result_2 = conn.db.zrangebyscore(REDIS_KEY, MAX_SCORE, MAX_SCORE)
    logger.info('There are {} proxies with MAX_SCORE'.format(len(result_2)))