예제 #1
0
class Getter():
    def __init__(self):
        self.crawler = Crawler()
        self.redis = RedisClient()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        :return:
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        # print('爬取代理开始')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxyies(callback)
                sys.stdout.flush()  #LINUX有区别,单秒输出字符
                for proxy in proxies:
                    if not self.redis.exists(proxy):
                        self.redis.add(proxy)
                    else:
                        # print("{}已经存在在IP池".format(proxy))
                        logging.info("重复抓取IP {}".format(proxy))
예제 #2
0
class Tester(object):
    def __init__(self):
        self.db = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理,设置为协程
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.db.effective(proxy)
                    else:
                        self.db.invalid(proxy)
                        logging.error(
                            '请求码不合法{},IP{}'.format(*[response.status, proxy]))
                        # print('请求码不合法{},IP{}'.format(*[response.status,proxy]))
            except Exception as e:
                print(e)
                self.db.invalid(proxy)
                logging.error("{}代理请求失败".format(proxy))
                # print("{}代理请求失败".format(proxy))
    def run(self):
        try:
            count = self.db.count()
            # print('当前剩余{}个代理未检测'.format(count))
            logging.info('当前剩余{}个代理未检测'.format(count))
            for i in range(ceil(count / BATCH_TEST_SIZE)):
                test_proxies = self.db.batch(BATCH_TEST_SIZE)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            logging.critical('测试器发生错误{}'.format(e.args))