Beispiel #1
0
class Validation(object):
    def __init__(self,
                 validation_addr="http://www.baidu.com",
                 host="www.baidu.com",
                 batch_size=100,
                 db_host='localhost',
                 port='6379',
                 key='Proxy'):
        self.__redis = RedisClient(host=db_host,
                                   port=port,
                                   password=None,
                                   key=key)
        self.__host = host
        self.__addr = validation_addr
        self.__batch_size = batch_size

    async def validation_single(self, proxy):
        '''validate single proxy'''
        # print("validation proxy: ", proxy)
        headers = UserAgent(host=self.__host).headers()
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                async with session.get(self.__addr,
                                       proxy=real_proxy,
                                       headers=headers,
                                       timeout=15) as response:
                    if response.status == 200:
                        self.__redis.max(proxy)
                        # print("proxy {} is good.".format(proxy))
                    else:
                        self.__redis.decrease(proxy)
                        # print("proxy {} is not good".format(proxy))
            except Exception:
                # print("proxy {} is not good".format(proxy))
                self.__redis.decrease(proxy)

    def run(self):
        try:
            proxies = self.__redis.all()
            for i in range(0, len(proxies), self.__batch_size):
                # print("processing i: {}\tprogress:{}/{} {:.2f}%".format(i + 1, i + 1, len(proxies),
                # (i + 1) / len(proxies) * 100))
                task_proxies = proxies[i:self.__batch_size + i]
                loop = asyncio.get_event_loop()
                tasks = [
                    self.validation_single(proxy) for proxy in task_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
            # print("After validation,total {} in database".format(self.__redis.count()))
        except Exception as e:
            pass
Beispiel #2
0
class Tester(object):
    """
        从redis中取出代理,测试代理是否可用,并调整代理IP的优先级
    """
    def __init__(self, test_url):
        self.redisdb = RedisClient()
        # 用来测试代理是否可用的地址
        self.test_url = test_url

    def test_proxy(self, proxy):
        try:
            if isinstance(proxy, bytes):
                proxy = proxy.decode('utf-8')
            proxies = {
                'http': 'http://' + proxy,
                'https': 'https://' + proxy
            }
            print('正在检测:{}'.format(proxy))
            res = requests.get(self.test_url, proxies=proxies, timeout=10)
            if res.status_code == 200:
                return True, proxy
            else:
                return False, proxy
                # 代理不可用,就降低其优先级
        except Exception as e:
            return False, proxy
            # print('代理检测异常:{}  {}'.format(proxy, e))
            self.redisdb.decrease(proxy)
            print('代理不可用:{}'.format(proxy))


    def run(self):
        print('启动检测模块......')
        try:
            # 获取redis中所有爬取到的代理
            proxies = self.redisdb.get_all_proxy()
            for i in range(0, len(proxies), 50):
                test_proxies = proxies[i:i+50]
                workers = len(test_proxies)
                with futures.ThreadPoolExecutor(workers) as executor:
                    tasks_res = executor.map(self.test_proxy, test_proxies)
                    for res, proxy in tasks_res:
                        if not res:
                            # 代理不可用,就降低其优先级
                            self.redisdb.decrease(proxy)
                            print('代理不可用:{}'.format(proxy))
                        else:
                            # 代理可用,将其优先级置为最大
                            self.redisdb.max(proxy)
                            print('代理可用:{}'.format(proxy))

        except Exception as e:
            print(traceback.format_exc())
            print('检测模块出错!!!')
Beispiel #3
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('测试代理: {}'.format(real_proxy))
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       allow_redirects=False,
                                       timeout=10) as resp:
                    if resp.status in VALID_STATUS_CODE:
                        self.redis.max(proxy)
                        print('代理{} 可用'.format(real_proxy))
                    else:
                        self.redis.decrase(proxy)
                        print('代理{}的返回状态错误'.format(real_proxy))
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrase(proxy)
                print("代理{}请求异常".format(real_proxy))

    def run(self):
        '''
        测试主函数
        '''
        print('开始测试')
        try:
            count = self.redis.count()
            print('当前剩余{}个代理'.format(count))
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('测试测范围{}-{}'.format(start, stop))
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(2)
        except Exception as e:
            print('测试异常{}'.format(e.args))
Beispiel #4
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试:', proxy)
                async with session.get(TEST_URL, proxy=real_proxy,
                                       timeout=15) as response:
                    if response.status in VALID_STSTUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用:', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法', proxy)
            except (ClientError, ClientConnectorError, TimeoutError,
                    AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            proxies = self.redis.all()
            loop = asyncio.get_event_loop()
            #批量测试
            for i in range(0, len(proxies), BATCH_TEST_SIZE):
                test_proxies = proxies[i:i + BATCH_TEST_SIZE]
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Beispiel #5
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy: 单个代理
        :return: NOne
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = "http://" + proxy
                print("正在测试", proxy)
                async with session.get(TEST_URL, proxy=real_proxy,
                                       timeout=15) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print("代理可用", proxy)
                    else:
                        self.redis.decrease(proxy)
                        print("请求接口相应不合法", proxy)
            except:
                pass

    def run(self):
        """
        测试主函数
        :return: None
        """
        print("测试器开始执行")
        try:
            proxies = self.redis.all()
            loop = asyncio.get_event_loop()
            #批量测试
            for i in range(0, len(proxies), BATCH_TEST_SIZE):
                test_proxies = proxies[i:i + BATCH_TEST_SIZE]
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(5)
        except Exception as e:
            print("测试器发生错误", e.args)
Beispiel #6
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    # 异步的方法
    async def test_single_proxy(self, proxy):
        """
        方法用于检测一个代理是否合法
        :param proxy: 需要检测的代理
        :return:
        """

        # 用来设置一次最大连接数量 参数用来防止ssl报错
        conn = aiohttp.TCPConnector(verify_ssl=False)
        # 用来创建一个Session连接
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                # 检测proxy是否为bytes类型
                if isinstance(proxy, bytes):
                    # 如果是的话 用utf-8进行proxy编码
                    proxy = proxy.decode('utf-8')
                real_proxy = "http://" + proxy
                print("testing...", proxy)
                # 发起get请求
                async with session.get(TEST_URL, proxy=real_proxy,
                                       timeout=15) as response:
                    # 如果响应状态码是200
                    if response.status in VALID_STATUS_CODE:
                        # 将proxy的分数设置为 100
                        self.redis.max(proxy)
                        print("proxy ok", proxy)
                    else:
                        # 将代理分数减一
                        self.redis.decrease(proxy)
                        print("return code is illegal", proxy)
            except (aiohttp.ClientError, aiohttp.ClientConnectorError,
                    TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print("proxy request fail", proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        print("测试器开始运行")
        try:
            proxies = self.redis.all()
            # 创建消息循环队列
            loop = asyncio.get_event_loop()
            # 进行批量测试
            for i in range(0, len(proxies), BATCH_TEST_SIZE):
                # 一次测试 100 个代理
                test_proxies = proxies[i:i + BATCH_TEST_SIZE]
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(5)
        except Exception as e:

            print("error", e.args)