class Validation(object): def __init__(self, validation_addr="http://www.baidu.com", host="www.baidu.com", batch_size=100, db_host='localhost', port='6379', key='Proxy'): self.__redis = RedisClient(host=db_host, port=port, password=None, key=key) self.__host = host self.__addr = validation_addr self.__batch_size = batch_size async def validation_single(self, proxy): '''validate single proxy''' # print("validation proxy: ", proxy) headers = UserAgent(host=self.__host).headers() conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy async with session.get(self.__addr, proxy=real_proxy, headers=headers, timeout=15) as response: if response.status == 200: self.__redis.max(proxy) # print("proxy {} is good.".format(proxy)) else: self.__redis.decrease(proxy) # print("proxy {} is not good".format(proxy)) except Exception: # print("proxy {} is not good".format(proxy)) self.__redis.decrease(proxy) def run(self): try: proxies = self.__redis.all() for i in range(0, len(proxies), self.__batch_size): # print("processing i: {}\tprogress:{}/{} {:.2f}%".format(i + 1, i + 1, len(proxies), # (i + 1) / len(proxies) * 100)) task_proxies = proxies[i:self.__batch_size + i] loop = asyncio.get_event_loop() tasks = [ self.validation_single(proxy) for proxy in task_proxies ] loop.run_until_complete(asyncio.wait(tasks)) # print("After validation,total {} in database".format(self.__redis.count())) except Exception as e: pass
class Tester(object): """ 从redis中取出代理,测试代理是否可用,并调整代理IP的优先级 """ def __init__(self, test_url): self.redisdb = RedisClient() # 用来测试代理是否可用的地址 self.test_url = test_url def test_proxy(self, proxy): try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } print('正在检测:{}'.format(proxy)) res = requests.get(self.test_url, proxies=proxies, timeout=10) if res.status_code == 200: return True, proxy else: return False, proxy # 代理不可用,就降低其优先级 except Exception as e: return False, proxy # print('代理检测异常:{} {}'.format(proxy, e)) self.redisdb.decrease(proxy) print('代理不可用:{}'.format(proxy)) def run(self): print('启动检测模块......') try: # 获取redis中所有爬取到的代理 proxies = self.redisdb.get_all_proxy() for i in range(0, len(proxies), 50): test_proxies = proxies[i:i+50] workers = len(test_proxies) with futures.ThreadPoolExecutor(workers) as executor: tasks_res = executor.map(self.test_proxy, test_proxies) for res, proxy in tasks_res: if not res: # 代理不可用,就降低其优先级 self.redisdb.decrease(proxy) print('代理不可用:{}'.format(proxy)) else: # 代理可用,将其优先级置为最大 self.redisdb.max(proxy) print('代理可用:{}'.format(proxy)) except Exception as e: print(traceback.format_exc()) print('检测模块出错!!!')
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('测试代理: {}'.format(real_proxy)) async with session.get(TEST_URL, proxy=real_proxy, allow_redirects=False, timeout=10) as resp: if resp.status in VALID_STATUS_CODE: self.redis.max(proxy) print('代理{} 可用'.format(real_proxy)) else: self.redis.decrase(proxy) print('代理{}的返回状态错误'.format(real_proxy)) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrase(proxy) print("代理{}请求异常".format(real_proxy)) def run(self): ''' 测试主函数 ''' print('开始测试') try: count = self.redis.count() print('当前剩余{}个代理'.format(count)) for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('测试测范围{}-{}'.format(start, stop)) test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(2) except Exception as e: print('测试异常{}'.format(e.args))
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试:', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15) as response: if response.status in VALID_STSTUS_CODES: self.redis.max(proxy) print('代理可用:', proxy) else: self.redis.decrease(proxy) print('请求响应码不合法', proxy) except (ClientError, ClientConnectorError, TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) def run(self): """ 测试主函数 :return: """ print('测试器开始运行') try: proxies = self.redis.all() loop = asyncio.get_event_loop() #批量测试 for i in range(0, len(proxies), BATCH_TEST_SIZE): test_proxies = proxies[i:i + BATCH_TEST_SIZE] tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: 单个代理 :return: NOne """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = "http://" + proxy print("正在测试", proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print("代理可用", proxy) else: self.redis.decrease(proxy) print("请求接口相应不合法", proxy) except: pass def run(self): """ 测试主函数 :return: None """ print("测试器开始执行") try: proxies = self.redis.all() loop = asyncio.get_event_loop() #批量测试 for i in range(0, len(proxies), BATCH_TEST_SIZE): test_proxies = proxies[i:i + BATCH_TEST_SIZE] tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) time.sleep(5) except Exception as e: print("测试器发生错误", e.args)
class Tester(object): def __init__(self): self.redis = RedisClient() # 异步的方法 async def test_single_proxy(self, proxy): """ 方法用于检测一个代理是否合法 :param proxy: 需要检测的代理 :return: """ # 用来设置一次最大连接数量 参数用来防止ssl报错 conn = aiohttp.TCPConnector(verify_ssl=False) # 用来创建一个Session连接 async with aiohttp.ClientSession(connector=conn) as session: try: # 检测proxy是否为bytes类型 if isinstance(proxy, bytes): # 如果是的话 用utf-8进行proxy编码 proxy = proxy.decode('utf-8') real_proxy = "http://" + proxy print("testing...", proxy) # 发起get请求 async with session.get(TEST_URL, proxy=real_proxy, timeout=15) as response: # 如果响应状态码是200 if response.status in VALID_STATUS_CODE: # 将proxy的分数设置为 100 self.redis.max(proxy) print("proxy ok", proxy) else: # 将代理分数减一 self.redis.decrease(proxy) print("return code is illegal", proxy) except (aiohttp.ClientError, aiohttp.ClientConnectorError, TimeoutError, AttributeError): self.redis.decrease(proxy) print("proxy request fail", proxy) def run(self): """ 测试主函数 :return: """ print("测试器开始运行") try: proxies = self.redis.all() # 创建消息循环队列 loop = asyncio.get_event_loop() # 进行批量测试 for i in range(0, len(proxies), BATCH_TEST_SIZE): # 一次测试 100 个代理 test_proxies = proxies[i:i + BATCH_TEST_SIZE] tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) time.sleep(5) except Exception as e: print("error", e.args)