class Getter: def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get__proxies(callback) for proxy in proxies: self.redis.add(proxy)
class Tester(object): def __init__(self): self.db = RedisClient() async def test_single_proxy(self, proxy): conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode("utf-8") real_proxy = "http://" + proxy async with session.get(url=TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALUE_CODE: print("代理", proxy, "有效,分数置为100") self.db.max(proxy) else: print("代理", proxy, "响应码,分数减1") self.db.decrease(proxy) except Exception: print("代理", proxy, "请求出错,分数减1") self.db.decrease(proxy) def run(self): count = self.db.count() print('当前剩余', count, '个代理') for i in range(0, count, BATCH): start = i end = min(i + BATCH, count) proxies = self.db.batch(start, end) print('正在测试第', start + 1, '-', end, '个代理') loop = asyncio.get_event_loop() tasks = [self.test_single_proxy(proxy) for proxy in proxies] loop.run_until_complete(asyncio.wait(tasks)) time.sleep(5)
class Fetcher: def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池数量上限 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for func in self.crawler.get_funclist(): # 从各个代理IP网站开始获取IP代理地址 proxies = self.crawler.get_proxies(func) sys.stdout.flush() for proxy in proxies: # 将获取的proxy加入到redis队列 self.redis.add(proxy)
class Getter: def __init__(self): """Initializing databases class and spider class""" self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """Determine if the database if full""" if self.redis.count() >= POOL_UPPER_THRESHLD: return True return False async def run(self): print('开始获取代理...') if not self.is_over_threshold(): for i in range(self.crawler.CrawlFuncCount): crawl_func = self.crawler.CrawlFunc[i] proxies = await self.crawler.get_proxy(crawl_func) for proxy in proxies: print(proxy) self.redis.add(proxy)
class Saver: """ 爬取代理,并且存入redis数据库 """ def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断代理池中的代理数是否已经足够 """ if self.redis.count() >= settings.proxy_enough_count: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for crawl_func in self.crawler.crawl_funcs: proxies = self.crawler.get_proxies(crawl_func) for proxy in proxies: self.redis.add(proxy)
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 proxy = 代理 return """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') pattern = re.compile('(\d+\.\d+\.\d+\.\d+\:\d+)') result = re.findall(pattern, proxy) real_proxy = 'http://' + result[0] print('正在测试', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print('代理可用', proxy) else: self.redis.decrease(proxy) print('请求响应码不合法 ', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) except Exception: self.redis.decrease(proxy) print('代理请求失败, 原因None', proxy) def run(self): """ 测试主函数 return """ print('测试器开始运行') try: count = self.redis.count() print('当前剩余', count, '个代理') for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(1) except Exception as e: print('测试器发生错误', e.args)
from db import RedisClient from crawler import Crawler if __name__ == "__main__": redisclient = RedisClient() print(redisclient.count()) crawler = Crawler() results = crawler.crawl_dail66() for result in results: print(result)
class Tester(object): def __init__(self): self.redis = RedisClient() #测试单个代理 async def test_single_proxy(self, proxy): conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试', proxy) async with session.get( TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False, headers= { 'Host': 'xueqiu.com', 'Referer': 'https://xueqiu.com/u/8205178197', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', 'Cookie': 'device_id=5d27463e2df6a534e7ecba029eb95e29; xq_a_token=f89219d7e7ee863a5773244ad9d2db6e3dc5ea38; xq_r_token=8bdf53186f54b2c5c885621e64fd4d728f3111e0;', }) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print('代理可用', proxy) else: self.redis.decrease(proxy) print('请求响应码不合法', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) #测试启动 def run(self): print('测试器开始运行') try: count = self.redis.count() print('当前剩余', count, '个代理') #批量测试 for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
class Tester(object): def __init__(self): self.redis = RedisClient() self.headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', } # async def test_single_proxy(self, proxy): def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ # conn = aiohttp.TCPConnector(verify_ssl=False) # async with aiohttp.ClientSession(connector=conn) as session: # try: # if isinstance(proxy, bytes): # proxy = proxy.decode('utf-8') # real_proxy = 'http://' + proxy # print('正在测试', proxy) # async with session.get(TEST_URL, proxy=real_proxy, timeout=5, allow_redirects=False) as response: # if response.status in VALID_STATUS_CODES: # self.redis.max(proxy) # print('代理可用', proxy) # else: # self.redis.decrease(proxy) # print('请求响应码不合法 ', response.status, 'IP', proxy) # except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): # self.redis.decrease(proxy) # print('代理请求失败', proxy) # if isinstance(proxy, bytes): # proxy = proxy.decode('utf-8') ip = proxy.split(':')[0] port = proxy.split(':')[1] print('正在测试', proxy) try: conn = http.client.HTTPConnection(ip, port, timeout=5.0) conn.request(method='GET', url=TETS_URL, headers=self.headers) res = conn.getresponse() print("+++Success:" + proxy) self.redis.max(proxy) except: print("---Failure:" + proxy) self.redis.decrease(proxy) def run(self): """ 测试主函数 :return: """ print('测试器开始运行') try: count = self.redis.count() print('当前剩余', count, '个代理') for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) # loop = asyncio.get_event_loop() # tasks = [self.test_single_proxy(proxy) for proxy in test_proxies] for proxy in test_proxies: self.test_single_proxy(proxy) time.sleep(0.5) # loop.run_until_complete(asyncio.wait(tasks)) # sys.stdout.flush() # time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
class IpValidation(Utility): def __init__(self): self.redis = RedisClient() self.real_ip = '' # 每次验证不成功,减去的分值 self.minus_every_time = (INITIAL_SCORE - DISCARD_SCORE) // VALIDATE_TIME self.key = PROXY_ORIGINAL self.anon_check_url = 'http://httpbin.org/ip' @staticmethod async def is_proxy_valid(proxy, url=TEST_URL): url = url ua = get_random_ua() headers = {'User-Agent': ua} try: conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(headers=headers, connector=conn) as session: async with session.get(url, proxy=proxy, ssl=False) as resp: code = resp.status if 200 <= code < 300: logger.info('%s is valid' % proxy) return True else: logger.info('%s is invalid, code: %s' % (proxy, code)) return False except (ClientConnectionError, ClientHttpProxyError, TimeoutError, CancelledError, ClientProxyConnectionError, Exception) as e: logger.warning(e) return False async def is_high_anon(self, proxy): url = ANON_CHECK_URL try: async with aiohttp.ClientSession() as session: async with session.get(url, proxy=proxy, ssl=False, timeout=15) as resp: code = resp.status if 200 <= code < 300: x_forwarded_for_json = await resp.json() if self.anon_check_url == ANON_CHECK_URL: x_forwarded_for = x_forwarded_for_json['origin'] else: # 根据接口自己定义 x_forwarded_for = x_forwarded_for_json[ 'X-Forwarded-For'] if self.real_ip in x_forwarded_for: return False return True return False except (ClientConnectionError, ClientHttpProxyError, TimeoutError, CancelledError, ClientProxyConnectionError, Exception) as e: logger.warning('proxy: %s, %s' % (proxy, e)) return False async def test_proxy(self, proxy): try: if len(proxy.split('-')[1]) > 1: if not await self.is_high_anon( proxy.split('-')[1].replace('https://', 'http://')): self.redis.adjust_score(proxy, -self.minus_every_time, key=self.key) else: self.redis.adjust_score(proxy, +1, key=self.key) except CancelledError as e: logger.warning('proxy: %s, %s' % (proxy, e)) def get_real_ip(self): resp = requests.get(ANON_CHECK_URL) if self.anon_check_url == ANON_CHECK_URL: self.real_ip = resp.json()['origin'].split(',')[0] else: self.real_ip = resp.json()['X-Real-Ip'] def run_validation(self, key=None): if key: self.key = key logger.info('start checking...') start, end = DISCARD_SCORE + 1, INITIAL_SCORE while True: proxy_unvalidated = self.redis.count(start, end, name=self.key) if proxy_unvalidated: logger.info('checking...') if proxy_unvalidated <= CONCURRENCY_TASK_LIMIT: self.get_real_ip() proxy_list = self.redis.get_proxy_by_score( start, end, proxy_unvalidated, key=self.key) # loop = asyncio.get_event_loop() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = [self.test_proxy(proxy) for proxy in proxy_list] loop.run_until_complete(asyncio.wait(tasks)) else: fetch_times = proxy_unvalidated // CONCURRENCY_TASK_LIMIT left_nums = proxy_unvalidated // CONCURRENCY_TASK_LIMIT for i in range(fetch_times): self.get_real_ip() proxy_list = self.redis.get_proxy_by_score( start, end, CONCURRENCY_TASK_LIMIT, key=self.key) # loop = asyncio.get_event_loop() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = [ self.test_proxy(proxy) for proxy in proxy_list ] loop.run_until_complete(asyncio.wait(tasks)) proxy_list = self.redis.get_proxy_by_score(start, end, left_nums, key=self.key) # loop = asyncio.get_event_loop() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = [self.test_proxy(proxy) for proxy in proxy_list] loop.run_until_complete(asyncio.wait(tasks)) import settings if not proxy_unvalidated and not settings.SPIDER_RUNNING: settings.SPIDER_RUNNING = True self.key = PROXY_ORIGINAL logger.info('scrawl finished,all proxies check finished') break
class DownLoader(Utility): def __init__(self): self.rules = rules self.spider = ProxySpider() self.redis = RedisClient() self.crack_anti_crawl = CrackAntiCrawl() def start_crawl(self): for start_urls in self.rules: urls = start_urls['resources'] gfw = start_urls['GFW'] name = start_urls['name'] page_type = start_urls['type'] referer = start_urls['referer'] host = start_urls['host'] anti_crawl = start_urls['AntiCrawl'] cookies = None if anti_crawl: cookies = eval('crack()', { 'crack': eval('self.crack_anti_crawl.crack_{}'.format(name)) }) ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:67.0) Gecko/20100101 Firefox/67.0' headers = {'User-Agent': ua, 'Referer': referer, 'Host': host} # loop = asyncio.get_event_loop() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = [ self.proxy_downlaod(url, gfw, page_type, name, headers, cookies) for url in urls ] loop.run_until_complete(asyncio.wait(tasks)) # 检测有效proxy数量,达到指定数量,停止爬取 validated_proxy_num = self.redis.count(VALIDATED_SCORE, VALIDATED_SCORE) if validated_proxy_num >= VALIDATED_PROXY_NUM: break settings.SPIDER_RUNNING = False logger.info('scrawl finished') async def proxy_downlaod(self, url, gfw, page_type, name, headers, cookies): logger.info('downloading %s' % url) try: if not gfw: async with aiohttp.ClientSession(headers=headers, cookies=cookies) as session: # 设置随机爬取间隔 time.sleep(random.randint(1, 3) + random.random()) async with session.get(url, ssl=False) as r: code = r.status if 200 <= code < 300: if page_type == 'normal': text = await r.text() try: await eval( 'parse(text)', { 'parse': eval('self.spider.parse_{}'.format( name)), 'text': text }) except Exception: logger.error('parse_%s error' % name, exc_info=True) else: text = await r.text() try: await eval( 'parse(text,api)', { 'parse': eval('self.spider.parse_{}'.format( name)), 'text': text, 'api': 1 }) except Exception: logger.error('parse_%s error' % name, exc_info=True) else: logger.error('page %s failed, status code: %s' % (url, code), exc_info=True) else: async with aiohttp.ClientSession(headers=headers, cookies=cookies) as session: # 设置随机爬取间隔 time.sleep(random.randint(1, 3) + random.random()) async with session.get(url, proxy=GFW_PROXY, ssl=False) as r: code = r.status if 200 <= code < 300: if page_type == 'normal': text = await r.text() try: await eval( 'parse(text)', { 'parse': eval('self.spider.parse_{}'.format( name)), 'text': text }) except Exception: logger.error('parse_%s error' % name, exc_info=True) else: text = await r.text() try: await eval( 'parse(text,api)', { 'parse': eval('self.spider.parse_{}'.format( name)), 'text': text, 'api': page_type }) except Exception: logger.error('parse_%s error' % name, exc_info=True) else: logger.error('page %s failed, status code: %s' % (url, code), exc_info=True) except (ClientConnectionError, ClientHttpProxyError, ClientProxyConnectionError, CancelledError, Exception): logger.error('page %s failed' % url, exc_info=True)
class Tester(): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 """ # 如果proxy是字节类型的,以utf-8格式解码 if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy # 不验证SSL conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: # 访问httpbin async with session.get(TEST_URL, proxy=real_proxy, timeout=7, allow_redirects=False) as req: # # 获取相应内容 # response_content = await req.json() # ip_response = response_content['origin'] # # 获取访问IP # juege_proxy = re.search('(.*):', proxy).group(1) # # 判断访问IP是否与代理一致 # if ip_response == juege_proxy: # # 代理分值设置为最高 # self.redis.max(proxy) # # print('代理可用', proxy) if req.status in VALID_STATUS_CODES: self.redis.max(proxy) else: self.redis.decrease(proxy) except Exception as e: # print(e.args) # 有异常则代理分数减一 self.redis.decrease(proxy) # print('代理不可用,分值-1', proxy) def run(self): """ 批量测试代理 """ try: # 获取当前代理池代理数量 count = self.redis.count() print('当前共有', count, '个代理!') # 批量测试代理 for i in range(0, count, BATCH_SIZE): start = i stop = min(i + BATCH_SIZE, count - 1) print('正在测试第', start + 1, '-', stop, '个代理!') proxies_list = self.redis.batch(start, stop) # 启用一个事件循环 loop = asyncio.get_event_loop() # 把携程对象封装为task task = [ self.test_single_proxy(proxy) for proxy in proxies_list ] # 运行 loop.run_until_complete(asyncio.wait(task)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
class Tester(object): def __init__(self): self.redis = RedisClient() #这是一个异步的方法 async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(ssl=False) #建立一个session对象 #session可以进行多项操作,比如post, get, put, head等 async with aiohttp.ClientSession(connector=conn) as session: #检查如果是字节类型就解码 try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试', proxy) #利用session对象去get async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: #如果状态码有效 if response.status in VALID_STATUS_CODES: #状态码值设置为最大 self.redis.max(proxy) print('代理可用', proxy) else: self.redis.decrease(proxy) print('请求响应码不合法 ', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) def run(self): """ 测试主函数 :return: """ print('测试器开始运行') try: count = self.redis.count() print('当前剩余', count, '个代理') #取出数量为BATCH_TEST_SIZE的proxy for i in range(0, count, BATCH_TEST_SIZE): start = i #这样可以取到最后一个proxy stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) #调用这个方法可以避免“进程已经运行”这个错误 nest_asyncio.apply() #主线程调用asyncio.get_event_loop()时会创建事件循环 loop = asyncio.get_event_loop() # #tasks为异步的任务,列表里面生成的为coroutine(协程)元素 tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] #把异步的任务丢给这个循环的run_until_complete()方法 loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
class Verify: def __init__(self): self.db = RedisClient() async def verify_proxy(self, redis_key, proxy): ''' 验证一个代理IP :param proxy: :return: ''' if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') re_proxy = 'http://' + proxy conn = aiohttp.TCPConnector(ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: async with session.get(setting.TEST_URL, proxy=re_proxy, timeout=6, allow_redirects=False) as resp: if resp.status in [200, 302]: print("{}||{}池:{}: ok 100点".format( time.ctime(), redis_key, proxy)) self.db.max(redis_key, proxy) else: print("{}||{}池:{}: fail -1点".format( time.ctime(), redis_key, proxy)) self.db.decrease(redis_key, proxy) except (aiohttp.ClientError, aiohttp.ClientConnectorError, asyncio.TimeoutError) as e: print("{}||{}池:{}: error -1点".format(time.ctime(), redis_key, proxy)) self.db.decrease(redis_key, proxy) # async def run_by_redis(self, redis_key): # count = self.db.count(redis_key) # print(redis_key, '当前剩余', count, '个代理') # for i in range(0, count, setting.TEST_SIZE): # start = i # end = min(i + setting.TEST_SIZE, count) - 1 # print('正在测试{}第'.format(redis_key), start + 1, '-', end + 1, '个代理') # proxies = self.db.batch(redis_key, start, end) # for proxy in proxies: # await self.verify_proxy(redis_key, proxy) # # def run(self): # print("开始验证代理") # try: # tasks = [ # self.run_by_redis(setting.REDIS_KEY_HTTP), # self.run_by_redis(setting.REDIS_KEY_HTTPS) # ] # loop = asyncio.get_event_loop() # loop.run_until_complete(asyncio.wait(tasks)) # time.sleep(5) # except Exception as e: # print('验证程序运行错误: ', e) def run_verify_http(self, part): stime = time.time() count = self.db.count(setting.REDIS_KEY_HTTP) start = part * (count // 4) stop = start + (count // 4) if part == 3: stop = count try: logger.info("{}开始验证{}-{}".format(setting.REDIS_KEY_HTTP, start, stop)) for i in range(start, stop, setting.HTTP_VERIFY_SIZE): proxies = self.db.batch(setting.REDIS_KEY_HTTP, i, i + setting.HTTP_VERIFY_SIZE) loop = asyncio.get_event_loop() tasks = [ self.verify_proxy(setting.REDIS_KEY_HTTP, proxy) for proxy in proxies ] loop.run_until_complete(asyncio.wait(tasks)) logger.info("{}验证完成{}-{}耗时:{}".format(setting.REDIS_KEY_HTTP, start, stop, time.time() - stime)) except Exception as e: logger.info('{}验证报错{}-{}:{}'.format(setting.REDIS_KEY_HTTP, start, stop, e)) def run_verify_https(self): stime = time.time() try: logger.info("{}开始验证".format(setting.REDIS_KEY_HTTPS)) count = self.db.count(setting.REDIS_KEY_HTTPS) for i in range(0, count, setting.HTTP_VERIFY_SIZE): proxies = self.db.batch(setting.REDIS_KEY_HTTPS, i, i + setting.HTTP_VERIFY_SIZE) loop = asyncio.get_event_loop() tasks = [ self.verify_proxy(setting.REDIS_KEY_HTTPS, proxy) for proxy in proxies ] loop.run_until_complete(asyncio.wait(tasks)) logger.info("{}验证完成,耗时:{}".format(setting.REDIS_KEY_HTTPS, time.time() - stime)) except Exception as e: logger.warning('{}验证报错:{}'.format(setting.REDIS_KEY_HTTPS, e))
def count(): r = RedisClient() return str(r.count())
from db import RedisClient import time db = RedisClient() while True: print(db.count()) time.sleep(10)
class Tester(object): def __init__(self): self.redis = RedisClient() #函数前面加 async 表示此函数是异步的 async def test_single_proxy(self, proxy): ''' 测试单个代理 :param proxy: :return: ''' conn = aiohttp.TCPConnector( verify_ssl=False) #获取请求,verify_ssl=False防止ssl证书报错 async with aiohttp.ClientSession( connector=conn ) as session: #创建一个session对象(session用于存储特定对话所需信息) try: if isinstance(proxy, bytes): #判断proxy是不是bytes类型 proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试', proxy) async with session.get( TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False ) as response: #allow_redirects=False禁止重定向 if response.status in VALID_STATUS_CODES: self.redis.max(proxy) #调用db的max()方法将score设为100 print('代理可用', proxy) else: self.redis.decrease(proxy) #调用db的decrease方法将score减一 print('请求响应码不合法', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) def run(self): ''' 测试主函数 :return: ''' print('测试器开始运行') try: count = self.redis.count() print('当前剩余', count, '个代理') for i in range(0, count, BATCH_TEST_SIZE): # 步长为100 start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch( start, stop) # 调用db的batch()获取100个代理列表从高到低排列 loop = asyncio.get_event_loop() # 获取EventLoop tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait( tasks)) #执行异步任务tasks。在等待网站返回的时候去执行另一个任务,网站返回后跳回任务继续执行原任务 sys.stdout.flush() #输出实时信息,而不是等待运行完毕后输出 time.sleep(5) except Exception as e: print('测试器发送错误', e.args)