class GetProxy: def __init__(self): self.redis = RedisClient() def clear_old_key(self): min_, max_ = '-inf', '+inf' length = self.redis.count(min_, max_, name=PROXY_FOR_USE) if length > 0: self.redis.db.zremrangebyrank(PROXY_FOR_USE, 0, -1) def init_redis_key(self): self.clear_old_key() min_, max_ = '-inf', '+inf' length = self.redis.count(min_, max_) if length > 0: self.redis.db.zunionstore(PROXY_FOR_USE, [PROXY_VALIDATED]) def get_proxy(self): min_, max_ = '-inf', '+inf' length = self.redis.count(min_, max_, name=PROXY_FOR_USE) if length > 0: proxy = self.redis.get_proxy_by_score( min_, max_, 1, key=PROXY_FOR_USE)[0].split('-')[1] return proxy else: raise Exception('no proxy to use')
class ProxyCheck(Utility): def __init__(self): self.redis = RedisClient() self.valid = IpValidation() def check_num(self): valid_num = self.redis.count(VALIDATED_SCORE, VALIDATED_SCORE, name=PROXY_VALIDATED) if valid_num < VALIDATED_PROXY_NUM: run() def init_score(self): start, end = DISCARD_SCORE, '+inf' length = self.redis.count(start, end, name=PROXY_VALIDATED) while length > 0: result = self.redis.get_proxy_by_score(start, end, 1000) for ip in result: self.redis.db.zadd(PROXY_VALIDATED, {ip: INITIAL_SCORE}) start += 1000 length = self.redis.count(start, end) logger.info('initiation finished') def check_valid(self): settings.SPIDER_RUNNING = False self.init_score() self.valid.run_validation(key=PROXY_VALIDATED) self.check_num()
class IpValidation(Utility): def __init__(self): self.redis = RedisClient() self.real_ip = '' # 每次验证不成功,减去的分值 self.minus_every_time = (INITIAL_SCORE - DISCARD_SCORE) // VALIDATE_TIME self.key = PROXY_ORIGINAL self.anon_check_url = 'http://httpbin.org/ip' @staticmethod async def is_proxy_valid(proxy, url=TEST_URL): url = url ua = get_random_ua() headers = {'User-Agent': ua} try: conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(headers=headers, connector=conn) as session: async with session.get(url, proxy=proxy, ssl=False) as resp: code = resp.status if 200 <= code < 300: logger.info('%s is valid' % proxy) return True else: logger.info('%s is invalid, code: %s' % (proxy, code)) return False except (ClientConnectionError, ClientHttpProxyError, TimeoutError, CancelledError, ClientProxyConnectionError, Exception) as e: logger.warning(e) return False async def is_high_anon(self, proxy): url = ANON_CHECK_URL try: async with aiohttp.ClientSession() as session: async with session.get(url, proxy=proxy, ssl=False, timeout=15) as resp: code = resp.status if 200 <= code < 300: x_forwarded_for_json = await resp.json() if self.anon_check_url == ANON_CHECK_URL: x_forwarded_for = x_forwarded_for_json['origin'] else: # 根据接口自己定义 x_forwarded_for = x_forwarded_for_json[ 'X-Forwarded-For'] if self.real_ip in x_forwarded_for: return False return True return False except (ClientConnectionError, ClientHttpProxyError, TimeoutError, CancelledError, ClientProxyConnectionError, Exception) as e: logger.warning('proxy: %s, %s' % (proxy, e)) return False async def test_proxy(self, proxy): try: if len(proxy.split('-')[1]) > 1: if not await self.is_high_anon( proxy.split('-')[1].replace('https://', 'http://')): self.redis.adjust_score(proxy, -self.minus_every_time, key=self.key) else: self.redis.adjust_score(proxy, +1, key=self.key) except CancelledError as e: logger.warning('proxy: %s, %s' % (proxy, e)) def get_real_ip(self): resp = requests.get(ANON_CHECK_URL) if self.anon_check_url == ANON_CHECK_URL: self.real_ip = resp.json()['origin'].split(',')[0] else: self.real_ip = resp.json()['X-Real-Ip'] def run_validation(self, key=None): if key: self.key = key logger.info('start checking...') start, end = DISCARD_SCORE + 1, INITIAL_SCORE while True: proxy_unvalidated = self.redis.count(start, end, name=self.key) if proxy_unvalidated: logger.info('checking...') if proxy_unvalidated <= CONCURRENCY_TASK_LIMIT: self.get_real_ip() proxy_list = self.redis.get_proxy_by_score( start, end, proxy_unvalidated, key=self.key) # loop = asyncio.get_event_loop() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = [self.test_proxy(proxy) for proxy in proxy_list] loop.run_until_complete(asyncio.wait(tasks)) else: fetch_times = proxy_unvalidated // CONCURRENCY_TASK_LIMIT left_nums = proxy_unvalidated // CONCURRENCY_TASK_LIMIT for i in range(fetch_times): self.get_real_ip() proxy_list = self.redis.get_proxy_by_score( start, end, CONCURRENCY_TASK_LIMIT, key=self.key) # loop = asyncio.get_event_loop() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = [ self.test_proxy(proxy) for proxy in proxy_list ] loop.run_until_complete(asyncio.wait(tasks)) proxy_list = self.redis.get_proxy_by_score(start, end, left_nums, key=self.key) # loop = asyncio.get_event_loop() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = [self.test_proxy(proxy) for proxy in proxy_list] loop.run_until_complete(asyncio.wait(tasks)) import settings if not proxy_unvalidated and not settings.SPIDER_RUNNING: settings.SPIDER_RUNNING = True self.key = PROXY_ORIGINAL logger.info('scrawl finished,all proxies check finished') break