def __init__(self, crawler): print( '[' + time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(time.time())) + ']', "【init】") self.proxy = None # 默认代理为空,使用真实IP self.db = RedisClient()
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: print(proxy, type(proxy)) self.redis.add(proxy)
def control_add_proxy(): proxy_min = PROXY_MIN conn = RedisClient() addr = PoolAdder() cycle = CYCLE_TIME while True: if conn.count() < PROXY_MAX: addr.add_to_redis() time.sleep(cycle)
def valid_proxies(cycle=VALID_CHECK_CYCLE): conn = RedisClient() tester = ValidityTester() while True: print("check valid proxies") count = int(conn.zset_len / 2) if count == 0: print("add new valid proxies") time.sleep(cycle) continue proxies = conn.get(count) tester.set_raw_proxies(proxies) tester.test() time.sleep(cycle)
def control_test_proxy(): conn = RedisClient() cycle = CYCLE_TIME tester = ValidityTester() while True: print('Refreshing ip') count = int(0.5 * conn.count()) if count == 0: print('Waiting for adding') time.sleep(cycle) continue raw_proxies = conn.get(count) tester.set_test_procies(raw_proxies) tester.test() time.sleep(cycle)
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = "http://" + proxy print('正在测试', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print('代理可用', proxy) else: self.redis.decrease(proxy) print('请求响应码不合法 ', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) def run(self): """ 测试主函数 :return: """ print('测试器开始运行') try: count = self.redis.count() print('当前剩余', count, '个代理') for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
def run(self): print('Ip processing running') RedisClient().flush() valid_process = Process(target=Schedule.valid_proxies) check_process = Process(target=Schedule.check_pool) valid_process.start() check_process.start()
class PoolAdder(object): def __init__(self): self.proxy_max = PROXY_MAX self.tester = ValidityTester() self.getter = FreeProxyGetter() self.conn = RedisClient() def is_picture(self): if self.conn.count() < self.proxy_max: return True return False def add_to_redis(self): proxy_count = 0 while self.is_picture(): for callback_label in range(self.getter.__CrawlFuncCount__): callback = self.getter.__CrawlFunc__[callback_label] raw_proxies = self.getter.get_raw_proxies(callback) self.tester.set_test_procies(raw_proxies) self.tester.test() proxy_count += len(raw_proxies) if not self.is_picture(): print('代理ip队列已满') break if proxy_count == 0: print('请求不到ip') break
def check_pool(cycle=POOL_LEN_CHECK_CYCLE, min=POOL_LOWER_THRESHOLD, max=POOL_UPPER_THRESHOLD): conn = RedisClient() adder = ProxyPoolAdder(max) while True: if conn.zset_len <= min: adder.add_to_zset() time.sleep(cycle)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__crawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class ValidityTester(): test_api = TEST_API header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5702.400 QQBrowser/10.2.1893.400' } def __init__(self): self.test_proxies = [] def set_test_procies(self, proxies): self.test_proxies = proxies self.conn = RedisClient() def test_single_proxy(self, proxy): try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy r = requests.get(url=self.test_api, proxies={'http://': real_proxy}, headers=self.header) if r.status_code == 200: self.conn.put(proxy) print('验证成功:', proxy) else: print('验证失败:', proxy) except: print('请求失败:', proxy) def test(self): try: test_proxy_pool = threadpool.ThreadPool(5) test_proxy_requests = threadpool.makeRequests( self.test_single_proxy, self.test_proxies) for tpr in test_proxy_requests: test_proxy_pool.putRequest(tpr) test_proxy_pool.wait() self.test_proxies = [] except: self.test()
class QichachaProxyMiddleware(object): def __init__(self): self.proxyData = RedisClient() self.logger = logging.getLogger(__name__) def process_request(self, request, spider): if request.meta.get('retry_times'): proxy = self.proxyData.random_proxy() if proxy: uri = 'https://{}'.format(proxy) self.logger.debug('使用代理' + proxy) request.meta['proxy'] = uri request.meta['downlaod_timeout'] = 20
class ValidityTester(object): test_api = TEST_API def __init__(self): self._raw_proxies = None self._available_proxies = [] self._conn = RedisClient() self.logger = logging.getLogger(__name__) def set_raw_proxies(self, proxies): self._raw_proxies = proxies async def test_one_proxy(self, proxy): async with aiohttp.ClientSession() as session: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = "http://" + proxy try: async with session.get(ValidityTester.test_api, proxy=real_proxy, timeout=10) as rep: if rep.status == 200: self._conn.add(proxy) self.logger.info("valid proxy:" + proxy) except Exception as e: self.logger.error("invalid proxy:" + proxy) def test(self): """ aio test all proxies. """ print('ValidityTester is working') try: loop = asyncio.get_event_loop() tasks = [self.test_one_proxy(proxy) for proxy in self._raw_proxies] loop.run_until_complete(asyncio.wait(tasks)) except ValueError: self.logger.error('Async Error')
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: 单个代理 :return: None """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode("utf-8") real_proxy = "http://" + proxy print("正在测试", proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=10) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) else: self.redis.decrease(proxy) print("请求响应码不和法", proxy) except ( ClientError, ProxyConnectionError, ServerConnectionError, TimeoutError, AttributeError, ): self.redis.decrease(proxy) print("代理请求失败", proxy) except Exception as e: print(e) def run(self): """ 测试主函数 :return: """ print("测试器开始运行") try: proxies = self.redis.all() loop = asyncio.get_event_loop() # 批量测试 for i in range(0, len(proxies), BATCH_TEST_SIZE): test_proxies = proxies[i:i + BATCH_TEST_SIZE] tasks = [self.test_single_proxy(proxy) for proxy in proxies] loop.run_until_complete(asyncio.wait(tasks)) time.sleep(5) except Exception as e: print("测试器发生错误", e.args)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ Compare to the capacity of proxy pool. """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): logger.debug('Getter is running.') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print('代理可用', proxy) else: self.redis.decrease(proxy) print('请求响应码不合法', proxy) except (ClientError, ClientConnectionError, TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) def run(self): print('测试器开始运行') try: proxies = self.redis.all() loop = asyncio.get_event_loop() for i in range(0, len(proxies), BATCH_TEST_SIZE): test_proxies = proxies[i:i + BATCH_TEST_SIZE] tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
def __init__(self): self.redis = RedisClient()
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
def set_test_procies(self, proxies): self.test_proxies = proxies self.conn = RedisClient()
def __init__(self): self.proxy_max = PROXY_MAX self.tester = ValidityTester() self.getter = FreeProxyGetter() self.conn = RedisClient()
class SuperspiderDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. def __init__(self, crawler): print( '[' + time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(time.time())) + ']', "【init】") self.proxy = None # 默认代理为空,使用真实IP self.db = RedisClient() def infoprint(self, message): print('[' + time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(time.time())) + ']【Info】' + message) def get_proxy(self): """随机获取代理""" try: proxy = self.db.get_random().decode('utf-8') return "http://{}".format(proxy) except Exception as e: print( '[' + time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(time.time())) + ']', '【获取代理失败!】') return None @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls(crawler) crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called """处理请求""" self.infoprint('【是否重试】:' + ('是' if request.meta.get('retry') else '否')) old_proxy = request.meta.get('proxy') if self.proxy is None or old_proxy is None or self.proxy == request.meta.get( 'proxy'): # 请求被重来,更换代理 proxy = self.get_proxy() self.infoprint('更换代理为:{}'.format(proxy)) if proxy: self.proxy = proxy request.meta['proxy'] = self.proxy spider.info('【request】' + self.proxy + ' URL:' + request.url) def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest """处理响应""" if response.status != 200: if response.status == 302: self.infoprint('【被拦截】:' + self.proxy) spider.logger.warning('【被拦截】:' + self.proxy) elif response.status == 404: self.infoprint('【无法找到文件】:' + self.proxy + ' URL:' + request.url) spider.logger.warning('【无法找到文件】:' + self.proxy + ' URL:' + request.url) else: self.infoprint('【未知】' + self.proxy + ' ' + str(response.status) + ' URL:' + request.url) spider.logger.warning('【未知】' + self.proxy + ' ' + str(response.status) + ' URL:' + request.url) return self.get_retry_request(request) elif '用户访问安全认证' in response.text: self.infoprint('【出现安全认证】' + response.url) spider.logger.warning('【出现安全认证】' + response.url) return self.get_retry_request(request) return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain try: oserror = str(exception.osError) if oserror == "10060" or oserror == "10061": self.infoprint('【exception】' + request.url + ' ' + str(exception.args)) spider.logger.error('【exception】' + request.url + ' ' + str(exception.args)) else: self.infoprint('【exception】' + request.url + ' ' + str(exception.osError)) spider.logger.error('【exception】' + request.url + ' ' + str(exception.osError)) except: try: self.infoprint('【exception】' + request.url + ' ' + str(exception)) spider.logger.error('【exception】' + request.url + ' ' + str(exception)) except: pass pass self.infoprint('【请求错误】重试') spider.logger.info('【请求错误】重试') # 重试 return self.get_retry_request(request) def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) def get_retry_request(self, request): """获取要重试的请求""" try: self.proxy = None # 重置代理 retry_request = request.copy() retry_request.dont_filter = True # 禁止去重 retry_request.meta['retry'] = time.time() return retry_request except Exception as e: self.infoprint('【get_retry_request】【获取要重试的请求出错】' + str(e)) return None
def get_conn(): if not hasattr(g, 'redis'): g.redis = RedisClient() return g.redis
from ProxyPool.db import RedisClient conn = RedisClient() def set(proxy): result = conn.add(proxy) print(proxy) print('录入成功' if result else '录入失败') def scan(): print('请输入代理, 输入exit退出读入') while True: proxy = input() if proxy == 'exit': break set(proxy) if __name__ == '__main__': scan()
def __init__(self, threshold): self._conn = RedisClient() self._crawl = FreeProxyGetter() self._tester = ValidityTester() self._threshold = threshold self.logger = logging.getLogger(__name__)
class Tester: def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ Test the availability of a proxy. And maximize its score if available else decrease its score. :param proxy :return """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy # url应包括协议,浏览器访问不加协议会默认用http logger.debug('Testing {}'.format(proxy)) async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: logger.debug('Proxy {} is OK'.format(proxy)) self.redis.maximize(proxy) else: logger.warning( 'Failed to use proxy {} because the response code was {}' .format(proxy, response.status)) self.redis.decrease(proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError) as e: self.redis.decrease(proxy) logger.warning('Failed to use proxy {} because of {}'.format( proxy, repr(e))) def run(self): """ Run tester. :return: """ logger.debug('Tester is running.') try: count = self.redis.count() logger.info( 'There are {} proxy (proxies) in proxy pool now.'.format( count)) for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) # 分批测试防止内存开销过大 logger.debug( 'Testing proxies with index between {} and {}.'.format( start + 1, stop)) test_proxies = self.redis.batch(start, stop) # 异步测试加快速度 loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) logger.info('Testing finished') except Exception as e: logger.warning('Tester error {}'.format(e.args))
def __init__(self): self.proxyData = RedisClient() self.logger = logging.getLogger(__name__)
def __init__(self): self._raw_proxies = None self._available_proxies = [] self._conn = RedisClient() self.logger = logging.getLogger(__name__)
def get_proxy(): cli = RedisClient() return cli.random()