async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') #real_proxy = 'http://' + proxy real_proxy = proxy print_messege('正在测试', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print_messege('代理可用', proxy) else: self.redis.decrease(proxy) error_messege('请求响应码不合法 ', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) error_messege('代理请求失败', proxy)
def schedule_getter(self, cycle=GETTER_CYCLE): """ 定时获取代理 """ getter = Getter() while True: print_messege('开始抓取代理') getter.run() time.sleep(cycle)
def schedule_tester(self, cycle=TESTER_CYCLE): """ 定时测试代理 """ tester = Tester() while True: print_messege('测试器开始运行') tester.run() time.sleep(cycle)
def max(self, proxy): """ 将代理设置为MAX_SCORE :param proxy: 代理 :return: 设置结果 """ print_messege('代理', proxy, '可用,设置为', MAX_SCORE) #return self.db.zadd(REDIS_KEY, MAX_SCORE, proxy) #更新为redis3.0+版本,解决redis3.0更新后的报错,如用旧版本还原上方代码 return self.db.zadd(REDIS_KEY, {proxy: MAX_SCORE})
def run(self): print_messege('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
def run(self): print_messege('代理池开始运行') if TESTER_ENABLED: tester_process = Process(target=self.schedule_tester) tester_process.start() if GETTER_ENABLED: getter_process = Process(target=self.schedule_getter) getter_process.start() if API_ENABLED: api_process = Process(target=self.schedule_api) api_process.start()
def decrease(self, proxy): """ 代理值减一分,小于最小值则删除 :param proxy: 代理 :return: 修改后的代理分数 """ score = self.db.zscore(REDIS_KEY, proxy) if score and score > MIN_SCORE: print_messege('代理', proxy, '当前分数', score, '减1') #return self.db.zincrby(REDIS_KEY, proxy, -1) #更新为redis3.0+版本,解决redis3.0更新后的报错,如用旧版本还原上方代码 return self.db.zincrby(REDIS_KEY, -1, proxy) else: print_messege('代理', proxy, '当前分数', score, '移除') return self.db.zrem(REDIS_KEY, proxy)
def get_page_noverify(url, options={}): """ 抓取代理 :param url: :param options: :return: """ headers = dict(base_headers, **options) print_messege('正在抓取', url) try: requests.packages.urllib3.disable_warnings() response = requests.get(url, headers=headers, verify=False) print_messege('抓取成功', url, response.status_code) if response.status_code == 200: return response.text except ConnectionError: error_messege('抓取失败', url) return None
def run(self): """ 测试主函数 :return: """ print_messege('测试器开始运行') try: count = self.redis.count() print_messege('当前剩余', count, '个代理') for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print_messege('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: error_messege('测试器发生错误', e.args)
def get_proxies(self, callback): proxies = [] for proxy in eval("self.{}()".format(callback)): print_messege('成功获取到代理', proxy) proxies.append(proxy) return proxies