class ValidateTester(): # 检测代理 def __init__(self): # 能使用的代理 self.usable_proxies = [] def set_raw_proxies(self, proxies): # 设置两个变量 self._raw_proxies = proxies self._conn = RedisClient() async def test_proxy(self, proxy): # 异步校验代理proxy是否有效 real_proxy = 'http://' + proxy print(real_proxy) async with aiohttp.ClientSession() as session: # 执行异步获取响应 try: async with session.get(TEST_IP, proxy=real_proxy, timeout=TEST_PROXY_TIMEOUT) as response: if response.status == 200: self._conn.put(proxy) except: # 如果获取不到,超时异常,则表示代理失效 print('%s代理失效' % proxy) def test(self): # 测试代理是否有效 print('测试代理是否有效开始') loop = asyncio.get_event_loop() # proxy为 IP:PORT task = [self.test_proxy(proxy) for proxy in self._raw_proxies] loop.run_until_complete(asyncio.wait(task))
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: # 将bytes对象解码成字符串,默认使用utf-8进行解码。防止数据库提取的proxy是bytes格式。 if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print('代理可用', proxy) else: self.redis.decrease(proxy) print('请求响应码不合法 ', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) def run(self): """ 测试主函数 :return: """ print('测试器开始运行') try: count = self.redis.count() print('当前剩余', count, '个代理') for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
def valid_proxies(cycle=VALID_CHECK_CYCLE): # 定时检测代理器,根据传入的时间来定时检测 # 获取redis客户端 conn = RedisClient() # 检测代理是否可用 tester = ValidateTester() while True: count = int(conn.queue_len * 0.5) # 获取队列中前一半的代理 raw_proxies = conn.get(count) # 检测代理 tester.set_raw_proxies(raw_proxies) # 开始检测代理 tester.test() # 休眠指定时长 time.sleep(cycle)
def cookie_test(id): """ cookie测试 :return: """ conn = RedisClient('taobao', 'cookie') cookies = conn.result(id) cookies = eval(cookies) cookie = cookies2cookie(cookies) user = get_user(cookie) comment = get_comment(cookie) info = get_info(cookie) colloct = get_collect(cookie) order = get_order(cookie) like = guess_you_like(cookie) data = { 'id': user, '用户信息': info, '信誉明细': comment, '购物数据': order, '收藏信息': colloct, '猜你喜欢': like, } conn3 = RedisClient('taobao', 'result') conn3.set(user, data) return info
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() # 判断是否达到了代理池限制 def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for site_name in self.crawler.__CrawlName__: proxies = self.crawler.get_raw_proxies(site_name) sys.stdout.flush() for item in proxies: self.redis.add(item)
def check_pool(lower_threshold=POOL_LOWER_THRESHOLD, upper_threshold=POOL_UPPER_THRESHOLD, cycle=POOL_CHECK_CYCLE): # 检测代理池中的IP数量,如果太少则爬取,如果太多则不爬取 # 获取redis连接对象 conn = RedisClient() print('1') adder = PoolAdder(upper_threshold) while True: # 如果redis中代理队列的长度小于最小阈值,则执行爬取代理任务 if conn.queue_len < lower_threshold: # 执行爬取代理任务 adder.add_to_queue() time.sleep(cycle)
def qrcode(task_id): """ 根据任务id(task.id)查询二维码 :return: 二维码QRCODE """ conn = RedisClient('taobao', 'qrcode') result = conn.result(task_id) # 将查询结果转成字典 result = eval(result) if result is not None: response = { "resultcode": "200", "reason": "二维码获取成功", "result": result } return jsonify(response) else: response = { "resultcode": "404", "reason": "二维码获取失败", "result": None } return jsonify(response)
def browser(): driver = Browser() qrcode = driver.create_qrcode() conn = RedisClient('taobao', 'qrcode') id = uuid.uuid1().hex response = {'uuid': id, 'qrcode': qrcode} conn.set(browser.request.id, response) driver.verify_login() if driver.login_signal == 1: cookie = driver.get_cookie() conn2 = RedisClient('taobao', 'cookie') conn2.set(id, cookie) celery.send_task('app.tasks.cookie_test', args=(id, )) return True if driver.login_signal == 0: print('任务失败') return '任务失败'
def find_user_info(user): user = str(user) print(user) print(type(user)) try: # result = db[TAOBAO_COLLECTION].find_one({"id": "英雄也枉然", }) conn = RedisClient('taobao', 'result') result = conn.result(user) result = eval(result) print(type(result)) response = { "resultcode": "200", "reason": "查询成功", "result": result } return jsonify(response) except Exception as e: print('查询失败', e) response = { "resultcode": "404", "reason": "查询失败", "result": None } return jsonify(response)
def get_redis_conn(): # 获取redis连接,并设置全局上下文对象g # hasattr()用于判断某对象是否有属性,有则返回True,否则返回False if not hasattr(g, 'redis_client'): g.redis_client = RedisClient() return g.redis_client
def __init__(self, threshold): self._threshold = threshold self._conn = RedisClient() self._crawl = FreeProxyGetter() self._tester = ValidateTester()
def set_raw_proxies(self, proxies): # 设置两个变量 self._raw_proxies = proxies self._conn = RedisClient()
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
def __init__(self): self.redis = RedisClient()