class PageCookiePool(CookiePoolInterface): """ 由页面产生的cookie 不需要用户登陆 """ def __init__( self, redis_key, page_url=None, min_cookies=10000, must_contained_keys=(), keep_alive=False, **kwargs, ): """ @param redis_key: 项目名 @param page_url: 生产cookie的url @param min_cookies: 最小cookie数 @param must_contained_keys: cookie 必须包含的key @param keep_alive: 当cookie数量足够是是否保持随时待命,生产cookie的状态。False为否,满足则退出 --- @param kwargs: WebDriver的一些参数 load_images: 是否加载图片 user_agent_pool: user-agent池 为None时不使用 proxies_pool: ;代理池 为None时不使用 headless: 是否启用无头模式 driver_type: web driver 类型 timeout: 请求超时时间 默认16s window_size: 屏幕分辨率 (width, height) """ self._redisdb = RedisDB() self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key) self._tab_cookie_pool_last_count = "{}:str_cookie_pool_count".format( redis_key ) # 存储上一次统计cookie 数量的时间,格式为 时间戳:数量 self._page_url = page_url self._min_cookies = min_cookies self._must_contained_keys = must_contained_keys self._keep_alive = keep_alive self._kwargs = kwargs self._kwargs.setdefault("load_images", False) self._kwargs.setdefault("headless", True) def create_cookie(self): """ 可能会重写 @return: """ with WebDriver(**self._kwargs) as driver: driver.get(self._page_url) cookies = driver.get_cookies() cookies_json = {} for cookie in cookies: cookies_json[cookie["name"]] = cookie["value"] for key in self._must_contained_keys: if key not in cookies_json: break else: return cookies_json log.error("获取cookie失败 cookies = {}".format(cookies_json)) return None def add_cookies(self, cookies): log.info("添加cookie {}".format(cookies)) self._redisdb.lpush(self._tab_cookie_pool, cookies) def run(self): while True: try: now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool) need_cookie_count = self._min_cookies - now_cookie_count if need_cookie_count > 0: log.info( "当前cookie数为 {} 小于 {}, 生产cookie".format( now_cookie_count, self._min_cookies ) ) try: cookies = self.create_cookie() if cookies: self.add_cookies(cookies) except Exception as e: log.exception(e) else: log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count)) # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出 last_count_info = self._redisdb.strget( self._tab_cookie_pool_last_count ) if not last_count_info: self._redisdb.strset( self._tab_cookie_pool_last_count, "{}:{}".format(time.time(), now_cookie_count), ) else: last_time, last_count = last_count_info.split(":") last_time = float(last_time) last_count = int(last_count) if time.time() - last_time > 60: if now_cookie_count == last_count: log.info("近一分钟,cookie池数量无变化,判定爬虫未使用,退出生产") break else: self._redisdb.strset( self._tab_cookie_pool_last_count, "{}:{}".format(time.time(), now_cookie_count), ) if self._keep_alive: log.info("sleep 10") tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1) def get_cookie(self, wait_when_null=True): while True: try: cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool) if not cookie_info and wait_when_null: log.info("暂无cookie 生产中...") self._keep_alive = False self._min_cookies = 1 with RedisLock( key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5 ) as _lock: if _lock.locked: self.run() continue return eval(cookie_info) if cookie_info else {} except Exception as e: log.exception(e) tools.delay_time(1) def del_cookie(self, cookies): self._redisdb.lrem(self._tab_cookie_pool, cookies)
# -*- coding: utf-8 -*- """ Created on 2021/3/4 11:01 下午 --------- @summary: --------- @author: Boris @email: [email protected] """ from feapder.db.redisdb import RedisDB redis = RedisDB(ip_ports="localhost:6379", db=0) redis.lpush("l_test", 2) redis.lpush("l_test", 3) print(redis.lrange("l_test")) print(redis.lrem("l_test", 2)) print(redis.lrange("l_test"))
class LoginCookiePool(CookiePoolInterface): """ 需要登陆的cookie池, 用户账号密码等信息用mysql保存 """ def __init__( self, redis_key, *, table_userbase, login_state_key="login_state", lock_state_key="lock_state", username_key="username", password_key="password", login_retry_times=10, ): """ @param redis_key: 项目名 @param table_userbase: 用户表名 @param login_state_key: 登录状态列名 @param lock_state_key: 封锁状态列名 @param username_key: 登陆名列名 @param password_key: 密码列名 @param login_retry_times: 登陆失败重试次数 """ self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key) self._login_retry_times = login_retry_times self._table_userbase = table_userbase self._login_state_key = login_state_key self._lock_state_key = lock_state_key self._username_key = username_key self._password_key = password_key self._redisdb = RedisDB() self._mysqldb = MysqlDB() self.create_userbase() def create_userbase(self): sql = f""" CREATE TABLE IF NOT EXISTS `{self._table_userbase}` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `{self._username_key}` varchar(50) DEFAULT NULL COMMENT '用户名', `{self._password_key}` varchar(255) DEFAULT NULL COMMENT '密码', `{self._login_state_key}` int(11) DEFAULT '0' COMMENT '登录状态(0未登录 1已登录)', `{self._lock_state_key}` int(11) DEFAULT '0' COMMENT '账号是否被封(0 未封 1 被封)', PRIMARY KEY (`id`), UNIQUE KEY `username` (`username`) USING BTREE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; """ self._mysqldb.execute(sql) def create_cookie(self, username, password): """ 创建cookie @param username: 用户名 @param password: 密码 @return: return cookie / None """ raise NotImplementedError def get_user_info(self): """ 返回用户信息 @return: yield username, password """ sql = "select {username_key}, {password_key} from {table_userbase} where {lock_state_key} != 1 and {login_state_key} != 1".format( username_key=self._username_key, password_key=self._password_key, table_userbase=self._table_userbase, lock_state_key=self._lock_state_key, login_state_key=self._login_state_key, ) return self._mysqldb.find(sql) def handle_login_failed_user(self, username, password): """ 处理登录失败的user @param username: @param password: @return: """ pass def handel_exception(self, e): """ 处理异常 @param e: @return: """ log.exception(e) def save_cookie(self, username, cookie): user_cookie = {"username": username, "cookie": cookie} self._redisdb.lpush(self._tab_cookie_pool, user_cookie) sql = "update {table_userbase} set {login_state_key} = 1 where {username_key} = '{username}'".format( table_userbase=self._table_userbase, login_state_key=self._login_state_key, username_key=self._username_key, username=username, ) self._mysqldb.update(sql) def get_cookie(self, wait_when_null=True) -> User: while True: try: user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool) if not user_cookie and wait_when_null: log.info("暂无cookie 生产中...") self.login() continue if user_cookie: user_cookie = eval(user_cookie) return User(**user_cookie) return None except Exception as e: log.exception(e) tools.delay_time(1) def del_cookie(self, user: User): """ 删除失效的cookie @param user: @return: """ user_info = {"username": user.username, "cookie": user.cookie} self._redisdb.lrem(self._tab_cookie_pool, user_info) sql = "update {table_userbase} set {login_state_key} = 0 where {username_key} = '{username}'".format( table_userbase=self._table_userbase, login_state_key=self._login_state_key, username_key=self._username_key, username=user.username, ) self._mysqldb.update(sql) def user_is_locked(self, user: User): sql = "update {table_userbase} set {lock_state_key} = 1 where {username_key} = '{username}'".format( table_userbase=self._table_userbase, lock_state_key=self._lock_state_key, username_key=self._username_key, username=user.username, ) self._mysqldb.update(sql) def run(self): with RedisLock( key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100 ) as _lock: if _lock.locked: user_infos = self.get_user_info() if not isinstance(user_infos, Iterable): raise ValueError("get_user_info 返回值必须可迭代") if not user_infos: log.info("无可用用户") for username, password in user_infos: for i in range(self._login_retry_times): try: cookie = self.create_cookie(username, password) if cookie: self.save_cookie(username, cookie) else: self.handle_login_failed_user(username, password) break except Exception as e: self.handel_exception(e) else: self.handle_login_failed_user(username, password) login = run