def acquire(self): start = time.time() while 1: # 尝试加锁 if self.redis_conn.setnx(self.lock_key, time.time()): self.redis_conn.expire(self.lock_key, self.timeout) self.locked = True break else: # 修复bug: 当加锁时被干掉 导致没有设置expire成功 锁无限存在 if self.redis_conn.ttl(self.lock_key) < 0: self.redis_conn.delete(self.lock_key) if self.wait_timeout > 0: if time.time() - start > self.wait_timeout: log.info("加锁失败") break else: # 不等待 break if self.break_wait(): log.info("break_wait 生效 不再等待加锁") break log.debug("等待加锁: {} wait:{}".format(self, time.time() - start)) if self.wait_timeout > 10: time.sleep(5) else: time.sleep(1) return
def run(self): while True: try: now_user_count = self._redisdb.hget_count(self._tab_user_pool) need_user_count = self._min_users - now_user_count if need_user_count > 0: log.info("当前在线user数为 {} 小于 {}, 生产user".format( now_user_count, self._min_users)) try: user = self.login() if user: self.add_user(user) except Exception as e: log.exception(e) else: log.debug("当前user数为 {} 数量足够 暂不生产".format(now_user_count)) if self._keep_alive: tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1)
def __add_item_to_db(self, items, update_items, requests, callbacks, items_fingerprints): export_success = False self._is_adding_to_db = True # 去重 if setting.ITEM_FILTER_ENABLE: items, items_fingerprints = self.__dedup_items( items, items_fingerprints) # 分捡 items_dict = self.__pick_items(items) update_items_dict = self.__pick_items(update_items, is_update_item=True) # item批量入库 while items_dict: tab_item, datas = items_dict.popitem() log.debug(""" -------------- item 批量入库 -------------- 表名: %s datas: %s """ % (tab_item, tools.dumps_json(datas, indent=16))) export_success = self.__export_to_db(tab_item, datas) # 执行批量update while update_items_dict: tab_item, datas = update_items_dict.popitem() log.debug(""" -------------- item 批量更新 -------------- 表名: %s datas: %s """ % (tab_item, tools.dumps_json(datas, indent=16))) update_keys = self._item_update_keys.get(tab_item) export_success = self.__export_to_db(tab_item, datas, is_update=True, update_keys=update_keys) # 执行回调 while callbacks: try: callback = callbacks.pop(0) callback() except Exception as e: log.exception(e) # 删除做过的request if requests: self._db.zrem(self._table_request, requests) # 去重入库 if export_success and setting.ITEM_FILTER_ENABLE: if items_fingerprints: self.__class__.dedup.add(items_fingerprints, skip_check=True) self._is_adding_to_db = False
def start_monitor_task(self): """ @summary: 监控任务状态 --------- --------- @result: """ if not self._parsers: # 不是多模版模式, 将自己注入到parsers,自己为模版 self._is_more_parsers = False self._parsers.append(self) elif len(self._parsers) <= 1: self._is_more_parsers = False if self._task: self.distribute_task([self._task]) else: tasks = self.get_todo_task_from_mysql() if not tasks: raise Exception("未获取到任务 请检查 task_id: {} 是否存在".format( self._task_id)) self.distribute_task(tasks) os.environ.setdefault("batch_date", "1970-00-00") log.debug("下发任务完毕")
def __add_request_to_db(self): request_list = [] prioritys = [] callbacks = [] while self._requests_deque: request = self._requests_deque.popleft() self._is_adding_to_db = True if callable(request): # 函数 # 注意:应该考虑闭包情况。闭包情况可写成 # def test(xxx = xxx): # # TODO 业务逻辑 使用 xxx # 这么写不会导致xxx为循环结束后的最后一个值 callbacks.append(request) continue priority = request.priority # 如果需要去重并且库中已重复 则continue if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE and not self.__class__.dedup.add(request.fingerprint)): log.debug("request已存在 url = %s" % request.url) continue else: request_list.append(str(request.to_dict)) prioritys.append(priority) if len(request_list) > MAX_URL_COUNT: self._db.zadd(self._table_request, request_list, prioritys) request_list = [] prioritys = [] # 入库 if request_list: self._db.zadd(self._table_request, request_list, prioritys) # 执行回调 for callback in callbacks: try: callback() except Exception as e: log.exception(e) # 删除已做任务 if self._del_requests_deque: request_done_list = [] while self._del_requests_deque: request_done_list.append(self._del_requests_deque.popleft()) # 去掉request_list中的requests, 否则可能会将刚添加的request删除 request_done_list = list( set(request_done_list) - set(request_list)) if request_done_list: self._db.zrem(self._table_request, request_done_list) self._is_adding_to_db = False
def add_user(self, user: NormalUser): log.debug("add {}".format(user)) self._redisdb.hset(self._tab_user_pool, user.user_id, user.to_dict()) sql = "update {table_userbase} set {login_state_key} = 1 where id = {user_id}".format( table_userbase=self._table_userbase, login_state_key=self._login_state_key, username_key=self._username_key, user_id=user.user_id, ) self._mysqldb.update(sql)
def delete_tables(self, delete_tables_list): if isinstance(delete_tables_list, bool): delete_tables_list = [self._redis_key + "*"] elif not isinstance(delete_tables_list, (list, tuple)): delete_tables_list = [delete_tables_list] for delete_tab in delete_tables_list: if not delete_tab.startswith(self._redis_key): delete_tab = self._redis_key + delete_tab tables = self._redisdb.getkeys(delete_tab) for table in tables: log.debug("正在删除key %s" % table) self._redisdb.clear(table)
def send( self, receivers: list, title: str, content: str, content_type: str = "plain", filepath: str = None, ): """ Args: receivers: title: content: content_type: html / plain filepath: Returns: """ # 创建一个带附件的实例 message = MIMEMultipart() message["From"] = formataddr( (self.sender, self.username)) # 括号里的对应发件人邮箱昵称、发件人邮箱账号 message["To"] = formataddr( (receivers[0], receivers[0])) # ",".join(receivers) message["Subject"] = Header(title, "utf-8") content = MIMEText(content, content_type, "utf-8") message.attach(content) # 构造附件 if filepath: attach = MIMEText(open(filepath, "rb").read(), "base64", "utf-8") attach.add_header( "content-disposition", "attachment", filename=("utf-8", "", os.path.basename(filepath)), ) message.attach(attach) msg = message.as_string() # 此处直接发送多个邮箱有问题,改成一个个发送 for receiver in receivers: log.debug("发送邮件到 {}".format(receiver)) self.smtp_client.sendmail(self.username, receiver, msg) log.debug("邮件发送成功!!!") return True
def __init__( self, ip=None, port=None, db=None, user_name=None, user_pass=None, **kwargs ): # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值 if not ip: ip = setting.MYSQL_IP if not port: port = setting.MYSQL_PORT if not db: db = setting.MYSQL_DB if not user_name: user_name = setting.MYSQL_USER_NAME if not user_pass: user_pass = setting.MYSQL_USER_PASS try: self.connect_pool = PooledDB( creator=pymysql, mincached=1, maxcached=100, maxconnections=100, blocking=True, ping=7, host=ip, port=port, user=user_name, passwd=user_pass, db=db, charset="utf8mb4", cursorclass=cursors.SSCursor, ) # cursorclass 使用服务的游标,默认的在多线程下大批量插入数据会使内存递增 except Exception as e: log.error( """ 连接数据失败: ip: {} port: {} db: {} user_name: {} user_pass: {} exception: {} """.format( ip, port, db, user_name, user_pass, e ) ) else: log.debug("连接到mysql数据库 %s : %s" % (ip, db))
def delete_tables(self, delete_tables_list): if isinstance(delete_tables_list, bool): delete_tables_list = [self._redis_key + "*"] elif not isinstance(delete_tables_list, (list, tuple)): delete_tables_list = [delete_tables_list] redis = RedisDB() for delete_tab in delete_tables_list: if delete_tab == "*": delete_tab = self._redis_key + "*" tables = redis.getkeys(delete_tab) for table in tables: log.debug("正在清理表 %s" % table) redis.clear(table)
def run(self): while True: try: try: with RedisLock( key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0 ) as _lock: if _lock.locked: for user in self._load_user(): retry_times = 0 while retry_times <= self._login_retry_times: try: user = self.login(user) if user: self.add_user(user) else: self.handle_login_failed_user(user) break except NotImplementedError: log.error( f"{self.__class__.__name__} must be implementation login method!" ) os._exit(0) except Exception as e: self.handel_exception(e) log.debug( f"login failed, user: {user} retry_times: {retry_times}" ) retry_times += 1 else: self.handle_login_failed_user(user) now_user_count = self._redisdb.hget_count( self._tab_user_pool ) log.info("当前在线user数为 {}".format(now_user_count)) except Exception as e: log.exception(e) if self._keep_alive: tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1)
def run(self): self._thread_stop = False while not self._thread_stop: try: request = self._collector.get_request() if not request: if not self.is_show_tip: log.debug("等待任务...") self.is_show_tip = True continue self.is_show_tip = False self.deal_request(request) except Exception as e: log.exception(e)
def set_unique_key(self, table, key): try: sql = "alter table %s add unique (%s)" % (table, key) conn, cursor = self.get_connection() cursor.execute(sql) conn.commit() except Exception as e: log.error(table + " " + str(e) + " key = " + key) return False else: log.debug("%s表创建唯一索引成功 索引为 %s" % (table, key)) return True finally: self.close_connection(conn, cursor)
def run(self): self.distribute_task() for i in range(self._thread_count): parser_control = AirSpiderParserControl(self._memory_db) parser_control.add_parser(self) parser_control.start() self._parser_controls.append(parser_control) while True: if self.all_thread_is_done(): # 停止 parser_controls for parser_control in self._parser_controls: parser_control.stop() log.debug("无任务,爬虫结束") break
def parse_play_time(self, request, response): """ 解析购买地址 """ movie_id = request.movie_id cinema_id = request.cinema_id pay_urls = response.xpath( f'//a[@data-val="{{movie_id: {movie_id}, cinema_id:{cinema_id}}}"]/@href' ).extract() for pay_url in pay_urls: log.debug("解析到购买地址 {}".format(pay_url)) next_request = request.copy() next_request.url = pay_url next_request.callback = self.parse_seats next_request.priority = 1 yield next_request
def run(self): while not self._thread_stop: try: request = self._memory_db.get() if not request: if not self.is_show_tip: log.debug("等待任务...") self.is_show_tip = True time.sleep(1) continue self.is_show_tip = False self.deal_request(request) except Exception as e: log.exception(e) time.sleep(3)
def reput_failed_requests_to_requests(self): log.debug("正在重置失败的requests...") total_count = 0 while True: failed_requests = self.get_failed_requests() if not failed_requests: break for request in failed_requests: request["retry_times"] = 0 request_obj = Request.from_dict(request) self._request_buffer.put_request(request_obj) total_count += 1 self._request_buffer.flush() log.debug("重置%s条失败requests为待抓取requests" % total_count)
def run(self): while not self._thread_stop: try: requests = self._memory_db.get() if not requests: if not self.is_show_tip: log.debug("parser 等待任务 ...") self.is_show_tip = True time.sleep(1) self._wait_task_time += 1 continue self.is_show_tip = False self.deal_requests([requests]) except Exception as e: log.exception(e)
def acquire(self): start = time.time() while True: # 尝试加锁 if self.redis_conn.set(self.lock_key, time.time(), nx=True, ex=5): self.locked = True break if self.wait_timeout > 0: if time.time() - start > self.wait_timeout: log.info("加锁失败") break else: break log.debug("等待加锁: {} wait:{}".format(self, time.time() - start)) if self.wait_timeout > 10: time.sleep(5) else: time.sleep(1) return
def run(self): while not self._thread_stop: try: requests = self._collector.get_requests( setting.SPIDER_TASK_COUNT) if not requests: if not self.is_show_tip: log.debug("parser 等待任务 ...") self.is_show_tip = True # log.debug('parser 等待任务 {}...'.format(tools.format_seconds(self._wait_task_time))) time.sleep(1) self._wait_task_time += 1 continue self.is_show_tip = False self.deal_requests(requests) except Exception as e: log.exception(e)
def update_task_state(self, task_id, state=1, **kwargs): """ @summary: 更新任务表中任务状态,做完每个任务时代码逻辑中要主动调用。可能会重写 调用方法为 yield lambda : self.update_task_state(task_id, state) --------- @param task_id: @param state: --------- @result: """ kwargs["id"] = task_id kwargs[self._task_state] = state sql = tools.make_update_sql( self._task_table, kwargs, condition="id = {task_id}".format(task_id=task_id) ) if self._mysqldb.update(sql): log.debug("置任务%s状态成功" % task_id) else: log.error("置任务%s状态失败 sql=%s" % (task_id, sql))
def __init__(self, ip_ports=None, db=None, user_pass=None, url=None, decode_responses=True, service_name=None, max_connections=32, **kwargs): """ redis的封装 Args: ip_ports: ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"] db: user_pass: url: decode_responses: service_name: 适用于redis哨兵模式 """ # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值 if ip_ports is None: ip_ports = setting.REDISDB_IP_PORTS if db is None: db = setting.REDISDB_DB if user_pass is None: user_pass = setting.REDISDB_USER_PASS if service_name is None: service_name = setting.REDISDB_SERVICE_NAME self._is_redis_cluster = False try: if not url: ip_ports = (ip_ports if isinstance(ip_ports, list) else ip_ports.split(",")) if len(ip_ports) > 1: startup_nodes = [] for ip_port in ip_ports: ip, port = ip_port.split(":") startup_nodes.append({"host": ip, "port": port}) if service_name: log.debug("使用redis哨兵模式") hosts = [(node["host"], node["port"]) for node in startup_nodes] sentinel = Sentinel(hosts, socket_timeout=3, **kwargs) self._redis = sentinel.master_for( service_name, password=user_pass, db=db, redis_class=redis.StrictRedis, decode_responses=decode_responses, max_connections=max_connections, **kwargs) else: log.debug("使用redis集群模式") self._redis = StrictRedisCluster( startup_nodes=startup_nodes, decode_responses=decode_responses, password=user_pass, max_connections=max_connections, **kwargs) self._is_redis_cluster = True else: ip, port = ip_ports[0].split(":") self._redis = redis.StrictRedis( host=ip, port=port, db=db, password=user_pass, decode_responses=decode_responses, max_connections=max_connections, **kwargs) else: self._redis = redis.StrictRedis.from_url( url, decode_responses=decode_responses) except Exception as e: raise else: if not url: log.debug("连接到redis数据库 %s db%s" % (ip_ports, db)) else: log.debug("连接到redis数据库 %s" % (url)) self._ip_ports = ip_ports self._db = db self._user_pass = user_pass self._url = url
def get_response(self, save_cached=False): """ 获取带有selector功能的response @param save_cached: 保存缓存 方便调试时不用每次都重新下载 @return: """ # 设置超时默认时间 self.requests_kwargs.setdefault("timeout", 22) # connect=22 read=22 # 设置stream self.requests_kwargs.setdefault( "stream", True ) # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。 # 关闭证书验证 self.requests_kwargs.setdefault("verify", False) # 设置请求方法 method = self.__dict__.get("method") if not method: if "data" in self.requests_kwargs: method = "POST" else: method = "GET" # 随机user—agent headers = self.requests_kwargs.get("headers", {}) if "user-agent" not in headers and "User-Agent" not in headers: if self.random_user_agent and setting.RANDOM_HEADERS: headers.update( {"User-Agent": self.__class__.user_agent_pool.get()}) self.requests_kwargs.update(headers=headers) else: self.requests_kwargs.setdefault( "headers", { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" }, ) # 代理 proxies = self.requests_kwargs.get("proxies", -1) if proxies == -1 and setting.PROXY_ENABLE and self.__class__.proxies_pool: while True: proxies = self.__class__.proxies_pool.get() if proxies: self.requests_kwargs.update(proxies=proxies) break else: log.debug("暂无可用代理 ...") log.debug(""" -------------- %srequest for ---------------- url = %s method = %s body = %s """ % ( "" if not self.parser_name else "%s.%s " % ( self.parser_name, (self.callback and callable(self.callback) and getattr( self.callback, "__name__") or self.callback) or "parser", ), self.url, method, self.requests_kwargs, )) # def hooks(response, *args, **kwargs): # print(response.url) # # self.requests_kwargs.update(hooks={'response': hooks}) use_session = (setting.USE_SESSION if self.use_session is None else self.use_session) # self.use_session 优先级高 if use_session: response = self._session.request(method, self.url, **self.requests_kwargs) else: response = requests.request(method, self.url, **self.requests_kwargs) response = Response(response) if save_cached: self.save_cached(response, expire_time=self.__class__.cached_expire_time) return response
def add_user(self, user: GuestUser): log.debug("add {}".format(user)) self._redisdb.hset(self._tab_user_pool, user.user_id, user.to_dict())
def get_user( self, block=True, username=None, used_for_spider_name=None, not_limit_use_interval=False, ) -> Optional[GoldUser]: """ @params username: 获取指定的用户 @params used_for_spider_name: 独享式使用,独享爬虫的名字。其他爬虫不可抢占 @params block: 无用户时是否等待 @params not_limit_frequence: 不限制使用频率 @return: GoldUser """ while True: try: user_id = username or self._get_user_id() user_str = None if user_id: user_str = self._redisdb.hget(self._tab_user_pool, user_id) if (not user_id or not user_str) and block: self._keep_alive = False self.run(username) continue # 取到用户 user = GoldUser(**eval(user_str)) # 独占式使用,若为其他爬虫,检查等待使用时间是否超过独占时间,若超过则可以使用 if (user.get_used_for_spider_name() and user.get_used_for_spider_name() != used_for_spider_name): wait_time = time.time() - user.get_last_use_time() if wait_time < user.exclusive_time: log.info("用户{} 被 {} 爬虫独占,需等待 {} 秒后才可使用".format( user.username, user.get_used_for_spider_name(), user.exclusive_time - wait_time, )) time.sleep(1) continue if not user.is_overwork() and user.is_at_work_time(): if not user.cookies: log.debug(f"用户 {user.username} 未登录,尝试登录") self._keep_alive = False self.run(username) continue if not_limit_use_interval or user.is_time_to_use(): user.set_used_for_spider_name(used_for_spider_name) log.debug("使用用户 {}".format(user.username)) self.record_user_status(user.user_id, GoldUserStatus.USED) return user else: log.debug("{} 用户使用间隔过短 查看下一个用户".format(user.username)) time.sleep(1) continue else: if not user.is_at_work_time(): log.info("用户 {} 不在工作时间 sleep 60s".format( user.username)) if block: time.sleep(60) continue else: return None except Exception as e: log.exception(e) time.sleep(1)
# -*- coding: utf-8 -*- """ Created on 2021/6/18 10:36 上午 --------- @summary: --------- @author: Boris @email: [email protected] """ from feapder.utils.log import log log.debug(1)
def get_response(self, save_cached=False): """ 获取带有selector功能的response @param save_cached: 保存缓存 方便调试时不用每次都重新下载 @return: """ # 设置超时默认时间 self.requests_kwargs.setdefault( "timeout", setting.REQUEST_TIMEOUT) # connect=22 read=22 # 设置stream # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。 self.requests_kwargs.setdefault("stream", True) # 关闭证书验证 self.requests_kwargs.setdefault("verify", False) # 设置请求方法 method = self.__dict__.get("method") if not method: if "data" in self.requests_kwargs or "json" in self.requests_kwargs: method = "POST" else: method = "GET" # 随机user—agent headers = self.requests_kwargs.get("headers", {}) if "user-agent" not in headers and "User-Agent" not in headers: if self.render: # 如果是渲染默认,优先使用WEBDRIVER中配置的ua ua = setting.WEBDRIVER.get( "user_agent") or self.__class__.user_agent_pool.get( setting.USER_AGENT_TYPE) else: ua = self.__class__.user_agent_pool.get( setting.USER_AGENT_TYPE) if self.random_user_agent and setting.RANDOM_HEADERS: headers.update({"User-Agent": ua}) self.requests_kwargs.update(headers=headers) else: self.requests_kwargs.setdefault( "headers", {"User-Agent": setting.DEFAULT_USERAGENT}) # 代理 proxies = self.requests_kwargs.get("proxies", -1) if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API: while True: proxies = self._proxies_pool.get() if proxies: self.requests_kwargs.update(proxies=proxies) break else: log.debug("暂无可用代理 ...") log.debug(""" -------------- %srequest for ---------------- url = %s method = %s body = %s """ % ( "" if not self.parser_name else "%s.%s " % ( self.parser_name, (self.callback and callable(self.callback) and getattr( self.callback, "__name__") or self.callback) or "parse", ), self.url, method, self.requests_kwargs, )) # def hooks(response, *args, **kwargs): # print(response.url) # # self.requests_kwargs.update(hooks={'response': hooks}) use_session = (setting.USE_SESSION if self.use_session is None else self.use_session) # self.use_session 优先级高 if self.render: # 使用request的user_agent、cookies、proxy user_agent = headers.get("User-Agent") or headers.get("user-agent") cookies = self.requests_kwargs.get("cookies") if cookies and isinstance(cookies, RequestsCookieJar): cookies = cookies.get_dict() if not cookies: cookie_str = headers.get("Cookie") or headers.get("cookie") if cookie_str: cookies = tools.get_cookies_from_str(cookie_str) proxy = None if proxies and proxies != -1: proxy = proxies.get("http", "").strip("http://") or proxies.get( "https", "").strip("https://") browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy) url = self.url if self.requests_kwargs.get("params"): url = tools.joint_url(self.url, self.requests_kwargs.get("params")) try: browser.get(url) if cookies: browser.cookies = cookies if self.render_time: tools.delay_time(self.render_time) html = browser.page_source response = Response.from_dict({ "url": browser.current_url, "cookies": browser.cookies, "_content": html.encode(), "status_code": 200, "elapsed": 666, "headers": { "User-Agent": browser.execute_script("return navigator.userAgent"), "Cookie": tools.cookies2str(browser.cookies), }, }) response.browser = browser except Exception as e: self._webdriver_pool.remove(browser) raise e elif use_session: response = self._session.request(method, self.url, **self.requests_kwargs) response = Response(response) else: response = requests.request(method, self.url, **self.requests_kwargs) response = Response(response) if save_cached: self.save_cached(response, expire_time=self.__class__.cached_expire_time) return response
def run(): while True: redisdb = RedisDB() try: block_ip = redisdb.sget(setting.CAPTCHA_BLOCK_IP_REDIS_KEY) if not block_ip: log.debug("暂无被封ip") for ip in block_ip: task = redisdb.hget(setting.CAPTCHA_REDIS_KEY, ip, is_pop=True) task = eval(task) ua = task.get("ua") url = task.get("url") with WebDriver(proxy=ip, user_agent=ua) as browser: log.info("解封ip {}, url {}".format(ip, url)) browser.get(url) browser.implicitly_wait(5) frame = browser.find_element_by_id("tcaptcha_iframe") browser.switch_to.frame(frame) for i in range(20): for i in range(1000): bg_url = browser.find_element_by_id( "slideBg").get_attribute("src") slide_url = browser.find_element_by_id( "slideBlock").get_attribute("src") if bg_url and slide_url: break else: log.error("滑块加载失败") return bg_image = os.path.join( CAPTCHA_PATH, "bg_" + tools.get_md5(bg_url) + ".png") slide_image = os.path.join( CAPTCHA_PATH, "slider_" + tools.get_md5(slide_url) + ".png") if tools.download_file( bg_url, bg_image) and tools.download_file( slide_url, slide_image): # 识别缺口 x, y = get_gap_center_point(bg_image, slide_image, show=False) # 缩放 x = x * 340 / 680 x = x - 27.5 - 30 # 滑动 slide_btn = browser.find_element_by_id( "tcaptcha_drag_thumb") tracks = track.get_tracks(x) drag_and_drop(browser, slide_btn, tracks) # 删除图片 os.remove(bg_image) os.remove(slide_image) tools.delay_time(2) if "verify.maoyan.com" not in browser.current_url: log.info("解封成功") break else: try: browser.find_element_by_css_selector( ".tc-action-icon").click() except: pass tools.delay_time(3) except Exception as e: log.error(e)
def run(self, username=None): while True: try: with RedisLock(key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0) as _lock: if _lock.locked: self.__sycn_users_info() online_user = 0 for user in self.users: if username and username != user.username: continue try: if user.cookies: online_user += 1 continue # 预检查 if not user.is_time_to_login(): log.info("账号{}与上次登录时间间隔过短,暂不登录: 将在{}登录使用". format(user.username, user.next_login_time())) continue user = self.login(user) if user.cookies: # 保存cookie user.set_login_time() self.add_user(user) self.record_user_status( user.user_id, GoldUserStatus.LOGIN_SUCCESS) log.debug("登录成功 {}".format(user.username)) online_user += 1 else: log.info("登录失败 {}".format(user.username)) self.record_user_status( user.user_id, GoldUserStatus.LOGIN_FALIED) except NotImplementedError: log.error( f"{self.__class__.__name__} must be implementation login method!" ) os._exit(0) except Exception as e: log.exception(e) msg = f"{user.username} 账号登陆失败 exception: {str(e)}" log.info(msg) self.record_user_status( user.user_id, GoldUserStatus.LOGIN_FALIED) send_msg( msg=msg, level="error", message_prefix=f"{user.username} 账号登陆失败", ) log.info("当前在线user数为 {}".format(online_user)) if self._keep_alive: time.sleep(10) else: break except Exception as e: log.exception(e) time.sleep(1)
def get_response(self, save_cached=False): """ 获取带有selector功能的response @param save_cached: 保存缓存 方便调试时不用每次都重新下载 @return: """ # 设置超时默认时间 self.requests_kwargs.setdefault("timeout", 22) # connect=22 read=22 # 设置stream # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。 self.requests_kwargs.setdefault("stream", True) # 关闭证书验证 self.requests_kwargs.setdefault("verify", False) # 设置请求方法 method = self.__dict__.get("method") if not method: if "data" in self.requests_kwargs: method = "POST" else: method = "GET" # 随机user—agent headers = self.requests_kwargs.get("headers", {}) if "user-agent" not in headers and "User-Agent" not in headers: if self.random_user_agent and setting.RANDOM_HEADERS: headers.update( {"User-Agent": self.__class__.user_agent_pool.get()}) self.requests_kwargs.update(headers=headers) else: self.requests_kwargs.setdefault( "headers", {"User-Agent": setting.DEFAULT_USERAGENT}) # 代理 proxies = self.requests_kwargs.get("proxies", -1) if proxies == -1 and setting.PROXY_ENABLE and self.__class__.proxies_pool: while True: proxies = self.__class__.proxies_pool.get() if proxies: self.requests_kwargs.update(proxies=proxies) break else: log.debug("暂无可用代理 ...") log.debug(""" -------------- %srequest for ---------------- url = %s method = %s body = %s """ % ( "" if not self.parser_name else "%s.%s " % ( self.parser_name, (self.callback and callable(self.callback) and getattr( self.callback, "__name__") or self.callback) or "parse", ), self.url, method, self.requests_kwargs, )) # def hooks(response, *args, **kwargs): # print(response.url) # # self.requests_kwargs.update(hooks={'response': hooks}) use_session = (setting.USE_SESSION if self.use_session is None else self.use_session) # self.use_session 优先级高 if self.render: browser = self._webdriver_pool.get() try: browser.get(self.url) html = browser.page_source response = Response.from_dict({ "url": browser.current_url, "cookies": browser.cookies, "text": html, "_content": html.encode(), "status_code": 200, "elapsed": 666, "headers": { "User-Agent": browser.execute_script("return navigator.userAgent") }, }) response._cached_text = html # response.browser = browser # 因为浏览器渲染完就释放了,所以不能绑定到response上 self._webdriver_pool.put(browser) except Exception as e: self._webdriver_pool.remove(browser) raise e elif use_session: response = self._session.request(method, self.url, **self.requests_kwargs) response = Response(response) else: response = requests.request(method, self.url, **self.requests_kwargs) response = Response(response) if save_cached: self.save_cached(response, expire_time=self.__class__.cached_expire_time) return response