def get_response_from_cached(self, save_cached=True): """ 从缓存中获取response 注意: 属性值为空: -raw : urllib3.response.HTTPResponse -connection:requests.adapters.HTTPAdapter -history 属性含义改变: - request 由requests 改为Request @param: save_cached 当无缓存 直接下载 下载完是否保存缓存 @return: """ response_dict = self._cache_db.strget(self._cached_redis_key) if not response_dict: log.info("无response缓存 重新下载") response_obj = self.get_response(save_cached=save_cached) else: response_dict = eval(response_dict) response_obj = Response.from_dict(response_dict) return response_obj
def get_response(self, save_cached=False): """ 获取带有selector功能的response @param save_cached: 保存缓存 方便调试时不用每次都重新下载 @return: """ # 设置超时默认时间 self.requests_kwargs.setdefault( "timeout", setting.REQUEST_TIMEOUT) # connect=22 read=22 # 设置stream # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。 self.requests_kwargs.setdefault("stream", True) # 关闭证书验证 self.requests_kwargs.setdefault("verify", False) # 设置请求方法 method = self.__dict__.get("method") if not method: if "data" in self.requests_kwargs or "json" in self.requests_kwargs: method = "POST" else: method = "GET" # 随机user—agent headers = self.requests_kwargs.get("headers", {}) if "user-agent" not in headers and "User-Agent" not in headers: if self.render: # 如果是渲染默认,优先使用WEBDRIVER中配置的ua ua = setting.WEBDRIVER.get( "user_agent") or self.__class__.user_agent_pool.get( setting.USER_AGENT_TYPE) else: ua = self.__class__.user_agent_pool.get( setting.USER_AGENT_TYPE) if self.random_user_agent and setting.RANDOM_HEADERS: headers.update({"User-Agent": ua}) self.requests_kwargs.update(headers=headers) else: self.requests_kwargs.setdefault( "headers", {"User-Agent": setting.DEFAULT_USERAGENT}) # 代理 proxies = self.requests_kwargs.get("proxies", -1) if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API: while True: proxies = self._proxies_pool.get() if proxies: self.requests_kwargs.update(proxies=proxies) break else: log.debug("暂无可用代理 ...") log.debug(""" -------------- %srequest for ---------------- url = %s method = %s body = %s """ % ( "" if not self.parser_name else "%s.%s " % ( self.parser_name, (self.callback and callable(self.callback) and getattr( self.callback, "__name__") or self.callback) or "parse", ), self.url, method, self.requests_kwargs, )) # def hooks(response, *args, **kwargs): # print(response.url) # # self.requests_kwargs.update(hooks={'response': hooks}) use_session = (setting.USE_SESSION if self.use_session is None else self.use_session) # self.use_session 优先级高 if self.render: # 使用request的user_agent、cookies、proxy user_agent = headers.get("User-Agent") or headers.get("user-agent") cookies = self.requests_kwargs.get("cookies") if cookies and isinstance(cookies, RequestsCookieJar): cookies = cookies.get_dict() if not cookies: cookie_str = headers.get("Cookie") or headers.get("cookie") if cookie_str: cookies = tools.get_cookies_from_str(cookie_str) proxy = None if proxies and proxies != -1: proxy = proxies.get("http", "").strip("http://") or proxies.get( "https", "").strip("https://") browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy) url = self.url if self.requests_kwargs.get("params"): url = tools.joint_url(self.url, self.requests_kwargs.get("params")) try: browser.get(url) if cookies: browser.cookies = cookies if self.render_time: tools.delay_time(self.render_time) html = browser.page_source response = Response.from_dict({ "url": browser.current_url, "cookies": browser.cookies, "_content": html.encode(), "status_code": 200, "elapsed": 666, "headers": { "User-Agent": browser.execute_script("return navigator.userAgent"), "Cookie": tools.cookies2str(browser.cookies), }, }) response.browser = browser except Exception as e: self._webdriver_pool.remove(browser) raise e elif use_session: response = self._session.request(method, self.url, **self.requests_kwargs) response = Response(response) else: response = requests.request(method, self.url, **self.requests_kwargs) response = Response(response) if save_cached: self.save_cached(response, expire_time=self.__class__.cached_expire_time) return response
def get_response(self, save_cached=False): """ 获取带有selector功能的response @param save_cached: 保存缓存 方便调试时不用每次都重新下载 @return: """ # 设置超时默认时间 self.requests_kwargs.setdefault("timeout", 22) # connect=22 read=22 # 设置stream # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。 self.requests_kwargs.setdefault("stream", True) # 关闭证书验证 self.requests_kwargs.setdefault("verify", False) # 设置请求方法 method = self.__dict__.get("method") if not method: if "data" in self.requests_kwargs: method = "POST" else: method = "GET" # 随机user—agent headers = self.requests_kwargs.get("headers", {}) if "user-agent" not in headers and "User-Agent" not in headers: if self.random_user_agent and setting.RANDOM_HEADERS: headers.update( {"User-Agent": self.__class__.user_agent_pool.get()}) self.requests_kwargs.update(headers=headers) else: self.requests_kwargs.setdefault( "headers", {"User-Agent": setting.DEFAULT_USERAGENT}) # 代理 proxies = self.requests_kwargs.get("proxies", -1) if proxies == -1 and setting.PROXY_ENABLE and self.__class__.proxies_pool: while True: proxies = self.__class__.proxies_pool.get() if proxies: self.requests_kwargs.update(proxies=proxies) break else: log.debug("暂无可用代理 ...") log.debug(""" -------------- %srequest for ---------------- url = %s method = %s body = %s """ % ( "" if not self.parser_name else "%s.%s " % ( self.parser_name, (self.callback and callable(self.callback) and getattr( self.callback, "__name__") or self.callback) or "parse", ), self.url, method, self.requests_kwargs, )) # def hooks(response, *args, **kwargs): # print(response.url) # # self.requests_kwargs.update(hooks={'response': hooks}) use_session = (setting.USE_SESSION if self.use_session is None else self.use_session) # self.use_session 优先级高 if self.render: browser = self._webdriver_pool.get() try: browser.get(self.url) html = browser.page_source response = Response.from_dict({ "url": browser.current_url, "cookies": browser.cookies, "text": html, "_content": html.encode(), "status_code": 200, "elapsed": 666, "headers": { "User-Agent": browser.execute_script("return navigator.userAgent") }, }) response._cached_text = html # response.browser = browser # 因为浏览器渲染完就释放了,所以不能绑定到response上 self._webdriver_pool.put(browser) except Exception as e: self._webdriver_pool.remove(browser) raise e elif use_session: response = self._session.request(method, self.url, **self.requests_kwargs) response = Response(response) else: response = requests.request(method, self.url, **self.requests_kwargs) response = Response(response) if save_cached: self.save_cached(response, expire_time=self.__class__.cached_expire_time) return response