Beispiel #1
0
    def get_response_from_cached(self, save_cached=True):
        """
        从缓存中获取response
        注意:
            属性值为空:
                -raw : urllib3.response.HTTPResponse
                -connection:requests.adapters.HTTPAdapter
                -history

            属性含义改变:
                - request 由requests 改为Request
        @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
        @return:
        """
        response_dict = self._cache_db.strget(self._cached_redis_key)
        if not response_dict:
            log.info("无response缓存  重新下载")
            response_obj = self.get_response(save_cached=save_cached)
        else:
            response_dict = eval(response_dict)
            response_obj = Response.from_dict(response_dict)
        return response_obj
Beispiel #2
0
    def get_response(self, save_cached=False):
        """
        获取带有selector功能的response
        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
        @return:
        """
        # 设置超时默认时间
        self.requests_kwargs.setdefault(
            "timeout", setting.REQUEST_TIMEOUT)  # connect=22 read=22

        # 设置stream
        # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
        self.requests_kwargs.setdefault("stream", True)

        # 关闭证书验证
        self.requests_kwargs.setdefault("verify", False)

        # 设置请求方法
        method = self.__dict__.get("method")
        if not method:
            if "data" in self.requests_kwargs or "json" in self.requests_kwargs:
                method = "POST"
            else:
                method = "GET"

        # 随机user—agent
        headers = self.requests_kwargs.get("headers", {})
        if "user-agent" not in headers and "User-Agent" not in headers:
            if self.render:  # 如果是渲染默认,优先使用WEBDRIVER中配置的ua
                ua = setting.WEBDRIVER.get(
                    "user_agent") or self.__class__.user_agent_pool.get(
                        setting.USER_AGENT_TYPE)
            else:
                ua = self.__class__.user_agent_pool.get(
                    setting.USER_AGENT_TYPE)

            if self.random_user_agent and setting.RANDOM_HEADERS:
                headers.update({"User-Agent": ua})
                self.requests_kwargs.update(headers=headers)
        else:
            self.requests_kwargs.setdefault(
                "headers", {"User-Agent": setting.DEFAULT_USERAGENT})

        # 代理
        proxies = self.requests_kwargs.get("proxies", -1)
        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
            while True:
                proxies = self._proxies_pool.get()
                if proxies:
                    self.requests_kwargs.update(proxies=proxies)
                    break
                else:
                    log.debug("暂无可用代理 ...")

        log.debug("""
                -------------- %srequest for ----------------
                url  = %s
                method = %s
                body = %s
                """ % (
            "" if not self.parser_name else "%s.%s " % (
                self.parser_name,
                (self.callback and callable(self.callback) and getattr(
                    self.callback, "__name__") or self.callback) or "parse",
            ),
            self.url,
            method,
            self.requests_kwargs,
        ))

        # def hooks(response, *args, **kwargs):
        #     print(response.url)
        #
        # self.requests_kwargs.update(hooks={'response': hooks})

        use_session = (setting.USE_SESSION if self.use_session is None else
                       self.use_session)  # self.use_session 优先级高

        if self.render:
            # 使用request的user_agent、cookies、proxy
            user_agent = headers.get("User-Agent") or headers.get("user-agent")
            cookies = self.requests_kwargs.get("cookies")
            if cookies and isinstance(cookies, RequestsCookieJar):
                cookies = cookies.get_dict()

            if not cookies:
                cookie_str = headers.get("Cookie") or headers.get("cookie")
                if cookie_str:
                    cookies = tools.get_cookies_from_str(cookie_str)

            proxy = None
            if proxies and proxies != -1:
                proxy = proxies.get("http",
                                    "").strip("http://") or proxies.get(
                                        "https", "").strip("https://")

            browser = self._webdriver_pool.get(user_agent=user_agent,
                                               proxy=proxy)

            url = self.url
            if self.requests_kwargs.get("params"):
                url = tools.joint_url(self.url,
                                      self.requests_kwargs.get("params"))

            try:
                browser.get(url)
                if cookies:
                    browser.cookies = cookies
                if self.render_time:
                    tools.delay_time(self.render_time)

                html = browser.page_source
                response = Response.from_dict({
                    "url": browser.current_url,
                    "cookies": browser.cookies,
                    "_content": html.encode(),
                    "status_code": 200,
                    "elapsed": 666,
                    "headers": {
                        "User-Agent":
                        browser.execute_script("return navigator.userAgent"),
                        "Cookie":
                        tools.cookies2str(browser.cookies),
                    },
                })

                response.browser = browser
            except Exception as e:
                self._webdriver_pool.remove(browser)
                raise e

        elif use_session:
            response = self._session.request(method, self.url,
                                             **self.requests_kwargs)
            response = Response(response)
        else:
            response = requests.request(method, self.url,
                                        **self.requests_kwargs)
            response = Response(response)

        if save_cached:
            self.save_cached(response,
                             expire_time=self.__class__.cached_expire_time)

        return response
Beispiel #3
0
    def get_response(self, save_cached=False):
        """
        获取带有selector功能的response
        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
        @return:
        """
        # 设置超时默认时间
        self.requests_kwargs.setdefault("timeout", 22)  # connect=22 read=22

        # 设置stream
        # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
        self.requests_kwargs.setdefault("stream", True)

        # 关闭证书验证
        self.requests_kwargs.setdefault("verify", False)

        # 设置请求方法
        method = self.__dict__.get("method")
        if not method:
            if "data" in self.requests_kwargs:
                method = "POST"
            else:
                method = "GET"

        # 随机user—agent
        headers = self.requests_kwargs.get("headers", {})
        if "user-agent" not in headers and "User-Agent" not in headers:
            if self.random_user_agent and setting.RANDOM_HEADERS:
                headers.update(
                    {"User-Agent": self.__class__.user_agent_pool.get()})
                self.requests_kwargs.update(headers=headers)
        else:
            self.requests_kwargs.setdefault(
                "headers", {"User-Agent": setting.DEFAULT_USERAGENT})

        # 代理
        proxies = self.requests_kwargs.get("proxies", -1)
        if proxies == -1 and setting.PROXY_ENABLE and self.__class__.proxies_pool:
            while True:
                proxies = self.__class__.proxies_pool.get()
                if proxies:
                    self.requests_kwargs.update(proxies=proxies)
                    break
                else:
                    log.debug("暂无可用代理 ...")

        log.debug("""
                -------------- %srequest for ----------------
                url  = %s
                method = %s
                body = %s
                """ % (
            "" if not self.parser_name else "%s.%s " % (
                self.parser_name,
                (self.callback and callable(self.callback) and getattr(
                    self.callback, "__name__") or self.callback) or "parse",
            ),
            self.url,
            method,
            self.requests_kwargs,
        ))

        # def hooks(response, *args, **kwargs):
        #     print(response.url)
        #
        # self.requests_kwargs.update(hooks={'response': hooks})

        use_session = (setting.USE_SESSION if self.use_session is None else
                       self.use_session)  # self.use_session 优先级高

        if self.render:
            browser = self._webdriver_pool.get()

            try:
                browser.get(self.url)
                html = browser.page_source
                response = Response.from_dict({
                    "url": browser.current_url,
                    "cookies": browser.cookies,
                    "text": html,
                    "_content": html.encode(),
                    "status_code": 200,
                    "elapsed": 666,
                    "headers": {
                        "User-Agent":
                        browser.execute_script("return navigator.userAgent")
                    },
                })

                response._cached_text = html
                # response.browser = browser # 因为浏览器渲染完就释放了,所以不能绑定到response上
                self._webdriver_pool.put(browser)
            except Exception as e:
                self._webdriver_pool.remove(browser)
                raise e

        elif use_session:
            response = self._session.request(method, self.url,
                                             **self.requests_kwargs)
            response = Response(response)
        else:
            response = requests.request(method, self.url,
                                        **self.requests_kwargs)
            response = Response(response)

        if save_cached:
            self.save_cached(response,
                             expire_time=self.__class__.cached_expire_time)

        return response