コード例 #1
0
ファイル: session_page.py プロジェクト: nntx/DrissionPage
    def _make_response(self,
                       url: str,
                       mode: str = 'get',
                       data: dict = None,
                       show_errmsg: bool = False,
                       **kwargs) -> tuple:
        """生成response对象                     \n
        :param url: 目标url
        :param mode: 'get', 'post' 中选择
        :param data: post方式要提交的数据
        :param show_errmsg: 是否显示和抛出异常
        :param kwargs: 其它参数
        :return: Response对象
        """
        if mode not in ['get', 'post']:
            raise ValueError("Argument mode can only be 'get' or 'post'.")
        url = quote(url, safe='/:&?=%;#@')

        # 设置referer和host值
        kwargs_set = set(x.lower() for x in kwargs)
        if 'headers' in kwargs_set:
            header_set = set(x.lower() for x in kwargs['headers'])
            if self.url and 'referer' not in header_set:
                kwargs['headers']['Referer'] = self.url
            if 'host' not in header_set:
                kwargs['headers']['Host'] = urlparse(url).hostname
        else:
            kwargs['headers'] = self.session.headers
            kwargs['headers']['Host'] = urlparse(url).hostname
            if self.url:
                kwargs['headers']['Referer'] = self.url

        if 'timeout' not in kwargs_set:
            kwargs['timeout'] = self.timeout

        try:
            r = None
            if mode == 'get':
                r = self.session.get(url, **kwargs)
            elif mode == 'post':
                r = self.session.post(url, data=data, **kwargs)
        except Exception as e:
            if show_errmsg:
                raise e
            return None, e
        else:
            headers = dict(r.headers)
            if 'Content-Type' not in headers or 'charset' not in headers['Content-Type']:
                re_result = re_SEARCH(r'<meta.*?charset=[ \'"]*([^"\' />]+).*?>', r.text)
                try:
                    charset = re_result.group(1)
                except:
                    charset = 'utf-8'
            else:
                charset = headers['Content-Type'].split('=')[1]
            # 避免存在退格符导致乱码或解析出错
            r._content = r.content if 'stream' in kwargs and kwargs['stream'] else r.content.replace(b'\x08', b'\\b')
            r.encoding = charset
            return r, 'Success'
コード例 #2
0
    def _make_response(self,
                       url: str,
                       mode: str = 'get',
                       data: dict = None,
                       show_errmsg: bool = False,
                       **kwargs) -> tuple:
        """生成response对象                     \n
        :param url: 目标url
        :param mode: 'get', 'post' 中选择
        :param data: post方式要提交的数据
        :param show_errmsg: 是否显示和抛出异常
        :param kwargs: 其它参数
        :return: tuple,第一位为Response或None,第二位为出错信息或'Sussess'
        """
        if mode not in ['get', 'post']:
            raise ValueError("Argument mode can only be 'get' or 'post'.")
        url = quote(url, safe='/:&?=%;#@+')

        # 设置referer和host值
        kwargs_set = set(x.lower() for x in kwargs)
        if 'headers' in kwargs_set:
            header_set = set(x.lower() for x in kwargs['headers'])
            if self.url and 'referer' not in header_set:
                kwargs['headers']['Referer'] = self.url
            if 'host' not in header_set:
                kwargs['headers']['Host'] = urlparse(url).hostname
        else:
            kwargs['headers'] = self.session.headers
            kwargs['headers']['Host'] = urlparse(url).hostname
            if self.url:
                kwargs['headers']['Referer'] = self.url

        if 'timeout' not in kwargs_set:
            kwargs['timeout'] = self.timeout

        try:
            r = None
            if mode == 'get':
                r = self.session.get(url, **kwargs)
            elif mode == 'post':
                r = self.session.post(url, data=data, **kwargs)
        except Exception as e:
            if show_errmsg:
                raise e
            return None, e
        else:
            headers = dict(r.headers)
            content_type = tuple(x for x in headers
                                 if x.lower() == 'content-type')
            stream = tuple(x for x in kwargs if x.lower() == 'stream')
            not_stream = (not stream
                          or not kwargs[stream[0]]) and not self.session.stream
            charset = None
            if not content_type or 'charset' not in headers[
                    content_type[0]].lower():
                if not_stream:
                    re_result = re_SEARCH(
                        r'<meta.*?charset=[ \'"]*([^"\' />]+).*?>',
                        r.iter_content(chunk_size=512).__next__().decode())
                    try:
                        charset = re_result.group(1)
                    except:
                        charset = r.apparent_encoding
            else:
                charset = headers[content_type[0]].split('=')[1]

            if not_stream:  # 加载网页时修复编码
                r._content = r.content.replace(b'\x08',
                                               b'\\b')  # 避免存在退格符导致乱码或解析出错
                r.html.encoding = r.encoding  # 修复requests_html丢失编码方式的bug
            if charset:
                r.encoding = charset
            return r, 'Success'
コード例 #3
0
    def _make_response(self,
                       url: str,
                       mode: str = 'get',
                       data: dict = None,
                       show_errmsg: bool = False,
                       **kwargs) -> tuple:
        """生成response对象                     \n
        :param url: 目标url
        :param mode: 'get', 'post' 中选择
        :param data: post方式要提交的数据
        :param show_errmsg: 是否显示和抛出异常
        :param kwargs: 其它参数
        :return: tuple,第一位为Response或None,第二位为出错信息或'Success'
        """
        if not url:
            if show_errmsg:
                raise ValueError('url is empty.')
            return None, 'url is empty.'

        if mode not in ('get', 'post'):
            raise ValueError("Argument mode can only be 'get' or 'post'.")

        url = quote(url, safe='/:&?=%;#@+')

        # 设置referer和host值
        kwargs_set = set(x.lower() for x in kwargs)

        if 'headers' in kwargs_set:
            header_set = set(x.lower() for x in kwargs['headers'])

            if self.url and 'referer' not in header_set:
                kwargs['headers']['Referer'] = self.url

            if 'host' not in header_set:
                kwargs['headers']['Host'] = urlparse(url).hostname

        else:
            kwargs['headers'] = self.session.headers
            kwargs['headers']['Host'] = urlparse(url).hostname

            if self.url:
                kwargs['headers']['Referer'] = self.url

        if 'timeout' not in kwargs_set:
            kwargs['timeout'] = self.timeout

        try:
            r = None

            if mode == 'get':
                r = self.session.get(url, **kwargs)
            elif mode == 'post':
                r = self.session.post(url, data=data, **kwargs)

        except Exception as e:
            if show_errmsg:
                raise e

            return None, e

        else:
            # ----------------获取并设置编码开始-----------------
            # 在headers中获取编码
            content_type = r.headers.get('content-type', '').lower()
            charset = re.search(r'charset[=: ]*(.*)?[;]', content_type)

            if charset:
                r.encoding = charset.group(1)

            # 在headers中获取不到编码,且如果是网页
            elif content_type.replace(' ', '').startswith('text/html'):
                re_result = re_SEARCH(
                    b'<meta.*?charset=[ \\\'"]*([^"\\\' />]+).*?>', r.content)

                if re_result:
                    charset = re_result.group(1).decode()
                else:
                    charset = r.apparent_encoding

                r.encoding = charset
            # ----------------获取并设置编码结束-----------------

            return r, 'Success'