def _make_response(self, url: str, mode: str = 'get', data: dict = None, show_errmsg: bool = False, **kwargs) -> tuple: """生成response对象 \n :param url: 目标url :param mode: 'get', 'post' 中选择 :param data: post方式要提交的数据 :param show_errmsg: 是否显示和抛出异常 :param kwargs: 其它参数 :return: Response对象 """ if mode not in ['get', 'post']: raise ValueError("Argument mode can only be 'get' or 'post'.") url = quote(url, safe='/:&?=%;#@') # 设置referer和host值 kwargs_set = set(x.lower() for x in kwargs) if 'headers' in kwargs_set: header_set = set(x.lower() for x in kwargs['headers']) if self.url and 'referer' not in header_set: kwargs['headers']['Referer'] = self.url if 'host' not in header_set: kwargs['headers']['Host'] = urlparse(url).hostname else: kwargs['headers'] = self.session.headers kwargs['headers']['Host'] = urlparse(url).hostname if self.url: kwargs['headers']['Referer'] = self.url if 'timeout' not in kwargs_set: kwargs['timeout'] = self.timeout try: r = None if mode == 'get': r = self.session.get(url, **kwargs) elif mode == 'post': r = self.session.post(url, data=data, **kwargs) except Exception as e: if show_errmsg: raise e return None, e else: headers = dict(r.headers) if 'Content-Type' not in headers or 'charset' not in headers['Content-Type']: re_result = re_SEARCH(r'<meta.*?charset=[ \'"]*([^"\' />]+).*?>', r.text) try: charset = re_result.group(1) except: charset = 'utf-8' else: charset = headers['Content-Type'].split('=')[1] # 避免存在退格符导致乱码或解析出错 r._content = r.content if 'stream' in kwargs and kwargs['stream'] else r.content.replace(b'\x08', b'\\b') r.encoding = charset return r, 'Success'
def _make_response(self, url: str, mode: str = 'get', data: dict = None, show_errmsg: bool = False, **kwargs) -> tuple: """生成response对象 \n :param url: 目标url :param mode: 'get', 'post' 中选择 :param data: post方式要提交的数据 :param show_errmsg: 是否显示和抛出异常 :param kwargs: 其它参数 :return: tuple,第一位为Response或None,第二位为出错信息或'Sussess' """ if mode not in ['get', 'post']: raise ValueError("Argument mode can only be 'get' or 'post'.") url = quote(url, safe='/:&?=%;#@+') # 设置referer和host值 kwargs_set = set(x.lower() for x in kwargs) if 'headers' in kwargs_set: header_set = set(x.lower() for x in kwargs['headers']) if self.url and 'referer' not in header_set: kwargs['headers']['Referer'] = self.url if 'host' not in header_set: kwargs['headers']['Host'] = urlparse(url).hostname else: kwargs['headers'] = self.session.headers kwargs['headers']['Host'] = urlparse(url).hostname if self.url: kwargs['headers']['Referer'] = self.url if 'timeout' not in kwargs_set: kwargs['timeout'] = self.timeout try: r = None if mode == 'get': r = self.session.get(url, **kwargs) elif mode == 'post': r = self.session.post(url, data=data, **kwargs) except Exception as e: if show_errmsg: raise e return None, e else: headers = dict(r.headers) content_type = tuple(x for x in headers if x.lower() == 'content-type') stream = tuple(x for x in kwargs if x.lower() == 'stream') not_stream = (not stream or not kwargs[stream[0]]) and not self.session.stream charset = None if not content_type or 'charset' not in headers[ content_type[0]].lower(): if not_stream: re_result = re_SEARCH( r'<meta.*?charset=[ \'"]*([^"\' />]+).*?>', r.iter_content(chunk_size=512).__next__().decode()) try: charset = re_result.group(1) except: charset = r.apparent_encoding else: charset = headers[content_type[0]].split('=')[1] if not_stream: # 加载网页时修复编码 r._content = r.content.replace(b'\x08', b'\\b') # 避免存在退格符导致乱码或解析出错 r.html.encoding = r.encoding # 修复requests_html丢失编码方式的bug if charset: r.encoding = charset return r, 'Success'
def _make_response(self, url: str, mode: str = 'get', data: dict = None, show_errmsg: bool = False, **kwargs) -> tuple: """生成response对象 \n :param url: 目标url :param mode: 'get', 'post' 中选择 :param data: post方式要提交的数据 :param show_errmsg: 是否显示和抛出异常 :param kwargs: 其它参数 :return: tuple,第一位为Response或None,第二位为出错信息或'Success' """ if not url: if show_errmsg: raise ValueError('url is empty.') return None, 'url is empty.' if mode not in ('get', 'post'): raise ValueError("Argument mode can only be 'get' or 'post'.") url = quote(url, safe='/:&?=%;#@+') # 设置referer和host值 kwargs_set = set(x.lower() for x in kwargs) if 'headers' in kwargs_set: header_set = set(x.lower() for x in kwargs['headers']) if self.url and 'referer' not in header_set: kwargs['headers']['Referer'] = self.url if 'host' not in header_set: kwargs['headers']['Host'] = urlparse(url).hostname else: kwargs['headers'] = self.session.headers kwargs['headers']['Host'] = urlparse(url).hostname if self.url: kwargs['headers']['Referer'] = self.url if 'timeout' not in kwargs_set: kwargs['timeout'] = self.timeout try: r = None if mode == 'get': r = self.session.get(url, **kwargs) elif mode == 'post': r = self.session.post(url, data=data, **kwargs) except Exception as e: if show_errmsg: raise e return None, e else: # ----------------获取并设置编码开始----------------- # 在headers中获取编码 content_type = r.headers.get('content-type', '').lower() charset = re.search(r'charset[=: ]*(.*)?[;]', content_type) if charset: r.encoding = charset.group(1) # 在headers中获取不到编码,且如果是网页 elif content_type.replace(' ', '').startswith('text/html'): re_result = re_SEARCH( b'<meta.*?charset=[ \\\'"]*([^"\\\' />]+).*?>', r.content) if re_result: charset = re_result.group(1).decode() else: charset = r.apparent_encoding r.encoding = charset # ----------------获取并设置编码结束----------------- return r, 'Success'