def transformed_response_body( response: Response, html_transform: Callable[[BeautifulSoup, str, ProxyUrl], None], proxy_url: ProxyUrl) -> Tuple[bool, bytes]: body = response.body or b'' content_type = (response.headers or {}).get('content-type', '') if content_type.startswith('text/html'): encoding = http_content_type_encoding(content_type) try: base_url = get_base_url(body, response.url, encoding) except UnicodeDecodeError: base_url = response.url soup = BeautifulSoup(body, 'lxml', from_encoding=encoding) html_transform( soup, base_url=base_url, proxy_url=proxy_url) head = soup.find('head') if head: head.append(soup.new_tag('meta', charset='utf8')) return True, soup.encode('utf8') elif content_type.startswith('text/css'): css_source = body.decode('utf8', 'ignore') return (False, process_css( css_source, base_uri=response.url, proxy_url=proxy_url) .encode('utf8')) else: return False, body
def _headers_encoding(self): """ 从headers获取头部charset编码 """ content_type = self.headers.get("Content-Type") or self.headers.get( "content-type") return (http_content_type_encoding(content_type) or "utf-8" if "application/json" in content_type else None)
def process_request_unsafe(self, request, spider ): spider.session.visit(request.url) spider.session.wait() body = spider.session.body() headers = spider.session.headers() headers = dict((str(k),headers[k]) for k in headers) encoding = http_content_type_encoding(headers.get("Content-Type")) if encoding is None: encoding = http_content_type_encoding(body) if encoding is None: encoding = 'utf-8' if body is None: return return HtmlResponse( spider.session.url(), body=body, encoding=encoding, headers=dict((str(k),headers[k]) for k in headers) )
def __call__(self, session, url, *args, **kwargs): self.session = session session.visit(url) session.wait() body = session.body() headers = session.headers() headers = dict((k,headers[k]) for k in headers) content_type_header = headers.get("Content-Type") encoding = http_content_type_encoding(content_type_header) return HtmlPage(session.url(), headers=headers, body=body, encoding=encoding)
def download_url(url: str, timeout: int=None) -> str: """Download the contents of a remote url and return the text""" from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT timeout = timeout or TIMEOUT response = requests.get( url, headers={'User-Agent': WGET_USER_AGENT}, verify=CHECK_SSL_VALIDITY, timeout=timeout, ) content_type = response.headers.get('Content-Type', '') encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text) if encoding is not None: response.encoding = encoding return response.text
def _headers_encoding(self): content_type = self.headers.get(b'Content-Type', b'') return http_content_type_encoding(to_native_str(content_type))
def _headers_encoding(cls,response): """ 根据content-type查看编码类型 """ content_type = response.headers.get('Content-Type') return http_content_type_encoding(content_type)
def _headers_encoding(self): content_type = self.headers.get(b'Content-Type', b'') return http_content_type_encoding(to_native_str(content_type))
def _infer_encoding_from_content_type(self): content_type = self.headers.get("Content-Type") if content_type: return http_content_type_encoding(content_type) return None
def _headers_encoding(self): content_type = self.headers.get('Content-Type') return http_content_type_encoding(content_type)
def _headers_encoding(self): content_type = self.headers.get(b"Content-Type", b"") return http_content_type_encoding(to_unicode(content_type))
def _headers_encoding(cls, response): """ 根据content-type查看编码类型 """ content_type = response.headers.get('Content-Type') return http_content_type_encoding(content_type)
def _headers_encoding(self): content_type = self.headers.get('Content-Type') return http_content_type_encoding(content_type)
def test_http_encoding_header(self): header_value = "Content-Type: text/html; charset=ISO-8859-4" extracted = http_content_type_encoding(header_value) self.assertEqual(extracted, "iso8859-4") self.assertEqual(None, http_content_type_encoding("something else"))
def test_http_encoding_header(self): header_value = "Content-Type: text/html; charset=ISO-8859-4" extracted = http_content_type_encoding(header_value) self.assertEqual(extracted, "iso8859-4") self.assertEqual(None, http_content_type_encoding("something else"))
def guess_coding(body): return http_content_type_encoding(f'charset={chardet.detect(body)["encoding"]}')