def make_UnicodeDammit(raw_html, http_headers=None, **kwargs): """create a UnicodeDammit instance for the given HTML If given the HTTP response headers and they contain a Content-Type header with an encoding, it will be given to UnicodeDammit properly. @param raw_html: the binary (i.e. encoded) HTML data (str) @param http_headers: the HTTP response headers (dict; optional) @return: a UnicodeDammit instance """ content_type = get_content_type(http_headers) if content_type: content_type_header = ContentTypeHeader(content_type) is_html = content_type_header.is_html charset = content_type_header.charset else: is_html = False charset = None encodings_to_try_first = [] raw_html, bom_encoding = EncodingDetector.strip_byte_order_mark(raw_html) if bom_encoding is not None: encodings_to_try_first.append(bom_encoding) declared_encoding = EncodingDetector.find_declared_encoding( raw_html, is_html=is_html, search_entire_document=True) if declared_encoding is not None: encodings_to_try_first.append(declared_encoding) if charset: encodings_to_try_first.append(charset) return UnicodeDammit( raw_html, is_html=is_html, override_encodings=encodings_to_try_first, **kwargs )
def iterhtmldecode( iterable: Iterable[AnyStr], http_charset: Optional[str] = None, default_encoding: str = 'cp1252', errors: str = 'replace', scan_window: int = 1024, ) -> Iterator[str]: """ .. versionadded:: 0.7.0 Given an HTML document in the form of an iterable of `bytes`, try to determine the document's encoding while consuming as little of the iterable as necessary, and then decode the elements of the iterable one at a time. As a convenience, if an iterable of `str` objects is passed, the elements of the iterable are yielded unmodified. This function follows a vastly simplified form of the WHATWG's `"Determining the Character Encoding" specification <encspec_>`_. In particular, it determines the character encoding by consulting the following sources, in order, and using the first one found: - byte-order mark - HTTP charset - encoding declared in document - default encoding .. _encspec: https://html.spec.whatwg.org/multipage/parsing.html #determining-the-character-encoding :param Iterable[AnyStr] iterable: an iterable of either `bytes` or `str` that, when joined together, form an HTML document :param Optional[str] http_charset: the document's encoding as declared by the transport layer, if any; e.g., as declared in the ``charset`` parameter of the :mailheader:`Content-Type` header of the HTTP response that returned the document :param str default_encoding: the default encoding to fall back to if none of the other sources succeed in determining the encoding; defaults to CP1252 :param str errors: the error handler to use when decoding the document; defauts to ``"replace"`` :param int scan_window: how many bytes to consume from the iterable when checking for an encoding declaration :rtype: Iterator[str] """ # We can't use UnicodeDammit directly for this because (a) it gives the # encoding extracted from the `Content-Type` header precedence over the # encoding indicated by the BOM, when it should be the other way around, # and (b) it assumes it's got the whole document to work with and will fail # if the given blob ends in the middle of a multibyte character encoding. iterator = iter(iterable) try: initblob = next(iterator) except StopIteration: return iter(cast(List[str], [])) if isinstance(initblob, str): return chain([initblob], iterator) while len(initblob) < scan_window: try: initblob += next(iterator) except StopIteration: break enc: Optional[str] initblob, enc = EncodingDetector.strip_byte_order_mark(initblob) if enc is None: if http_charset is not None: enc = http_charset else: enc = EncodingDetector.find_declared_encoding( initblob, is_html=True, search_entire_document=True, ) if enc is None: enc = default_encoding assert isinstance(enc, str) return iterdecode(chain([initblob], iterator), enc, errors=errors)