Exemple #1
0
def make_UnicodeDammit(raw_html, http_headers=None, **kwargs):
    """create a UnicodeDammit instance for the given HTML

    If given the HTTP response headers and they contain a Content-Type header
    with an encoding, it will be given to UnicodeDammit properly.

    @param raw_html: the binary (i.e. encoded) HTML data (str)
    @param http_headers: the HTTP response headers (dict; optional)
    @return: a UnicodeDammit instance
    """
    content_type = get_content_type(http_headers)
    if content_type:
        content_type_header = ContentTypeHeader(content_type)
        is_html = content_type_header.is_html
        charset = content_type_header.charset
    else:
        is_html = False
        charset = None

    encodings_to_try_first = []
    raw_html, bom_encoding = EncodingDetector.strip_byte_order_mark(raw_html)
    if bom_encoding is not None:
        encodings_to_try_first.append(bom_encoding)

    declared_encoding = EncodingDetector.find_declared_encoding(
        raw_html, is_html=is_html, search_entire_document=True)
    if declared_encoding is not None:
        encodings_to_try_first.append(declared_encoding)

    if charset:
        encodings_to_try_first.append(charset)

    return UnicodeDammit(
        raw_html, is_html=is_html,
        override_encodings=encodings_to_try_first,
        **kwargs
    )
def iterhtmldecode(
    iterable: Iterable[AnyStr],
    http_charset: Optional[str] = None,
    default_encoding: str = 'cp1252',
    errors: str = 'replace',
    scan_window: int = 1024,
) -> Iterator[str]:
    """
    .. versionadded:: 0.7.0

    Given an HTML document in the form of an iterable of `bytes`, try to
    determine the document's encoding while consuming as little of the iterable
    as necessary, and then decode the elements of the iterable one at a time.

    As a convenience, if an iterable of `str` objects is passed, the elements
    of the iterable are yielded unmodified.

    This function follows a vastly simplified form of the WHATWG's
    `"Determining the Character Encoding" specification <encspec_>`_.  In
    particular, it determines the character encoding by consulting the
    following sources, in order, and using the first one found:

    - byte-order mark
    - HTTP charset
    - encoding declared in document
    - default encoding

    .. _encspec:
       https://html.spec.whatwg.org/multipage/parsing.html
       #determining-the-character-encoding

    :param Iterable[AnyStr] iterable: an iterable of either `bytes` or `str`
        that, when joined together, form an HTML document
    :param Optional[str] http_charset: the document's encoding as declared by
        the transport layer, if any; e.g., as declared in the ``charset``
        parameter of the :mailheader:`Content-Type` header of the HTTP response
        that returned the document
    :param str default_encoding: the default encoding to fall back to if none
        of the other sources succeed in determining the encoding; defaults to
        CP1252
    :param str errors: the error handler to use when decoding the document;
        defauts to ``"replace"``
    :param int scan_window: how many bytes to consume from the iterable when
        checking for an encoding declaration
    :rtype: Iterator[str]
    """

    # We can't use UnicodeDammit directly for this because (a) it gives the
    # encoding extracted from the `Content-Type` header precedence over the
    # encoding indicated by the BOM, when it should be the other way around,
    # and (b) it assumes it's got the whole document to work with and will fail
    # if the given blob ends in the middle of a multibyte character encoding.

    iterator = iter(iterable)
    try:
        initblob = next(iterator)
    except StopIteration:
        return iter(cast(List[str], []))
    if isinstance(initblob, str):
        return chain([initblob], iterator)
    while len(initblob) < scan_window:
        try:
            initblob += next(iterator)
        except StopIteration:
            break
    enc: Optional[str]
    initblob, enc = EncodingDetector.strip_byte_order_mark(initblob)
    if enc is None:
        if http_charset is not None:
            enc = http_charset
        else:
            enc = EncodingDetector.find_declared_encoding(
                initblob,
                is_html=True,
                search_entire_document=True,
            )
            if enc is None:
                enc = default_encoding
    assert isinstance(enc, str)
    return iterdecode(chain([initblob], iterator), enc, errors=errors)