Ejemplo n.º 1
0
def fix_java_encoding(bytestring):
    """
    Convert a bytestring that might contain "Java UTF8" into valid UTF-8.

    There are two things that Java is known to do with its "UTF8" encoder
    that are incompatible with UTF-8. (If you happen to be writing Java
    code, apparently the standards-compliant encoder is named "AS32UTF8".)

    - Every UTF-16 character is separately encoded as UTF-8. This is wrong
      when the UTF-16 string contains surrogates; the character they actually
      represent should have been encoded as UTF-8 instead. Unicode calls this
      "CESU-8", the Compatibility Encoding Scheme for Unicode. Python 2 will
      decode it as if it's UTF-8, but Python 3 refuses to.

    - The null codepoint, U+0000, is encoded as 0xc0 0x80, which avoids
      outputting a null byte by breaking the UTF shortest-form rule.
      Unicode does not even deign to give this scheme a name, and no version
      of Python will decode it.
    """
    assert isinstance(bytestring, bytes)
    # Replace the sloppy encoding of U+0000 with the correct one.
    bytestring = bytestring.replace(b'\xc0\x80', b'\x00')

    # When we have improperly encoded surrogates, we can still see the
    # bits that they were meant to represent.
    #
    # The surrogates were meant to encode a 20-bit number, to which we
    # add 0x10000 to get a codepoint. That 20-bit number now appears in
    # this form:
    #
    #   11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst
    #
    # The CESU8_RE above matches byte sequences of this form. Then we need
    # to extract the bits and assemble a codepoint number from them.
    match = CESU8_RE.search(bytestring)
    fixed_pieces = []
    while match:
        pos = match.start()
        cesu8_sequence = bytes_to_ints(bytestring[pos:pos + 6])
        assert cesu8_sequence[0] == cesu8_sequence[3] == 0xed
        codepoint = (
            ((cesu8_sequence[1] & 0x0f) << 16) +
            ((cesu8_sequence[2] & 0x3f) << 10) +
            ((cesu8_sequence[4] & 0x0f) << 6) +
            (cesu8_sequence[5] & 0x3f) +
            0x10000
        )
        # For the reason why this will work on all Python builds, see
        # compatibility.py.
        new_bytes = unichr(codepoint).encode('utf-8')
        fixed_pieces.append(bytestring[:pos] + new_bytes)
        bytestring = bytestring[pos + 6:]
        match = CESU8_RE.match(bytestring)

    return b''.join(fixed_pieces) + bytestring
Ejemplo n.º 2
0
def fix_java_encoding(bytestring):
    """
    Convert a bytestring that might contain "Java UTF8" into valid UTF-8.

    There are two things that Java is known to do with its "UTF8" encoder
    that are incompatible with UTF-8. (If you happen to be writing Java
    code, apparently the standards-compliant encoder is named "AS32UTF8".)

    - Every UTF-16 character is separately encoded as UTF-8. This is wrong
      when the UTF-16 string contains surrogates; the character they actually
      represent should have been encoded as UTF-8 instead. Unicode calls this
      "CESU-8", the Compatibility Encoding Scheme for Unicode. Python 2 will
      decode it as if it's UTF-8, but Python 3 refuses to.

    - The null codepoint, U+0000, is encoded as 0xc0 0x80, which avoids
      outputting a null byte by breaking the UTF shortest-form rule.
      Unicode does not even deign to give this scheme a name, and no version
      of Python will decode it.
    """
    assert isinstance(bytestring, bytes)
    # Replace the sloppy encoding of U+0000 with the correct one.
    bytestring = bytestring.replace(b'\xc0\x80', b'\x00')

    # When we have improperly encoded surrogates, we can still see the
    # bits that they were meant to represent.
    #
    # The surrogates were meant to encode a 20-bit number, to which we
    # add 0x10000 to get a codepoint. That 20-bit number now appears in
    # this form:
    #
    #   11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst
    #
    # The CESU8_RE above matches byte sequences of this form. Then we need
    # to extract the bits and assemble a codepoint number from them.
    match = CESU8_RE.search(bytestring)
    fixed_pieces = []
    while match:
        pos = match.start()
        cesu8_sequence = bytes_to_ints(bytestring[pos:pos + 6])
        assert cesu8_sequence[0] == cesu8_sequence[3] == 0xed
        codepoint = (((cesu8_sequence[1] & 0x0f) << 16) +
                     ((cesu8_sequence[2] & 0x3f) << 10) +
                     ((cesu8_sequence[4] & 0x0f) << 6) +
                     (cesu8_sequence[5] & 0x3f) + 0x10000)
        # For the reason why this will work on all Python builds, see
        # compatibility.py.
        new_bytes = unichr(codepoint).encode('utf-8')
        fixed_pieces.append(bytestring[:pos] + new_bytes)
        bytestring = bytestring[pos + 6:]
        match = CESU8_RE.match(bytestring)

    return b''.join(fixed_pieces) + bytestring
Ejemplo n.º 3
0
    def _buffer_decode_surrogates(sup, input, errors, final):
        """
        When we have improperly encoded surrogates, we can still see the
        bits that they were meant to represent.

        The surrogates were meant to encode a 20-bit number, to which we
        add 0x10000 to get a codepoint. That 20-bit number now appears in
        this form:

          11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst

        The CESU8_RE above matches byte sequences of this form. Then we need
        to extract the bits and assemble a codepoint number from them.
        """
        if len(input) < 6:
            if final:
                # We found 0xed near the end of the stream, and there aren't
                # six bytes to decode. Delegate to the superclass method to
                # handle it as normal UTF-8. It might be a Hangul character
                # or an error.
                if PYTHON2 and len(input) >= 3:
                    # We can't trust Python 2 to raise an error when it's
                    # asked to decode a surrogate, so let's force the issue.
                    input = mangle_surrogates(input)
                return sup(input, errors, final)
            else:
                # We found 0xed, the stream isn't over yet, and we don't know
                # enough of the following bytes to decode anything, so consume
                # zero bytes and wait.
                return '', 0
        else:
            if CESU8_RE.match(input):
                # If this is a CESU-8 sequence, do some math to pull out
                # the intended 20-bit value, and consume six bytes.
                bytenums = bytes_to_ints(input[:6])
                codepoint = (
                    ((bytenums[1] & 0x0f) << 16) +
                    ((bytenums[2] & 0x3f) << 10) +
                    ((bytenums[4] & 0x0f) << 6) +
                    (bytenums[5] & 0x3f) +
                    0x10000
                )
                return unichr(codepoint), 6
            else:
                # This looked like a CESU-8 sequence, but it wasn't one.
                # 0xed indicates the start of a three-byte sequence, so give
                # three bytes to the superclass to decode as usual -- except
                # for working around the Python 2 discrepancy as before.
                if PYTHON2:
                    input = mangle_surrogates(input)
                return sup(input[:3], errors, False)
    def _buffer_decode_surrogates(sup, input, errors, final):
        """
        When we have improperly encoded surrogates, we can still see the
        bits that they were meant to represent.

        The surrogates were meant to encode a 20-bit number, to which we
        add 0x10000 to get a codepoint. That 20-bit number now appears in
        this form:

          11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst

        The CESU8_RE above matches byte sequences of this form. Then we need
        to extract the bits and assemble a codepoint number from them.
        """
        if len(input) < 6:
            if final:
                # We found 0xed near the end of the stream, and there aren't
                # six bytes to decode. Delegate to the superclass method
                # to handle this error.
                return sup(input, errors, final)
            else:
                # We found 0xed, the stream isn't over yet, and we don't know
                # enough of the following bytes to decode anything, so consume
                # zero bytes and wait.
                return '', 0
        else:
            if CESU8_RE.match(input):
                # If this is a CESU-8 sequence, do some math to pull out
                # the intended 20-bit value, and consume six bytes.
                bytenums = bytes_to_ints(input[:6])
                codepoint = (
                    ((bytenums[1] & 0x0f) << 16) +
                    ((bytenums[2] & 0x3f) << 10) +
                    ((bytenums[4] & 0x0f) << 6) +
                    (bytenums[5] & 0x3f) +
                    0x10000
                )
                return unichr(codepoint), 6
            else:
                # This looked like a CESU-8 sequence, but it wasn't one.
                # 0xed indicates the start of a three-byte sequence, so give
                # three bytes to the superclass, so it can either decode them
                # as a surrogate codepoint (on Python 2) or handle the error
                # (on Python 3).
                return sup(input[:3], errors, False)
Ejemplo n.º 5
0
    def _buffer_decode_surrogates(sup, input, errors, final):
        """
        When we have improperly encoded surrogates, we can still see the
        bits that they were meant to represent.

        The surrogates were meant to encode a 20-bit number, to which we
        add 0x10000 to get a codepoint. That 20-bit number now appears in
        this form:

          11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst

        The CESU8_RE above matches byte sequences of this form. Then we need
        to extract the bits and assemble a codepoint number from them.
        """
        if len(input) < 6:
            if final:
                # We found 0xed near the end of the stream, and there aren't
                # six bytes to decode. Delegate to the superclass method
                # to handle this error.
                return sup(input, errors, final)
            else:
                # We found 0xed, the stream isn't over yet, and we don't know
                # enough of the following bytes to decode anything, so consume
                # zero bytes and wait.
                return '', 0
        else:
            if CESU8_RE.match(input):
                # If this is a CESU-8 sequence, do some math to pull out
                # the intended 20-bit value, and consume six bytes.
                bytenums = bytes_to_ints(input[:6])
                codepoint = (((bytenums[1] & 0x0f) << 16) +
                             ((bytenums[2] & 0x3f) << 10) +
                             ((bytenums[4] & 0x0f) << 6) +
                             (bytenums[5] & 0x3f) + 0x10000)
                return unichr(codepoint), 6
            else:
                # This looked like a CESU-8 sequence, but it wasn't one.
                # 0xed indicates the start of a three-byte sequence, so give
                # three bytes to the superclass, so it can either decode them
                # as a surrogate codepoint (on Python 2) or handle the error
                # (on Python 3).
                return sup(input[:3], errors, False)