def fix_java_encoding(bytestring): """ Convert a bytestring that might contain "Java UTF8" into valid UTF-8. There are two things that Java is known to do with its "UTF8" encoder that are incompatible with UTF-8. (If you happen to be writing Java code, apparently the standards-compliant encoder is named "AS32UTF8".) - Every UTF-16 character is separately encoded as UTF-8. This is wrong when the UTF-16 string contains surrogates; the character they actually represent should have been encoded as UTF-8 instead. Unicode calls this "CESU-8", the Compatibility Encoding Scheme for Unicode. Python 2 will decode it as if it's UTF-8, but Python 3 refuses to. - The null codepoint, U+0000, is encoded as 0xc0 0x80, which avoids outputting a null byte by breaking the UTF shortest-form rule. Unicode does not even deign to give this scheme a name, and no version of Python will decode it. """ assert isinstance(bytestring, bytes) # Replace the sloppy encoding of U+0000 with the correct one. bytestring = bytestring.replace(b'\xc0\x80', b'\x00') # When we have improperly encoded surrogates, we can still see the # bits that they were meant to represent. # # The surrogates were meant to encode a 20-bit number, to which we # add 0x10000 to get a codepoint. That 20-bit number now appears in # this form: # # 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst # # The CESU8_RE above matches byte sequences of this form. Then we need # to extract the bits and assemble a codepoint number from them. match = CESU8_RE.search(bytestring) fixed_pieces = [] while match: pos = match.start() cesu8_sequence = bytes_to_ints(bytestring[pos:pos + 6]) assert cesu8_sequence[0] == cesu8_sequence[3] == 0xed codepoint = ( ((cesu8_sequence[1] & 0x0f) << 16) + ((cesu8_sequence[2] & 0x3f) << 10) + ((cesu8_sequence[4] & 0x0f) << 6) + (cesu8_sequence[5] & 0x3f) + 0x10000 ) # For the reason why this will work on all Python builds, see # compatibility.py. new_bytes = unichr(codepoint).encode('utf-8') fixed_pieces.append(bytestring[:pos] + new_bytes) bytestring = bytestring[pos + 6:] match = CESU8_RE.match(bytestring) return b''.join(fixed_pieces) + bytestring
def fix_java_encoding(bytestring): """ Convert a bytestring that might contain "Java UTF8" into valid UTF-8. There are two things that Java is known to do with its "UTF8" encoder that are incompatible with UTF-8. (If you happen to be writing Java code, apparently the standards-compliant encoder is named "AS32UTF8".) - Every UTF-16 character is separately encoded as UTF-8. This is wrong when the UTF-16 string contains surrogates; the character they actually represent should have been encoded as UTF-8 instead. Unicode calls this "CESU-8", the Compatibility Encoding Scheme for Unicode. Python 2 will decode it as if it's UTF-8, but Python 3 refuses to. - The null codepoint, U+0000, is encoded as 0xc0 0x80, which avoids outputting a null byte by breaking the UTF shortest-form rule. Unicode does not even deign to give this scheme a name, and no version of Python will decode it. """ assert isinstance(bytestring, bytes) # Replace the sloppy encoding of U+0000 with the correct one. bytestring = bytestring.replace(b'\xc0\x80', b'\x00') # When we have improperly encoded surrogates, we can still see the # bits that they were meant to represent. # # The surrogates were meant to encode a 20-bit number, to which we # add 0x10000 to get a codepoint. That 20-bit number now appears in # this form: # # 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst # # The CESU8_RE above matches byte sequences of this form. Then we need # to extract the bits and assemble a codepoint number from them. match = CESU8_RE.search(bytestring) fixed_pieces = [] while match: pos = match.start() cesu8_sequence = bytes_to_ints(bytestring[pos:pos + 6]) assert cesu8_sequence[0] == cesu8_sequence[3] == 0xed codepoint = (((cesu8_sequence[1] & 0x0f) << 16) + ((cesu8_sequence[2] & 0x3f) << 10) + ((cesu8_sequence[4] & 0x0f) << 6) + (cesu8_sequence[5] & 0x3f) + 0x10000) # For the reason why this will work on all Python builds, see # compatibility.py. new_bytes = unichr(codepoint).encode('utf-8') fixed_pieces.append(bytestring[:pos] + new_bytes) bytestring = bytestring[pos + 6:] match = CESU8_RE.match(bytestring) return b''.join(fixed_pieces) + bytestring
def _buffer_decode_surrogates(sup, input, errors, final): """ When we have improperly encoded surrogates, we can still see the bits that they were meant to represent. The surrogates were meant to encode a 20-bit number, to which we add 0x10000 to get a codepoint. That 20-bit number now appears in this form: 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst The CESU8_RE above matches byte sequences of this form. Then we need to extract the bits and assemble a codepoint number from them. """ if len(input) < 6: if final: # We found 0xed near the end of the stream, and there aren't # six bytes to decode. Delegate to the superclass method to # handle it as normal UTF-8. It might be a Hangul character # or an error. if PYTHON2 and len(input) >= 3: # We can't trust Python 2 to raise an error when it's # asked to decode a surrogate, so let's force the issue. input = mangle_surrogates(input) return sup(input, errors, final) else: # We found 0xed, the stream isn't over yet, and we don't know # enough of the following bytes to decode anything, so consume # zero bytes and wait. return '', 0 else: if CESU8_RE.match(input): # If this is a CESU-8 sequence, do some math to pull out # the intended 20-bit value, and consume six bytes. bytenums = bytes_to_ints(input[:6]) codepoint = ( ((bytenums[1] & 0x0f) << 16) + ((bytenums[2] & 0x3f) << 10) + ((bytenums[4] & 0x0f) << 6) + (bytenums[5] & 0x3f) + 0x10000 ) return unichr(codepoint), 6 else: # This looked like a CESU-8 sequence, but it wasn't one. # 0xed indicates the start of a three-byte sequence, so give # three bytes to the superclass to decode as usual -- except # for working around the Python 2 discrepancy as before. if PYTHON2: input = mangle_surrogates(input) return sup(input[:3], errors, False)
def _buffer_decode_surrogates(sup, input, errors, final): """ When we have improperly encoded surrogates, we can still see the bits that they were meant to represent. The surrogates were meant to encode a 20-bit number, to which we add 0x10000 to get a codepoint. That 20-bit number now appears in this form: 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst The CESU8_RE above matches byte sequences of this form. Then we need to extract the bits and assemble a codepoint number from them. """ if len(input) < 6: if final: # We found 0xed near the end of the stream, and there aren't # six bytes to decode. Delegate to the superclass method # to handle this error. return sup(input, errors, final) else: # We found 0xed, the stream isn't over yet, and we don't know # enough of the following bytes to decode anything, so consume # zero bytes and wait. return '', 0 else: if CESU8_RE.match(input): # If this is a CESU-8 sequence, do some math to pull out # the intended 20-bit value, and consume six bytes. bytenums = bytes_to_ints(input[:6]) codepoint = ( ((bytenums[1] & 0x0f) << 16) + ((bytenums[2] & 0x3f) << 10) + ((bytenums[4] & 0x0f) << 6) + (bytenums[5] & 0x3f) + 0x10000 ) return unichr(codepoint), 6 else: # This looked like a CESU-8 sequence, but it wasn't one. # 0xed indicates the start of a three-byte sequence, so give # three bytes to the superclass, so it can either decode them # as a surrogate codepoint (on Python 2) or handle the error # (on Python 3). return sup(input[:3], errors, False)
def _buffer_decode_surrogates(sup, input, errors, final): """ When we have improperly encoded surrogates, we can still see the bits that they were meant to represent. The surrogates were meant to encode a 20-bit number, to which we add 0x10000 to get a codepoint. That 20-bit number now appears in this form: 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst The CESU8_RE above matches byte sequences of this form. Then we need to extract the bits and assemble a codepoint number from them. """ if len(input) < 6: if final: # We found 0xed near the end of the stream, and there aren't # six bytes to decode. Delegate to the superclass method # to handle this error. return sup(input, errors, final) else: # We found 0xed, the stream isn't over yet, and we don't know # enough of the following bytes to decode anything, so consume # zero bytes and wait. return '', 0 else: if CESU8_RE.match(input): # If this is a CESU-8 sequence, do some math to pull out # the intended 20-bit value, and consume six bytes. bytenums = bytes_to_ints(input[:6]) codepoint = (((bytenums[1] & 0x0f) << 16) + ((bytenums[2] & 0x3f) << 10) + ((bytenums[4] & 0x0f) << 6) + (bytenums[5] & 0x3f) + 0x10000) return unichr(codepoint), 6 else: # This looked like a CESU-8 sequence, but it wasn't one. # 0xed indicates the start of a three-byte sequence, so give # three bytes to the superclass, so it can either decode them # as a surrogate codepoint (on Python 2) or handle the error # (on Python 3). return sup(input[:3], errors, False)