Ejemplo n.º 1
0
def restore_byte_a0(byts):
    """
    Find sequences that would convincingly decode as UTF-8 if the byte 0x20
    were changed to 0xa0, and fix them. This is used as a step within
    `fix_encoding`.
    """
    def replacement(match):
        "The function to apply when this regex matches."
        return match.group(0).replace(b'\x20', b'\xa0')

    fixed = ALTERED_UTF8_RE.sub(replacement, byts)
    return fixed, fixed.count(b'\xa0') * 2
Ejemplo n.º 2
0
def restore_byte_a0(byts):
    """
    Find sequences that would convincingly decode as UTF-8 if the byte 0x20
    were changed to 0xa0, and fix them. This is used as a step within
    `fix_encoding`.
    """
    def replacement(match):
        "The function to apply when this regex matches."
        return match.group(0).replace(b'\x20', b'\xa0')

    fixed = ALTERED_UTF8_RE.sub(replacement, byts)
    return fixed, fixed.count(b'\xa0') * 2
Ejemplo n.º 3
0
def restore_byte_a0(byts):
    """
    Some mojibake has been additionally altered by a process that said "hmm,
    byte A0, that's basically a space!" and replaced it with an ASCII space.
    When the A0 is part of a sequence that we intend to decode as UTF-8,
    changing byte A0 to 20 would make it fail to decode.

    This process finds sequences that would convincingly decode as UTF-8 if
    byte 20 were changed to A0, and puts back the A0. For the purpose of
    deciding whether this is a good idea, this step gets a cost of twice
    the number of bytes that are changed.

    This is used as a step within `fix_encoding`.
    """
    def replacement(match):
        "The function to apply when this regex matches."
        return match.group(0).replace(b'\x20', b'\xa0')

    return ALTERED_UTF8_RE.sub(replacement, byts)
Ejemplo n.º 4
0
def restore_byte_a0(byts):
    """
    Some mojibake has been additionally altered by a process that said "hmm,
    byte A0, that's basically a space!" and replaced it with an ASCII space.
    When the A0 is part of a sequence that we intend to decode as UTF-8,
    changing byte A0 to 20 would make it fail to decode.

    This process finds sequences that would convincingly decode as UTF-8 if
    byte 20 were changed to A0, and puts back the A0. For the purpose of
    deciding whether this is a good idea, this step gets a cost of twice
    the number of bytes that are changed.

    This is used as a step within `fix_encoding`.
    """
    def replacement(match):
        "The function to apply when this regex matches."
        return match.group(0).replace(b'\x20', b'\xa0')

    return ALTERED_UTF8_RE.sub(replacement, byts)
Ejemplo n.º 5
0
def fix_one_step_and_explain(text):
    """
    Performs a single step of re-decoding text that's been decoded incorrectly.

    Returns the decoded text, plus a "plan" for how to reproduce what it did.
    """
    if isinstance(text, bytes):
        raise UnicodeError(BYTES_ERROR_TEXT)
    if len(text) == 0:
        return text, []

    # The first plan is to return ASCII text unchanged.
    if possible_encoding(text, 'ascii'):
        return text, []

    # As we go through the next step, remember the possible encodings
    # that we encounter but don't successfully fix yet. We may need them
    # later.
    possible_1byte_encodings = []

    # Suppose the text was supposed to be UTF-8, but it was decoded using
    # a single-byte encoding instead. When these cases can be fixed, they
    # are usually the correct thing to do, so try them next.
    for encoding in CHARMAP_ENCODINGS:
        if possible_encoding(text, encoding):
            encoded_bytes = text.encode(encoding)
            encode_step = ('encode', encoding, ENCODING_COSTS.get(encoding, 0))
            transcode_steps = []

            # Now, find out if it's UTF-8 (or close enough). Otherwise,
            # remember the encoding for later.
            try:
                decoding = 'utf-8'
                # Check encoded_bytes for sequences that would be UTF-8,
                # except they have b' ' where b'\xa0' would belong.
                if ALTERED_UTF8_RE.search(encoded_bytes):
                    encoded_bytes = restore_byte_a0(encoded_bytes)
                    cost = encoded_bytes.count(0xa0) * 2
                    transcode_steps.append(('transcode', 'restore_byte_a0', cost))

                # Check for the byte 0x1a, which indicates where one of our
                # 'sloppy' codecs found a replacement character.
                if encoding.startswith('sloppy') and 0x1a in encoded_bytes:
                    encoded_bytes = replace_lossy_sequences(encoded_bytes)
                    transcode_steps.append(('transcode', 'replace_lossy_sequences', 0))

                if 0xed in encoded_bytes or 0xc0 in encoded_bytes:
                    decoding = 'utf-8-variants'

                decode_step = ('decode', decoding, 0)
                steps = [encode_step] + transcode_steps + [decode_step]
                fixed = encoded_bytes.decode(decoding)
                return fixed, steps

            except UnicodeDecodeError:
                possible_1byte_encodings.append(encoding)

    # Look for a-hat-euro sequences that remain, and fix them in isolation.
    if PARTIAL_UTF8_PUNCT_RE.search(text):
        steps = [('transcode', 'fix_partial_utf8_punct_in_1252', 1)]
        fixed = fix_partial_utf8_punct_in_1252(text)
        return fixed, steps

    # The next most likely case is that this is Latin-1 that was intended to
    # be read as Windows-1252, because those two encodings in particular are
    # easily confused.
    if 'latin-1' in possible_1byte_encodings:
        if 'windows-1252' in possible_1byte_encodings:
            # This text is in the intersection of Latin-1 and
            # Windows-1252, so it's probably legit.
            return text, []
        else:
            # Otherwise, it means we have characters that are in Latin-1 but
            # not in Windows-1252. Those are C1 control characters. Nobody
            # wants those. Assume they were meant to be Windows-1252. Don't
            # use the sloppy codec, because bad Windows-1252 characters are
            # a bad sign.
            encoded = text.encode('latin-1')
            try:
                fixed = encoded.decode('windows-1252')
                steps = []
                if fixed != text:
                    steps = [('encode', 'latin-1', 0),
                             ('decode', 'windows-1252', 1)]
                return fixed, steps
            except UnicodeDecodeError:
                # This text contained characters that don't even make sense
                # if you assume they were supposed to be Windows-1252. In
                # that case, let's not assume anything.
                pass

    # The cases that remain are mixups between two different single-byte
    # encodings, and not the common case of Latin-1 vs. Windows-1252.
    #
    # These cases may be unsolvable without adding false positives, though
    # I have vague ideas about how to optionally address them in the future.

    # Return the text unchanged; the plan is empty.
    return text, []
Ejemplo n.º 6
0
def fix_one_step_and_explain(text):
    """
    Performs a single step of re-decoding text that's been decoded incorrectly.

    Returns the decoded text, plus a "plan" for how to reproduce what it did.
    """
    if isinstance(text, bytes):
        raise UnicodeError(BYTES_ERROR_TEXT)
    if len(text) == 0:
        return text, []

    # The first plan is to return ASCII text unchanged.
    if possible_encoding(text, 'ascii'):
        return text, []

    # As we go through the next step, remember the possible encodings
    # that we encounter but don't successfully fix yet. We may need them
    # later.
    possible_1byte_encodings = []

    # Suppose the text was supposed to be UTF-8, but it was decoded using
    # a single-byte encoding instead. When these cases can be fixed, they
    # are usually the correct thing to do, so try them next.
    for encoding in CHARMAP_ENCODINGS:
        if possible_encoding(text, encoding):
            encoded_bytes = text.encode(encoding)
            encode_step = ('encode', encoding, ENCODING_COSTS.get(encoding, 0))
            transcode_steps = []

            # Now, find out if it's UTF-8 (or close enough). Otherwise,
            # remember the encoding for later.
            try:
                decoding = 'utf-8'
                # Check encoded_bytes for sequences that would be UTF-8,
                # except they have b' ' where b'\xa0' would belong.
                if ALTERED_UTF8_RE.search(encoded_bytes):
                    encoded_bytes = restore_byte_a0(encoded_bytes)
                    cost = encoded_bytes.count(0xa0)
                    transcode_steps.append(
                        ('transcode', 'restore_byte_a0', cost))

                # Check for the byte 0x1a, which indicates where one of our
                # 'sloppy' codecs found a replacement character.
                if encoding.startswith('sloppy') and 0x1a in encoded_bytes:
                    encoded_bytes = replace_lossy_sequences(encoded_bytes)
                    transcode_steps.append(
                        ('transcode', 'replace_lossy_sequences', 0))

                if 0xed in encoded_bytes or 0xc0 in encoded_bytes:
                    decoding = 'utf-8-variants'

                decode_step = ('decode', decoding, 0)
                steps = [encode_step] + transcode_steps + [decode_step]
                fixed = encoded_bytes.decode(decoding)
                return fixed, steps

            except UnicodeDecodeError:
                possible_1byte_encodings.append(encoding)

    # Look for a-hat-euro sequences that remain, and fix them in isolation.
    if PARTIAL_UTF8_PUNCT_RE.search(text):
        steps = [('transcode', 'fix_partial_utf8_punct_in_1252', 1)]
        fixed = fix_partial_utf8_punct_in_1252(text)
        return fixed, steps

    # The next most likely case is that this is Latin-1 that was intended to
    # be read as Windows-1252, because those two encodings in particular are
    # easily confused.
    if 'latin-1' in possible_1byte_encodings:
        if 'windows-1252' in possible_1byte_encodings:
            # This text is in the intersection of Latin-1 and
            # Windows-1252, so it's probably legit.
            return text, []
        else:
            # Otherwise, it means we have characters that are in Latin-1 but
            # not in Windows-1252. Those are C1 control characters. Nobody
            # wants those. Assume they were meant to be Windows-1252. Don't
            # use the sloppy codec, because bad Windows-1252 characters are
            # a bad sign.
            encoded = text.encode('latin-1')
            try:
                fixed = encoded.decode('windows-1252')
                steps = []
                if fixed != text:
                    steps = [('encode', 'latin-1', 0),
                             ('decode', 'windows-1252', 1)]
                return fixed, steps
            except UnicodeDecodeError:
                # This text contained characters that don't even make sense
                # if you assume they were supposed to be Windows-1252. In
                # that case, let's not assume anything.
                pass

    # The cases that remain are mixups between two different single-byte
    # encodings, and not the common case of Latin-1 vs. Windows-1252.
    #
    # These cases may be unsolvable without adding false positives, though
    # I have vague ideas about how to optionally address them in the future.

    # Return the text unchanged; the plan is empty.
    return text, []