Exemple #1
0
def fix_partial_utf8_punct_in_1252(text):
    """
    Fix particular characters that seem to be found in the wild encoded in
    UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be
    consistently applied. This is used as a step within `fix_encoding`.

    For this function, we assume the text has been decoded in Windows-1252.
    If it was decoded in Latin-1, we'll call this right after it goes through
    the Latin-1-to-Windows-1252 fixer.
    """
    def replacement(match):
        "The function to apply when this regex matches."
        return match.group(0).encode('sloppy-windows-1252').decode('utf-8')
    return PARTIAL_UTF8_PUNCT_RE.sub(replacement, text)
Exemple #2
0
def fix_partial_utf8_punct_in_1252(text):
    """
    Fix particular characters that seem to be found in the wild encoded in
    UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be
    consistently applied. This is used as a step within `fix_encoding`.

    For this function, we assume the text has been decoded in Windows-1252.
    If it was decoded in Latin-1, we'll call this right after it goes through
    the Latin-1-to-Windows-1252 fixer.
    """
    def replacement(match):
        "The function to apply when this regex matches."
        return match.group(0).encode('sloppy-windows-1252').decode('utf-8')

    return PARTIAL_UTF8_PUNCT_RE.sub(replacement, text)
Exemple #3
0
def fix_partial_utf8_punct_in_1252(text):
    """
    Fix particular characters that seem to be found in the wild encoded in
    UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be
    consistently applied.

    One form of inconsistency we need to deal with is that some character might
    be from the Latin-1 C1 control character set, while others are from the
    set of characters that take their place in Windows-1252. So we first replace
    those characters, then apply a fix that only works on Windows-1252 characters.

    This is used as a transcoder within `fix_encoding`.
    """
    def latin1_to_w1252(match):
        "The function to apply when this regex matches."
        return match.group(0).encode('latin-1').decode('sloppy-windows-1252')

    def w1252_to_utf8(match):
        "The function to apply when this regex matches."
        return match.group(0).encode('sloppy-windows-1252').decode('utf-8')

    text = C1_CONTROL_RE.sub(latin1_to_w1252, text)
    return PARTIAL_UTF8_PUNCT_RE.sub(w1252_to_utf8, text)
Exemple #4
0
def fix_partial_utf8_punct_in_1252(text):
    """
    Fix particular characters that seem to be found in the wild encoded in
    UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be
    consistently applied.

    One form of inconsistency we need to deal with is that some character might
    be from the Latin-1 C1 control character set, while others are from the
    set of characters that take their place in Windows-1252. So we first replace
    those characters, then apply a fix that only works on Windows-1252 characters.

    This is used as a transcoder within `fix_encoding`.
    """
    def latin1_to_w1252(match):
        "The function to apply when this regex matches."
        return match.group(0).encode('latin-1').decode('sloppy-windows-1252')

    def w1252_to_utf8(match):
        "The function to apply when this regex matches."
        return match.group(0).encode('sloppy-windows-1252').decode('utf-8')

    text = C1_CONTROL_RE.sub(latin1_to_w1252, text)
    return PARTIAL_UTF8_PUNCT_RE.sub(w1252_to_utf8, text)
Exemple #5
0
def fix_one_step_and_explain(text):
    """
    Performs a single step of re-decoding text that's been decoded incorrectly.

    Returns the decoded text, plus a "plan" for how to reproduce what it did.
    """
    if isinstance(text, bytes):
        raise UnicodeError(BYTES_ERROR_TEXT)
    if len(text) == 0:
        return text, []

    # The first plan is to return ASCII text unchanged.
    if possible_encoding(text, 'ascii'):
        return text, []

    # As we go through the next step, remember the possible encodings
    # that we encounter but don't successfully fix yet. We may need them
    # later.
    possible_1byte_encodings = []

    # Suppose the text was supposed to be UTF-8, but it was decoded using
    # a single-byte encoding instead. When these cases can be fixed, they
    # are usually the correct thing to do, so try them next.
    for encoding in CHARMAP_ENCODINGS:
        if possible_encoding(text, encoding):
            encoded_bytes = text.encode(encoding)
            encode_step = ('encode', encoding, ENCODING_COSTS.get(encoding, 0))
            transcode_steps = []

            # Now, find out if it's UTF-8 (or close enough). Otherwise,
            # remember the encoding for later.
            try:
                decoding = 'utf-8'
                # Check encoded_bytes for sequences that would be UTF-8,
                # except they have b' ' where b'\xa0' would belong.
                if ALTERED_UTF8_RE.search(encoded_bytes):
                    encoded_bytes = restore_byte_a0(encoded_bytes)
                    cost = encoded_bytes.count(0xa0) * 2
                    transcode_steps.append(('transcode', 'restore_byte_a0', cost))

                # Check for the byte 0x1a, which indicates where one of our
                # 'sloppy' codecs found a replacement character.
                if encoding.startswith('sloppy') and 0x1a in encoded_bytes:
                    encoded_bytes = replace_lossy_sequences(encoded_bytes)
                    transcode_steps.append(('transcode', 'replace_lossy_sequences', 0))

                if 0xed in encoded_bytes or 0xc0 in encoded_bytes:
                    decoding = 'utf-8-variants'

                decode_step = ('decode', decoding, 0)
                steps = [encode_step] + transcode_steps + [decode_step]
                fixed = encoded_bytes.decode(decoding)
                return fixed, steps

            except UnicodeDecodeError:
                possible_1byte_encodings.append(encoding)

    # Look for a-hat-euro sequences that remain, and fix them in isolation.
    if PARTIAL_UTF8_PUNCT_RE.search(text):
        steps = [('transcode', 'fix_partial_utf8_punct_in_1252', 1)]
        fixed = fix_partial_utf8_punct_in_1252(text)
        return fixed, steps

    # The next most likely case is that this is Latin-1 that was intended to
    # be read as Windows-1252, because those two encodings in particular are
    # easily confused.
    if 'latin-1' in possible_1byte_encodings:
        if 'windows-1252' in possible_1byte_encodings:
            # This text is in the intersection of Latin-1 and
            # Windows-1252, so it's probably legit.
            return text, []
        else:
            # Otherwise, it means we have characters that are in Latin-1 but
            # not in Windows-1252. Those are C1 control characters. Nobody
            # wants those. Assume they were meant to be Windows-1252. Don't
            # use the sloppy codec, because bad Windows-1252 characters are
            # a bad sign.
            encoded = text.encode('latin-1')
            try:
                fixed = encoded.decode('windows-1252')
                steps = []
                if fixed != text:
                    steps = [('encode', 'latin-1', 0),
                             ('decode', 'windows-1252', 1)]
                return fixed, steps
            except UnicodeDecodeError:
                # This text contained characters that don't even make sense
                # if you assume they were supposed to be Windows-1252. In
                # that case, let's not assume anything.
                pass

    # The cases that remain are mixups between two different single-byte
    # encodings, and not the common case of Latin-1 vs. Windows-1252.
    #
    # These cases may be unsolvable without adding false positives, though
    # I have vague ideas about how to optionally address them in the future.

    # Return the text unchanged; the plan is empty.
    return text, []
Exemple #6
0
def fix_one_step_and_explain(text):
    """
    Performs a single step of re-decoding text that's been decoded incorrectly.

    Returns the decoded text, plus a "plan" for how to reproduce what it did.
    """
    if isinstance(text, bytes):
        raise UnicodeError(BYTES_ERROR_TEXT)
    if len(text) == 0:
        return text, []

    # The first plan is to return ASCII text unchanged.
    if possible_encoding(text, 'ascii'):
        return text, []

    # As we go through the next step, remember the possible encodings
    # that we encounter but don't successfully fix yet. We may need them
    # later.
    possible_1byte_encodings = []

    # Suppose the text was supposed to be UTF-8, but it was decoded using
    # a single-byte encoding instead. When these cases can be fixed, they
    # are usually the correct thing to do, so try them next.
    for encoding in CHARMAP_ENCODINGS:
        if possible_encoding(text, encoding):
            encoded_bytes = text.encode(encoding)
            encode_step = ('encode', encoding, ENCODING_COSTS.get(encoding, 0))
            transcode_steps = []

            # Now, find out if it's UTF-8 (or close enough). Otherwise,
            # remember the encoding for later.
            try:
                decoding = 'utf-8'
                # Check encoded_bytes for sequences that would be UTF-8,
                # except they have b' ' where b'\xa0' would belong.
                if ALTERED_UTF8_RE.search(encoded_bytes):
                    encoded_bytes = restore_byte_a0(encoded_bytes)
                    cost = encoded_bytes.count(0xa0)
                    transcode_steps.append(
                        ('transcode', 'restore_byte_a0', cost))

                # Check for the byte 0x1a, which indicates where one of our
                # 'sloppy' codecs found a replacement character.
                if encoding.startswith('sloppy') and 0x1a in encoded_bytes:
                    encoded_bytes = replace_lossy_sequences(encoded_bytes)
                    transcode_steps.append(
                        ('transcode', 'replace_lossy_sequences', 0))

                if 0xed in encoded_bytes or 0xc0 in encoded_bytes:
                    decoding = 'utf-8-variants'

                decode_step = ('decode', decoding, 0)
                steps = [encode_step] + transcode_steps + [decode_step]
                fixed = encoded_bytes.decode(decoding)
                return fixed, steps

            except UnicodeDecodeError:
                possible_1byte_encodings.append(encoding)

    # Look for a-hat-euro sequences that remain, and fix them in isolation.
    if PARTIAL_UTF8_PUNCT_RE.search(text):
        steps = [('transcode', 'fix_partial_utf8_punct_in_1252', 1)]
        fixed = fix_partial_utf8_punct_in_1252(text)
        return fixed, steps

    # The next most likely case is that this is Latin-1 that was intended to
    # be read as Windows-1252, because those two encodings in particular are
    # easily confused.
    if 'latin-1' in possible_1byte_encodings:
        if 'windows-1252' in possible_1byte_encodings:
            # This text is in the intersection of Latin-1 and
            # Windows-1252, so it's probably legit.
            return text, []
        else:
            # Otherwise, it means we have characters that are in Latin-1 but
            # not in Windows-1252. Those are C1 control characters. Nobody
            # wants those. Assume they were meant to be Windows-1252. Don't
            # use the sloppy codec, because bad Windows-1252 characters are
            # a bad sign.
            encoded = text.encode('latin-1')
            try:
                fixed = encoded.decode('windows-1252')
                steps = []
                if fixed != text:
                    steps = [('encode', 'latin-1', 0),
                             ('decode', 'windows-1252', 1)]
                return fixed, steps
            except UnicodeDecodeError:
                # This text contained characters that don't even make sense
                # if you assume they were supposed to be Windows-1252. In
                # that case, let's not assume anything.
                pass

    # The cases that remain are mixups between two different single-byte
    # encodings, and not the common case of Latin-1 vs. Windows-1252.
    #
    # These cases may be unsolvable without adding false positives, though
    # I have vague ideas about how to optionally address them in the future.

    # Return the text unchanged; the plan is empty.
    return text, []