Ejemplo n.º 1
0
def restore_byte_a0(byts):
    """
    Find sequences that would convincingly decode as UTF-8 if the byte 0x20
    were changed to 0xa0, and fix them. This is used as a step within
    `fix_encoding`.
    """
    def replacement(match):
        "The function to apply when this regex matches."
        return match.group(0).replace(b'\x20', b'\xa0')

    fixed = ALTERED_UTF8_RE.sub(replacement, byts)
    return fixed, fixed.count(b'\xa0') * 2
Ejemplo n.º 2
0
def restore_byte_a0(byts):
    """
    Find sequences that would convincingly decode as UTF-8 if the byte 0x20
    were changed to 0xa0, and fix them. This is used as a step within
    `fix_encoding`.
    """
    def replacement(match):
        "The function to apply when this regex matches."
        return match.group(0).replace(b'\x20', b'\xa0')

    fixed = ALTERED_UTF8_RE.sub(replacement, byts)
    return fixed, fixed.count(b'\xa0') * 2
Ejemplo n.º 3
0
def restore_byte_a0(byts):
    """
    Some mojibake has been additionally altered by a process that said "hmm,
    byte A0, that's basically a space!" and replaced it with an ASCII space.
    When the A0 is part of a sequence that we intend to decode as UTF-8,
    changing byte A0 to 20 would make it fail to decode.

    This process finds sequences that would convincingly decode as UTF-8 if
    byte 20 were changed to A0, and puts back the A0. For the purpose of
    deciding whether this is a good idea, this step gets a cost of twice
    the number of bytes that are changed.

    This is used as a step within `fix_encoding`.
    """
    def replacement(match):
        "The function to apply when this regex matches."
        return match.group(0).replace(b'\x20', b'\xa0')

    return ALTERED_UTF8_RE.sub(replacement, byts)
Ejemplo n.º 4
0
def restore_byte_a0(byts):
    """
    Some mojibake has been additionally altered by a process that said "hmm,
    byte A0, that's basically a space!" and replaced it with an ASCII space.
    When the A0 is part of a sequence that we intend to decode as UTF-8,
    changing byte A0 to 20 would make it fail to decode.

    This process finds sequences that would convincingly decode as UTF-8 if
    byte 20 were changed to A0, and puts back the A0. For the purpose of
    deciding whether this is a good idea, this step gets a cost of twice
    the number of bytes that are changed.

    This is used as a step within `fix_encoding`.
    """
    def replacement(match):
        "The function to apply when this regex matches."
        return match.group(0).replace(b'\x20', b'\xa0')

    return ALTERED_UTF8_RE.sub(replacement, byts)