Exemple #1
0
def fix_partial_utf8_punct_in_1252(text):
    """
    Fix particular characters that seem to be found in the wild encoded in
    UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be
    consistently applied. This is used as a step within `fix_encoding`.

    For this function, we assume the text has been decoded in Windows-1252.
    If it was decoded in Latin-1, we'll call this right after it goes through
    the Latin-1-to-Windows-1252 fixer.
    """
    def replacement(match):
        "The function to apply when this regex matches."
        return match.group(0).encode('sloppy-windows-1252').decode('utf-8')
    return PARTIAL_UTF8_PUNCT_RE.sub(replacement, text)
Exemple #2
0
def fix_partial_utf8_punct_in_1252(text):
    """
    Fix particular characters that seem to be found in the wild encoded in
    UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be
    consistently applied. This is used as a step within `fix_encoding`.

    For this function, we assume the text has been decoded in Windows-1252.
    If it was decoded in Latin-1, we'll call this right after it goes through
    the Latin-1-to-Windows-1252 fixer.
    """
    def replacement(match):
        "The function to apply when this regex matches."
        return match.group(0).encode('sloppy-windows-1252').decode('utf-8')

    return PARTIAL_UTF8_PUNCT_RE.sub(replacement, text)
Exemple #3
0
def fix_partial_utf8_punct_in_1252(text):
    """
    Fix particular characters that seem to be found in the wild encoded in
    UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be
    consistently applied.

    One form of inconsistency we need to deal with is that some character might
    be from the Latin-1 C1 control character set, while others are from the
    set of characters that take their place in Windows-1252. So we first replace
    those characters, then apply a fix that only works on Windows-1252 characters.

    This is used as a transcoder within `fix_encoding`.
    """
    def latin1_to_w1252(match):
        "The function to apply when this regex matches."
        return match.group(0).encode('latin-1').decode('sloppy-windows-1252')

    def w1252_to_utf8(match):
        "The function to apply when this regex matches."
        return match.group(0).encode('sloppy-windows-1252').decode('utf-8')

    text = C1_CONTROL_RE.sub(latin1_to_w1252, text)
    return PARTIAL_UTF8_PUNCT_RE.sub(w1252_to_utf8, text)
Exemple #4
0
def fix_partial_utf8_punct_in_1252(text):
    """
    Fix particular characters that seem to be found in the wild encoded in
    UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be
    consistently applied.

    One form of inconsistency we need to deal with is that some character might
    be from the Latin-1 C1 control character set, while others are from the
    set of characters that take their place in Windows-1252. So we first replace
    those characters, then apply a fix that only works on Windows-1252 characters.

    This is used as a transcoder within `fix_encoding`.
    """
    def latin1_to_w1252(match):
        "The function to apply when this regex matches."
        return match.group(0).encode('latin-1').decode('sloppy-windows-1252')

    def w1252_to_utf8(match):
        "The function to apply when this regex matches."
        return match.group(0).encode('sloppy-windows-1252').decode('utf-8')

    text = C1_CONTROL_RE.sub(latin1_to_w1252, text)
    return PARTIAL_UTF8_PUNCT_RE.sub(w1252_to_utf8, text)