def fix_partial_utf8_punct_in_1252(text): """ Fix particular characters that seem to be found in the wild encoded in UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be consistently applied. This is used as a step within `fix_encoding`. For this function, we assume the text has been decoded in Windows-1252. If it was decoded in Latin-1, we'll call this right after it goes through the Latin-1-to-Windows-1252 fixer. """ def replacement(match): "The function to apply when this regex matches." return match.group(0).encode('sloppy-windows-1252').decode('utf-8') return PARTIAL_UTF8_PUNCT_RE.sub(replacement, text)
def fix_partial_utf8_punct_in_1252(text): """ Fix particular characters that seem to be found in the wild encoded in UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be consistently applied. One form of inconsistency we need to deal with is that some character might be from the Latin-1 C1 control character set, while others are from the set of characters that take their place in Windows-1252. So we first replace those characters, then apply a fix that only works on Windows-1252 characters. This is used as a transcoder within `fix_encoding`. """ def latin1_to_w1252(match): "The function to apply when this regex matches." return match.group(0).encode('latin-1').decode('sloppy-windows-1252') def w1252_to_utf8(match): "The function to apply when this regex matches." return match.group(0).encode('sloppy-windows-1252').decode('utf-8') text = C1_CONTROL_RE.sub(latin1_to_w1252, text) return PARTIAL_UTF8_PUNCT_RE.sub(w1252_to_utf8, text)