def check_ftfy(self, text, encoding_only=True): """ Given a single text input, check whether `ftfy.fix_text_encoding` would change it. If so, display the change. """ self.count += 1 text = unescape_html(text) if not possible_encoding(text, 'ascii'): if encoding_only: fixed = fix_encoding(text) else: fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False) if text != fixed: # possibly filter common bots before printing print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format( text=text, fixed=fixed )) self.num_fixed += 1 elif 'â€' in text or '\x80' in text: print('\nNot fixed:\t{text!r}'.format(text=text)) # Print status updates once in a while if self.count % 100 == 0: print('.', end='', flush=True) if self.count % 10000 == 0: print('\n%d/%d fixed' % (self.num_fixed, self.count))
def check_ftfy(self, text, encoding_only=True): """ Given a single text input, check whether `ftfy.fix_text_encoding` would change it. If so, display the change. """ self.count += 1 text = unescape_html(text) if not possible_encoding(text, 'ascii'): if encoding_only: fixed = fix_encoding(text) else: fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False) if text != fixed: # possibly filter common bots before printing print(u'\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format( text=text, fixed=fixed)) self.num_fixed += 1 # Print status updates once in a while if self.count % 100 == 0: print('.', end='', flush=True) if self.count % 10000 == 0: print('\n%d/%d fixed' % (self.num_fixed, self.count))
def check_ftfy(self, text): check_text = remove_unsafe_private_use(text).lower() if not possible_encoding(text, 'ascii') and 'unfollow' not in check_text: fixed = fix_text_encoding(text) if text != fixed: if not check_text.startswith('http://t.co/'): print(u'Text:\t{text}\nFixed:\t{fixed}\n'.format(text=text, fixed=fixed)) self.num_fixed += 1
def check_ftfy(self, text): """ Given a single text input, check whether `ftfy.fix_text_encoding` would change it. If so, display the change. """ self.count += 1 if not possible_encoding(text, 'ascii'): fixed = fix_text_encoding(text) if text != fixed: # possibly filter common bots before printing print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format( text=text, fixed=fixed)) self.num_fixed += 1 # Print status updates once in a while if self.count % 100 == 0: print('.', end='', flush=True) if self.count % 10000 == 0: print('\n%d/%d fixed' % (self.num_fixed, self.count))
def check_ftfy(self, text): """ Given a single text input, check whether `ftfy.fix_text_encoding` would change it. If so, display the change. """ self.count += 1 if not possible_encoding(text, 'ascii'): fixed = fix_text_encoding(text) if text != fixed: # possibly filter common bots before printing print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format( text=text, fixed=fixed )) self.num_fixed += 1 # Print status updates once in a while if self.count % 100 == 0: print('.', end='', flush=True) if self.count % 10000 == 0: print('\n%d/%d fixed' % (self.num_fixed, self.count))
def fix_one_step_and_explain(text): """ Performs a single step of re-decoding text that's been decoded incorrectly. Returns the decoded text, plus a "plan" for how to reproduce what it did. """ if isinstance(text, bytes): raise UnicodeError(BYTES_ERROR_TEXT) if len(text) == 0: return text, [] # The first plan is to return ASCII text unchanged. if possible_encoding(text, 'ascii'): return text, [] # As we go through the next step, remember the possible encodings # that we encounter but don't successfully fix yet. We may need them # later. possible_1byte_encodings = [] # Suppose the text was supposed to be UTF-8, but it was decoded using # a single-byte encoding instead. When these cases can be fixed, they # are usually the correct thing to do, so try them next. for encoding in CHARMAP_ENCODINGS: if possible_encoding(text, encoding): encoded_bytes = text.encode(encoding) # Now, find out if it's UTF-8 (or close enough). Otherwise, # remember the encoding for later. try: decoding = 'utf-8' if b'\xed' in encoded_bytes or b'\xc0' in encoded_bytes: decoding = 'utf-8-variants' fixed = encoded_bytes.decode(decoding) steps = [('encode', encoding), ('decode', decoding)] return fixed, steps except UnicodeDecodeError: possible_1byte_encodings.append(encoding) # The next most likely case is that this is Latin-1 that was intended to # be read as Windows-1252, because those two encodings in particular are # easily confused. if 'latin-1' in possible_1byte_encodings: if 'windows-1252' in possible_1byte_encodings: # This text is in the intersection of Latin-1 and # Windows-1252, so it's probably legit. return text, [] else: # Otherwise, it means we have characters that are in Latin-1 but # not in Windows-1252. Those are C1 control characters. Nobody # wants those. Assume they were meant to be Windows-1252. Don't # use the sloppy codec, because bad Windows-1252 characters are # a bad sign. encoded = text.encode('latin-1') try: fixed = encoded.decode('windows-1252') steps = [] if fixed != text: steps = [('encode', 'latin-1'), ('decode', 'windows-1252')] return fixed, steps except UnicodeDecodeError: # This text contained characters that don't even make sense # if you assume they were supposed to be Windows-1252. In # that case, let's not assume anything. pass # The cases that remain are mixups between two different single-byte # encodings, and not the common case of Latin-1 vs. Windows-1252. # # Those cases are somewhat rare, and impossible to solve without false # positives. If you're in one of these situations, you should try using # the `ftfy.guess_bytes` function. # Return the text unchanged; the plan is empty. return text, []
def fix_one_step_and_explain(text): """ Performs a single step of re-decoding text that's been decoded incorrectly. Returns the decoded text, plus a "plan" for how to reproduce what it did. """ if isinstance(text, bytes): raise UnicodeError(BYTES_ERROR_TEXT) if len(text) == 0: return text, [] # The first plan is to return ASCII text unchanged. if possible_encoding(text, 'ascii'): return text, [] # As we go through the next step, remember the possible encodings # that we encounter but don't successfully fix yet. We may need them # later. possible_1byte_encodings = [] # Suppose the text was supposed to be UTF-8, but it was decoded using # a single-byte encoding instead. When these cases can be fixed, they # are usually the correct thing to do, so try them next. for encoding in CHARMAP_ENCODINGS: if possible_encoding(text, encoding): encoded_bytes = text.encode(encoding) encode_step = ('encode', encoding, ENCODING_COSTS.get(encoding, 0)) transcode_steps = [] # Now, find out if it's UTF-8 (or close enough). Otherwise, # remember the encoding for later. try: decoding = 'utf-8' # Check encoded_bytes for sequences that would be UTF-8, # except they have b' ' where b'\xa0' would belong. if ALTERED_UTF8_RE.search(encoded_bytes): encoded_bytes = restore_byte_a0(encoded_bytes) cost = encoded_bytes.count(0xa0) * 2 transcode_steps.append(('transcode', 'restore_byte_a0', cost)) # Check for the byte 0x1a, which indicates where one of our # 'sloppy' codecs found a replacement character. if encoding.startswith('sloppy') and 0x1a in encoded_bytes: encoded_bytes = replace_lossy_sequences(encoded_bytes) transcode_steps.append(('transcode', 'replace_lossy_sequences', 0)) if 0xed in encoded_bytes or 0xc0 in encoded_bytes: decoding = 'utf-8-variants' decode_step = ('decode', decoding, 0) steps = [encode_step] + transcode_steps + [decode_step] fixed = encoded_bytes.decode(decoding) return fixed, steps except UnicodeDecodeError: possible_1byte_encodings.append(encoding) # Look for a-hat-euro sequences that remain, and fix them in isolation. if PARTIAL_UTF8_PUNCT_RE.search(text): steps = [('transcode', 'fix_partial_utf8_punct_in_1252', 1)] fixed = fix_partial_utf8_punct_in_1252(text) return fixed, steps # The next most likely case is that this is Latin-1 that was intended to # be read as Windows-1252, because those two encodings in particular are # easily confused. if 'latin-1' in possible_1byte_encodings: if 'windows-1252' in possible_1byte_encodings: # This text is in the intersection of Latin-1 and # Windows-1252, so it's probably legit. return text, [] else: # Otherwise, it means we have characters that are in Latin-1 but # not in Windows-1252. Those are C1 control characters. Nobody # wants those. Assume they were meant to be Windows-1252. Don't # use the sloppy codec, because bad Windows-1252 characters are # a bad sign. encoded = text.encode('latin-1') try: fixed = encoded.decode('windows-1252') steps = [] if fixed != text: steps = [('encode', 'latin-1', 0), ('decode', 'windows-1252', 1)] return fixed, steps except UnicodeDecodeError: # This text contained characters that don't even make sense # if you assume they were supposed to be Windows-1252. In # that case, let's not assume anything. pass # The cases that remain are mixups between two different single-byte # encodings, and not the common case of Latin-1 vs. Windows-1252. # # These cases may be unsolvable without adding false positives, though # I have vague ideas about how to optionally address them in the future. # Return the text unchanged; the plan is empty. return text, []
def fix_one_step_and_explain(text): """ Performs a single step of re-decoding text that's been decoded incorrectly. Returns the decoded text, plus a "plan" for how to reproduce what it did. """ if isinstance(text, bytes): raise UnicodeError(BYTES_ERROR_TEXT) if len(text) == 0: return text, [] # The first plan is to return ASCII text unchanged. if possible_encoding(text, 'ascii'): return text, [] # As we go through the next step, remember the possible encodings # that we encounter but don't successfully fix yet. We may need them # later. possible_1byte_encodings = [] # Suppose the text was supposed to be UTF-8, but it was decoded using # a single-byte encoding instead. When these cases can be fixed, they # are usually the correct thing to do, so try them next. for encoding in CHARMAP_ENCODINGS: if possible_encoding(text, encoding): encoded_bytes = text.encode(encoding) encode_step = ('encode', encoding, ENCODING_COSTS.get(encoding, 0)) transcode_steps = [] # Now, find out if it's UTF-8 (or close enough). Otherwise, # remember the encoding for later. try: decoding = 'utf-8' # Check encoded_bytes for sequences that would be UTF-8, # except they have b' ' where b'\xa0' would belong. if ALTERED_UTF8_RE.search(encoded_bytes): encoded_bytes = restore_byte_a0(encoded_bytes) cost = encoded_bytes.count(0xa0) transcode_steps.append( ('transcode', 'restore_byte_a0', cost)) # Check for the byte 0x1a, which indicates where one of our # 'sloppy' codecs found a replacement character. if encoding.startswith('sloppy') and 0x1a in encoded_bytes: encoded_bytes = replace_lossy_sequences(encoded_bytes) transcode_steps.append( ('transcode', 'replace_lossy_sequences', 0)) if 0xed in encoded_bytes or 0xc0 in encoded_bytes: decoding = 'utf-8-variants' decode_step = ('decode', decoding, 0) steps = [encode_step] + transcode_steps + [decode_step] fixed = encoded_bytes.decode(decoding) return fixed, steps except UnicodeDecodeError: possible_1byte_encodings.append(encoding) # Look for a-hat-euro sequences that remain, and fix them in isolation. if PARTIAL_UTF8_PUNCT_RE.search(text): steps = [('transcode', 'fix_partial_utf8_punct_in_1252', 1)] fixed = fix_partial_utf8_punct_in_1252(text) return fixed, steps # The next most likely case is that this is Latin-1 that was intended to # be read as Windows-1252, because those two encodings in particular are # easily confused. if 'latin-1' in possible_1byte_encodings: if 'windows-1252' in possible_1byte_encodings: # This text is in the intersection of Latin-1 and # Windows-1252, so it's probably legit. return text, [] else: # Otherwise, it means we have characters that are in Latin-1 but # not in Windows-1252. Those are C1 control characters. Nobody # wants those. Assume they were meant to be Windows-1252. Don't # use the sloppy codec, because bad Windows-1252 characters are # a bad sign. encoded = text.encode('latin-1') try: fixed = encoded.decode('windows-1252') steps = [] if fixed != text: steps = [('encode', 'latin-1', 0), ('decode', 'windows-1252', 1)] return fixed, steps except UnicodeDecodeError: # This text contained characters that don't even make sense # if you assume they were supposed to be Windows-1252. In # that case, let's not assume anything. pass # The cases that remain are mixups between two different single-byte # encodings, and not the common case of Latin-1 vs. Windows-1252. # # These cases may be unsolvable without adding false positives, though # I have vague ideas about how to optionally address them in the future. # Return the text unchanged; the plan is empty. return text, []
def _fix_encoding_one_step_and_explain( text: str, config: TextFixerConfig ) -> ExplainedText: """ Perform one step of fixing the encoding of text. """ if config is None: config = TextFixerConfig() if len(text) == 0: return ExplainedText(text, []) # The first plan is to return ASCII text unchanged, as well as text # that doesn't look like it contains mojibake if chardata.possible_encoding(text, "ascii") or not is_bad(text): return ExplainedText(text, []) # As we go through the next step, remember the possible encodings # that we encounter but don't successfully fix yet. We may need them # later. possible_1byte_encodings = [] # Suppose the text was supposed to be UTF-8, but it was decoded using # a single-byte encoding instead. When these cases can be fixed, they # are usually the correct thing to do, so try them next. for encoding in chardata.CHARMAP_ENCODINGS: if chardata.possible_encoding(text, encoding): possible_1byte_encodings.append(encoding) encoded_bytes = text.encode(encoding) encode_step = ("encode", encoding) transcode_steps = [] # Now, find out if it's UTF-8 (or close enough). Otherwise, # remember the encoding for later. try: decoding = "utf-8" # Check encoded_bytes for sequences that would be UTF-8, # except they have b' ' where b'\xa0' would belong. if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search( encoded_bytes ): replaced_bytes = fixes.restore_byte_a0(encoded_bytes) if replaced_bytes != encoded_bytes: transcode_steps.append(("transcode", "restore_byte_a0")) encoded_bytes = replaced_bytes # Replace sequences where information has been lost if config.replace_lossy_sequences and encoding.startswith("sloppy"): replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes) if replaced_bytes != encoded_bytes: transcode_steps.append(("transcode", "replace_lossy_sequences")) encoded_bytes = replaced_bytes if 0xED in encoded_bytes or 0xC0 in encoded_bytes: decoding = "utf-8-variants" decode_step = ("decode", decoding) steps = [encode_step] + transcode_steps + [decode_step] fixed = encoded_bytes.decode(decoding) return ExplainedText(fixed, steps) except UnicodeDecodeError: pass # Look for a-hat-euro sequences that remain, and fix them in isolation. if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text): steps = [("apply", "decode_inconsistent_utf8")] fixed = fixes.decode_inconsistent_utf8(text) if fixed != text: return ExplainedText(fixed, steps) # The next most likely case is that this is Latin-1 that was intended to # be read as Windows-1252, because those two encodings in particular are # easily confused. if "latin-1" in possible_1byte_encodings: if "windows-1252" in possible_1byte_encodings: # This text is in the intersection of Latin-1 and # Windows-1252, so it's probably legit. return ExplainedText(text, []) else: # Otherwise, it means we have characters that are in Latin-1 but # not in Windows-1252. Those are C1 control characters. Nobody # wants those. Assume they were meant to be Windows-1252. try: fixed = text.encode("latin-1").decode("windows-1252") if fixed != text: steps = [("encode", "latin-1"), ("decode", "windows-1252")] return ExplainedText(fixed, steps) except UnicodeDecodeError: pass # Fix individual characters of Latin-1 with a less satisfying explanation if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text): steps = [("transcode", "fix_c1_controls")] fixed = fixes.fix_c1_controls(text) return ExplainedText(fixed, steps) # The cases that remain are mixups between two different single-byte # encodings, and not the common case of Latin-1 vs. Windows-1252. # # With the new heuristic in 6.0, it's possible that we're closer to solving # these in some cases. It would require a lot of testing and tuning, though. # For now, we leave the text unchanged in these cases. return ExplainedText(text, [])
def fix_text_and_explain(text): """ Performs a single step of re-encoding text that's been decoded incorrectly. It returns the decoded text, plus a structure explaining what it did. This structure could be used for more than it currently is, but we at least use it to track whether we had to intepret text as an old encoding such as MacRoman or cp437. """ if isinstance(text, bytes): raise UnicodeError(BYTES_ERROR_TEXT) if len(text) == 0: return text, [] # The first plan is to return ASCII text unchanged. if possible_encoding(text, 'ascii'): return text, [] # As we go through the next step, remember the possible encodings # that we encounter but don't successfully fix yet. We may need them # later. possible_1byte_encodings = [] # Suppose the text was supposed to be UTF-8, but it was decoded using # a single-byte encoding instead. When these cases can be fixed, they # are usually the correct thing to do, so try them next. for encoding in CHARMAP_ENCODINGS: if possible_encoding(text, encoding): # This is an ugly-looking way to get the bytes that represent # the text in this encoding. The reason we can't necessarily # use .encode(encoding) is that the decoder is very likely # to have been sloppier than Python. # # The decoder might have left bytes unchanged when they're not # part of the encoding. It might represent b'\x81' as u'\x81' # in Windows-1252, while Python would claim that using byte # 0x81 in Windows-1252 is an error. # # So what we do here is we use the .translate method of Unicode # strings. Using it with the character maps we have computed will # give us back a Unicode string using only code # points up to 0xff. This can then be converted into the intended # bytes by encoding it as Latin-1. sorta_encoded_text = text.translate(CHARMAPS[encoding]) # When we get the bytes, run them through fix_java_encoding, # because we can only reliably do that at the byte level. (See # its documentation for details.) encoded_bytes = fix_java_encoding( sorta_encoded_text.encode('latin-1') ) # Now, find out if it's UTF-8. Otherwise, remember the encoding # for later. try: fixed = encoded_bytes.decode('utf-8') steps = [('sloppy_encode', encoding), ('decode', 'utf-8')] return fixed, steps except UnicodeDecodeError: possible_1byte_encodings.append(encoding) # The next most likely case is that this is Latin-1 that was intended to # be read as Windows-1252, because those two encodings in particular are # easily confused. # # We don't need to check for possibilities such as Latin-1 that was # intended to be read as MacRoman, because it is unlikely that any # software has that confusion. if 'latin-1' in possible_1byte_encodings: if 'windows-1252' in possible_1byte_encodings: # This text is in the intersection of Latin-1 and # Windows-1252, so it's probably legit. return text, [] else: # Otherwise, it means we have characters that are in Latin-1 but # not in Windows-1252. Those are C1 control characters. Nobody # wants those. Assume they were meant to be Windows-1252. encoded = text.encode('latin-1') try: fixed = encoded.decode('windows-1252') steps = [('encode', 'latin-1'), ('decode', 'windows-1252')] return fixed, steps except UnicodeDecodeError: # Well, never mind. pass # The cases that remain are mixups between two different single-byte # encodings, neither of which is Latin-1. # # Those cases are somewhat rare, and impossible to solve without false # positives. If you're in one of these situations, you don't need an # encoding fixer. You need something that heuristically guesses what # the encoding is in the first place. # # It's a different problem, the one that the 'chardet' module is # theoretically designed to solve. It probably *won't* solve it in # such an ambiguous case, but perhaps a version of it with better # heuristics would. Anyway, ftfy should not claim to solve it. return text, [('give up', None)]
def test_possible_encoding(): for codept in range(256): char = chr(codept) assert possible_encoding(char, 'latin-1')