def check_phrase(text): eq_(fix_text_encoding(text), text) eq_(fix_text_encoding(text.encode('utf-8').decode('latin-1')), text) # make sure that the opening punctuation is not the only thing that makes # it work eq_(fix_text_encoding(text[1:]), text[1:]) eq_(fix_text_encoding(text[1:].encode('utf-8').decode('latin-1')), text[1:])
def test_all_bmp_characters(): for index in range(0xa0, 0xfffd): char = unichr(index) # Exclude code points that are not assigned if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn'): garble = char.encode('utf-8').decode('latin-1') garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1') eq_(char_names(fix_text_encoding(garble)), char_names(char)) eq_(char_names(fix_text_encoding(garble2)), char_names(char))
def fix_bad_encoding(text): """ Kept for compatibility with previous versions of ftfy. """ warnings.warn('fix_bad_encoding is now known as fix_text_encoding', DeprecationWarning) return fix_text_encoding(text)
def check_ftfy(self, text): check_text = remove_unsafe_private_use(text).lower() if not possible_encoding(text, 'ascii') and 'unfollow' not in check_text: fixed = fix_text_encoding(text) if text != fixed: if not check_text.startswith('http://t.co/'): print(u'Text:\t{text}\nFixed:\t{fixed}\n'.format(text=text, fixed=fixed)) self.num_fixed += 1
def fix_bad_encoding(text): """ Kept for compatibility with previous versions of ftfy. """ warnings.warn( 'fix_bad_encoding is now known as fix_text_encoding', DeprecationWarning ) return fix_text_encoding(text)
def fix_text_segment(text, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC'): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if fix_entities: text = fixes.unescape_html(text) if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_text_encoding(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom: text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def fix_text_segment(text, remove_unsafe_private_use=True, fix_entities=True, remove_terminal_escapes=True, fix_encoding=True, normalization='NFKC', uncurl_quotes=True, fix_line_breaks=True, remove_control_chars=True, remove_bom=True): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is all in the same encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) unsafe_entities = ('<' in text and '>' in text) while True: origtext = text if remove_unsafe_private_use: text = fixes.remove_unsafe_private_use(text) if fix_entities and not unsafe_entities: text = fixes.unescape_html(text) if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_text_encoding(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom: text = fixes.remove_bom(text) if text == origtext: return text
def check_ftfy(self, text): """ Given a single text input, check whether `ftfy.fix_text_encoding` would change it. If so, display the change. """ self.count += 1 if not possible_encoding(text, 'ascii'): fixed = fix_text_encoding(text) if text != fixed: # possibly filter common bots before printing print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format( text=text, fixed=fixed)) self.num_fixed += 1 # Print status updates once in a while if self.count % 100 == 0: print('.', end='', flush=True) if self.count % 10000 == 0: print('\n%d/%d fixed' % (self.num_fixed, self.count))
def check_ftfy(self, text): """ Given a single text input, check whether `ftfy.fix_text_encoding` would change it. If so, display the change. """ self.count += 1 if not possible_encoding(text, 'ascii'): fixed = fix_text_encoding(text) if text != fixed: # possibly filter common bots before printing print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format( text=text, fixed=fixed )) self.num_fixed += 1 # Print status updates once in a while if self.count % 100 == 0: print('.', end='', flush=True) if self.count % 10000 == 0: print('\n%d/%d fixed' % (self.num_fixed, self.count))
def check_phrase(text): eq_(fix_text_encoding(text), text) eq_(fix_text_encoding(text.encode('utf-8').decode('latin-1')), text)
def test_fix_with_backslash(): eq_(fix_text_encoding(u"<40\\% vs \xe2\x89\xa540\\%"), u"<40\\% vs ≥40\\%")