def check_ftfy(self, text): check_text = remove_unsafe_private_use(text).lower() if not possible_encoding(text, 'ascii') and 'unfollow' not in check_text: fixed = fix_text_encoding(text) if text != fixed: if not check_text.startswith('http://t.co/'): print(u'Text:\t{text}\nFixed:\t{fixed}\n'.format(text=text, fixed=fixed)) self.num_fixed += 1
def fix_text_segment(text, remove_unsafe_private_use=False, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, normalization='NFKC', uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is all in the same encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_unsafe_private_use: text = fixes.remove_unsafe_private_use(text) if fix_entities: text = fixes.unescape_html(text) if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_text_encoding(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom: text = fixes.remove_bom(text) if text == origtext: return text
def standardize_word(word): u""" Apply various normalizations to the text. In languages where this is relevant, it will end up in all lowercase letters, with pre-composed diacritics. Some language-specific gotchas: - Words ending with a capital "Σ" in Greek have a lowercase version that ends with "ς" on Python 3, but "σ" on Python 2. (Python 3 is orthographically correct.) This will lead to different frequencies on such Greek words, and different numbers of words in total. - Words containing a capital "I" in Turkish will be normalized to a lowercase "i", incorrectly, instead of "ı". The effective result is that the capitalized versions will not share a word count with the lowercase versions. """ return normalize('NFKC', remove_unsafe_private_use(word)).lower()
def render_safe(text): ''' Make sure the given text is safe to pass to an external process. ''' return remove_control_chars(remove_unsafe_private_use(text))