def fix_text_segment( text, *, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC' ): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def fix_text_segment(text, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC'): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def TestInput(data): if len(data) < 1: return fdp = atheris.FuzzedDataProvider(data) ftfy.character_width(chr(fdp.ConsumeIntInRange(1,1114110))) ftfy.monospaced_width(fdp.ConsumeString(1000)) ftfy.monospaced_width(fdp.ConsumeUnicode(1000)) choice = fdp.ConsumeIntInRange(1,3) if choice == 1: ftfy.display_ljust(fdp.ConsumeString(1000),fdp.ConsumeIntInRange(1,2000)) ftfy.display_ljust(fdp.ConsumeUnicode(1000),fdp.ConsumeIntInRange(1,2000)) if choice == 2: ftfy.display_rjust(fdp.ConsumeString(1000),fdp.ConsumeIntInRange(1,2000)) ftfy.display_rjust(fdp.ConsumeUnicode(1000),fdp.ConsumeIntInRange(1,2000)) if choice == 3: ftfy.display_center(fdp.ConsumeString(1000),fdp.ConsumeIntInRange(1,2000)) ftfy.display_center(fdp.ConsumeUnicode(1000),fdp.ConsumeIntInRange(1,2000)) fixes.remove_bom(fdp.ConsumeString(1000)) fixes.remove_bom(fdp.ConsumeUnicode(1000))
def fix_text_segment(text, remove_unsafe_private_use=False, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, normalization='NFKC', uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is all in the same encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_unsafe_private_use: text = fixes.remove_unsafe_private_use(text) if fix_entities: text = fixes.unescape_html(text) if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_text_encoding(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom: text = fixes.remove_bom(text) if text == origtext: return text
def fix_text_segment(text, remove_unsafe_private_use=False, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, normalization='NFKC', uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is all in the same encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_unsafe_private_use: text = fixes.remove_unsafe_private_use(text) if fix_entities: text = fixes.unescape_html(text) if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_text_encoding(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom: text = fixes.remove_bom(text) if text == origtext: return text
def clean_string(s): s = str(s) if isnull(s): return None elif re.search('[a-zA-Z]', s) is None: return None else: s = remove_bom(s) s = remove_control_chars(s) s = fix_encoding(s) s = fix_text(s) s = fix_partial_utf8_punct_in_1252(s) s = decode_escapes(s) s = fix_latin_ligatures(s) s = uncurl_quotes(s) s = s.replace("Äu0087", "ć") s = s.replace("Äu0090", "Đ") s = s.replace("Ãu0096", "Ö") s = s.replace("Åu008D", "ō") s = s.replace("\\", " ") s = s.replace("/", " ") s = s.replace("ö", "ö") p = re.compile("^\w+[A-Z]{1}\w*$") if p.search(s): # From: https://stackoverflow.com/a/37697078 s = re.sub('(?!^)([A-Z][a-z]+)', r'\1', s) new_string = "" p = False for letter in s: if letter in "([": p = True elif letter in ")]": p = False continue if not p: new_string += letter return new_string.strip()