def fix_text_segment( text, *, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC' ): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def fix_text_segment(text, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC'): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def add_latin_ligatures(line): """Returns the line cleaned of latin ligatures if there are any. Param: line (unicode) Returns: False if there are not any latin ligatures Corrected line """ cleaned_line = fix_latin_ligatures(line) if line != cleaned_line: return cleaned_line else: return False
def clean_string(s): s = str(s) if isnull(s): return None elif re.search('[a-zA-Z]', s) is None: return None else: s = remove_bom(s) s = remove_control_chars(s) s = fix_encoding(s) s = fix_text(s) s = fix_partial_utf8_punct_in_1252(s) s = decode_escapes(s) s = fix_latin_ligatures(s) s = uncurl_quotes(s) s = s.replace("Äu0087", "ć") s = s.replace("Äu0090", "Đ") s = s.replace("Ãu0096", "Ö") s = s.replace("Åu008D", "ō") s = s.replace("\\", " ") s = s.replace("/", " ") s = s.replace("ö", "ö") p = re.compile("^\w+[A-Z]{1}\w*$") if p.search(s): # From: https://stackoverflow.com/a/37697078 s = re.sub('(?!^)([A-Z][a-z]+)', r'\1', s) new_string = "" p = False for letter in s: if letter in "([": p = True elif letter in ")]": p = False continue if not p: new_string += letter return new_string.strip()