def decompose_nfkd(text: Any) -> Optional[str]: """Perform unicode compatibility decomposition. This will replace some non-standard value representations in unicode and normalise them, while also separating characters and their diacritics into two separate codepoints. """ if not is_text(text): return None return unicodedata.normalize('NFKD', text)
def latinize_text(text: Optional[str], ascii: bool = False) -> Optional[str]: """Transliterate the given text to the latin script. This attempts to convert a given text to latin script using the closest match of characters vis a vis the original script. """ if text is None or not is_text(text) or not len(text): return text if ascii: if not hasattr(latinize_text, "_ascii"): latinize_text._ascii = make_trans(ASCII_SCRIPT) # type: ignore return latinize_text._ascii(text) # type: ignore if not hasattr(latinize_text, "_tr"): latinize_text._tr = make_trans("Any-Latin") # type: ignore return latinize_text._tr(text) # type: ignore
def category_replace( text: Any, replacements: Categories = UNICODE_CATEGORIES) -> Optional[str]: """Remove characters from a string based on unicode classes. This is a method for removing non-text characters (such as punctuation, whitespace, marks and diacritics) from a piece of text by class, rather than specifying them individually. """ text = decompose_nfkd(text) if not is_text(text): return None characters = [] for character in text: cat = unicodedata.category(character) replacement = replacements.get(cat, character) if replacement is not None: characters.append(replacement) return u''.join(characters)
def ascii_text(text: Optional[str]) -> Optional[str]: """Transliterate the given text and make sure it ends up as ASCII.""" text = latinize_text(text, ascii=True) if text is None or not is_text(text): return None return text.encode("ascii", "ignore").decode("ascii")
def collapse_spaces(text: Any) -> Optional[str]: """Remove newlines, tabs and multiple spaces with single spaces.""" if not is_text(text): return None return COLLAPSE_RE.sub(WS, text).strip(WS)
def remove_byte_order_mark(text) -> Optional[str]: """Remove a BOM from the beginning of the text.""" if not is_text(text): return None return BOM_RE.sub('', text)
def remove_unsafe_chars(text) -> Optional[str]: """Remove unsafe unicode characters from a piece of text.""" if not is_text(text): return None return UNSAFE_RE.sub('', text)
def strip_quotes(text: Any) -> Optional[str]: """Remove double or single quotes surrounding a string.""" if not is_text(text): return None return QUOTES_RE.sub('\\1', text)
def compose_nfkc(text: Any) -> Optional[str]: """Perform unicode composition.""" if not is_text(text): return None return unicodedata.normalize('NFKC', text)