def clean_terms(terms): """ Clean up a sequence of single- or multi-word strings: strip leading/trailing junk chars, handle dangling parens and odd hyphenation, etc. Args: terms (Iterable[str]): sequence of terms such as "presidency", "epic failure", or "George W. Bush" that may be _unclean_ for whatever reason Yields: str: next term in `terms` but with the cruft cleaned up, excluding terms that were _entirely_ cruft .. warning:: Terms with (intentionally) unusual punctuation may get "cleaned" into a form that changes or obscures the original meaning of the term. """ # get rid of leading/trailing junk characters terms = (LEAD_TAIL_CRUFT_TERM_RE.sub('', term) for term in terms) terms = (LEAD_HYPHEN_TERM_RE.sub(r'\1', term) for term in terms) # handle dangling/backwards parens, don't allow '(' or ')' to appear without the other terms = ('' if term.count(')') != term.count('(') or term.find(')') < term.find('(') else term if '(' not in term else DANGLING_PARENS_TERM_RE.sub(r'\1\2\3', term) for term in terms) # handle oddly separated hyphenated words terms = (term if '-' not in term else NEG_DIGIT_TERM_RE.sub(r'\1\2', WEIRD_HYPHEN_SPACE_TERM_RE.sub(r'\1', term)) for term in terms) # handle oddly separated apostrophe'd words terms = (WEIRD_APOSTR_SPACE_TERM_RE.sub(r'\1\2', term) if "'" in term else term for term in terms) # normalize whitespace terms = (NONBREAKING_SPACE_REGEX.sub(' ', term).strip() for term in terms) for term in terms: if re.search(r'\w', term): yield term
def normalize_whitespace(text): """ Given ``text`` str, replace one or more spacings with a single space, and one or more linebreaks with a single newline. Also strip leading/trailing whitespace. """ return NONBREAKING_SPACE_REGEX.sub(' ', LINEBREAK_REGEX.sub(r'\n', text)).strip()