Exemple #1
0
def clean_terms(terms):
    """
    Clean up a sequence of single- or multi-word strings: strip leading/trailing
    junk chars, handle dangling parens and odd hyphenation, etc.

    Args:
        terms (Iterable[str]): sequence of terms such as "presidency", "epic failure",
            or "George W. Bush" that may be _unclean_ for whatever reason

    Yields:
        str: next term in `terms` but with the cruft cleaned up, excluding terms
            that were _entirely_ cruft

    .. warning:: Terms with (intentionally) unusual punctuation may get "cleaned"
        into a form that changes or obscures the original meaning of the term.
    """
    # get rid of leading/trailing junk characters
    terms = (LEAD_TAIL_CRUFT_TERM_RE.sub('', term)
             for term in terms)
    terms = (LEAD_HYPHEN_TERM_RE.sub(r'\1', term)
             for term in terms)
    # handle dangling/backwards parens, don't allow '(' or ')' to appear without the other
    terms = ('' if term.count(')') != term.count('(') or term.find(')') < term.find('(')
             else term if '(' not in term
             else DANGLING_PARENS_TERM_RE.sub(r'\1\2\3', term)
             for term in terms)
    # handle oddly separated hyphenated words
    terms = (term if '-' not in term
             else NEG_DIGIT_TERM_RE.sub(r'\1\2', WEIRD_HYPHEN_SPACE_TERM_RE.sub(r'\1', term))
             for term in terms)
    # handle oddly separated apostrophe'd words
    terms = (WEIRD_APOSTR_SPACE_TERM_RE.sub(r'\1\2', term)
             if "'" in term else term
             for term in terms)
    # normalize whitespace
    terms = (NONBREAKING_SPACE_REGEX.sub(' ', term).strip()
             for term in terms)
    for term in terms:
        if re.search(r'\w', term):
            yield term
Exemple #2
0
def normalize_whitespace(text):
    """
    Given ``text`` str, replace one or more spacings with a single space, and one
    or more linebreaks with a single newline. Also strip leading/trailing whitespace.
    """
    return NONBREAKING_SPACE_REGEX.sub(' ', LINEBREAK_REGEX.sub(r'\n', text)).strip()