Exemple #1
0
 def normalize(self, name):
     name = ascii_text(name)
     name = category_replace(name, UNICODE_CATEGORIES)
     if name.upper() == name:
         name = name.replace(WS, '_')
         name = name.lower()
     else:
         name = stringcase.snakecase(name)
     return re.sub('_+', '_', name)
Exemple #2
0
def clean_strict(text, boundary=WS):
    """Super-hardcore string scrubbing."""
    # transliterate to ascii
    text = ascii_text(text)
    # replace punctuation and symbols
    text = CHARACTERS_REMOVE_RE.sub('', text)
    text = category_replace(text)
    # pad out for company type replacements
    text = ''.join((boundary, collapse_spaces(text), boundary))
    return text
Exemple #3
0
def clean_strict(text, boundary=WS):
    """Super-hardcore string scrubbing."""
    # transliterate to ascii
    text = ascii_text(text)
    # replace punctuation and symbols
    text = CHARACTERS_REMOVE_RE.sub('', text)
    text = category_replace(text)
    # pad out for company type replacements
    text = ''.join((boundary, collapse_spaces(text), boundary))
    return text
Exemple #4
0
def normalize_strong(text):
    """Perform heavy normalisation of a given text.

    The goal of this function is not to retain a readable version of the given
    string, but rather to yield a normalised version suitable for comparisons
    and machine analysis.
    """
    text = latinize_text(string_value(text))
    if text is None:
        return
    text = category_replace(text.lower())
    return collapse_spaces(text)
Exemple #5
0
def clean_strict(text: Optional[str], boundary: str = WS) -> Optional[str]:
    """Super-hardcore string scrubbing."""
    # transliterate to ascii
    text = ascii_text(text)
    if not isinstance(text, str):
        return None
    # replace punctuation and symbols
    text = CHARACTERS_REMOVE_RE.sub("", text)
    text = category_replace(text)
    text = collapse_spaces(text)
    if text is None:
        return None
    # pad out for company type replacements
    return "".join((boundary, text, boundary))
def normalize(text):
    text = category_replace(text, replacements=UNICODE_CATEGORIES)
    text = ascii_text(text)
    if text is not None:
        return text.lower()