Exemple #1
0
 def clean_text(self, address, **kwargs):
     """Basic clean-up."""
     address = self.LINE_BREAKS.sub(', ', address)
     address = self.COMMATA.sub(', ', address)
     address = collapse_spaces(address)
     if len(address):
         return address
Exemple #2
0
def normalize(text,
              lowercase=True,
              collapse=True,
              latinize=False,
              ascii=False,
              decompose=False,
              replace_categories=UNICODE_CATEGORIES):
    """The main normalization function for text.

    This will take a string and apply a set of transformations to it so
    that it can be processed more easily afterwards. Arguments:

    * ``lowercase``: not very mysterious.
    * ``collapse``: replace multiple whitespace-like characters with a
      single whitespace. This is especially useful with category replacement
      which can lead to a lot of whitespace.
    * ``decompose``: apply a unicode normalization (NFKD) to separate
      simple characters and their diacritics.
    * ``replace_categories``: This will perform a replacement of whole
      classes of unicode characters (e.g. symbols, marks, numbers) with a
      given character. It is used to replace any non-text elements of the
      input string.
    """
    if not isinstance(text, six.string_types):
        return

    # TODO: Python 3?
    if six.PY2 and not isinstance(text, six.text_type):
        encoding = guess_encoding(text, 'utf-8')
        text = text.decode(encoding)

    if lowercase:
        # Yeah I made a Python package for this.
        text = text.lower()

    if decompose:
        text = decompose_nfkd(text)

    if ascii:
        # A stricter form of transliteration that leaves only ASCII
        # characters.
        text = ascii_text(text)
    elif latinize:
        # Perform unicode-based transliteration, e.g. of cyricllic
        # or CJK scripts into latin.
        text = latinize_text(text)

    # Perform unicode category-based character replacement. This is
    # used to filter out whole classes of characters, such as symbols,
    # punctuation, or whitespace-like characters.
    text = category_replace(text, replace_categories)

    if collapse:
        # Remove consecutive whitespace.
        text = collapse_spaces(text)

    return text
Exemple #3
0
 def clean_text(
     self,
     text: str,
     fuzzy: bool = False,
     format: Optional[str] = None,
     proxy: Optional["EntityProxy"] = None,
 ) -> Optional[str]:
     """Basic clean-up."""
     name = strip_quotes(text)
     return collapse_spaces(name)
Exemple #4
0
def normalize(text: Any,
              lowercase: bool = True,
              collapse: bool = True,
              latinize: bool = False,
              ascii: bool = False,
              encoding_default: Encoding = DEFAULT_ENCODING,
              encoding: Optional[str] = None,
              replace_categories: Categories = UNICODE_CATEGORIES):
    """The main normalization function for text.

    This will take a string and apply a set of transformations to it so
    that it can be processed more easily afterwards. Arguments:

    * ``lowercase``: not very mysterious.
    * ``collapse``: replace multiple whitespace-like characters with a
      single whitespace. This is especially useful with category replacement
      which can lead to a lot of whitespace.
    * ``decompose``: apply a unicode normalization (NFKD) to separate
      simple characters and their diacritics.
    * ``replace_categories``: This will perform a replacement of whole
      classes of unicode characters (e.g. symbols, marks, numbers) with a
      given character. It is used to replace any non-text elements of the
      input string.
    """
    text = stringify(text,
                     encoding_default=encoding_default,
                     encoding=encoding)
    if text is None:
        return

    if lowercase:
        # Yeah I made a Python package for this.
        text = text.lower()

    if ascii:
        # A stricter form of transliteration that leaves only ASCII
        # characters.
        text = ascii_text(text)
    elif latinize:
        # Perform unicode-based transliteration, e.g. of cyricllic
        # or CJK scripts into latin.
        text = latinize_text(text)

    if text is None:
        return

    # Perform unicode category-based character replacement. This is
    # used to filter out whole classes of characters, such as symbols,
    # punctuation, or whitespace-like characters.
    text = category_replace(text, replace_categories)

    if collapse:
        # Remove consecutive whitespace.
        text = collapse_spaces(text)
    return text
Exemple #5
0
def _safe_name(file_name: Optional[str], sep: str) -> Optional[str]:
    """Convert the file name to ASCII and normalize the string."""
    file_name = stringify(file_name)
    if file_name is None:
        return None
    file_name = ascii_text(file_name)
    file_name = category_replace(file_name, UNICODE_CATEGORIES)
    file_name = collapse_spaces(file_name)
    if file_name is None or not len(file_name):
        return None
    return file_name.replace(WS, sep)
Exemple #6
0
 def clean_text(
     self,
     text: str,
     fuzzy: bool = False,
     format: Optional[str] = None,
     proxy: Optional["EntityProxy"] = None,
 ) -> Optional[str]:
     """Basic clean-up."""
     address = self.LINE_BREAKS.sub(", ", text)
     address = self.COMMATA.sub(", ", address)
     collapsed = collapse_spaces(address)
     if collapsed is None or not len(collapsed):
         return None
     return collapsed
Exemple #7
0
def slugify(value: Any, sep: str = "-") -> Optional[str]:
    """A simple slug generator. Slugs are pure ASCII lowercase strings
    that can be used in URLs an other places where a name has to be
    machine-safe."""
    text = stringify(value)
    if text is None:
        return None
    text = text.replace(sep, WS)
    # run this first because it'll give better results on special
    # characters.
    text = category_replace(text, SLUG_CATEGORIES)
    text = latinize_text(text, ascii=True)
    if text is None:
        return None
    text = text.lower()
    text = "".join([c for c in text if c in VALID_CHARS])
    text = collapse_spaces(text)
    if text is None or len(text) == 0:
        return None
    return text.replace(WS, sep)
Exemple #8
0
 def clean_text(self, name, **kwargs):
     """Basic clean-up."""
     name = strip_quotes(name)
     name = collapse_spaces(name)
     return name