Ejemplo n.º 1
0
def stringify(
    value: Any, encoding_default: str = DEFAULT_ENCODING, encoding: Optional[str] = None
) -> Optional[str]:
    """Brute-force convert a given object to a string.

    This will attempt an increasingly mean set of conversions to make a given
    object into a unicode string. It is guaranteed to either return unicode or
    None, if all conversions failed (or the value is indeed empty).
    """
    if value is None:
        return None
    if isinstance(value, str):
        return _clean_empty(value)
    if isinstance(value, (date, datetime)):
        return value.isoformat()
    elif isinstance(value, (float, Decimal)):
        return Decimal(value).to_eng_string()
    elif isinstance(value, bytes):
        if encoding is None:
            encoding = guess_encoding(value, default=encoding_default)
        value = value.decode(encoding, "replace")
        value = remove_unsafe_chars(value)
        if value is None:
            return None
        return _clean_empty(value)
    return _clean_empty(str(value))
Ejemplo n.º 2
0
def stringify(value, encoding_default='utf-8', encoding=None):
    """Brute-force convert a given object to a string.

    This will attempt an increasingly mean set of conversions to make a given
    object into a unicode string. It is guaranteed to either return unicode or
    None, if all conversions failed (or the value is indeed empty).
    """
    if value is None:
        return None

    if not isinstance(value, six.text_type):
        if isinstance(value, (date, datetime)):
            return value.isoformat()
        elif isinstance(value, (float, Decimal)):
            return Decimal(value).to_eng_string()
        elif isinstance(value, six.binary_type):
            if encoding is None:
                encoding = guess_encoding(value, default=encoding_default)
            value = value.decode(encoding, 'replace')
            value = remove_byte_order_mark(value)
            value = remove_unsafe_chars(value)
        else:
            value = six.text_type(value)

    # XXX: is this really a good idea?
    value = value.strip()
    if not len(value):
        return None
    return value
Ejemplo n.º 3
0
def normalize(text,
              lowercase=True,
              collapse=True,
              latinize=False,
              ascii=False,
              decompose=False,
              replace_categories=UNICODE_CATEGORIES):
    """The main normalization function for text.

    This will take a string and apply a set of transformations to it so
    that it can be processed more easily afterwards. Arguments:

    * ``lowercase``: not very mysterious.
    * ``collapse``: replace multiple whitespace-like characters with a
      single whitespace. This is especially useful with category replacement
      which can lead to a lot of whitespace.
    * ``decompose``: apply a unicode normalization (NFKD) to separate
      simple characters and their diacritics.
    * ``replace_categories``: This will perform a replacement of whole
      classes of unicode characters (e.g. symbols, marks, numbers) with a
      given character. It is used to replace any non-text elements of the
      input string.
    """
    if not isinstance(text, six.string_types):
        return

    # TODO: Python 3?
    if six.PY2 and not isinstance(text, six.text_type):
        encoding = guess_encoding(text, 'utf-8')
        text = text.decode(encoding)

    if lowercase:
        # Yeah I made a Python package for this.
        text = text.lower()

    if decompose:
        text = decompose_nfkd(text)

    if ascii:
        # A stricter form of transliteration that leaves only ASCII
        # characters.
        text = ascii_text(text)
    elif latinize:
        # Perform unicode-based transliteration, e.g. of cyricllic
        # or CJK scripts into latin.
        text = latinize_text(text)

    # Perform unicode category-based character replacement. This is
    # used to filter out whole classes of characters, such as symbols,
    # punctuation, or whitespace-like characters.
    text = category_replace(text, replace_categories)

    if collapse:
        # Remove consecutive whitespace.
        text = collapse_spaces(text)

    return text