def stringify( value: Any, encoding_default: str = DEFAULT_ENCODING, encoding: Optional[str] = None ) -> Optional[str]: """Brute-force convert a given object to a string. This will attempt an increasingly mean set of conversions to make a given object into a unicode string. It is guaranteed to either return unicode or None, if all conversions failed (or the value is indeed empty). """ if value is None: return None if isinstance(value, str): return _clean_empty(value) if isinstance(value, (date, datetime)): return value.isoformat() elif isinstance(value, (float, Decimal)): return Decimal(value).to_eng_string() elif isinstance(value, bytes): if encoding is None: encoding = guess_encoding(value, default=encoding_default) value = value.decode(encoding, "replace") value = remove_unsafe_chars(value) if value is None: return None return _clean_empty(value) return _clean_empty(str(value))
def stringify(value, encoding_default='utf-8', encoding=None): """Brute-force convert a given object to a string. This will attempt an increasingly mean set of conversions to make a given object into a unicode string. It is guaranteed to either return unicode or None, if all conversions failed (or the value is indeed empty). """ if value is None: return None if not isinstance(value, six.text_type): if isinstance(value, (date, datetime)): return value.isoformat() elif isinstance(value, (float, Decimal)): return Decimal(value).to_eng_string() elif isinstance(value, six.binary_type): if encoding is None: encoding = guess_encoding(value, default=encoding_default) value = value.decode(encoding, 'replace') value = remove_byte_order_mark(value) value = remove_unsafe_chars(value) else: value = six.text_type(value) # XXX: is this really a good idea? value = value.strip() if not len(value): return None return value
def normalize(text, lowercase=True, collapse=True, latinize=False, ascii=False, decompose=False, replace_categories=UNICODE_CATEGORIES): """The main normalization function for text. This will take a string and apply a set of transformations to it so that it can be processed more easily afterwards. Arguments: * ``lowercase``: not very mysterious. * ``collapse``: replace multiple whitespace-like characters with a single whitespace. This is especially useful with category replacement which can lead to a lot of whitespace. * ``decompose``: apply a unicode normalization (NFKD) to separate simple characters and their diacritics. * ``replace_categories``: This will perform a replacement of whole classes of unicode characters (e.g. symbols, marks, numbers) with a given character. It is used to replace any non-text elements of the input string. """ if not isinstance(text, six.string_types): return # TODO: Python 3? if six.PY2 and not isinstance(text, six.text_type): encoding = guess_encoding(text, 'utf-8') text = text.decode(encoding) if lowercase: # Yeah I made a Python package for this. text = text.lower() if decompose: text = decompose_nfkd(text) if ascii: # A stricter form of transliteration that leaves only ASCII # characters. text = ascii_text(text) elif latinize: # Perform unicode-based transliteration, e.g. of cyricllic # or CJK scripts into latin. text = latinize_text(text) # Perform unicode category-based character replacement. This is # used to filter out whole classes of characters, such as symbols, # punctuation, or whitespace-like characters. text = category_replace(text, replace_categories) if collapse: # Remove consecutive whitespace. text = collapse_spaces(text) return text