Python clean_utf8 Beispiele, pug.nlp.db.clean_utf8 Python Beispiele

Beispiel #1

0

Datei anzeigen

    def normalize_col_name(self, col_name, used_column_names, is_relation):
        """
        Modify the column name to make it Python-compatible as a field name
        """
        field_params = {}
        field_notes = []

        new_name = clean_utf8(col_name)
        new_name = new_name.lower()
        if new_name != col_name:
            field_notes.append('Field name cleaned of non-UTF-8 bytes and cast to lowercase.')

        if is_relation:
            if new_name.endswith('_id'):
                new_name = new_name[:-3]
            else:
                field_params['db_column'] = col_name

        new_name, num_repl = re.subn(r'\W', '_', new_name)
        if num_repl > 0:
            field_notes.append('Field renamed to remove unsuitable characters.')

        if new_name.find('__') >= 0:
            while new_name.find('__') >= 0:
                new_name = new_name.replace('__', '_')
            if col_name.lower().find('__') >= 0:
                # Only add the comment if the double underscore was in the original name
                field_notes.append("Field renamed because it contained more than one '_' in a row.")

        if new_name.startswith('_'):
            new_name = 'field%s' % new_name
            field_notes.append("Field renamed because it started with '_'.")

        if new_name.endswith('_'):
            new_name = '%sfield' % new_name
            field_notes.append("Field renamed because it ended with '_'.")

        if keyword.iskeyword(new_name):
            new_name += '_field'
            field_notes.append('Field renamed because it was a Python reserved word.')

        if new_name[0].isdigit():
            new_name = 'number_%s' % new_name
            field_notes.append("Field renamed because it wasn't a valid Python identifier.")

        if new_name in used_column_names:
            num = 0
            while '%s_%d' % (new_name, num) in used_column_names:
                num += 1
            new_name = '%s_%d' % (new_name, num)
            field_notes.append('Field renamed because of name conflict.')

        if col_name != new_name and field_notes:
            field_params['db_column'] = col_name

        return new_name, field_params, field_notes

Beispiel #2

0

Datei anzeigen

Datei: explore.py Projekt: pug/pug

def make_serializable(data, mutable=True):
    r"""Make sure the data structure is json serializable (json.dumps-able), all they way down to scalars in nested structures.

    If mutable=False then return tuples for all iterables, except basestrings (strs),
        so that they can be used as keys in a Mapping (dict).

    >>> from collections import OrderedDict
    >>> from decimal import Decimal
    >>> data = {'x': Decimal('01.234567891113151719'), 'X': [{('y', 'z'): {'q': 'A\xFFB'}}, 'ender'] }
    >>> make_serializable(OrderedDict(data)) == {'X': [{('y', 'z'): {'q': 'A\xc3\xbfB'}}, 'ender'], 'x': 1.2345678911131517}
    True
    """
    #print 'serializabling: ' + repr(data)
    # print 'type: ' + repr(type(data))
    if isinstance(data, basestring):
        return db.clean_utf8(data)
    if isinstance(data, (datetime.datetime, datetime.date, datetime.time)):
        return str(data)
    #print 'nonstring type: ' + repr(type(data))
    if isinstance(data, Mapping):
        mapping = tuple((make_serializable(k, mutable=False),
                         make_serializable(v, mutable=mutable))
                        for (k, v) in data.iteritems())
        # print 'mapping tuple = %s' % repr(mapping)
        #print 'keys list = %s' % repr([make_serializable(k, mutable=False) for k in data])
        # this mutability business is probably unneccessary because the keys of the mapping will already be immutable... at least until python 3 MutableMappings
        if mutable:
            return dict(mapping)
        return mapping
    if hasattr(data, '__iter__'):
        if mutable:
            #print list(make_serializable(v, mutable=mutable) for v in data)
            return list(make_serializable(v, mutable=mutable) for v in data)
        else:
            #print tuple(make_serializable(v, mutable=mutable) for v in data)
            return tuple(make_serializable(v, mutable=mutable) for v in data)
    if isinstance(data, (float, Decimal)):
        return float(data)
    try:
        return int(data)
    except:
        try:
            return float(data)
        except:
            try:
                # try to parse a date or datetime string
                return parser.parse(str(data))
            except:
                return str(try_convert(data))

Beispiel #3

0

Datei anzeigen

Datei: explore.py Projekt: pug/pug

def make_serializable(data, mutable=True):
    r"""Make sure the data structure is json serializable (json.dumps-able), all they way down to scalars in nested structures.

    If mutable=False then return tuples for all iterables, except basestrings (strs),
        so that they can be used as keys in a Mapping (dict).

    >>> from collections import OrderedDict
    >>> from decimal import Decimal
    >>> data = {'x': Decimal('01.234567891113151719'), 'X': [{('y', 'z'): {'q': 'A\xFFB'}}, 'ender'] }
    >>> make_serializable(OrderedDict(data)) == {'X': [{('y', 'z'): {'q': 'A\xc3\xbfB'}}, 'ender'], 'x': 1.2345678911131517}
    True
    """
    #print 'serializabling: ' + repr(data)
    # print 'type: ' + repr(type(data))
    if isinstance(data, basestring):
        return db.clean_utf8(data)
    if isinstance(data, (datetime.datetime, datetime.date, datetime.time)):
        return str(data)
    #print 'nonstring type: ' + repr(type(data))
    if isinstance(data, Mapping):
        mapping = tuple((make_serializable(k, mutable=False), make_serializable(v, mutable=mutable)) for (k, v) in data.iteritems())
        # print 'mapping tuple = %s' % repr(mapping)
        #print 'keys list = %s' % repr([make_serializable(k, mutable=False) for k in data])
        # this mutability business is probably unneccessary because the keys of the mapping will already be immutable... at least until python 3 MutableMappings
        if mutable:
            return dict(mapping)
        return mapping
    if hasattr(data, '__iter__'):
        if mutable:
            #print list(make_serializable(v, mutable=mutable) for v in data)
            return list(make_serializable(v, mutable=mutable) for v in data)
        else:
            #print tuple(make_serializable(v, mutable=mutable) for v in data)
            return tuple(make_serializable(v, mutable=mutable) for v in data)
    if isinstance(data, (float, Decimal)):
        return float(data)
    try:
        return int(data)
    except:
        try:
            return float(data)
        except:
            try:
                # try to parse a date or datetime string
                return parser.parse(str(data))
            except:
                return str(try_convert(data))

Beispiel #4

0

Datei anzeigen

Datei: explore.py Projekt: pug/pug

def augment_field_meta(field, queryset, field_properties, verbosity=0, count=0):
    """Return a dict of statistical properties (metadata) for a database column (model field)

    Strings are UTF-8 encoded (UTF-16 or invalid UTF-8 characters are ignored)
    Resulting dictionary is json-serializable using the pug.nlp.db.RobustEncoder class.

    {
        'num_distinct':   # count of distinct (different) discrete values within the column
        'min':   # minimum value
        'max':   # maximum value
        'num_null':   # count of the Null or None values in the column
        'type':  # database column type
    }

    TODO:
      1. count the number of values that are strings that could be converted to
         a. integers
         b. floats
         c. dates / datetimes
         d. booleans / nullbooleans
         e. other ordinal, categorical, or quantitative types
      2. count the number of null values
         a. null/None
         b. blank
         c. whitespace or other strings signifying null ('NULL', 'None', 'N/A', 'NaN', 'Not provided')
    """
    # Calculate the fraction of values in a column that are distinct (unique).
    #   For columns that aren't populated with 100% distinct values, the fraction may help identify columns that are part of a  "unique-together" compound key
    #   Necessary constraint for col1 and col2 to be compound key: col1_uniqueness + col2_uniqueness >= 1.0 (100%)
    # TODO: check for other clues about primary_keyness besides just uniqueness 
    field_properties['num_distinct'] = count
    field_properties['num_null'] = count
    field_properties['fraction_distinct'] = count
    typ = field_properties.get('type')
    if typ and typ not in types_not_countable and count:
        try:
            field_properties['num_distinct'] = queryset.values(field.name).distinct().count()
            field_properties['num_null'] = queryset.filter(**{'%s__isnull' % field.name: True}).count()
            field_properties['fraction_distinct'] = float(field_properties['num_distinct']) / (count or 1)
        except DatabaseError as e:
            if verbosity:
                print_exc()
                print "DatabaseError: Skipped count of values in field named '%s' (%s) because of %s." % (field.name, repr(field.db_column), e)
            connection.close()
        try:
            if field_properties['num_distinct'] > 1 and (0 < field_properties['fraction_distinct'] < 0.999):
                # this will not work until pyodbc is updated
                # May be related to django-pyodbc incompatability with django 1.6
                # FIXME: use the working query for values.distinct.count and sort that dict and then query the top 10 of those individually
                field_properties['most_frequent'] = [(v, c) for (v,c) in 
                                                     queryset.distinct().values(field.name).annotate(field_value_count=models.Count(field.name))
                                                     .extra(order_by=['-field_value_count']).values_list(field.name, 'field_value_count')
                                                     [:min(field_properties['num_distinct'], 10)]
                                                    ]
        except (StandardError, FieldError, DatabaseError) as e:
            if verbosity:
                print "Warning: Failed to calculate the Top-10 histogram for field named '%s' (%s) because of %s." % (field.name, repr(field.db_column), e)
            if verbosity > 2:
                print_exc()

    field_properties['max'] = None
    field_properties['min'] = None
    # check field_properties['num_null'] for all Null first?
    if count and typ and typ not in types_not_aggregatable:
        connection.close()
        try:
            field_properties['max'] = db.clean_utf8(queryset.aggregate(max_value=models.Max(field.name))['max_value'])
            field_properties['min'] = db.clean_utf8(queryset.aggregate(min_value=models.Min(field.name))['min_value'])
        except ValueError as e:
            if verbosity:
                print_exc()
                print "ValueError (perhaps UnicodeDecodeError?): Skipped max/min calculations for field named '%s' (%s) because of %s." % (field.name, repr(field.db_column), e)
            connection.close()
        except DatabaseError, e:
            if verbosity:
                print_exc()
                print "DatabaseError: Skipped max/min calculations for field named '%s' (%s) because of %s." % (field.name, repr(field.db_column), e)
            connection.close()
        # validate values that might be invalid strings do to db encoding/decoding errors (make sure they are UTF-8
        for k in ('min', 'max'):
            db.clean_utf8(field_properties.get(k))

Beispiel #5

0

Datei anzeigen

Datei: explore.py Projekt: pug/pug

def augment_field_meta(field,
                       queryset,
                       field_properties,
                       verbosity=0,
                       count=0):
    """Return a dict of statistical properties (metadata) for a database column (model field)

    Strings are UTF-8 encoded (UTF-16 or invalid UTF-8 characters are ignored)
    Resulting dictionary is json-serializable using the pug.nlp.db.RobustEncoder class.

    {
        'num_distinct':   # count of distinct (different) discrete values within the column
        'min':   # minimum value
        'max':   # maximum value
        'num_null':   # count of the Null or None values in the column
        'type':  # database column type
    }

    TODO:
      1. count the number of values that are strings that could be converted to
         a. integers
         b. floats
         c. dates / datetimes
         d. booleans / nullbooleans
         e. other ordinal, categorical, or quantitative types
      2. count the number of null values
         a. null/None
         b. blank
         c. whitespace or other strings signifying null ('NULL', 'None', 'N/A', 'NaN', 'Not provided')
    """
    # Calculate the fraction of values in a column that are distinct (unique).
    #   For columns that aren't populated with 100% distinct values, the fraction may help identify columns that are part of a  "unique-together" compound key
    #   Necessary constraint for col1 and col2 to be compound key: col1_uniqueness + col2_uniqueness >= 1.0 (100%)
    # TODO: check for other clues about primary_keyness besides just uniqueness
    field_properties['num_distinct'] = count
    field_properties['num_null'] = count
    field_properties['fraction_distinct'] = count
    typ = field_properties.get('type')
    if typ and typ not in types_not_countable and count:
        try:
            field_properties['num_distinct'] = queryset.values(
                field.name).distinct().count()
            field_properties['num_null'] = queryset.filter(
                **{
                    '%s__isnull' % field.name: True
                }).count()
            field_properties['fraction_distinct'] = float(
                field_properties['num_distinct']) / (count or 1)
        except DatabaseError as e:
            if verbosity:
                print_exc()
                print "DatabaseError: Skipped count of values in field named '%s' (%s) because of %s." % (
                    field.name, repr(field.db_column), e)
            connection.close()
        try:
            if field_properties['num_distinct'] > 1 and (
                    0 < field_properties['fraction_distinct'] < 0.999):
                # this will not work until pyodbc is updated
                # May be related to django-pyodbc incompatability with django 1.6
                # FIXME: use the working query for values.distinct.count and sort that dict and then query the top 10 of those individually
                field_properties['most_frequent'] = [
                    (v, c)
                    for (v,
                         c) in queryset.distinct().values(field.name).annotate(
                             field_value_count=models.Count(field.name)).extra(
                                 order_by=['-field_value_count']).values_list(
                                     field.name, 'field_value_count')
                    [:min(field_properties['num_distinct'], 10)]
                ]
        except (StandardError, FieldError, DatabaseError) as e:
            if verbosity:
                print "Warning: Failed to calculate the Top-10 histogram for field named '%s' (%s) because of %s." % (
                    field.name, repr(field.db_column), e)
            if verbosity > 2:
                print_exc()

    field_properties['max'] = None
    field_properties['min'] = None
    # check field_properties['num_null'] for all Null first?
    if count and typ and typ not in types_not_aggregatable:
        connection.close()
        try:
            field_properties['max'] = db.clean_utf8(
                queryset.aggregate(
                    max_value=models.Max(field.name))['max_value'])
            field_properties['min'] = db.clean_utf8(
                queryset.aggregate(
                    min_value=models.Min(field.name))['min_value'])
        except ValueError as e:
            if verbosity:
                print_exc()
                print "ValueError (perhaps UnicodeDecodeError?): Skipped max/min calculations for field named '%s' (%s) because of %s." % (
                    field.name, repr(field.db_column), e)
            connection.close()
        except DatabaseError, e:
            if verbosity:
                print_exc()
                print "DatabaseError: Skipped max/min calculations for field named '%s' (%s) because of %s." % (
                    field.name, repr(field.db_column), e)
            connection.close()
        # validate values that might be invalid strings do to db encoding/decoding errors (make sure they are UTF-8
        for k in ('min', 'max'):
            db.clean_utf8(field_properties.get(k))