def normalize_col_name(self, col_name, used_column_names, is_relation): """ Modify the column name to make it Python-compatible as a field name """ field_params = {} field_notes = [] new_name = clean_utf8(col_name) new_name = new_name.lower() if new_name != col_name: field_notes.append('Field name cleaned of non-UTF-8 bytes and cast to lowercase.') if is_relation: if new_name.endswith('_id'): new_name = new_name[:-3] else: field_params['db_column'] = col_name new_name, num_repl = re.subn(r'\W', '_', new_name) if num_repl > 0: field_notes.append('Field renamed to remove unsuitable characters.') if new_name.find('__') >= 0: while new_name.find('__') >= 0: new_name = new_name.replace('__', '_') if col_name.lower().find('__') >= 0: # Only add the comment if the double underscore was in the original name field_notes.append("Field renamed because it contained more than one '_' in a row.") if new_name.startswith('_'): new_name = 'field%s' % new_name field_notes.append("Field renamed because it started with '_'.") if new_name.endswith('_'): new_name = '%sfield' % new_name field_notes.append("Field renamed because it ended with '_'.") if keyword.iskeyword(new_name): new_name += '_field' field_notes.append('Field renamed because it was a Python reserved word.') if new_name[0].isdigit(): new_name = 'number_%s' % new_name field_notes.append("Field renamed because it wasn't a valid Python identifier.") if new_name in used_column_names: num = 0 while '%s_%d' % (new_name, num) in used_column_names: num += 1 new_name = '%s_%d' % (new_name, num) field_notes.append('Field renamed because of name conflict.') if col_name != new_name and field_notes: field_params['db_column'] = col_name return new_name, field_params, field_notes
def make_serializable(data, mutable=True): r"""Make sure the data structure is json serializable (json.dumps-able), all they way down to scalars in nested structures. If mutable=False then return tuples for all iterables, except basestrings (strs), so that they can be used as keys in a Mapping (dict). >>> from collections import OrderedDict >>> from decimal import Decimal >>> data = {'x': Decimal('01.234567891113151719'), 'X': [{('y', 'z'): {'q': 'A\xFFB'}}, 'ender'] } >>> make_serializable(OrderedDict(data)) == {'X': [{('y', 'z'): {'q': 'A\xc3\xbfB'}}, 'ender'], 'x': 1.2345678911131517} True """ #print 'serializabling: ' + repr(data) # print 'type: ' + repr(type(data)) if isinstance(data, basestring): return db.clean_utf8(data) if isinstance(data, (datetime.datetime, datetime.date, datetime.time)): return str(data) #print 'nonstring type: ' + repr(type(data)) if isinstance(data, Mapping): mapping = tuple((make_serializable(k, mutable=False), make_serializable(v, mutable=mutable)) for (k, v) in data.iteritems()) # print 'mapping tuple = %s' % repr(mapping) #print 'keys list = %s' % repr([make_serializable(k, mutable=False) for k in data]) # this mutability business is probably unneccessary because the keys of the mapping will already be immutable... at least until python 3 MutableMappings if mutable: return dict(mapping) return mapping if hasattr(data, '__iter__'): if mutable: #print list(make_serializable(v, mutable=mutable) for v in data) return list(make_serializable(v, mutable=mutable) for v in data) else: #print tuple(make_serializable(v, mutable=mutable) for v in data) return tuple(make_serializable(v, mutable=mutable) for v in data) if isinstance(data, (float, Decimal)): return float(data) try: return int(data) except: try: return float(data) except: try: # try to parse a date or datetime string return parser.parse(str(data)) except: return str(try_convert(data))
def augment_field_meta(field, queryset, field_properties, verbosity=0, count=0): """Return a dict of statistical properties (metadata) for a database column (model field) Strings are UTF-8 encoded (UTF-16 or invalid UTF-8 characters are ignored) Resulting dictionary is json-serializable using the pug.nlp.db.RobustEncoder class. { 'num_distinct': # count of distinct (different) discrete values within the column 'min': # minimum value 'max': # maximum value 'num_null': # count of the Null or None values in the column 'type': # database column type } TODO: 1. count the number of values that are strings that could be converted to a. integers b. floats c. dates / datetimes d. booleans / nullbooleans e. other ordinal, categorical, or quantitative types 2. count the number of null values a. null/None b. blank c. whitespace or other strings signifying null ('NULL', 'None', 'N/A', 'NaN', 'Not provided') """ # Calculate the fraction of values in a column that are distinct (unique). # For columns that aren't populated with 100% distinct values, the fraction may help identify columns that are part of a "unique-together" compound key # Necessary constraint for col1 and col2 to be compound key: col1_uniqueness + col2_uniqueness >= 1.0 (100%) # TODO: check for other clues about primary_keyness besides just uniqueness field_properties['num_distinct'] = count field_properties['num_null'] = count field_properties['fraction_distinct'] = count typ = field_properties.get('type') if typ and typ not in types_not_countable and count: try: field_properties['num_distinct'] = queryset.values(field.name).distinct().count() field_properties['num_null'] = queryset.filter(**{'%s__isnull' % field.name: True}).count() field_properties['fraction_distinct'] = float(field_properties['num_distinct']) / (count or 1) except DatabaseError as e: if verbosity: print_exc() print "DatabaseError: Skipped count of values in field named '%s' (%s) because of %s." % (field.name, repr(field.db_column), e) connection.close() try: if field_properties['num_distinct'] > 1 and (0 < field_properties['fraction_distinct'] < 0.999): # this will not work until pyodbc is updated # May be related to django-pyodbc incompatability with django 1.6 # FIXME: use the working query for values.distinct.count and sort that dict and then query the top 10 of those individually field_properties['most_frequent'] = [(v, c) for (v,c) in queryset.distinct().values(field.name).annotate(field_value_count=models.Count(field.name)) .extra(order_by=['-field_value_count']).values_list(field.name, 'field_value_count') [:min(field_properties['num_distinct'], 10)] ] except (StandardError, FieldError, DatabaseError) as e: if verbosity: print "Warning: Failed to calculate the Top-10 histogram for field named '%s' (%s) because of %s." % (field.name, repr(field.db_column), e) if verbosity > 2: print_exc() field_properties['max'] = None field_properties['min'] = None # check field_properties['num_null'] for all Null first? if count and typ and typ not in types_not_aggregatable: connection.close() try: field_properties['max'] = db.clean_utf8(queryset.aggregate(max_value=models.Max(field.name))['max_value']) field_properties['min'] = db.clean_utf8(queryset.aggregate(min_value=models.Min(field.name))['min_value']) except ValueError as e: if verbosity: print_exc() print "ValueError (perhaps UnicodeDecodeError?): Skipped max/min calculations for field named '%s' (%s) because of %s." % (field.name, repr(field.db_column), e) connection.close() except DatabaseError, e: if verbosity: print_exc() print "DatabaseError: Skipped max/min calculations for field named '%s' (%s) because of %s." % (field.name, repr(field.db_column), e) connection.close() # validate values that might be invalid strings do to db encoding/decoding errors (make sure they are UTF-8 for k in ('min', 'max'): db.clean_utf8(field_properties.get(k))
def augment_field_meta(field, queryset, field_properties, verbosity=0, count=0): """Return a dict of statistical properties (metadata) for a database column (model field) Strings are UTF-8 encoded (UTF-16 or invalid UTF-8 characters are ignored) Resulting dictionary is json-serializable using the pug.nlp.db.RobustEncoder class. { 'num_distinct': # count of distinct (different) discrete values within the column 'min': # minimum value 'max': # maximum value 'num_null': # count of the Null or None values in the column 'type': # database column type } TODO: 1. count the number of values that are strings that could be converted to a. integers b. floats c. dates / datetimes d. booleans / nullbooleans e. other ordinal, categorical, or quantitative types 2. count the number of null values a. null/None b. blank c. whitespace or other strings signifying null ('NULL', 'None', 'N/A', 'NaN', 'Not provided') """ # Calculate the fraction of values in a column that are distinct (unique). # For columns that aren't populated with 100% distinct values, the fraction may help identify columns that are part of a "unique-together" compound key # Necessary constraint for col1 and col2 to be compound key: col1_uniqueness + col2_uniqueness >= 1.0 (100%) # TODO: check for other clues about primary_keyness besides just uniqueness field_properties['num_distinct'] = count field_properties['num_null'] = count field_properties['fraction_distinct'] = count typ = field_properties.get('type') if typ and typ not in types_not_countable and count: try: field_properties['num_distinct'] = queryset.values( field.name).distinct().count() field_properties['num_null'] = queryset.filter( **{ '%s__isnull' % field.name: True }).count() field_properties['fraction_distinct'] = float( field_properties['num_distinct']) / (count or 1) except DatabaseError as e: if verbosity: print_exc() print "DatabaseError: Skipped count of values in field named '%s' (%s) because of %s." % ( field.name, repr(field.db_column), e) connection.close() try: if field_properties['num_distinct'] > 1 and ( 0 < field_properties['fraction_distinct'] < 0.999): # this will not work until pyodbc is updated # May be related to django-pyodbc incompatability with django 1.6 # FIXME: use the working query for values.distinct.count and sort that dict and then query the top 10 of those individually field_properties['most_frequent'] = [ (v, c) for (v, c) in queryset.distinct().values(field.name).annotate( field_value_count=models.Count(field.name)).extra( order_by=['-field_value_count']).values_list( field.name, 'field_value_count') [:min(field_properties['num_distinct'], 10)] ] except (StandardError, FieldError, DatabaseError) as e: if verbosity: print "Warning: Failed to calculate the Top-10 histogram for field named '%s' (%s) because of %s." % ( field.name, repr(field.db_column), e) if verbosity > 2: print_exc() field_properties['max'] = None field_properties['min'] = None # check field_properties['num_null'] for all Null first? if count and typ and typ not in types_not_aggregatable: connection.close() try: field_properties['max'] = db.clean_utf8( queryset.aggregate( max_value=models.Max(field.name))['max_value']) field_properties['min'] = db.clean_utf8( queryset.aggregate( min_value=models.Min(field.name))['min_value']) except ValueError as e: if verbosity: print_exc() print "ValueError (perhaps UnicodeDecodeError?): Skipped max/min calculations for field named '%s' (%s) because of %s." % ( field.name, repr(field.db_column), e) connection.close() except DatabaseError, e: if verbosity: print_exc() print "DatabaseError: Skipped max/min calculations for field named '%s' (%s) because of %s." % ( field.name, repr(field.db_column), e) connection.close() # validate values that might be invalid strings do to db encoding/decoding errors (make sure they are UTF-8 for k in ('min', 'max'): db.clean_utf8(field_properties.get(k))