def test_datafield(self): f = DataField(app_name='tests', model_name='month', field_name='id') self.assertEqual(f.values(), (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)) self.assertEqual(f.labels(), (u'January', u'February', u'March', u'April', u'May', u'June', u'July', u'August', u'September', u'October', u'November', u'December')) self.assertEqual(f.codes(), (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11))
def _migrate_fields(self, **options): force = options.get('force') no_input = options.get('no_input') total_migrated = 0 for lf in legacy.Field.objects.iterator(): try: f = DataField.objects.get_by_natural_key(lf.app_name, lf.model_name, lf.field_name) except DataField.DoesNotExist: f = DataField(app_name=lf.app_name, model_name=lf.model_name, field_name=lf.field_name) qualified_name = \ u'({0}) {1}.{2}'.format(f.app_name, f.model_name, f.field_name) if f.pk and not force: print u'{0} already exists. Skipping...'.format(qualified_name) continue # Check if this is an orphan if not f.field: print u'{0} is orphaned. Skipping...'.format(qualified_name) continue # Map various fields f.name = lf.name f.description = lf.description f.keywords = lf.keywords f.translator = lf.translator f.group_id = lf.group_id print u'Migrated "{0}"'.format(qualified_name) f.__dict__.update(utils.get_heuristic_flags(f.field)) # Disagreement with enumerable status if not no_input and f.enumerable != lf.enable_choices: if lf.enable_choices: override = raw_input(u'"{0}" is marked as enumerable, but ' 'does not qualify to be enumerable. ' 'Override? [y/N] ' .format(qualified_name)) else: override = raw_input(u'"{0}" is not marked as enumerable, ' 'but qualifies to be enumerable. ' 'Override? [y/N] ' .format(qualified_name)) if override.lower() == 'y': f.enumerable = lf.enable_choices f.save() f.sites = lf.sites.all() total_migrated += 1 print u'Fields migrated:\t{0}'.format(total_migrated)
def handle(self, field_label, model_label, **options): model = get_model(*model_label.split('.')) if model is None: raise CommandError(u'Not model named {0} was found'.format( model_label)) toks = field_label.split('.') if len(toks) != 3: raise CommandError(u'{0} is not a valid field identifier. Use a ' '"." delimited notation, e.g. "app.model.field"' .format(field_label)) # Not required to be persisted to the database.. f = DataField(app_name=toks[0], model_name=toks[1], field_name=toks[2]) if not f.field: raise CommandError(u'The field {0} could not be found.'.format( field_label)) count = 0 values = list(f.values()) values.sort(key=coerce_float) for value in values: if value is None or value == '': continue label = non_alnum.sub(value, ' ').title() obj = model(label=label, value=value, order=count, code=count) obj.save() count += 1 print(u'{0} distinct values loaded'.format(count))
def handle(self, field_label, model_label, **options): model = get_model(*model_label.split('.')) if model is None: raise CommandError( u'Not model named {0} was found'.format(model_label)) toks = field_label.split('.') if len(toks) != 3: raise CommandError( u'{0} is not a valid field identifier. Use a ' '"." delimited notation, e.g. "app.model.field"'.format( field_label)) # Not required to be persisted to the database.. f = DataField(app_name=toks[0], model_name=toks[1], field_name=toks[2]) if not f.field: raise CommandError( u'The field {0} could not be found.'.format(field_label)) count = 0 values = list(f.values()) values.sort(key=coerce_float) for value in values: if value is None or value == '': continue label = non_alnum.sub(value, ' ').title() obj = model(label=label, value=value, order=count, code=count) obj.save() count += 1 print(u'{0} distinct values loaded'.format(count))
def test_instance(self): f = DataField(app_name='avocado', model_name='datafield', field_name='name') f.save() usage.log('test', instance=f, async=False) self.assertEqual(Log.objects.get(pk=1).content_object, f)
class TranslateTestCase(TestCase): fixtures = ['initial_data.json'] # Define translator with defined through model and register it class T(VocabularyTranslator): through_model = TicketThrough registry.register(T, 'test') def setUp(self): management.call_command('avocado','init','tests', quiet=True) # Build item index TicketIndex.objects.index() # Create the text index DataField self.f = DataField(name='Ticket Index Item', app_name='tests', model_name='ticketindex', field_name='item') self.f.translator = "test" self.f.save() def test_only(self): c = DataContext(json={ 'field': self.f.pk, 'operator': 'only', 'value': [1, 6] }) self.assertEqual([3], [x.pk for x in c.apply(tree=TicketHolder)]) def test_excludes_any(self): c = DataContext(json={ 'field': self.f.pk, 'operator': '-in', 'value': [1, 2] }) self.assertEqual([2], [x.pk for x in c.apply(tree=TicketHolder)]) def test_excludes_all(self): c = DataContext(json={ 'field': self.f.pk, 'operator': '-all', 'value': [1, 5] }) self.assertEqual([1, 2], sorted([x.pk for x in c.apply(tree=TicketHolder)])) def test_requires_all(self): c = DataContext(json={ 'field': self.f.pk, 'operator': 'all', 'value': [1, 3] }) self.assertEqual([1, 3], sorted([x.pk for x in c.apply(tree=TicketHolder)])) def test_requires_any(self): c = DataContext(json={ 'field': self.f.pk, 'operator': 'in', 'value': [3] }) self.assertEqual([1, 2, 3], sorted([x.pk for x in c.apply(tree=TicketHolder)]))
def forwards(self, orm): "Write your forwards methods here." try: f = DataField.objects.get_by_natural_key('variants.variantphenotype.hgmd_id') except DataField.DoesNotExist: f = DataField(app_name='variants', model_name='variantphenotype', field_name='hgmd_id', name='HGMD', published=True, searchable=True) f.save()
def test_datafield_properties(self): [RecordSet(name=u'Set {0}'.format(i)).save() for i in xrange(10)] f = DataField(app_name='tests', model_name='recordset', field_name='id') self.assertEqual(list(f.values()), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) self.assertEqual(list(f.labels()), ['Set 0', 'Set 1', 'Set 2', 'Set 3', 'Set 4', 'Set 5', 'Set 6', 'Set 7', 'Set 8', 'Set 9'])
def test_instance(self): f = DataField(app_name='avocado', model_name='datafield', field_name='name') f.save() usage.log('test', instance=f, async=False) event = Log.objects.all()[0] self.assertEqual(event.content_object, f)
def test_translator(self): s = RecordSet() s.save() objs = [Record(pk=i) for i in xrange(1, 11)] s.bulk(objs) f = DataField(app_name='sets', model_name='recordset', field_name='id') trans = f.translate(value=s.pk, tree=Record) self.assertEqual(unicode(trans['query_modifiers']['condition']), "(AND: ('recordset__id__exact', 1))")
def test_datafield_properties(self): [RecordSet(name=u'Set {0}'.format(i)).save() for i in xrange(10)] f = DataField(app_name='tests', model_name='recordset', field_name='id') self.assertEqual(list(f.values()), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) self.assertEqual(list(f.labels()), [ 'Set 0', 'Set 1', 'Set 2', 'Set 3', 'Set 4', 'Set 5', 'Set 6', 'Set 7', 'Set 8', 'Set 9' ])
def test_translator(self): s = RecordSet() s.save() objs = [Record(pk=i) for i in xrange(1, 11)] s.bulk(objs) f = DataField(app_name='tests', model_name='recordset', field_name='id') trans = f.translate(value=s.pk, tree=Record) self.assertEqual(unicode(trans['query_modifiers']['condition']), "(AND: ('recordset__id__exact', 1))")
def forwards(self, orm): "Write your forwards methods here." try: f = DataField.objects.get_by_natural_key( 'variants.variantphenotype.hgmd_id') except DataField.DoesNotExist: f = DataField(app_name='variants', model_name='variantphenotype', field_name='hgmd_id', name='HGMD', published=True, searchable=True) f.save()
def test_dataview_order_by(self): f = DataField(app_name='lexicon', model_name='month', field_name='id') f.save() c = DataConcept() c.save() cf = DataConceptField(field=f, concept=c) cf.save() v = DataView({'ordering': [c.pk]}) qs = Month.objects.filter(label__startswith='J').values('id') self.assertEqual(str(v.apply(qs).query), 'SELECT "lexicon_month"."id" FROM "lexicon_month" WHERE "lexicon_month"."label" LIKE J% ESCAPE \'\\\' ORDER BY "lexicon_month"."order" ASC')
def test_dataview_order_by(self): f = DataField(app_name='tests', model_name='month', field_name='id') f.save() c = DataConcept() c.save() cf = DataConceptField(field=f, concept=c) cf.save() v = DataView({'ordering': [[c.pk, 'asc']]}) qs = Month.objects.filter(label__startswith='J').values('id') self.assertEqual( unicode(v.apply(qs).query), 'SELECT "tests_month"."id" FROM "tests_month" WHERE "tests_month"."label" LIKE J% ESCAPE \'\\\' ORDER BY "tests_month"."order" ASC' )
def setUp(self): management.call_command('avocado','init','tests', quiet=True) # Build item index TicketIndex.objects.index() # Create the text index DataField self.f = DataField(name='Ticket Index Item', app_name='tests', model_name='ticketindex', field_name='item') self.f.translator = "test" self.f.save()
def setUp(self): self.f = DataField.init('tests.title.name')
def _migrate_fields(self, **options): force = options.get('force') no_input = options.get('no_input') total_migrated = 0 for lf in legacy.Field.objects.iterator(): try: f = DataField.objects.get_by_natural_key( lf.app_name, lf.model_name, lf.field_name) except DataField.DoesNotExist: f = DataField(app_name=lf.app_name, model_name=lf.model_name, field_name=lf.field_name) qualified_name = \ u'({0}) {1}.{2}'.format(f.app_name, f.model_name, f.field_name) if f.pk and not force: print u'{0} already exists. Skipping...'.format(qualified_name) continue # Check if this is an orphan if not f.field: print u'{0} is orphaned. Skipping...'.format(qualified_name) continue # Map various fields f.name = lf.name f.description = lf.description f.keywords = lf.keywords f.translator = lf.translator f.group_id = lf.group_id print u'Migrated "{0}"'.format(qualified_name) f.__dict__.update(utils.get_heuristic_flags(f.field)) # Disagreement with enumerable status if not no_input and f.enumerable != lf.enable_choices: if lf.enable_choices: override = raw_input( u'"{0}" is marked as enumerable, but ' 'does not qualify to be enumerable. ' 'Override? [y/N] '.format(qualified_name)) else: override = raw_input( u'"{0}" is not marked as enumerable, ' 'but qualifies to be enumerable. ' 'Override? [y/N] '.format(qualified_name)) if override.lower() == 'y': f.enumerable = lf.enable_choices f.save() f.sites = lf.sites.all() total_migrated += 1 print u'Fields migrated:\t{0}'.format(total_migrated)
def get(self, request, pk): instance = self.get_object(request, pk=pk) params = self.get_params(request) tree = trees[params.get('tree')] opts = tree.root_model._meta tree_field = DataField(app_name=opts.app_label, model_name=opts.module_name, field_name=opts.pk.name) # This will eventually make it's way in the parametizer, but lists # are not supported dimensions = request.GET.getlist('dimensions') # The `aware` flag toggles the behavior of the distribution by making # it relative to the applied context or not if params['aware']: attrs = None else: attrs = {} # Get and apply context relative to the tree context = self.get_context(request, attrs=attrs) queryset = context.apply(tree=tree) # Explicit fields to group by, ignore ones that dont exist or the # user does not have permission to view. Default is to group by the # reference field for distinct counts. if any(dimensions): fields = [] groupby = [] for pk in dimensions: f = self.get_object(request, pk=pk) if f: fields.append(f) groupby.append(tree.query_string_for_field(f.field)) else: fields = [instance] groupby = [tree.query_string_for_field(instance.field)] # Perform a count aggregation of the tree model grouped by the # specified dimensions stats = tree_field.count(*groupby) # Apply it relative to the queryset stats = stats.apply(queryset) # Exclude null values. Dependending on the downstream use of the data, # nulls may or may not be desirable. if not params['nulls']: q = Q() for field in groupby: q = q | Q(**{field: None}) stats = stats.exclude(q) # Begin constructing the response resp = { 'data': [], 'outliers': [], 'clustered': False, 'size': 0, } # Evaluate list of points length = len(stats) # Nothing to do if not length: usage.log('dist', instance=instance, request=request, data={ 'size': 0, 'clustered': False, 'aware': params['aware'], }) return resp if length > MAXIMUM_OBSERVATIONS: data = { 'message': 'Data too large', } return self.render(request, data, status=codes.unprocessable_entity) # Apply ordering. If any of the fields are enumerable, ordering should # be relative to those fields. For continuous data, the ordering is # relative to the count of each group if (any([d.enumerable for d in fields]) and not params['sort'] == 'count'): stats = stats.order_by(*groupby) else: stats = stats.order_by('-count') clustered = False points = list(stats) outliers = [] # For N-dimensional continuous data, check if clustering should occur # to down-sample the data. if all([d.simple_type == 'number' for d in fields]): # Extract observations for clustering obs = [] for point in points: for i, dim in enumerate(point['values']): if isinstance(dim, Decimal): point['values'][i] = float(str(dim)) obs.append(point['values']) # Perform k-means clustering. Determine centroids and calculate # the weighted count relatives to the centroid and observations # within the kmeans module. if params['cluster'] and length >= MINIMUM_OBSERVATIONS: clustered = True counts = [p['count'] for p in points] points, outliers = kmeans.weighted_counts( obs, counts, params['n']) else: indexes = kmeans.find_outliers(obs, normalized=False) outliers = [] for idx in indexes: outliers.append(points[idx]) points[idx] = None points = [p for p in points if p is not None] usage.log('dist', instance=instance, request=request, data={ 'size': length, 'clustered': clustered, 'aware': params['aware'], }) return { 'data': points, 'clustered': clustered, 'outliers': outliers, 'size': length, }
def get(self, request): params = self.get_params(request) if params['aware']: context = self.get_context(request) else: context = DataContext() # Get all published app/model pairs to produce counts for. model_names = DataField.objects.published()\ .values_list('app_name', 'model_name')\ .order_by('model_name').distinct() results = [] data = [] models = set() QueryProcessor = pipeline.query_processors[params['processor']] # Workaround for a Python bug for versions 2.7.5 and below # http://bugs.python.org/issue10015 if not hasattr(threading.current_thread(), '_children'): threading.current_thread()._children = weakref.WeakKeyDictionary() # Pool of threads to execute the counts in parallel pool = ThreadPool() for app_name, model_name in model_names: # DataField used here to resolve foreign key-based fields. model = DataField(app_name=app_name, model_name=model_name).model # No redundant counts if model in models: continue models.add(model) opts = model._meta # Format is called to resolve Django's internal proxy wrapper. verbose_name = opts.verbose_name.format() verbose_name_plural = opts.verbose_name_plural.format() # Assume no custom verbose_name as been set in Meta class, so # apply a minimal title-case. if verbose_name.islower(): verbose_name = verbose_name.title() if verbose_name_plural.islower(): verbose_name_plural = verbose_name_plural.title() # Placeholder with the model name. The count will be replaced if # successful. data.append({ 'count': None, 'app_name': app_name, 'model_name': model_name, 'verbose_name': verbose_name, 'verbose_name_plural': verbose_name_plural, }) # Asynchronously execute the count result = pool.apply_async(get_count, args=(request, model, params['refresh'], QueryProcessor, context)) results.append(result) pool.close() for i, r in enumerate(results): try: count = r.get(timeout=serrano_settings.STATS_COUNT_TIMEOUT) data[i]['count'] = count except Exception: pass return data
def get(self, request, pk): instance = self.get_object(request, pk=pk) params = self.get_params(request) tree = trees[params.get('tree')] opts = tree.root_model._meta tree_field = DataField( app_name=opts.app_label, model_name=opts.module_name, field_name=opts.pk.name) # This will eventually make it's way in the parametizer, but lists # are not supported dimensions = request.GET.getlist('dimensions') if params['aware']: context = self.get_context(request) else: context = None QueryProcessor = pipeline.query_processors[params['processor']] processor = QueryProcessor(context=context, tree=tree) queryset = processor.get_queryset(request=request) # Explicit fields to group by, ignore ones that dont exist or the # user does not have permission to view. Default is to group by the # reference field for distinct counts. if any(dimensions): fields = [] groupby = [] for pk in dimensions: f = self.get_object(request, pk=pk) if f: fields.append(f) groupby.append(tree.query_string_for_field(f.field, model=f.model)) else: fields = [instance] groupby = [tree.query_string_for_field(instance.field, model=instance.model)] # Perform a count aggregation of the tree model grouped by the # specified dimensions stats = tree_field.count(*groupby) # Apply it relative to the queryset stats = stats.apply(queryset) # Exclude null values. Dependending on the downstream use of the data, # nulls may or may not be desirable. if not params['nulls']: q = Q() for field in groupby: q = q | Q(**{field: None}) stats = stats.exclude(q) # Begin constructing the response resp = { 'data': [], 'outliers': [], 'clustered': False, 'size': 0, } # Evaluate list of points length = len(stats) # Nothing to do if not length: usage.log('dist', instance=instance, request=request, data={ 'size': 0, 'clustered': False, 'aware': params['aware'], }) return resp if length > MAXIMUM_OBSERVATIONS: data = { 'message': 'Data too large', } return self.render(request, data, status=codes.unprocessable_entity) # Apply ordering. If any of the fields are enumerable, ordering should # be relative to those fields. For continuous data, the ordering is # relative to the count of each group if (any([d.enumerable for d in fields]) and not params['sort'] == 'count'): stats = stats.order_by(*groupby) else: stats = stats.order_by('-count') clustered = False points = list(stats) outliers = [] # For N-dimensional continuous data, check if clustering should occur # to down-sample the data. if all([d.simple_type == 'number' for d in fields]): # Extract observations for clustering obs = [] for point in points: for i, dim in enumerate(point['values']): if isinstance(dim, Decimal): point['values'][i] = float(str(dim)) obs.append(point['values']) # Perform k-means clustering. Determine centroids and calculate # the weighted count relatives to the centroid and observations # within the kmeans module. if params['cluster'] and length >= MINIMUM_OBSERVATIONS: clustered = True counts = [p['count'] for p in points] points, outliers = kmeans.weighted_counts( obs, counts, params['n']) else: indexes = kmeans.find_outliers(obs, normalized=False) outliers = [] for idx in indexes: outliers.append(points[idx]) points[idx] = None points = [p for p in points if p is not None] usage.log('dist', instance=instance, request=request, data={ 'size': length, 'clustered': clustered, 'aware': params['aware'], }) return { 'data': points, 'clustered': clustered, 'outliers': outliers, 'size': length, }
def get(self, request, pk): instance = self.get_object(request, pk=pk) params = self.get_params(request) tree = trees[params.get('tree')] opts = tree.root_model._meta tree_field = DataField(pk='{0}:{1}'.format(params.get('tree'), pk), app_name=opts.app_label, model_name=opts.module_name, field_name=opts.pk.name) # This will eventually make its way in the parametizer, but lists # are not supported. dimensions = request.GET.getlist('dimensions') if params['aware']: context = self.get_context(request) else: context = None QueryProcessor = pipeline.query_processors[params['processor']] processor = QueryProcessor(context=context, tree=tree) queryset = processor.get_queryset(request=request) # Explicit fields to group by, ignore ones that dont exist or the # user does not have permission to view. Default is to group by the # reference field for disinct counts. if any(dimensions): fields = [] groupby = [] for pk in dimensions: f = self.get_object(request, pk=pk) if f: fields.append(f) groupby.append( tree.query_string_for_field(f.field, model=f.model)) else: fields = [instance] groupby = [ tree.query_string_for_field(instance.field, model=instance.model) ] # Exclude null values. Depending on the downstream use of the data, # nulls may or may not be desirable. if not params['nulls']: q = Q() for field in groupby: q = q & Q(**{'{0}__isnull'.format(field): False}) queryset = queryset.filter(q) queryset = queryset.values(*groupby) # Begin constructing the response resp = { 'data': [], 'outliers': [], 'clustered': False, 'size': 0, } queryset = queryset.annotate(count=Count(tree_field.field.name))\ .values_list('count', *groupby) # Evaluate list of points length = len(queryset) # Nothing to do if not length: usage.log('dims', instance=instance, request=request, data={ 'size': 0, 'clustered': False, 'aware': params['aware'], }) return resp if length > MAXIMUM_OBSERVATIONS: data = { 'message': 'Data too large', } return self.render(request, data, status=codes.unprocessable_entity) # Apply ordering. If any of the fields are enumerable, ordering should # be relative to those fields. For continuous data, the ordering is # relative to the count of each group if (any([d.enumerable for d in fields]) and not params['sort'] == 'count'): queryset = queryset.order_by(*groupby) else: queryset = queryset.order_by('-count') clustered = False points = [{ 'count': point[0], 'values': point[1:], } for point in list(queryset)] outliers = [] # For N-dimensional continuous data, check if clustering should occur # to down-sample the data. if all([d.simple_type == 'number' for d in fields]): # Extract observations for clustering. obs = [] null_points = [] numeric_points = [] for i, point in enumerate(points): # We need to handle points that have null dimensions # differently than those that are all numeric as the kmeans # module currently cannot handle mixed type dimensions so we # only allow fully numeric points to be passed to the kmeans # module. if None in point['values']: null_points.append(point) continue for i, dim in enumerate(point['values']): if isinstance(dim, Decimal): point['values'][i] = float(str(dim)) numeric_points.append(point) obs.append(point['values']) # Perform k-means clustering. Determine centroids and calculate # the weighted count relatives to the centroid and observations # within the kmeans module. if params['cluster'] and length >= MINIMUM_OBSERVATIONS: clustered = True counts = [p['count'] for p in numeric_points] points, outliers = kmeans.weighted_counts( obs, counts, params['n']) else: indexes = kmeans.find_outliers(obs, normalized=False) outliers = [] for idx in indexes: outliers.append(numeric_points[idx]) numeric_points[idx] = None points = [p for p in numeric_points if p is not None] # Now that we have done the analysis using the purely numeric # points, we can add the mixed/null dimensionality points back in # to the list before returning results. points += null_points usage.log('dims', instance=instance, request=request, data={ 'size': length, 'clustered': clustered, 'aware': params['aware'], }) labeled_points = [] value_labels = tree_field.value_labels(queryset=queryset) for point in points: labeled_points.append({ 'count': point['count'], 'values': [{ 'label': value_labels.get(value, smart_unicode(value)), 'value': value } for value in point['values']] }) return { 'data': labeled_points, 'clustered': clustered, 'outliers': outliers, 'size': length, }
def handle_field(self, field, model_name, app_name, **options): include_keys = options.get('include_keys') force = options.get('force') include_non_editable = options.get('include_non_editable') prepend_model_name = options.get('prepend_model_name') create_concepts = options.get('concepts') auto_publish = options.get('publish') create_categories = options.get('categories') # M2Ms do not make any sense here.. if isinstance(field, ManyToManyField): return if dep_supported('objectset'): from objectset.models import ObjectSet objectset = issubclass(field.model, ObjectSet) else: objectset = False lexicon = issubclass(field.model, Lexicon) # Lexicons and ObjectSets are represented via their primary key, so # these may pass if not objectset and not lexicon: # Check for primary key, and foreign key fields if isinstance(field, self.key_field_types) and not include_keys: print(u'({0}) {1}.{2} is a primary or foreign key. Skipping...' .format(app_name, model_name, field.name)) return # Ignore non-editable fields since in most cases they are for # managment purposes if not field.editable and not include_non_editable: print(u'({0}) {1}.{2} is not editable. Skipping...' .format(app_name, model_name, field.name)) return # All but the field name is case-insensitive, do initial lookup # to see if it already exists, skip if it does lookup = { 'app_name__iexact': app_name, 'model_name__iexact': model_name, 'field_name': field.name, } # Note, `name` is set below kwargs = { 'description': field.help_text or None, 'app_name': app_name, 'model_name': model_name.lower(), 'field_name': field.name, } if lexicon: kwargs.update({ 'label_field_name': 'label', 'order_field_name': 'order', 'code_field_name': 'code', }) elif objectset and hasattr(field.model, 'label_field'): kwargs.update({ 'label_field_name': field.model.label_field }) try: f = DataField.objects.get(**lookup) except DataField.DoesNotExist: f = DataField(published=options.get('publish'), **kwargs) if f.pk: created = False if not force: print(u'({0}) {1}.{2} already exists. Skipping...' .format(app_name, model_name, field.name)) return # Only overwrite if the source value is not falsy f.__dict__.update([(k, v) for k, v in kwargs.items()]) else: created = True if not f.name: # Use the default unicode representation of the datafield if prepend_model_name: f.name = unicode(f) else: f.name = field.verbose_name.title() # Update fields with flags f.__dict__.update(utils.get_heuristic_flags(field)) # Create category based on the model name and associate # it to the field. if create_categories: category, _ = DataCategory.objects\ .get_or_create(name=f.model._meta.verbose_name.title(), published=auto_publish) f.category = category else: category = None f.save() # Create a concept if one does not already exist for this field if create_concepts and not DataConcept.objects\ .filter(fields=f).exists(): kwargs = { 'published': auto_publish, 'category': category, } DataConcept.objects.create_from_field(f, **kwargs) return created
def handle_field(self, field, model_name, app_name, **options): include_keys = options.get('include_keys') force = options.get('force') include_non_editable = options.get('include_non_editable') # M2Ms do not make any sense here.. if isinstance(field, ManyToManyField): return # Lexicons and ObjectSets are represented via their primary key, so # these may pass if not issubclass(field.model, (Lexicon, ObjectSet)): # Check for primary key, and foreign key fields if isinstance(field, self.key_field_types) and not include_keys: return # Ignore non-editable fields since in most cases they are for # managment purposes if not field.editable and not include_non_editable: return # All but the field name is case-insensitive, do initial lookup # to see if it already exists, skip if it does lookup = { 'app_name__iexact': app_name, 'model_name__iexact': model_name, 'field_name': field.name, } # Note, `name` is set below kwargs = { 'description': field.help_text or None, 'app_name': app_name, 'model_name': model_name.lower(), 'field_name': field.name, } try: datafield = DataField.objects.get(**lookup) except DataField.DoesNotExist: datafield = DataField(published=False, **kwargs) if datafield.pk: created = False if not force: print '({0}) {1}.{2} already exists. Skipping...'.format(app_name, model_name, field.name) return # Only overwrite if the source value is not falsy datafield.__dict__.update([(k, v) for k, v in kwargs.items()]) else: created = True if not datafield.name: # Use the default unicode representation of the datafield datafield.name = unicode(datafield) # Update fields with flags datafield.__dict__.update(utils.get_heuristic_flags(datafield)) datafield.save() return created
def test_dist(self): f = DataField(app_name='tests', model_name='date', field_name='month') # Months of the year result = tuple([(i, 1) for i in range(1, 13)]) self.assertEqual(f.dist(), result)
def handle_field(self, field, model_name, app_name, **options): include_keys = options.get('include_keys') force = options.get('force') include_non_editable = options.get('include_non_editable') prepend_model_name = options.get('prepend_model_name') create_concepts = options.get('concepts') auto_publish = options.get('publish') create_categories = options.get('categories') # M2Ms do not make any sense here.. if isinstance(field, ManyToManyField): return # Check for primary key, and foreign key fields if isinstance(field, self.key_field_types) and not include_keys: print(u'({0}) {1}.{2} is a primary or foreign key. Skipping...'. format(app_name, model_name, field.name)) return # Ignore non-editable fields since in most cases they are for # managment purposes if not field.editable and not include_non_editable: print(u'({0}) {1}.{2} is not editable. Skipping...'.format( app_name, model_name, field.name)) return # All but the field name is case-insensitive, do initial lookup # to see if it already exists, skip if it does lookup = { 'app_name__iexact': app_name, 'model_name__iexact': model_name, 'field_name': field.name, } # Note, `name` is set below kwargs = { 'description': field.help_text or None, 'app_name': app_name, 'model_name': model_name.lower(), 'field_name': field.name, } try: f = DataField.objects.get(**lookup) except DataField.DoesNotExist: f = DataField(published=options.get('publish'), **kwargs) if f.pk: created = False if not force: print(u'({0}) {1}.{2} already exists. Skipping...'.format( app_name, model_name, field.name)) return # Only overwrite if the source value is not falsy f.__dict__.update([(k, v) for k, v in kwargs.items()]) else: created = True if not f.name: # Use the default unicode representation of the datafield if prepend_model_name: f.name = unicode(f) else: f.name = field.verbose_name.title() # Update fields with flags f.__dict__.update(utils.get_heuristic_flags(field)) # Create category based on the model name and associate # it to the field. if create_categories: category, _ = DataCategory.objects\ .get_or_create(name=f.model._meta.verbose_name.title(), published=auto_publish) f.category = category else: category = None f.save() # Create a concept if one does not already exist for this field if create_concepts and not DataConcept.objects\ .filter(fields=f).exists(): kwargs = { 'published': auto_publish, 'category': category, } DataConcept.objects.create_from_field(f, **kwargs) return created
def get(self, request, pk): instance = self.get_object(request, pk=pk) params = self.get_params(request) tree = trees[params.get('tree')] opts = tree.root_model._meta tree_field = DataField(pk='{0}:{1}'.format(params.get('tree'), pk), app_name=opts.app_label, model_name=opts.module_name, field_name=opts.pk.name) # This will eventually make its way in the parametizer, but lists # are not supported. dimensions = request.GET.getlist('dimensions') if params['aware']: context = self.get_context(request) else: context = None QueryProcessor = pipeline.query_processors[params['processor']] processor = QueryProcessor(context=context, tree=tree) queryset = processor.get_queryset(request=request) # Explicit fields to group by, ignore ones that dont exist or the # user does not have permission to view. Default is to group by the # reference field for disinct counts. if any(dimensions): fields = [] groupby = [] for pk in dimensions: f = self.get_object(request, pk=pk) if f: fields.append(f) groupby.append(tree.query_string_for_field(f.field, model=f.model)) else: fields = [instance] groupby = [tree.query_string_for_field(instance.field, model=instance.model)] # Exclude null values. Depending on the downstream use of the data, # nulls may or may not be desirable. if not params['nulls']: q = Q() for field in groupby: q = q & Q(**{'{0}__isnull'.format(field): False}) queryset = queryset.filter(q) queryset = queryset.values(*groupby) # Begin constructing the response resp = { 'data': [], 'outliers': [], 'clustered': False, 'size': 0, } queryset = queryset.annotate(count=Count(tree_field.field.name))\ .values_list('count', *groupby) # Evaluate list of points length = len(queryset) # Nothing to do if not length: usage.log('dims', instance=instance, request=request, data={ 'size': 0, 'clustered': False, 'aware': params['aware'], }) return resp if length > MAXIMUM_OBSERVATIONS: data = { 'message': 'Data too large', } return self.render(request, data, status=codes.unprocessable_entity) # Apply ordering. If any of the fields are enumerable, ordering should # be relative to those fields. For continuous data, the ordering is # relative to the count of each group if (any([d.enumerable for d in fields]) and not params['sort'] == 'count'): queryset = queryset.order_by(*groupby) else: queryset = queryset.order_by('-count') clustered = False points = [{ 'count': point[0], 'values': point[1:], } for point in list(queryset)] outliers = [] # For N-dimensional continuous data, check if clustering should occur # to down-sample the data. if all([d.simple_type == 'number' for d in fields]): # Extract observations for clustering. obs = [] null_points = [] numeric_points = [] for i, point in enumerate(points): # We need to handle points that have null dimensions # differently than those that are all numeric as the kmeans # module currently cannot handle mixed type dimensions so we # only allow fully numeric points to be passed to the kmeans # module. if None in point['values']: null_points.append(point) continue for i, dim in enumerate(point['values']): if isinstance(dim, Decimal): point['values'][i] = float(str(dim)) numeric_points.append(point) obs.append(point['values']) # Perform k-means clustering. Determine centroids and calculate # the weighted count relatives to the centroid and observations # within the kmeans module. if params['cluster'] and length >= MINIMUM_OBSERVATIONS: clustered = True counts = [p['count'] for p in numeric_points] points, outliers = kmeans.weighted_counts( obs, counts, params['n']) else: indexes = kmeans.find_outliers(obs, normalized=False) outliers = [] for idx in indexes: outliers.append(numeric_points[idx]) numeric_points[idx] = None points = [p for p in numeric_points if p is not None] # Now that we have done the analysis using the purely numeric # points, we can add the mixed/null dimensionality points back in # to the list before returning results. points += null_points usage.log('dims', instance=instance, request=request, data={ 'size': length, 'clustered': clustered, 'aware': params['aware'], }) labeled_points = [] value_labels = tree_field.value_labels(queryset=queryset) for point in points: labeled_points.append({ 'count': point['count'], 'values': [{ 'label': value_labels.get(value, smart_unicode(value)), 'value': value } for value in point['values']] }) return { 'data': labeled_points, 'clustered': clustered, 'outliers': outliers, 'size': length, }