Ejemplo n.º 1
0
 def test_datafield(self):
     f = DataField(app_name='tests', model_name='month', field_name='id')
     self.assertEqual(f.values(), (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))
     self.assertEqual(f.labels(), (u'January', u'February', u'March',
         u'April', u'May', u'June', u'July', u'August', u'September',
         u'October', u'November', u'December'))
     self.assertEqual(f.codes(), (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11))
Ejemplo n.º 2
0
    def _migrate_fields(self, **options):
        force = options.get('force')
        no_input = options.get('no_input')

        total_migrated = 0

        for lf in legacy.Field.objects.iterator():
            try:
                f = DataField.objects.get_by_natural_key(lf.app_name,
                                                         lf.model_name,
                                                         lf.field_name)
            except DataField.DoesNotExist:
                f = DataField(app_name=lf.app_name, model_name=lf.model_name,
                              field_name=lf.field_name)

            qualified_name = \
                u'({0}) {1}.{2}'.format(f.app_name, f.model_name, f.field_name)

            if f.pk and not force:
                print u'{0} already exists. Skipping...'.format(qualified_name)
                continue

            # Check if this is an orphan
            if not f.field:
                print u'{0} is orphaned. Skipping...'.format(qualified_name)
                continue

            # Map various fields
            f.name = lf.name
            f.description = lf.description
            f.keywords = lf.keywords
            f.translator = lf.translator
            f.group_id = lf.group_id

            print u'Migrated "{0}"'.format(qualified_name)

            f.__dict__.update(utils.get_heuristic_flags(f.field))

            # Disagreement with enumerable status
            if not no_input and f.enumerable != lf.enable_choices:
                if lf.enable_choices:
                    override = raw_input(u'"{0}" is marked as enumerable, but '
                                         'does not qualify to be enumerable. '
                                         'Override? [y/N] '
                                         .format(qualified_name))
                else:
                    override = raw_input(u'"{0}" is not marked as enumerable, '
                                         'but qualifies to be enumerable. '
                                         'Override? [y/N] '
                                         .format(qualified_name))

                if override.lower() == 'y':
                    f.enumerable = lf.enable_choices

            f.save()
            f.sites = lf.sites.all()

            total_migrated += 1

        print u'Fields migrated:\t{0}'.format(total_migrated)
Ejemplo n.º 3
0
    def handle(self, field_label, model_label, **options):
        model = get_model(*model_label.split('.'))

        if model is None:
            raise CommandError(u'Not model named {0} was found'.format(
                model_label))

        toks = field_label.split('.')
        if len(toks) != 3:
            raise CommandError(u'{0} is not a valid field identifier. Use a '
                               '"." delimited notation, e.g. "app.model.field"'
                               .format(field_label))

        # Not required to be persisted to the database..
        f = DataField(app_name=toks[0], model_name=toks[1], field_name=toks[2])

        if not f.field:
            raise CommandError(u'The field {0} could not be found.'.format(
                field_label))

        count = 0
        values = list(f.values())
        values.sort(key=coerce_float)

        for value in values:
            if value is None or value == '':
                continue
            label = non_alnum.sub(value, ' ').title()
            obj = model(label=label, value=value, order=count, code=count)
            obj.save()
            count += 1

        print(u'{0} distinct values loaded'.format(count))
Ejemplo n.º 4
0
    def handle(self, field_label, model_label, **options):
        model = get_model(*model_label.split('.'))

        if model is None:
            raise CommandError(
                u'Not model named {0} was found'.format(model_label))

        toks = field_label.split('.')
        if len(toks) != 3:
            raise CommandError(
                u'{0} is not a valid field identifier. Use a '
                '"." delimited notation, e.g. "app.model.field"'.format(
                    field_label))

        # Not required to be persisted to the database..
        f = DataField(app_name=toks[0], model_name=toks[1], field_name=toks[2])

        if not f.field:
            raise CommandError(
                u'The field {0} could not be found.'.format(field_label))

        count = 0
        values = list(f.values())
        values.sort(key=coerce_float)

        for value in values:
            if value is None or value == '':
                continue
            label = non_alnum.sub(value, ' ').title()
            obj = model(label=label, value=value, order=count, code=count)
            obj.save()
            count += 1

        print(u'{0} distinct values loaded'.format(count))
Ejemplo n.º 5
0
 def test_instance(self):
     f = DataField(app_name='avocado',
                   model_name='datafield',
                   field_name='name')
     f.save()
     usage.log('test', instance=f, async=False)
     self.assertEqual(Log.objects.get(pk=1).content_object, f)
Ejemplo n.º 6
0
class TranslateTestCase(TestCase):
    fixtures = ['initial_data.json']

    # Define translator with defined through model and register it
    class T(VocabularyTranslator):
        through_model = TicketThrough
    registry.register(T, 'test')

    def setUp(self):
        management.call_command('avocado','init','tests', quiet=True)

        # Build item index
        TicketIndex.objects.index()

        # Create the text index DataField
        self.f = DataField(name='Ticket Index Item', app_name='tests',
            model_name='ticketindex', field_name='item')
        self.f.translator = "test"
        self.f.save()

    def test_only(self):
        c = DataContext(json={
            'field': self.f.pk,
            'operator': 'only',
            'value': [1, 6]
        })
        self.assertEqual([3], [x.pk for x in c.apply(tree=TicketHolder)])

    def test_excludes_any(self):
        c = DataContext(json={
            'field': self.f.pk,
            'operator': '-in',
            'value': [1, 2]
        })
        self.assertEqual([2], [x.pk for x in c.apply(tree=TicketHolder)])

    def test_excludes_all(self):
        c = DataContext(json={
            'field': self.f.pk,
            'operator': '-all',
            'value': [1, 5]
        })
        self.assertEqual([1, 2], sorted([x.pk for x in c.apply(tree=TicketHolder)]))

    def test_requires_all(self):
        c = DataContext(json={
            'field': self.f.pk,
            'operator': 'all',
            'value': [1, 3]
        })
        self.assertEqual([1, 3], sorted([x.pk for x in c.apply(tree=TicketHolder)]))

    def test_requires_any(self):
        c = DataContext(json={
            'field': self.f.pk,
            'operator': 'in',
            'value': [3]
        })
        self.assertEqual([1, 2, 3], sorted([x.pk for x in c.apply(tree=TicketHolder)]))
 def forwards(self, orm):
     "Write your forwards methods here."
     try:
         f = DataField.objects.get_by_natural_key('variants.variantphenotype.hgmd_id')
     except DataField.DoesNotExist:
         f = DataField(app_name='variants', model_name='variantphenotype',
             field_name='hgmd_id', name='HGMD', published=True, searchable=True)
         f.save()
Ejemplo n.º 8
0
 def test_datafield(self):
     f = DataField(app_name='tests', model_name='month', field_name='id')
     self.assertEqual(f.values(), (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))
     self.assertEqual(f.labels(),
                      (u'January', u'February', u'March', u'April', u'May',
                       u'June', u'July', u'August', u'September',
                       u'October', u'November', u'December'))
     self.assertEqual(f.codes(), (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11))
Ejemplo n.º 9
0
 def test_datafield_properties(self):
     [RecordSet(name=u'Set {0}'.format(i)).save() for i in xrange(10)]
     f = DataField(app_name='tests', model_name='recordset',
                   field_name='id')
     self.assertEqual(list(f.values()), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
     self.assertEqual(list(f.labels()), ['Set 0', 'Set 1', 'Set 2', 'Set 3',
                                         'Set 4', 'Set 5', 'Set 6', 'Set 7',
                                         'Set 8', 'Set 9'])
Ejemplo n.º 10
0
    def test_instance(self):
        f = DataField(app_name='avocado',
                      model_name='datafield',
                      field_name='name')
        f.save()

        usage.log('test', instance=f, async=False)
        event = Log.objects.all()[0]
        self.assertEqual(event.content_object, f)
Ejemplo n.º 11
0
    def test_translator(self):
        s = RecordSet()
        s.save()
        objs = [Record(pk=i) for i in xrange(1, 11)]
        s.bulk(objs)

        f = DataField(app_name='sets', model_name='recordset', field_name='id')
        trans = f.translate(value=s.pk, tree=Record)
        self.assertEqual(unicode(trans['query_modifiers']['condition']),
            "(AND: ('recordset__id__exact', 1))")
Ejemplo n.º 12
0
 def test_datafield_properties(self):
     [RecordSet(name=u'Set {0}'.format(i)).save() for i in xrange(10)]
     f = DataField(app_name='tests',
                   model_name='recordset',
                   field_name='id')
     self.assertEqual(list(f.values()), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
     self.assertEqual(list(f.labels()), [
         'Set 0', 'Set 1', 'Set 2', 'Set 3', 'Set 4', 'Set 5', 'Set 6',
         'Set 7', 'Set 8', 'Set 9'
     ])
Ejemplo n.º 13
0
    def test_translator(self):
        s = RecordSet()
        s.save()
        objs = [Record(pk=i) for i in xrange(1, 11)]
        s.bulk(objs)

        f = DataField(app_name='tests',
                      model_name='recordset',
                      field_name='id')
        trans = f.translate(value=s.pk, tree=Record)
        self.assertEqual(unicode(trans['query_modifiers']['condition']),
                         "(AND: ('recordset__id__exact', 1))")
 def forwards(self, orm):
     "Write your forwards methods here."
     try:
         f = DataField.objects.get_by_natural_key(
             'variants.variantphenotype.hgmd_id')
     except DataField.DoesNotExist:
         f = DataField(app_name='variants',
                       model_name='variantphenotype',
                       field_name='hgmd_id',
                       name='HGMD',
                       published=True,
                       searchable=True)
         f.save()
Ejemplo n.º 15
0
    def test_dataview_order_by(self):
        f = DataField(app_name='lexicon', model_name='month', field_name='id')
        f.save()

        c = DataConcept()
        c.save()

        cf = DataConceptField(field=f, concept=c)
        cf.save()

        v = DataView({'ordering': [c.pk]})

        qs = Month.objects.filter(label__startswith='J').values('id')
        self.assertEqual(str(v.apply(qs).query), 'SELECT "lexicon_month"."id" FROM "lexicon_month" WHERE "lexicon_month"."label" LIKE J% ESCAPE \'\\\'  ORDER BY "lexicon_month"."order" ASC')
Ejemplo n.º 16
0
    def test_dataview_order_by(self):
        f = DataField(app_name='tests', model_name='month', field_name='id')
        f.save()

        c = DataConcept()
        c.save()

        cf = DataConceptField(field=f, concept=c)
        cf.save()

        v = DataView({'ordering': [[c.pk, 'asc']]})

        qs = Month.objects.filter(label__startswith='J').values('id')
        self.assertEqual(
            unicode(v.apply(qs).query),
            'SELECT "tests_month"."id" FROM "tests_month" WHERE "tests_month"."label" LIKE J% ESCAPE \'\\\'  ORDER BY "tests_month"."order" ASC'
        )
Ejemplo n.º 17
0
    def setUp(self):
        management.call_command('avocado','init','tests', quiet=True)

        # Build item index
        TicketIndex.objects.index()

        # Create the text index DataField
        self.f = DataField(name='Ticket Index Item', app_name='tests',
            model_name='ticketindex', field_name='item')
        self.f.translator = "test"
        self.f.save()
Ejemplo n.º 18
0
 def setUp(self):
     self.f = DataField.init('tests.title.name')
Ejemplo n.º 19
0
    def _migrate_fields(self, **options):
        force = options.get('force')
        no_input = options.get('no_input')

        total_migrated = 0

        for lf in legacy.Field.objects.iterator():
            try:
                f = DataField.objects.get_by_natural_key(
                    lf.app_name, lf.model_name, lf.field_name)
            except DataField.DoesNotExist:
                f = DataField(app_name=lf.app_name,
                              model_name=lf.model_name,
                              field_name=lf.field_name)

            qualified_name = \
                u'({0}) {1}.{2}'.format(f.app_name, f.model_name, f.field_name)

            if f.pk and not force:
                print u'{0} already exists. Skipping...'.format(qualified_name)
                continue

            # Check if this is an orphan
            if not f.field:
                print u'{0} is orphaned. Skipping...'.format(qualified_name)
                continue

            # Map various fields
            f.name = lf.name
            f.description = lf.description
            f.keywords = lf.keywords
            f.translator = lf.translator
            f.group_id = lf.group_id

            print u'Migrated "{0}"'.format(qualified_name)

            f.__dict__.update(utils.get_heuristic_flags(f.field))

            # Disagreement with enumerable status
            if not no_input and f.enumerable != lf.enable_choices:
                if lf.enable_choices:
                    override = raw_input(
                        u'"{0}" is marked as enumerable, but '
                        'does not qualify to be enumerable. '
                        'Override? [y/N] '.format(qualified_name))
                else:
                    override = raw_input(
                        u'"{0}" is not marked as enumerable, '
                        'but qualifies to be enumerable. '
                        'Override? [y/N] '.format(qualified_name))

                if override.lower() == 'y':
                    f.enumerable = lf.enable_choices

            f.save()
            f.sites = lf.sites.all()

            total_migrated += 1

        print u'Fields migrated:\t{0}'.format(total_migrated)
Ejemplo n.º 20
0
    def get(self, request, pk):
        instance = self.get_object(request, pk=pk)
        params = self.get_params(request)

        tree = trees[params.get('tree')]
        opts = tree.root_model._meta
        tree_field = DataField(app_name=opts.app_label,
                               model_name=opts.module_name,
                               field_name=opts.pk.name)

        # This will eventually make it's way in the parametizer, but lists
        # are not supported
        dimensions = request.GET.getlist('dimensions')

        # The `aware` flag toggles the behavior of the distribution by making
        # it relative to the applied context or not
        if params['aware']:
            attrs = None
        else:
            attrs = {}

        # Get and apply context relative to the tree
        context = self.get_context(request, attrs=attrs)
        queryset = context.apply(tree=tree)

        # Explicit fields to group by, ignore ones that dont exist or the
        # user does not have permission to view. Default is to group by the
        # reference field for distinct counts.
        if any(dimensions):
            fields = []
            groupby = []

            for pk in dimensions:
                f = self.get_object(request, pk=pk)
                if f:
                    fields.append(f)
                    groupby.append(tree.query_string_for_field(f.field))
        else:
            fields = [instance]
            groupby = [tree.query_string_for_field(instance.field)]

        # Perform a count aggregation of the tree model grouped by the
        # specified dimensions
        stats = tree_field.count(*groupby)

        # Apply it relative to the queryset
        stats = stats.apply(queryset)

        # Exclude null values. Dependending on the downstream use of the data,
        # nulls may or may not be desirable.
        if not params['nulls']:
            q = Q()
            for field in groupby:
                q = q | Q(**{field: None})
            stats = stats.exclude(q)

        # Begin constructing the response
        resp = {
            'data': [],
            'outliers': [],
            'clustered': False,
            'size': 0,
        }

        # Evaluate list of points
        length = len(stats)

        # Nothing to do
        if not length:
            usage.log('dist',
                      instance=instance,
                      request=request,
                      data={
                          'size': 0,
                          'clustered': False,
                          'aware': params['aware'],
                      })
            return resp

        if length > MAXIMUM_OBSERVATIONS:
            data = {
                'message': 'Data too large',
            }
            return self.render(request,
                               data,
                               status=codes.unprocessable_entity)

        # Apply ordering. If any of the fields are enumerable, ordering should
        # be relative to those fields. For continuous data, the ordering is
        # relative to the count of each group
        if (any([d.enumerable for d in fields])
                and not params['sort'] == 'count'):
            stats = stats.order_by(*groupby)
        else:
            stats = stats.order_by('-count')

        clustered = False
        points = list(stats)
        outliers = []

        # For N-dimensional continuous data, check if clustering should occur
        # to down-sample the data.
        if all([d.simple_type == 'number' for d in fields]):
            # Extract observations for clustering
            obs = []
            for point in points:
                for i, dim in enumerate(point['values']):
                    if isinstance(dim, Decimal):
                        point['values'][i] = float(str(dim))
                obs.append(point['values'])

            # Perform k-means clustering. Determine centroids and calculate
            # the weighted count relatives to the centroid and observations
            # within the kmeans module.
            if params['cluster'] and length >= MINIMUM_OBSERVATIONS:
                clustered = True

                counts = [p['count'] for p in points]
                points, outliers = kmeans.weighted_counts(
                    obs, counts, params['n'])
            else:
                indexes = kmeans.find_outliers(obs, normalized=False)

                outliers = []
                for idx in indexes:
                    outliers.append(points[idx])
                    points[idx] = None
                points = [p for p in points if p is not None]

        usage.log('dist',
                  instance=instance,
                  request=request,
                  data={
                      'size': length,
                      'clustered': clustered,
                      'aware': params['aware'],
                  })

        return {
            'data': points,
            'clustered': clustered,
            'outliers': outliers,
            'size': length,
        }
Ejemplo n.º 21
0
    def get(self, request):
        params = self.get_params(request)

        if params['aware']:
            context = self.get_context(request)
        else:
            context = DataContext()

        # Get all published app/model pairs to produce counts for.
        model_names = DataField.objects.published()\
            .values_list('app_name', 'model_name')\
            .order_by('model_name').distinct()

        results = []
        data = []
        models = set()

        QueryProcessor = pipeline.query_processors[params['processor']]

        # Workaround for a Python bug for versions 2.7.5 and below
        # http://bugs.python.org/issue10015
        if not hasattr(threading.current_thread(), '_children'):
            threading.current_thread()._children = weakref.WeakKeyDictionary()

        # Pool of threads to execute the counts in parallel
        pool = ThreadPool()

        for app_name, model_name in model_names:
            # DataField used here to resolve foreign key-based fields.
            model = DataField(app_name=app_name, model_name=model_name).model

            # No redundant counts
            if model in models:
                continue

            models.add(model)

            opts = model._meta

            # Format is called to resolve Django's internal proxy wrapper.
            verbose_name = opts.verbose_name.format()
            verbose_name_plural = opts.verbose_name_plural.format()

            # Assume no custom verbose_name as been set in Meta class, so
            # apply a minimal title-case.
            if verbose_name.islower():
                verbose_name = verbose_name.title()

            if verbose_name_plural.islower():
                verbose_name_plural = verbose_name_plural.title()

            # Placeholder with the model name. The count will be replaced if
            # successful.
            data.append({
                'count': None,
                'app_name': app_name,
                'model_name': model_name,
                'verbose_name': verbose_name,
                'verbose_name_plural': verbose_name_plural,
            })

            # Asynchronously execute the count
            result = pool.apply_async(get_count,
                                      args=(request, model, params['refresh'],
                                            QueryProcessor, context))

            results.append(result)

        pool.close()

        for i, r in enumerate(results):
            try:
                count = r.get(timeout=serrano_settings.STATS_COUNT_TIMEOUT)
                data[i]['count'] = count
            except Exception:
                pass

        return data
Ejemplo n.º 22
0
    def get(self, request, pk):
        instance = self.get_object(request, pk=pk)
        params = self.get_params(request)

        tree = trees[params.get('tree')]
        opts = tree.root_model._meta
        tree_field = DataField(
            app_name=opts.app_label, model_name=opts.module_name,
            field_name=opts.pk.name)

        # This will eventually make it's way in the parametizer, but lists
        # are not supported
        dimensions = request.GET.getlist('dimensions')

        if params['aware']:
            context = self.get_context(request)
        else:
            context = None

        QueryProcessor = pipeline.query_processors[params['processor']]
        processor = QueryProcessor(context=context, tree=tree)

        queryset = processor.get_queryset(request=request)

        # Explicit fields to group by, ignore ones that dont exist or the
        # user does not have permission to view. Default is to group by the
        # reference field for distinct counts.
        if any(dimensions):
            fields = []
            groupby = []

            for pk in dimensions:
                f = self.get_object(request, pk=pk)
                if f:
                    fields.append(f)
                    groupby.append(tree.query_string_for_field(f.field,
                                                               model=f.model))
        else:
            fields = [instance]
            groupby = [tree.query_string_for_field(instance.field,
                                                   model=instance.model)]

        # Perform a count aggregation of the tree model grouped by the
        # specified dimensions
        stats = tree_field.count(*groupby)

        # Apply it relative to the queryset
        stats = stats.apply(queryset)

        # Exclude null values. Dependending on the downstream use of the data,
        # nulls may or may not be desirable.
        if not params['nulls']:
            q = Q()
            for field in groupby:
                q = q | Q(**{field: None})
            stats = stats.exclude(q)

        # Begin constructing the response
        resp = {
            'data': [],
            'outliers': [],
            'clustered': False,
            'size': 0,
        }

        # Evaluate list of points
        length = len(stats)

        # Nothing to do
        if not length:
            usage.log('dist', instance=instance, request=request, data={
                'size': 0,
                'clustered': False,
                'aware': params['aware'],
            })
            return resp

        if length > MAXIMUM_OBSERVATIONS:
            data = {
                'message': 'Data too large',
            }
            return self.render(request, data,
                               status=codes.unprocessable_entity)

        # Apply ordering. If any of the fields are enumerable, ordering should
        # be relative to those fields. For continuous data, the ordering is
        # relative to the count of each group
        if (any([d.enumerable for d in fields]) and
                not params['sort'] == 'count'):
            stats = stats.order_by(*groupby)
        else:
            stats = stats.order_by('-count')

        clustered = False
        points = list(stats)
        outliers = []

        # For N-dimensional continuous data, check if clustering should occur
        # to down-sample the data.
        if all([d.simple_type == 'number' for d in fields]):
            # Extract observations for clustering
            obs = []
            for point in points:
                for i, dim in enumerate(point['values']):
                    if isinstance(dim, Decimal):
                        point['values'][i] = float(str(dim))
                obs.append(point['values'])

            # Perform k-means clustering. Determine centroids and calculate
            # the weighted count relatives to the centroid and observations
            # within the kmeans module.
            if params['cluster'] and length >= MINIMUM_OBSERVATIONS:
                clustered = True

                counts = [p['count'] for p in points]
                points, outliers = kmeans.weighted_counts(
                    obs, counts, params['n'])
            else:
                indexes = kmeans.find_outliers(obs, normalized=False)

                outliers = []
                for idx in indexes:
                    outliers.append(points[idx])
                    points[idx] = None
                points = [p for p in points if p is not None]

        usage.log('dist', instance=instance, request=request, data={
            'size': length,
            'clustered': clustered,
            'aware': params['aware'],
        })

        return {
            'data': points,
            'clustered': clustered,
            'outliers': outliers,
            'size': length,
        }
Ejemplo n.º 23
0
    def get(self, request, pk):
        instance = self.get_object(request, pk=pk)
        params = self.get_params(request)

        tree = trees[params.get('tree')]
        opts = tree.root_model._meta

        tree_field = DataField(pk='{0}:{1}'.format(params.get('tree'), pk),
                               app_name=opts.app_label,
                               model_name=opts.module_name,
                               field_name=opts.pk.name)

        # This will eventually make its way in the parametizer, but lists
        # are not supported.
        dimensions = request.GET.getlist('dimensions')

        if params['aware']:
            context = self.get_context(request)
        else:
            context = None

        QueryProcessor = pipeline.query_processors[params['processor']]
        processor = QueryProcessor(context=context, tree=tree)
        queryset = processor.get_queryset(request=request)

        # Explicit fields to group by, ignore ones that dont exist or the
        # user does not have permission to view. Default is to group by the
        # reference field for disinct counts.
        if any(dimensions):
            fields = []
            groupby = []

            for pk in dimensions:
                f = self.get_object(request, pk=pk)

                if f:
                    fields.append(f)
                    groupby.append(
                        tree.query_string_for_field(f.field, model=f.model))
        else:
            fields = [instance]
            groupby = [
                tree.query_string_for_field(instance.field,
                                            model=instance.model)
            ]

        # Exclude null values. Depending on the downstream use of the data,
        # nulls may or may not be desirable.
        if not params['nulls']:
            q = Q()

            for field in groupby:
                q = q & Q(**{'{0}__isnull'.format(field): False})

            queryset = queryset.filter(q)

        queryset = queryset.values(*groupby)

        # Begin constructing the response
        resp = {
            'data': [],
            'outliers': [],
            'clustered': False,
            'size': 0,
        }

        queryset = queryset.annotate(count=Count(tree_field.field.name))\
            .values_list('count', *groupby)

        # Evaluate list of points
        length = len(queryset)

        # Nothing to do
        if not length:
            usage.log('dims',
                      instance=instance,
                      request=request,
                      data={
                          'size': 0,
                          'clustered': False,
                          'aware': params['aware'],
                      })

            return resp

        if length > MAXIMUM_OBSERVATIONS:
            data = {
                'message': 'Data too large',
            }

            return self.render(request,
                               data,
                               status=codes.unprocessable_entity)

        # Apply ordering. If any of the fields are enumerable, ordering should
        # be relative to those fields. For continuous data, the ordering is
        # relative to the count of each group
        if (any([d.enumerable for d in fields])
                and not params['sort'] == 'count'):
            queryset = queryset.order_by(*groupby)
        else:
            queryset = queryset.order_by('-count')

        clustered = False

        points = [{
            'count': point[0],
            'values': point[1:],
        } for point in list(queryset)]

        outliers = []

        # For N-dimensional continuous data, check if clustering should occur
        # to down-sample the data.
        if all([d.simple_type == 'number' for d in fields]):
            # Extract observations for clustering.
            obs = []

            null_points = []
            numeric_points = []

            for i, point in enumerate(points):
                # We need to handle points that have null dimensions
                # differently than those that are all numeric as the kmeans
                # module currently cannot handle mixed type dimensions so we
                # only allow fully numeric points to be passed to the kmeans
                # module.
                if None in point['values']:
                    null_points.append(point)
                    continue

                for i, dim in enumerate(point['values']):
                    if isinstance(dim, Decimal):
                        point['values'][i] = float(str(dim))

                numeric_points.append(point)
                obs.append(point['values'])

            # Perform k-means clustering. Determine centroids and calculate
            # the weighted count relatives to the centroid and observations
            # within the kmeans module.
            if params['cluster'] and length >= MINIMUM_OBSERVATIONS:
                clustered = True

                counts = [p['count'] for p in numeric_points]
                points, outliers = kmeans.weighted_counts(
                    obs, counts, params['n'])
            else:
                indexes = kmeans.find_outliers(obs, normalized=False)

                outliers = []

                for idx in indexes:
                    outliers.append(numeric_points[idx])
                    numeric_points[idx] = None

                points = [p for p in numeric_points if p is not None]

            # Now that we have done the analysis using the purely numeric
            # points, we can add the mixed/null dimensionality points back in
            # to the list before returning results.
            points += null_points

        usage.log('dims',
                  instance=instance,
                  request=request,
                  data={
                      'size': length,
                      'clustered': clustered,
                      'aware': params['aware'],
                  })

        labeled_points = []
        value_labels = tree_field.value_labels(queryset=queryset)

        for point in points:
            labeled_points.append({
                'count':
                point['count'],
                'values': [{
                    'label':
                    value_labels.get(value, smart_unicode(value)),
                    'value':
                    value
                } for value in point['values']]
            })

        return {
            'data': labeled_points,
            'clustered': clustered,
            'outliers': outliers,
            'size': length,
        }
Ejemplo n.º 24
0
    def handle_field(self, field, model_name, app_name, **options):
        include_keys = options.get('include_keys')
        force = options.get('force')
        include_non_editable = options.get('include_non_editable')
        prepend_model_name = options.get('prepend_model_name')
        create_concepts = options.get('concepts')
        auto_publish = options.get('publish')
        create_categories = options.get('categories')

        # M2Ms do not make any sense here..
        if isinstance(field, ManyToManyField):
            return

        if dep_supported('objectset'):
            from objectset.models import ObjectSet
            objectset = issubclass(field.model, ObjectSet)
        else:
            objectset = False

        lexicon = issubclass(field.model, Lexicon)

        # Lexicons and ObjectSets are represented via their primary key, so
        # these may pass
        if not objectset and not lexicon:
            # Check for primary key, and foreign key fields
            if isinstance(field, self.key_field_types) and not include_keys:
                print(u'({0}) {1}.{2} is a primary or foreign key. Skipping...'
                      .format(app_name, model_name, field.name))
                return

            # Ignore non-editable fields since in most cases they are for
            # managment purposes
            if not field.editable and not include_non_editable:
                print(u'({0}) {1}.{2} is not editable. Skipping...'
                      .format(app_name, model_name, field.name))
                return

        # All but the field name is case-insensitive, do initial lookup
        # to see if it already exists, skip if it does
        lookup = {
            'app_name__iexact': app_name,
            'model_name__iexact': model_name,
            'field_name': field.name,
        }

        # Note, `name` is set below
        kwargs = {
            'description': field.help_text or None,
            'app_name': app_name,
            'model_name': model_name.lower(),
            'field_name': field.name,
        }

        if lexicon:
            kwargs.update({
                'label_field_name': 'label',
                'order_field_name': 'order',
                'code_field_name': 'code',
            })
        elif objectset and hasattr(field.model, 'label_field'):
            kwargs.update({
                'label_field_name': field.model.label_field
            })

        try:
            f = DataField.objects.get(**lookup)
        except DataField.DoesNotExist:
            f = DataField(published=options.get('publish'), **kwargs)

        if f.pk:
            created = False
            if not force:
                print(u'({0}) {1}.{2} already exists. Skipping...'
                      .format(app_name, model_name, field.name))
                return
            # Only overwrite if the source value is not falsy
            f.__dict__.update([(k, v) for k, v in kwargs.items()])
        else:
            created = True

        if not f.name:
            # Use the default unicode representation of the datafield
            if prepend_model_name:
                f.name = unicode(f)
            else:
                f.name = field.verbose_name.title()

        # Update fields with flags
        f.__dict__.update(utils.get_heuristic_flags(field))

        # Create category based on the model name and associate
        # it to the field.
        if create_categories:
            category, _ = DataCategory.objects\
                .get_or_create(name=f.model._meta.verbose_name.title(),
                               published=auto_publish)
            f.category = category
        else:
            category = None

        f.save()

        # Create a concept if one does not already exist for this field
        if create_concepts and not DataConcept.objects\
                .filter(fields=f).exists():

            kwargs = {
                'published': auto_publish,
                'category': category,
            }

            DataConcept.objects.create_from_field(f, **kwargs)

        return created
Ejemplo n.º 25
0
    def handle_field(self, field, model_name, app_name, **options):
        include_keys = options.get('include_keys')
        force = options.get('force')
        include_non_editable = options.get('include_non_editable')

        # M2Ms do not make any sense here..
        if isinstance(field, ManyToManyField):
            return

        # Lexicons and ObjectSets are represented via their primary key, so
        # these may pass
        if not issubclass(field.model, (Lexicon, ObjectSet)):
            # Check for primary key, and foreign key fields
            if isinstance(field, self.key_field_types) and not include_keys:
                return

            # Ignore non-editable fields since in most cases they are for
            # managment purposes
            if not field.editable and not include_non_editable:
                return

        # All but the field name is case-insensitive, do initial lookup
        # to see if it already exists, skip if it does
        lookup = {
            'app_name__iexact': app_name,
            'model_name__iexact': model_name,
            'field_name': field.name,
        }

        # Note, `name` is set below
        kwargs = {
            'description': field.help_text or None,
            'app_name': app_name,
            'model_name': model_name.lower(),
            'field_name': field.name,
        }

        try:
            datafield = DataField.objects.get(**lookup)
        except DataField.DoesNotExist:
            datafield = DataField(published=False, **kwargs)

        if datafield.pk:
            created = False
            if not force:
                print '({0}) {1}.{2} already exists. Skipping...'.format(app_name,
                    model_name, field.name)
                return
            # Only overwrite if the source value is not falsy
            datafield.__dict__.update([(k, v) for k, v in kwargs.items()])
        else:
            created = True

        if not datafield.name:
            # Use the default unicode representation of the datafield
            datafield.name = unicode(datafield)

        # Update fields with flags
        datafield.__dict__.update(utils.get_heuristic_flags(datafield))
        datafield.save()
        return created
Ejemplo n.º 26
0
 def test_dist(self):
     f = DataField(app_name='tests', model_name='date', field_name='month')
     # Months of the year
     result = tuple([(i, 1) for i in range(1, 13)])
     self.assertEqual(f.dist(), result)
Ejemplo n.º 27
0
    def handle_field(self, field, model_name, app_name, **options):
        include_keys = options.get('include_keys')
        force = options.get('force')
        include_non_editable = options.get('include_non_editable')
        prepend_model_name = options.get('prepend_model_name')
        create_concepts = options.get('concepts')
        auto_publish = options.get('publish')
        create_categories = options.get('categories')

        # M2Ms do not make any sense here..
        if isinstance(field, ManyToManyField):
            return

        # Check for primary key, and foreign key fields
        if isinstance(field, self.key_field_types) and not include_keys:
            print(u'({0}) {1}.{2} is a primary or foreign key. Skipping...'.
                  format(app_name, model_name, field.name))
            return

        # Ignore non-editable fields since in most cases they are for
        # managment purposes
        if not field.editable and not include_non_editable:
            print(u'({0}) {1}.{2} is not editable. Skipping...'.format(
                app_name, model_name, field.name))
            return

        # All but the field name is case-insensitive, do initial lookup
        # to see if it already exists, skip if it does
        lookup = {
            'app_name__iexact': app_name,
            'model_name__iexact': model_name,
            'field_name': field.name,
        }

        # Note, `name` is set below
        kwargs = {
            'description': field.help_text or None,
            'app_name': app_name,
            'model_name': model_name.lower(),
            'field_name': field.name,
        }

        try:
            f = DataField.objects.get(**lookup)
        except DataField.DoesNotExist:
            f = DataField(published=options.get('publish'), **kwargs)

        if f.pk:
            created = False
            if not force:
                print(u'({0}) {1}.{2} already exists. Skipping...'.format(
                    app_name, model_name, field.name))
                return
            # Only overwrite if the source value is not falsy
            f.__dict__.update([(k, v) for k, v in kwargs.items()])
        else:
            created = True

        if not f.name:
            # Use the default unicode representation of the datafield
            if prepend_model_name:
                f.name = unicode(f)
            else:
                f.name = field.verbose_name.title()

        # Update fields with flags
        f.__dict__.update(utils.get_heuristic_flags(field))

        # Create category based on the model name and associate
        # it to the field.
        if create_categories:
            category, _ = DataCategory.objects\
                .get_or_create(name=f.model._meta.verbose_name.title(),
                               published=auto_publish)
            f.category = category
        else:
            category = None

        f.save()

        # Create a concept if one does not already exist for this field
        if create_concepts and not DataConcept.objects\
                .filter(fields=f).exists():

            kwargs = {
                'published': auto_publish,
                'category': category,
            }

            DataConcept.objects.create_from_field(f, **kwargs)

        return created
Ejemplo n.º 28
0
    def get(self, request, pk):
        instance = self.get_object(request, pk=pk)
        params = self.get_params(request)

        tree = trees[params.get('tree')]
        opts = tree.root_model._meta

        tree_field = DataField(pk='{0}:{1}'.format(params.get('tree'), pk),
                               app_name=opts.app_label,
                               model_name=opts.module_name,
                               field_name=opts.pk.name)

        # This will eventually make its way in the parametizer, but lists
        # are not supported.
        dimensions = request.GET.getlist('dimensions')

        if params['aware']:
            context = self.get_context(request)
        else:
            context = None

        QueryProcessor = pipeline.query_processors[params['processor']]
        processor = QueryProcessor(context=context, tree=tree)
        queryset = processor.get_queryset(request=request)

        # Explicit fields to group by, ignore ones that dont exist or the
        # user does not have permission to view. Default is to group by the
        # reference field for disinct counts.
        if any(dimensions):
            fields = []
            groupby = []

            for pk in dimensions:
                f = self.get_object(request, pk=pk)

                if f:
                    fields.append(f)
                    groupby.append(tree.query_string_for_field(f.field,
                                                               model=f.model))
        else:
            fields = [instance]
            groupby = [tree.query_string_for_field(instance.field,
                                                   model=instance.model)]

        # Exclude null values. Depending on the downstream use of the data,
        # nulls may or may not be desirable.
        if not params['nulls']:
            q = Q()

            for field in groupby:
                q = q & Q(**{'{0}__isnull'.format(field): False})

            queryset = queryset.filter(q)

        queryset = queryset.values(*groupby)

        # Begin constructing the response
        resp = {
            'data': [],
            'outliers': [],
            'clustered': False,
            'size': 0,
        }

        queryset = queryset.annotate(count=Count(tree_field.field.name))\
            .values_list('count', *groupby)

        # Evaluate list of points
        length = len(queryset)

        # Nothing to do
        if not length:
            usage.log('dims', instance=instance, request=request, data={
                'size': 0,
                'clustered': False,
                'aware': params['aware'],
            })

            return resp

        if length > MAXIMUM_OBSERVATIONS:
            data = {
                'message': 'Data too large',
            }

            return self.render(request, data,
                               status=codes.unprocessable_entity)

        # Apply ordering. If any of the fields are enumerable, ordering should
        # be relative to those fields. For continuous data, the ordering is
        # relative to the count of each group
        if (any([d.enumerable for d in fields]) and
                not params['sort'] == 'count'):
            queryset = queryset.order_by(*groupby)
        else:
            queryset = queryset.order_by('-count')

        clustered = False

        points = [{
            'count': point[0],
            'values': point[1:],
        } for point in list(queryset)]

        outliers = []

        # For N-dimensional continuous data, check if clustering should occur
        # to down-sample the data.
        if all([d.simple_type == 'number' for d in fields]):
            # Extract observations for clustering.
            obs = []

            null_points = []
            numeric_points = []

            for i, point in enumerate(points):
                # We need to handle points that have null dimensions
                # differently than those that are all numeric as the kmeans
                # module currently cannot handle mixed type dimensions so we
                # only allow fully numeric points to be passed to the kmeans
                # module.
                if None in point['values']:
                    null_points.append(point)
                    continue

                for i, dim in enumerate(point['values']):
                    if isinstance(dim, Decimal):
                        point['values'][i] = float(str(dim))

                numeric_points.append(point)
                obs.append(point['values'])

            # Perform k-means clustering. Determine centroids and calculate
            # the weighted count relatives to the centroid and observations
            # within the kmeans module.
            if params['cluster'] and length >= MINIMUM_OBSERVATIONS:
                clustered = True

                counts = [p['count'] for p in numeric_points]
                points, outliers = kmeans.weighted_counts(
                    obs, counts, params['n'])
            else:
                indexes = kmeans.find_outliers(obs, normalized=False)

                outliers = []

                for idx in indexes:
                    outliers.append(numeric_points[idx])
                    numeric_points[idx] = None

                points = [p for p in numeric_points if p is not None]

            # Now that we have done the analysis using the purely numeric
            # points, we can add the mixed/null dimensionality points back in
            # to the list before returning results.
            points += null_points

        usage.log('dims', instance=instance, request=request, data={
            'size': length,
            'clustered': clustered,
            'aware': params['aware'],
        })

        labeled_points = []
        value_labels = tree_field.value_labels(queryset=queryset)

        for point in points:
            labeled_points.append({
                'count': point['count'],
                'values': [{
                    'label': value_labels.get(value, smart_unicode(value)),
                    'value': value
                } for value in point['values']]
            })

        return {
            'data': labeled_points,
            'clustered': clustered,
            'outliers': outliers,
            'size': length,
        }
Ejemplo n.º 29
0
 def setUp(self):
     self.f = DataField.init('tests.title.name')