Ejemplo n.º 1
0
    def test_weighted_counts(self):
        """
        We might expect these to be 300, 200, and 100 but in running through
        this manually, the returned centroids from kmeans using k of 3 will be:
            [[0.9999999999999954, 0.9999999999999954, 0.9999999999999954], 
             [2.0000000000000018, 2.0000000000000018, 2.0000000000000018], 
             [2.9999999999999964, 2.9999999999999964, 2.9999999999999964]]
        
        which are close to but not exactly the perfect solution of:
        
            [[1.0, 1.0, 1.0],
             [2.0, 2.0, 2.0],
             [3.0, 3.0, 3.0]]

        this is a result of the floating point math in k-means. Because of 
        these seemingly trivial differences, we will get distances that aren't
        quite zero meaning, there will be dist_sum values in the 
        weighted_count methods. Because of that, we will lose fractions of a 
        count on each of the weighted count calculations, which, when summed
        over each centroid, account for difference of between 1 and 2 in the
        returned counts versus the expected counts. The weighted_counts method
        is deemed correct even though we expect counts of [300, 200, 100] 
        because it can't be expected to overcome the issues introduced during
        the floating point arithmetic during k-means execution.
        """

        expected_counts = [299, 199, 98]
        counts = [1] * 605

        centroid_counts, _ = kmeans.weighted_counts(int_points_3d, counts, 3)
        m_counts = [c['count'] for c in centroid_counts]
        self.assertSequenceEqual(expected_counts, m_counts)

        centroid_counts, _ = kmeans.weighted_counts(int_points, counts, 3)
        m_counts = [c['count'] for c in centroid_counts]
        self.assertSequenceEqual(expected_counts, m_counts)
Ejemplo n.º 2
0
    def test_weighted_counts(self):
        """
        We might expect these to be 300, 200, and 100 but in running through
        this manually, the returned centroids from kmeans using k of 3 will be:
            [[0.9999999999999954, 0.9999999999999954, 0.9999999999999954], 
             [2.0000000000000018, 2.0000000000000018, 2.0000000000000018], 
             [2.9999999999999964, 2.9999999999999964, 2.9999999999999964]]
        
        which are close to but not exactly the perfect solution of:
        
            [[1.0, 1.0, 1.0],
             [2.0, 2.0, 2.0],
             [3.0, 3.0, 3.0]]

        this is a result of the floating point math in k-means. Because of 
        these seemingly trivial differences, we will get distances that aren't
        quite zero meaning, there will be dist_sum values in the 
        weighted_count methods. Because of that, we will lose fractions of a 
        count on each of the weighted count calculations, which, when summed
        over each centroid, account for difference of between 1 and 2 in the
        returned counts versus the expected counts. The weighted_counts method
        is deemed correct even though we expect counts of [300, 200, 100] 
        because it can't be expected to overcome the issues introduced during
        the floating point arithmetic during k-means execution.
        """

        expected_counts = [299, 199, 98]
        counts = [1] * 605

        centroid_counts, _ = kmeans.weighted_counts(int_points_3d, counts, 3)
        m_counts = [c['count'] for c in centroid_counts]
        self.assertSequenceEqual(expected_counts, m_counts)

        centroid_counts, _ = kmeans.weighted_counts(int_points, counts, 3)
        m_counts = [c['count'] for c in centroid_counts]
        self.assertSequenceEqual(expected_counts, m_counts)
Ejemplo n.º 3
0
    def get(self, request, pk):
        instance = self.get_object(request, pk=pk)
        params = self.get_params(request)

        tree = trees[params.get('tree')]
        opts = tree.root_model._meta
        tree_field = DataField(app_name=opts.app_label,
                               model_name=opts.module_name,
                               field_name=opts.pk.name)

        # This will eventually make it's way in the parametizer, but lists
        # are not supported
        dimensions = request.GET.getlist('dimensions')

        # The `aware` flag toggles the behavior of the distribution by making
        # it relative to the applied context or not
        if params['aware']:
            attrs = None
        else:
            attrs = {}

        # Get and apply context relative to the tree
        context = self.get_context(request, attrs=attrs)
        queryset = context.apply(tree=tree)

        # Explicit fields to group by, ignore ones that dont exist or the
        # user does not have permission to view. Default is to group by the
        # reference field for distinct counts.
        if any(dimensions):
            fields = []
            groupby = []

            for pk in dimensions:
                f = self.get_object(request, pk=pk)
                if f:
                    fields.append(f)
                    groupby.append(tree.query_string_for_field(f.field))
        else:
            fields = [instance]
            groupby = [tree.query_string_for_field(instance.field)]

        # Perform a count aggregation of the tree model grouped by the
        # specified dimensions
        stats = tree_field.count(*groupby)

        # Apply it relative to the queryset
        stats = stats.apply(queryset)

        # Exclude null values. Dependending on the downstream use of the data,
        # nulls may or may not be desirable.
        if not params['nulls']:
            q = Q()
            for field in groupby:
                q = q | Q(**{field: None})
            stats = stats.exclude(q)

        # Begin constructing the response
        resp = {
            'data': [],
            'outliers': [],
            'clustered': False,
            'size': 0,
        }

        # Evaluate list of points
        length = len(stats)

        # Nothing to do
        if not length:
            usage.log('dist',
                      instance=instance,
                      request=request,
                      data={
                          'size': 0,
                          'clustered': False,
                          'aware': params['aware'],
                      })
            return resp

        if length > MAXIMUM_OBSERVATIONS:
            data = {
                'message': 'Data too large',
            }
            return self.render(request,
                               data,
                               status=codes.unprocessable_entity)

        # Apply ordering. If any of the fields are enumerable, ordering should
        # be relative to those fields. For continuous data, the ordering is
        # relative to the count of each group
        if (any([d.enumerable for d in fields])
                and not params['sort'] == 'count'):
            stats = stats.order_by(*groupby)
        else:
            stats = stats.order_by('-count')

        clustered = False
        points = list(stats)
        outliers = []

        # For N-dimensional continuous data, check if clustering should occur
        # to down-sample the data.
        if all([d.simple_type == 'number' for d in fields]):
            # Extract observations for clustering
            obs = []
            for point in points:
                for i, dim in enumerate(point['values']):
                    if isinstance(dim, Decimal):
                        point['values'][i] = float(str(dim))
                obs.append(point['values'])

            # Perform k-means clustering. Determine centroids and calculate
            # the weighted count relatives to the centroid and observations
            # within the kmeans module.
            if params['cluster'] and length >= MINIMUM_OBSERVATIONS:
                clustered = True

                counts = [p['count'] for p in points]
                points, outliers = kmeans.weighted_counts(
                    obs, counts, params['n'])
            else:
                indexes = kmeans.find_outliers(obs, normalized=False)

                outliers = []
                for idx in indexes:
                    outliers.append(points[idx])
                    points[idx] = None
                points = [p for p in points if p is not None]

        usage.log('dist',
                  instance=instance,
                  request=request,
                  data={
                      'size': length,
                      'clustered': clustered,
                      'aware': params['aware'],
                  })

        return {
            'data': points,
            'clustered': clustered,
            'outliers': outliers,
            'size': length,
        }
Ejemplo n.º 4
0
    def get(self, request, pk):
        instance = self.get_object(request, pk=pk)
        params = self.get_params(request)

        tree = trees[params.get('tree')]
        opts = tree.root_model._meta
        tree_field = DataField(
            app_name=opts.app_label, model_name=opts.module_name,
            field_name=opts.pk.name)

        # This will eventually make it's way in the parametizer, but lists
        # are not supported
        dimensions = request.GET.getlist('dimensions')

        if params['aware']:
            context = self.get_context(request)
        else:
            context = None

        QueryProcessor = pipeline.query_processors[params['processor']]
        processor = QueryProcessor(context=context, tree=tree)

        queryset = processor.get_queryset(request=request)

        # Explicit fields to group by, ignore ones that dont exist or the
        # user does not have permission to view. Default is to group by the
        # reference field for distinct counts.
        if any(dimensions):
            fields = []
            groupby = []

            for pk in dimensions:
                f = self.get_object(request, pk=pk)
                if f:
                    fields.append(f)
                    groupby.append(tree.query_string_for_field(f.field,
                                                               model=f.model))
        else:
            fields = [instance]
            groupby = [tree.query_string_for_field(instance.field,
                                                   model=instance.model)]

        # Perform a count aggregation of the tree model grouped by the
        # specified dimensions
        stats = tree_field.count(*groupby)

        # Apply it relative to the queryset
        stats = stats.apply(queryset)

        # Exclude null values. Dependending on the downstream use of the data,
        # nulls may or may not be desirable.
        if not params['nulls']:
            q = Q()
            for field in groupby:
                q = q | Q(**{field: None})
            stats = stats.exclude(q)

        # Begin constructing the response
        resp = {
            'data': [],
            'outliers': [],
            'clustered': False,
            'size': 0,
        }

        # Evaluate list of points
        length = len(stats)

        # Nothing to do
        if not length:
            usage.log('dist', instance=instance, request=request, data={
                'size': 0,
                'clustered': False,
                'aware': params['aware'],
            })
            return resp

        if length > MAXIMUM_OBSERVATIONS:
            data = {
                'message': 'Data too large',
            }
            return self.render(request, data,
                               status=codes.unprocessable_entity)

        # Apply ordering. If any of the fields are enumerable, ordering should
        # be relative to those fields. For continuous data, the ordering is
        # relative to the count of each group
        if (any([d.enumerable for d in fields]) and
                not params['sort'] == 'count'):
            stats = stats.order_by(*groupby)
        else:
            stats = stats.order_by('-count')

        clustered = False
        points = list(stats)
        outliers = []

        # For N-dimensional continuous data, check if clustering should occur
        # to down-sample the data.
        if all([d.simple_type == 'number' for d in fields]):
            # Extract observations for clustering
            obs = []
            for point in points:
                for i, dim in enumerate(point['values']):
                    if isinstance(dim, Decimal):
                        point['values'][i] = float(str(dim))
                obs.append(point['values'])

            # Perform k-means clustering. Determine centroids and calculate
            # the weighted count relatives to the centroid and observations
            # within the kmeans module.
            if params['cluster'] and length >= MINIMUM_OBSERVATIONS:
                clustered = True

                counts = [p['count'] for p in points]
                points, outliers = kmeans.weighted_counts(
                    obs, counts, params['n'])
            else:
                indexes = kmeans.find_outliers(obs, normalized=False)

                outliers = []
                for idx in indexes:
                    outliers.append(points[idx])
                    points[idx] = None
                points = [p for p in points if p is not None]

        usage.log('dist', instance=instance, request=request, data={
            'size': length,
            'clustered': clustered,
            'aware': params['aware'],
        })

        return {
            'data': points,
            'clustered': clustered,
            'outliers': outliers,
            'size': length,
        }
Ejemplo n.º 5
0
    def get(self, request, pk):
        instance = self.get_object(request, pk=pk)
        params = self.get_params(request)

        tree = trees[params.get('tree')]
        opts = tree.root_model._meta

        tree_field = DataField(pk='{0}:{1}'.format(params.get('tree'), pk),
                               app_name=opts.app_label,
                               model_name=opts.module_name,
                               field_name=opts.pk.name)

        # This will eventually make its way in the parametizer, but lists
        # are not supported.
        dimensions = request.GET.getlist('dimensions')

        if params['aware']:
            context = self.get_context(request)
        else:
            context = None

        QueryProcessor = pipeline.query_processors[params['processor']]
        processor = QueryProcessor(context=context, tree=tree)
        queryset = processor.get_queryset(request=request)

        # Explicit fields to group by, ignore ones that dont exist or the
        # user does not have permission to view. Default is to group by the
        # reference field for disinct counts.
        if any(dimensions):
            fields = []
            groupby = []

            for pk in dimensions:
                f = self.get_object(request, pk=pk)

                if f:
                    fields.append(f)
                    groupby.append(
                        tree.query_string_for_field(f.field, model=f.model))
        else:
            fields = [instance]
            groupby = [
                tree.query_string_for_field(instance.field,
                                            model=instance.model)
            ]

        # Exclude null values. Depending on the downstream use of the data,
        # nulls may or may not be desirable.
        if not params['nulls']:
            q = Q()

            for field in groupby:
                q = q & Q(**{'{0}__isnull'.format(field): False})

            queryset = queryset.filter(q)

        queryset = queryset.values(*groupby)

        # Begin constructing the response
        resp = {
            'data': [],
            'outliers': [],
            'clustered': False,
            'size': 0,
        }

        queryset = queryset.annotate(count=Count(tree_field.field.name))\
            .values_list('count', *groupby)

        # Evaluate list of points
        length = len(queryset)

        # Nothing to do
        if not length:
            usage.log('dims',
                      instance=instance,
                      request=request,
                      data={
                          'size': 0,
                          'clustered': False,
                          'aware': params['aware'],
                      })

            return resp

        if length > MAXIMUM_OBSERVATIONS:
            data = {
                'message': 'Data too large',
            }

            return self.render(request,
                               data,
                               status=codes.unprocessable_entity)

        # Apply ordering. If any of the fields are enumerable, ordering should
        # be relative to those fields. For continuous data, the ordering is
        # relative to the count of each group
        if (any([d.enumerable for d in fields])
                and not params['sort'] == 'count'):
            queryset = queryset.order_by(*groupby)
        else:
            queryset = queryset.order_by('-count')

        clustered = False

        points = [{
            'count': point[0],
            'values': point[1:],
        } for point in list(queryset)]

        outliers = []

        # For N-dimensional continuous data, check if clustering should occur
        # to down-sample the data.
        if all([d.simple_type == 'number' for d in fields]):
            # Extract observations for clustering.
            obs = []

            null_points = []
            numeric_points = []

            for i, point in enumerate(points):
                # We need to handle points that have null dimensions
                # differently than those that are all numeric as the kmeans
                # module currently cannot handle mixed type dimensions so we
                # only allow fully numeric points to be passed to the kmeans
                # module.
                if None in point['values']:
                    null_points.append(point)
                    continue

                for i, dim in enumerate(point['values']):
                    if isinstance(dim, Decimal):
                        point['values'][i] = float(str(dim))

                numeric_points.append(point)
                obs.append(point['values'])

            # Perform k-means clustering. Determine centroids and calculate
            # the weighted count relatives to the centroid and observations
            # within the kmeans module.
            if params['cluster'] and length >= MINIMUM_OBSERVATIONS:
                clustered = True

                counts = [p['count'] for p in numeric_points]
                points, outliers = kmeans.weighted_counts(
                    obs, counts, params['n'])
            else:
                indexes = kmeans.find_outliers(obs, normalized=False)

                outliers = []

                for idx in indexes:
                    outliers.append(numeric_points[idx])
                    numeric_points[idx] = None

                points = [p for p in numeric_points if p is not None]

            # Now that we have done the analysis using the purely numeric
            # points, we can add the mixed/null dimensionality points back in
            # to the list before returning results.
            points += null_points

        usage.log('dims',
                  instance=instance,
                  request=request,
                  data={
                      'size': length,
                      'clustered': clustered,
                      'aware': params['aware'],
                  })

        labeled_points = []
        value_labels = tree_field.value_labels(queryset=queryset)

        for point in points:
            labeled_points.append({
                'count':
                point['count'],
                'values': [{
                    'label':
                    value_labels.get(value, smart_unicode(value)),
                    'value':
                    value
                } for value in point['values']]
            })

        return {
            'data': labeled_points,
            'clustered': clustered,
            'outliers': outliers,
            'size': length,
        }
Ejemplo n.º 6
0
    def get(self, request, pk):
        instance = self.get_object(request, pk=pk)
        params = self.get_params(request)

        tree = trees[params.get('tree')]
        opts = tree.root_model._meta

        tree_field = DataField(pk='{0}:{1}'.format(params.get('tree'), pk),
                               app_name=opts.app_label,
                               model_name=opts.module_name,
                               field_name=opts.pk.name)

        # This will eventually make its way in the parametizer, but lists
        # are not supported.
        dimensions = request.GET.getlist('dimensions')

        if params['aware']:
            context = self.get_context(request)
        else:
            context = None

        QueryProcessor = pipeline.query_processors[params['processor']]
        processor = QueryProcessor(context=context, tree=tree)
        queryset = processor.get_queryset(request=request)

        # Explicit fields to group by, ignore ones that dont exist or the
        # user does not have permission to view. Default is to group by the
        # reference field for disinct counts.
        if any(dimensions):
            fields = []
            groupby = []

            for pk in dimensions:
                f = self.get_object(request, pk=pk)

                if f:
                    fields.append(f)
                    groupby.append(tree.query_string_for_field(f.field,
                                                               model=f.model))
        else:
            fields = [instance]
            groupby = [tree.query_string_for_field(instance.field,
                                                   model=instance.model)]

        # Exclude null values. Depending on the downstream use of the data,
        # nulls may or may not be desirable.
        if not params['nulls']:
            q = Q()

            for field in groupby:
                q = q & Q(**{'{0}__isnull'.format(field): False})

            queryset = queryset.filter(q)

        queryset = queryset.values(*groupby)

        # Begin constructing the response
        resp = {
            'data': [],
            'outliers': [],
            'clustered': False,
            'size': 0,
        }

        queryset = queryset.annotate(count=Count(tree_field.field.name))\
            .values_list('count', *groupby)

        # Evaluate list of points
        length = len(queryset)

        # Nothing to do
        if not length:
            usage.log('dims', instance=instance, request=request, data={
                'size': 0,
                'clustered': False,
                'aware': params['aware'],
            })

            return resp

        if length > MAXIMUM_OBSERVATIONS:
            data = {
                'message': 'Data too large',
            }

            return self.render(request, data,
                               status=codes.unprocessable_entity)

        # Apply ordering. If any of the fields are enumerable, ordering should
        # be relative to those fields. For continuous data, the ordering is
        # relative to the count of each group
        if (any([d.enumerable for d in fields]) and
                not params['sort'] == 'count'):
            queryset = queryset.order_by(*groupby)
        else:
            queryset = queryset.order_by('-count')

        clustered = False

        points = [{
            'count': point[0],
            'values': point[1:],
        } for point in list(queryset)]

        outliers = []

        # For N-dimensional continuous data, check if clustering should occur
        # to down-sample the data.
        if all([d.simple_type == 'number' for d in fields]):
            # Extract observations for clustering.
            obs = []

            null_points = []
            numeric_points = []

            for i, point in enumerate(points):
                # We need to handle points that have null dimensions
                # differently than those that are all numeric as the kmeans
                # module currently cannot handle mixed type dimensions so we
                # only allow fully numeric points to be passed to the kmeans
                # module.
                if None in point['values']:
                    null_points.append(point)
                    continue

                for i, dim in enumerate(point['values']):
                    if isinstance(dim, Decimal):
                        point['values'][i] = float(str(dim))

                numeric_points.append(point)
                obs.append(point['values'])

            # Perform k-means clustering. Determine centroids and calculate
            # the weighted count relatives to the centroid and observations
            # within the kmeans module.
            if params['cluster'] and length >= MINIMUM_OBSERVATIONS:
                clustered = True

                counts = [p['count'] for p in numeric_points]
                points, outliers = kmeans.weighted_counts(
                    obs, counts, params['n'])
            else:
                indexes = kmeans.find_outliers(obs, normalized=False)

                outliers = []

                for idx in indexes:
                    outliers.append(numeric_points[idx])
                    numeric_points[idx] = None

                points = [p for p in numeric_points if p is not None]

            # Now that we have done the analysis using the purely numeric
            # points, we can add the mixed/null dimensionality points back in
            # to the list before returning results.
            points += null_points

        usage.log('dims', instance=instance, request=request, data={
            'size': length,
            'clustered': clustered,
            'aware': params['aware'],
        })

        labeled_points = []
        value_labels = tree_field.value_labels(queryset=queryset)

        for point in points:
            labeled_points.append({
                'count': point['count'],
                'values': [{
                    'label': value_labels.get(value, smart_unicode(value)),
                    'value': value
                } for value in point['values']]
            })

        return {
            'data': labeled_points,
            'clustered': clustered,
            'outliers': outliers,
            'size': length,
        }