Exemple #1
0
    def __init__(self, name, count_filter=None):
        super(_HelperCalculation, self).__init__()
        self.outer_aggregations = {}
        # If the unique aggregation should count *all* of the unique values,
        # we can just use a simple "count" on the outer groupby
        if not count_filter or isinstance(count_filter, EmptyFilter):
            self.outer_aggregations[name] = count('count')
        else:
            # If the unique aggregation should only count unique values when
            # they meet a specific criteria, then we need to do more work.
            # Conceptually, to include a row if it meets a specific filter, we
            # would store a 1 for that row and sum the new column in the outer
            # groupby. Unfortunately, druid does not provide an aggregator that
            # returns a constant, so we must use a post aggregator on the inner
            # groupby to convert the value into a constant 1.

            # Choose an aggregation that is guaranteed to not be 0
            inner_agg = filtered_aggregator(filter=count_filter,
                                            agg=count('count'))
            inner_agg_key = '%s%s_agg' % (name, self.SUFFIX)
            self.add_aggregation(inner_agg_key, inner_agg)

            # Divide the value by itself during post aggregation so that the
            # inner groupby returns a 1 or 0 for this row
            const_formula = '%s / %s' % (inner_agg_key, inner_agg_key)
            post_agg_key = '%s%s_post_agg' % (name, self.SUFFIX)
            self.add_post_aggregation_from_formula(post_agg_key, const_formula)

            # Sum the constant column in the outer groupby to get the exact
            # unique count for a filtered set
            self.outer_aggregations[name] = longsum(post_agg_key)
    def _add_filter_to_aggregation(self, aggregation, query_filter):
        if not query_filter:
            return aggregation

        # Time interval aggregations are special and store an internal filter.
        if isinstance(aggregation, TimeIntervalAggregation):
            aggregation.initial_filter &= query_filter
            return aggregation

        # Construct a new filtered aggregation with our extra filter added.
        new_aggregation = aggregation

        # HACK(stephen): Copy the new query_filter each time since pydruid is a
        # garbage library. When filters are built by the `filtered_aggregator`
        # below, the *original* filter is mutated. This will cause weird errors
        # when the same filter is used across multiple filtered aggregators (
        # which is what we are doing here).
        new_filter = deepcopy(query_filter)
        current_agg_filter = build_filter_from_aggregation(aggregation)

        # This aggregation is a filtered aggregation already. Merge in its
        # filter.
        if current_agg_filter:
            new_filter &= current_agg_filter
            new_aggregation = aggregation['aggregator']
        return filtered_aggregator(filter=new_filter, agg=new_aggregation)
 def __init__(self, result_id, agg_type, query_filter=None):
     base_aggregation = agg_type('__time')
     if query_filter:
         base_aggregation = filtered_aggregator(filter=query_filter,
                                                agg=base_aggregation)
     aggregations = {result_id: base_aggregation}
     super(TimeCalculation, self).__init__(aggregations)
    def __init__(self, dimension, field):
        super(AverageCalculation, self).__init__(dimension, field, self.SUFFIX)
        # Calculate the count for this field.
        count_key = '%s_event_count%s' % (field, self.SUFFIX)
        count_agg = filtered_aggregator(filter=self.dimension_filter,
                                        agg=longsum('count'))
        self.add_aggregation(count_key, count_agg)

        avg_formula = '%s / %s' % (self.sum_key, count_key)
        self.add_post_aggregation_from_formula(field, avg_formula)
    def build_sequence_aggregation(self, agg_type):
        # Special case for druid builtin type "__time". It should not be
        # used as part of a filtered aggregation
        if self.sequence_field == '__time':
            return agg_type('__time')

        dimension_filter = Filter(dimension=self.dimension,
                                  value=self.sequence_field)
        return filtered_aggregator(filter=dimension_filter,
                                   agg=agg_type('sum'))
    def __init__(self, dimension, field, aggregation_suffix=''):
        self.sum_key = '%s%s' % (field, aggregation_suffix)
        self.dimension = dimension
        self.dimension_filter = Filter(dimension=dimension, value=field)

        aggregations = {
            self.sum_key:
            filtered_aggregator(filter=self.dimension_filter,
                                agg=doublesum('sum'))
        }

        super(SumCalculation, self).__init__(aggregations=aggregations)
    def __init__(self, dimension, field, weight_field):
        super(WeightedAverageCalculation,
              self).__init__(dimension, field, self.SUFFIX)

        self.weight_key = '%s%s' % (weight_field, self.SUFFIX)
        self.weight_filter = Filter(dimension=dimension, value=weight_field)

        weight_aggregation = {
            self.weight_key:
            filtered_aggregator(filter=self.weight_filter,
                                agg=doublesum('sum'))
        }
        self.add_aggregations(weight_aggregation)

        weighted_avg = '%s / %s' % (self.sum_key, self.weight_key)
        self.add_post_aggregation_from_formula(field, weighted_avg)
Exemple #8
0
    def __init__(
        self,
        name,
        theta_sketch_field,
        size=16384,
        count_filter=None,
        is_input_theta_sketch=True,
    ):
        aggregator = thetasketch(theta_sketch_field, is_input_theta_sketch,
                                 size)
        if count_filter:
            aggregator = filtered_aggregator(filter=count_filter,
                                             agg=aggregator)
        aggregations = {name: aggregator}

        super(ThetaSketchUniqueCountCalculation,
              self).__init__(aggregations=aggregations)
    def build_full_aggregation(self, dimensions, granularity, intervals):
        intervals_filter = self._filter_creator.get_interval_filter(
            granularity, intervals
        )

        # An interval filter might not be built for every query type. If
        # no interval filter is created, just return the original aggregation
        if not intervals_filter:
            return self.base_aggregation

        aggregation = self.base_aggregation
        # If the base aggregation is a filtered aggregation, AND its
        # filter with the new intervals filter. Drop the original filter
        # from the aggregation so that only the new filter is attached.
        if self.initial_filter:
            intervals_filter &= self.initial_filter
            aggregation = self.base_aggregation['aggregator']

        return filtered_aggregator(filter=intervals_filter, agg=aggregation)
Exemple #10
0
    def __init__(self, dimension, field):
        # The order of these fields are very important. The JS aggregator
        # can only receive function arguments in the order defined.
        # TODO(stephen): Add a way to validate and enforce rules that the
        # js aggregator requires
        aggregation_fields = ['__time', 'sum']
        js_aggregator = LAST_VALUE_FORMULA.build_aggregator(aggregation_fields)
        cur_filter = Filter(dimension=dimension, value=field)
        aggregation = filtered_aggregator(filter=cur_filter, agg=js_aggregator)

        js_aggregation_key = '%s%s' % (field, self.SUFFIX)
        aggregations = {js_aggregation_key: aggregation}

        post_aggregation = LAST_VALUE_FORMULA.build_post_aggregator(
            name=field, fields=[js_aggregation_key])

        post_aggregations = {field: post_aggregation}

        super(LastValueCalculation, self).__init__(aggregations,
                                                   post_aggregations)
Exemple #11
0
    def add_count_for_field(self, field):
        assert field in self.aggregations or field in self.post_aggregations, (
            'Cannot add count for field that does not exist: %s' % field)

        agg_filter = None
        if field in self.aggregations:
            agg_filter = build_filter_from_aggregation(
                self.aggregations[field])
        else:
            # Collect the aggregations that produce the post-aggregations value.
            aggregations = extract_aggregations_for_post_aggregation(
                field, self.aggregations, self.post_aggregations)
            agg_filter = build_query_filter_from_aggregations(aggregations)

        # Count the number of rows that stream through the aggregations computed
        # for this field.
        count_agg = longsum('count')

        # If an aggregation filter exists, use it to limit the count.
        if agg_filter is not None:
            count_agg = filtered_aggregator(filter=agg_filter, agg=count_agg)

        key = self.count_field_name(field)
        self.add_aggregation(key, count_agg)
Exemple #12
0
 def sum_calculation(cls, dimension, field, interval_creator):
     dimension_filter = Filter(dimension=dimension, value=field)
     base_aggregation = filtered_aggregator(filter=dimension_filter,
                                            agg=doublesum('sum'))
     return cls(field, base_aggregation, interval_creator)