def __init__(self, name, count_filter=None): super(_HelperCalculation, self).__init__() self.outer_aggregations = {} # If the unique aggregation should count *all* of the unique values, # we can just use a simple "count" on the outer groupby if not count_filter or isinstance(count_filter, EmptyFilter): self.outer_aggregations[name] = count('count') else: # If the unique aggregation should only count unique values when # they meet a specific criteria, then we need to do more work. # Conceptually, to include a row if it meets a specific filter, we # would store a 1 for that row and sum the new column in the outer # groupby. Unfortunately, druid does not provide an aggregator that # returns a constant, so we must use a post aggregator on the inner # groupby to convert the value into a constant 1. # Choose an aggregation that is guaranteed to not be 0 inner_agg = filtered_aggregator(filter=count_filter, agg=count('count')) inner_agg_key = '%s%s_agg' % (name, self.SUFFIX) self.add_aggregation(inner_agg_key, inner_agg) # Divide the value by itself during post aggregation so that the # inner groupby returns a 1 or 0 for this row const_formula = '%s / %s' % (inner_agg_key, inner_agg_key) post_agg_key = '%s%s_post_agg' % (name, self.SUFFIX) self.add_post_aggregation_from_formula(post_agg_key, const_formula) # Sum the constant column in the outer groupby to get the exact # unique count for a filtered set self.outer_aggregations[name] = longsum(post_agg_key)
def _add_filter_to_aggregation(self, aggregation, query_filter): if not query_filter: return aggregation # Time interval aggregations are special and store an internal filter. if isinstance(aggregation, TimeIntervalAggregation): aggregation.initial_filter &= query_filter return aggregation # Construct a new filtered aggregation with our extra filter added. new_aggregation = aggregation # HACK(stephen): Copy the new query_filter each time since pydruid is a # garbage library. When filters are built by the `filtered_aggregator` # below, the *original* filter is mutated. This will cause weird errors # when the same filter is used across multiple filtered aggregators ( # which is what we are doing here). new_filter = deepcopy(query_filter) current_agg_filter = build_filter_from_aggregation(aggregation) # This aggregation is a filtered aggregation already. Merge in its # filter. if current_agg_filter: new_filter &= current_agg_filter new_aggregation = aggregation['aggregator'] return filtered_aggregator(filter=new_filter, agg=new_aggregation)
def __init__(self, result_id, agg_type, query_filter=None): base_aggregation = agg_type('__time') if query_filter: base_aggregation = filtered_aggregator(filter=query_filter, agg=base_aggregation) aggregations = {result_id: base_aggregation} super(TimeCalculation, self).__init__(aggregations)
def __init__(self, dimension, field): super(AverageCalculation, self).__init__(dimension, field, self.SUFFIX) # Calculate the count for this field. count_key = '%s_event_count%s' % (field, self.SUFFIX) count_agg = filtered_aggregator(filter=self.dimension_filter, agg=longsum('count')) self.add_aggregation(count_key, count_agg) avg_formula = '%s / %s' % (self.sum_key, count_key) self.add_post_aggregation_from_formula(field, avg_formula)
def build_sequence_aggregation(self, agg_type): # Special case for druid builtin type "__time". It should not be # used as part of a filtered aggregation if self.sequence_field == '__time': return agg_type('__time') dimension_filter = Filter(dimension=self.dimension, value=self.sequence_field) return filtered_aggregator(filter=dimension_filter, agg=agg_type('sum'))
def __init__(self, dimension, field, aggregation_suffix=''): self.sum_key = '%s%s' % (field, aggregation_suffix) self.dimension = dimension self.dimension_filter = Filter(dimension=dimension, value=field) aggregations = { self.sum_key: filtered_aggregator(filter=self.dimension_filter, agg=doublesum('sum')) } super(SumCalculation, self).__init__(aggregations=aggregations)
def __init__(self, dimension, field, weight_field): super(WeightedAverageCalculation, self).__init__(dimension, field, self.SUFFIX) self.weight_key = '%s%s' % (weight_field, self.SUFFIX) self.weight_filter = Filter(dimension=dimension, value=weight_field) weight_aggregation = { self.weight_key: filtered_aggregator(filter=self.weight_filter, agg=doublesum('sum')) } self.add_aggregations(weight_aggregation) weighted_avg = '%s / %s' % (self.sum_key, self.weight_key) self.add_post_aggregation_from_formula(field, weighted_avg)
def __init__( self, name, theta_sketch_field, size=16384, count_filter=None, is_input_theta_sketch=True, ): aggregator = thetasketch(theta_sketch_field, is_input_theta_sketch, size) if count_filter: aggregator = filtered_aggregator(filter=count_filter, agg=aggregator) aggregations = {name: aggregator} super(ThetaSketchUniqueCountCalculation, self).__init__(aggregations=aggregations)
def build_full_aggregation(self, dimensions, granularity, intervals): intervals_filter = self._filter_creator.get_interval_filter( granularity, intervals ) # An interval filter might not be built for every query type. If # no interval filter is created, just return the original aggregation if not intervals_filter: return self.base_aggregation aggregation = self.base_aggregation # If the base aggregation is a filtered aggregation, AND its # filter with the new intervals filter. Drop the original filter # from the aggregation so that only the new filter is attached. if self.initial_filter: intervals_filter &= self.initial_filter aggregation = self.base_aggregation['aggregator'] return filtered_aggregator(filter=intervals_filter, agg=aggregation)
def __init__(self, dimension, field): # The order of these fields are very important. The JS aggregator # can only receive function arguments in the order defined. # TODO(stephen): Add a way to validate and enforce rules that the # js aggregator requires aggregation_fields = ['__time', 'sum'] js_aggregator = LAST_VALUE_FORMULA.build_aggregator(aggregation_fields) cur_filter = Filter(dimension=dimension, value=field) aggregation = filtered_aggregator(filter=cur_filter, agg=js_aggregator) js_aggregation_key = '%s%s' % (field, self.SUFFIX) aggregations = {js_aggregation_key: aggregation} post_aggregation = LAST_VALUE_FORMULA.build_post_aggregator( name=field, fields=[js_aggregation_key]) post_aggregations = {field: post_aggregation} super(LastValueCalculation, self).__init__(aggregations, post_aggregations)
def add_count_for_field(self, field): assert field in self.aggregations or field in self.post_aggregations, ( 'Cannot add count for field that does not exist: %s' % field) agg_filter = None if field in self.aggregations: agg_filter = build_filter_from_aggregation( self.aggregations[field]) else: # Collect the aggregations that produce the post-aggregations value. aggregations = extract_aggregations_for_post_aggregation( field, self.aggregations, self.post_aggregations) agg_filter = build_query_filter_from_aggregations(aggregations) # Count the number of rows that stream through the aggregations computed # for this field. count_agg = longsum('count') # If an aggregation filter exists, use it to limit the count. if agg_filter is not None: count_agg = filtered_aggregator(filter=agg_filter, agg=count_agg) key = self.count_field_name(field) self.add_aggregation(key, count_agg)
def sum_calculation(cls, dimension, field, interval_creator): dimension_filter = Filter(dimension=dimension, value=field) base_aggregation = filtered_aggregator(filter=dimension_filter, agg=doublesum('sum')) return cls(field, base_aggregation, interval_creator)