def build_dimension_filter(dimensions): output = EmptyFilter() if not dimensions: return output for dimension in dimensions: inner_filter = EmptyFilter() for dim, value in dimension.items(): inner_filter &= Filter(dimension=dim, value=value) output |= inner_filter return output
def __init__(self, dimension, name, count_filter=None, exclude_missing=True): self.dimension = dimension self.exclude_missing = exclude_missing self.name = name self.count_filter = count_filter or EmptyFilter() # If requested, prevent empty dimension values from being counted if exclude_missing: self.count_filter &= ~Filter(dimension=self.dimension, value='') self.calculation = _HelperCalculation(self.name, self.count_filter) super(ExactUniqueCountAggregation, self).__init__()
def __init__(self, query_client, geo_field_ordering): self.geo_field = None self.denom = None self.location_filters = [] self.non_hierarchical_filter = EmptyFilter() self.data_fields = set() self.calculated_fields = {} self.ordered_fields = [] self.latitude_field = None self.longitude_field = None self.start_date = None self.end_date = None self.selected_granularities = DEFAULT_GRANULARITIES self.time_bucket = None self.request_data = None self.request_is_demo = False self.use_randomized_data = False self.calculation = None self.druid_slice_dimensions = [] self.druid_geo_dimensions = [] self.batches = None self.response = None self.query_client = query_client self.all_geo_dimensions = set(geo_field_ordering) # Initialize basic structure of result self.results = { # Aggregate stats. 'overall': { 'totals': defaultdict(int), 'median': defaultdict(int), 'first_quartile': defaultdict(int), 'third_quartile': defaultdict(int), 'mean': defaultdict(int), 'std': defaultdict(int), 'variance': defaultdict(int), 'min': defaultdict(int), 'max': defaultdict(int), 'num_nonzero': defaultdict(int), }, # Geo-level stats. 'byGeo': {}, }
def _construct_authorization_filter(user_identity): '''Converts the query permissions that the user holds into druid filters and returns the logical OR of the computed constituent druid filters. ''' query_needs = enumerate_query_needs(user_identity) output_filter = EmptyFilter() query_need_added = False for query_need in query_needs: _filter = _construct_druid_filter_from_query_need(query_need) if not isinstance(_filter, EmptyFilter): output_filter |= _filter query_need_added = True if not query_need_added: # If the user has no query policies, ensure that any dimensions for which authorization is # enabled are unqueryable. for dimension_name in current_app.zen_config.filters.AUTHORIZABLE_DIMENSIONS: output_filter &= Dimension(dimension_name) == '' return output_filter
def _construct_druid_filter_from_query_need(query_need): '''Constructs a druid filter from an individual query need. ''' output_filter = EmptyFilter() filtered_dimensions = set() # Go through the individual dimension filters in the `QueryNeed` and construct the appropriate # filters. for dimension_filter in query_need.dimension_filters: dimension = dimension_filter.dimension_name include_values = dimension_filter.include_values exclude_values = dimension_filter.exclude_values all_values = dimension_filter.all_values dimension_filter = None filtered_dimensions.add(dimension) if all_values and not exclude_values: continue elif exclude_values: dimension_filter = ~Filter(type=IN_FILTER_SYMBOL, dimension=dimension, values=list(exclude_values)) else: dimension_filter = Filter(type=IN_FILTER_SYMBOL, dimension=dimension, values=list(include_values)) output_filter &= dimension_filter # If there are any authorizable dimensions for which permissions were not explicitly defined, # ensure that they are completely filtered out. for dimension_name in current_app.zen_config.filters.AUTHORIZABLE_DIMENSIONS: if dimension_name not in filtered_dimensions: no_dimension_values_filter = Dimension(dimension_name) == '' output_filter &= no_dimension_values_filter return output_filter
def build_query_filter(self): query_filter = self.filter.to_druid() if self.filter else EmptyFilter() for group in self.grouping_dimensions(): query_filter &= group.to_druid_filter() return query_filter
def __init__( self, datasource, granularity, grouping_fields, intervals, calculation, dimension_filter=None, optimize=True, subtotal_dimensions=None, subtotal_result_label='TOTAL', ): super(GroupByQueryBuilder, self).__init__(datasource, granularity, intervals) self.dimensions = grouping_fields self.subtotals = ( SubtotalConfig(self.dimensions, subtotal_dimensions, subtotal_result_label) if subtotal_dimensions else None ) # Build a copy of the input calculation with the fully built # aggregations and post aggregations. self.calculation = BaseCalculation() # Copy the calculations aggregations into the query. Call the # handlers of any aggregations that require information about # the current query to be built. self.query_modifier = None for key, aggregation in calculation.aggregations.items(): # NOTE(stephen): Very special case where an aggregation can # modify the query before it is issued. if isinstance(aggregation, QueryModifyingAggregation): if not self.query_modifier: self.query_modifier = aggregation else: # If a query modifier has already been set, we should merge # this query modifier into that one so that both are called. self.query_modifier = self.query_modifier.merge_compatible_aggregation( aggregation ) continue new_aggregation = aggregation # QueryDependentAggregations rely on certain query-time information # to be able to build their full filter and value sets. For example, # some aggregations should only be computed during the final time # interval of a query and not for the entire query duration. if isinstance(aggregation, QueryDependentAggregation): new_aggregation = aggregation.build_full_aggregation( dimensions=self.dimensions, granularity=self.granularity, intervals=self.intervals, ) self.calculation.add_aggregation(key, new_aggregation) self.calculation.add_post_aggregations(calculation.post_aggregations) # Build query filter from the selected data fields and dimensions. # Store dimension filters separate from aggregation filters so that # QueryModifyingAggregation can easily distinguish the filter types. # NOTE(stephen): Doing this *before* count fields are added so that # we don't duplicate the aggregation filters. Duplicating the filters, # while seemingly not a big deal, caused certain simple queries to take # 8x longer to run. self.aggregation_filter = build_query_filter_from_aggregations( self.calculation.aggregations ) self.dimension_filter = dimension_filter or EmptyFilter() # To workaround druid's default value of 0 for filtered aggregations, # we track the count of all fields that should have a null check # applied. If those fields have a count == 0, then in the parse step # after the query is run, their value will be replaced with None. strict_null_fields = calculation.strict_null_fields self.calculation.set_strict_null_fields(strict_null_fields) self.calculation.add_count_for_fields(strict_null_fields) # Store the aggregations/post aggregations at the top level of the query # dict since pydruid needs them in a specific place. # NOTE(stephen): This is kinda weird since we can become out of sync # with the calculation. self.aggregations = self.calculation.aggregations self.post_aggregations = self.calculation.post_aggregations # Combine the aggregation filters and the dimension filters in to the # full query filter to use. self.query_filter = self.aggregation_filter & self.dimension_filter # Remove RegionName = 'Nation' from national level query in the ET database. # When nation is selected and no dimension filters are set. # TODO(attila): We shouldn't have a region named 'Nation' in the first place ... ? # The national value could be computed as a post aggregation or in a dataframe. if ( not self.dimensions and isinstance(self.dimension_filter, EmptyFilter) and datasource.startswith('et') ): self.query_filter &= Dimension('RegionName') != 'Nation' # HACK(stephen): There appears to be a bug in how Druid produces # subtotals. Events produced by the first GroupBy pass inside Druid # are *reevaluated* against the original query filter. If the events # do not pass the original filter (and most of the time they do not for # us because we use filtered aggregations), then the event is *dropped* # from the final result. This happens even if the subtotals being # computed match the input dimensions exactly. To overcome this, we add # an extra filter that will only be valid on the computed events and # won't include any extra rows in the intermediate result (inside # Druid). This provides a filter that all events will pass while # subtotals are computed and will also ensure the non-subtotal events # accurate. # NOTE(stephen): This is fixed (Druid issue #7820) and can be removed # when the release containing the fix is live. if self.subtotals: # Use the first aggregation as the dimension to filter on. extra_filter = BoundFilter(list(self.aggregations.keys())[0], 0, None) self.query_filter |= extra_filter self.optimize = optimize