Example #1
0
    def build_dimension_filter(dimensions):
        output = EmptyFilter()
        if not dimensions:
            return output

        for dimension in dimensions:
            inner_filter = EmptyFilter()
            for dim, value in dimension.items():
                inner_filter &= Filter(dimension=dim, value=value)
            output |= inner_filter
        return output
Example #2
0
    def __init__(self,
                 dimension,
                 name,
                 count_filter=None,
                 exclude_missing=True):
        self.dimension = dimension
        self.exclude_missing = exclude_missing
        self.name = name
        self.count_filter = count_filter or EmptyFilter()

        # If requested, prevent empty dimension values from being counted
        if exclude_missing:
            self.count_filter &= ~Filter(dimension=self.dimension, value='')

        self.calculation = _HelperCalculation(self.name, self.count_filter)
        super(ExactUniqueCountAggregation, self).__init__()
Example #3
0
    def __init__(self, query_client, geo_field_ordering):
        self.geo_field = None
        self.denom = None
        self.location_filters = []
        self.non_hierarchical_filter = EmptyFilter()
        self.data_fields = set()
        self.calculated_fields = {}
        self.ordered_fields = []
        self.latitude_field = None
        self.longitude_field = None
        self.start_date = None
        self.end_date = None
        self.selected_granularities = DEFAULT_GRANULARITIES
        self.time_bucket = None

        self.request_data = None
        self.request_is_demo = False
        self.use_randomized_data = False

        self.calculation = None
        self.druid_slice_dimensions = []
        self.druid_geo_dimensions = []
        self.batches = None
        self.response = None
        self.query_client = query_client
        self.all_geo_dimensions = set(geo_field_ordering)

        # Initialize basic structure of result
        self.results = {
            # Aggregate stats.
            'overall': {
                'totals': defaultdict(int),
                'median': defaultdict(int),
                'first_quartile': defaultdict(int),
                'third_quartile': defaultdict(int),
                'mean': defaultdict(int),
                'std': defaultdict(int),
                'variance': defaultdict(int),
                'min': defaultdict(int),
                'max': defaultdict(int),
                'num_nonzero': defaultdict(int),
            },
            # Geo-level stats.
            'byGeo': {},
        }
Example #4
0
def _construct_authorization_filter(user_identity):
    '''Converts the query permissions that the user holds into druid filters and returns the
    logical OR of the computed constituent druid filters.
    '''
    query_needs = enumerate_query_needs(user_identity)

    output_filter = EmptyFilter()
    query_need_added = False

    for query_need in query_needs:
        _filter = _construct_druid_filter_from_query_need(query_need)
        if not isinstance(_filter, EmptyFilter):
            output_filter |= _filter
            query_need_added = True

    if not query_need_added:
        # If the user has no query policies, ensure that any dimensions for which authorization is
        # enabled are unqueryable.
        for dimension_name in current_app.zen_config.filters.AUTHORIZABLE_DIMENSIONS:
            output_filter &= Dimension(dimension_name) == ''

    return output_filter
Example #5
0
def _construct_druid_filter_from_query_need(query_need):
    '''Constructs a druid filter from an individual query need.
    '''

    output_filter = EmptyFilter()
    filtered_dimensions = set()

    # Go through the individual dimension filters in the `QueryNeed` and construct the appropriate
    # filters.
    for dimension_filter in query_need.dimension_filters:
        dimension = dimension_filter.dimension_name
        include_values = dimension_filter.include_values
        exclude_values = dimension_filter.exclude_values
        all_values = dimension_filter.all_values
        dimension_filter = None
        filtered_dimensions.add(dimension)

        if all_values and not exclude_values:
            continue
        elif exclude_values:
            dimension_filter = ~Filter(type=IN_FILTER_SYMBOL,
                                       dimension=dimension,
                                       values=list(exclude_values))
        else:
            dimension_filter = Filter(type=IN_FILTER_SYMBOL,
                                      dimension=dimension,
                                      values=list(include_values))

        output_filter &= dimension_filter

    # If there are any authorizable dimensions for which permissions were not explicitly defined,
    # ensure that they are completely filtered out.
    for dimension_name in current_app.zen_config.filters.AUTHORIZABLE_DIMENSIONS:
        if dimension_name not in filtered_dimensions:
            no_dimension_values_filter = Dimension(dimension_name) == ''
            output_filter &= no_dimension_values_filter

    return output_filter
Example #6
0
 def build_query_filter(self):
     query_filter = self.filter.to_druid() if self.filter else EmptyFilter()
     for group in self.grouping_dimensions():
         query_filter &= group.to_druid_filter()
     return query_filter
Example #7
0
    def __init__(
        self,
        datasource,
        granularity,
        grouping_fields,
        intervals,
        calculation,
        dimension_filter=None,
        optimize=True,
        subtotal_dimensions=None,
        subtotal_result_label='TOTAL',
    ):
        super(GroupByQueryBuilder, self).__init__(datasource, granularity, intervals)
        self.dimensions = grouping_fields
        self.subtotals = (
            SubtotalConfig(self.dimensions, subtotal_dimensions, subtotal_result_label)
            if subtotal_dimensions
            else None
        )

        # Build a copy of the input calculation with the fully built
        # aggregations and post aggregations.
        self.calculation = BaseCalculation()

        # Copy the calculations aggregations into the query. Call the
        # handlers of any aggregations that require information about
        # the current query to be built.
        self.query_modifier = None
        for key, aggregation in calculation.aggregations.items():
            # NOTE(stephen): Very special case where an aggregation can
            # modify the query before it is issued.
            if isinstance(aggregation, QueryModifyingAggregation):
                if not self.query_modifier:
                    self.query_modifier = aggregation
                else:
                    # If a query modifier has already been set, we should merge
                    # this query modifier into that one so that both are called.
                    self.query_modifier = self.query_modifier.merge_compatible_aggregation(
                        aggregation
                    )
                continue

            new_aggregation = aggregation
            # QueryDependentAggregations rely on certain query-time information
            # to be able to build their full filter and value sets. For example,
            # some aggregations should only be computed during the final time
            # interval of a query and not for the entire query duration.
            if isinstance(aggregation, QueryDependentAggregation):
                new_aggregation = aggregation.build_full_aggregation(
                    dimensions=self.dimensions,
                    granularity=self.granularity,
                    intervals=self.intervals,
                )

            self.calculation.add_aggregation(key, new_aggregation)

        self.calculation.add_post_aggregations(calculation.post_aggregations)

        # Build query filter from the selected data fields and dimensions.
        # Store dimension filters separate from aggregation filters so that
        # QueryModifyingAggregation can easily distinguish the filter types.
        # NOTE(stephen): Doing this *before* count fields are added so that
        # we don't duplicate the aggregation filters. Duplicating the filters,
        # while seemingly not a big deal, caused certain simple queries to take
        # 8x longer to run.
        self.aggregation_filter = build_query_filter_from_aggregations(
            self.calculation.aggregations
        )
        self.dimension_filter = dimension_filter or EmptyFilter()

        # To workaround druid's default value of 0 for filtered aggregations,
        # we track the count of all fields that should have a null check
        # applied. If those fields have a count == 0, then in the parse step
        # after the query is run, their value will be replaced with None.
        strict_null_fields = calculation.strict_null_fields
        self.calculation.set_strict_null_fields(strict_null_fields)
        self.calculation.add_count_for_fields(strict_null_fields)

        # Store the aggregations/post aggregations at the top level of the query
        # dict since pydruid needs them in a specific place.
        # NOTE(stephen): This is kinda weird since we can become out of sync
        # with the calculation.
        self.aggregations = self.calculation.aggregations
        self.post_aggregations = self.calculation.post_aggregations

        # Combine the aggregation filters and the dimension filters in to the
        # full query filter to use.
        self.query_filter = self.aggregation_filter & self.dimension_filter

        # Remove RegionName = 'Nation' from national level query in the ET database.
        # When nation is selected and no dimension filters are set.
        # TODO(attila): We shouldn't have a region named 'Nation' in the first place ... ?
        # The national value could be computed as a post aggregation or in a dataframe.
        if (
            not self.dimensions
            and isinstance(self.dimension_filter, EmptyFilter)
            and datasource.startswith('et')
        ):
            self.query_filter &= Dimension('RegionName') != 'Nation'

        # HACK(stephen): There appears to be a bug in how Druid produces
        # subtotals. Events produced by the first GroupBy pass inside Druid
        # are *reevaluated* against the original query filter. If the events
        # do not pass the original filter (and most of the time they do not for
        # us because we use filtered aggregations), then the event is *dropped*
        # from the final result. This happens even if the subtotals being
        # computed match the input dimensions exactly. To overcome this, we add
        # an extra filter that will only be valid on the computed events and
        # won't include any extra rows in the intermediate result (inside
        # Druid). This provides a filter that all events will pass while
        # subtotals are computed and will also ensure the non-subtotal events
        # accurate.
        # NOTE(stephen): This is fixed (Druid issue #7820) and can be removed
        # when the release containing the fix is live.
        if self.subtotals:
            # Use the first aggregation as the dimension to filter on.
            extra_filter = BoundFilter(list(self.aggregations.keys())[0], 0, None)
            self.query_filter |= extra_filter
        self.optimize = optimize