Example #1
0
    def load_dimensions_from_druid(self):
        base_query = GroupByQueryBuilder(
            datasource=self.datasource.name,
            granularity='all',
            grouping_fields=[],
            intervals=INTERVAL,
            calculation=COUNT_CALCULATION,
        )
        for ordered_dimensions in list(self.filter_dimensions.values()):
            # Special case: meta-dimensions (like Nation) are prefixed with '_'
            # and are handled elsewhere - don't query them in druid.
            queryable_dimensions = [
                d for d in ordered_dimensions if d[0] != '_'
            ]
            for dimension in queryable_dimensions:
                dimensions = self.dimension_slices.get(dimension, [dimension])
                base_query.dimensions = dimensions
                base_query.query_filter = Dimension(dimension) != None

                LOG.info('Querying distinct %s from Druid...', dimensions)
                query_result = self.query_client.run_query(base_query)

                output_rows = []
                for row in query_result.result:
                    event = row['event']
                    output_row = dict(event)
                    del output_row[COUNT_AGGREGATION_NAME]

                    # Create a display version of this dimension that includes
                    # the parent dimensions to help disambiguate dimension
                    # values that are the same with a different hierarchy
                    dimension_display = event[dimension]
                    num_dimensions = len(dimensions)
                    if num_dimensions > 1:
                        # NOTE(ian): This logic matches logic used on the
                        # frontend in SelectFilter.jsx
                        start = num_dimensions - 1
                        disambiguation = [
                            event[d] for d in dimensions[start::-1] if event[d]
                        ]
                        dimension_display = '%s (%s)' % (
                            dimension_display,
                            ', '.join(disambiguation),
                        )

                    output_row[DISPLAY_FIELD] = dimension_display
                    output_rows.append(output_row)

                self.dimension_map[dimension] = sorted(
                    output_rows, key=lambda a: a[DISPLAY_FIELD])
                LOG.info('%s values loaded for dimension: %s',
                         len(output_rows), dimension)

        LOG.info('Done preloading dimension values.')
Example #2
0
    def get_field_summary(self, field_id):
        druid_context = current_app.druid_context
        # Simulate building a query so we can access the query filter this field
        # would normally use.
        calculation = current_app.zen_config.aggregation_rules.get_calculation_for_fields(
            [field_id])
        interval = druid_context.data_time_boundary.get_full_time_interval()
        # HACK(ian): Setting granularity to month so that type=STOCK
        # aggregations are counted properly. This fails for
        # stock_granularity!=month, but there are very few of those.
        query = GroupByQueryBuilder('', 'month', [], [interval], calculation)

        # TODO(stephen): These values will be underreported for time interval
        # aggregations. Fix this.
        time_boundary = druid_context.data_time_boundary.get_field_time_boundary(
            field_id, query.query_filter)
        total_count = druid_context.row_count_lookup.get_row_count(
            query.query_filter, field_id)
        if not time_boundary or not total_count:
            return FieldSummary(field_id, 0)

        human_readable_formula = self.get_human_readable_formula_html(field_id)
        if not total_count:
            return FieldSummary(field_id,
                                0,
                                human_readable_formula=human_readable_formula)

        return FieldSummary(
            field_id,
            total_count,
            time_boundary['min'],
            time_boundary['max'],
            human_readable_formula=human_readable_formula,
        )
Example #3
0
    def __init__(
        self,
        fields,
        start_date,
        end_date,
        dimensions,
        dimension_values,
        granularity='all',
        query_client=None,
        compute_field_calculation=None,
    ):
        self.fields = fields
        self.start_date = start_date
        self.end_date = end_date
        self.dimension_values = dimension_values

        if not compute_field_calculation:
            from config.aggregation_rules import get_calculation_for_fields

            compute_field_calculation = get_calculation_for_fields

        # Build the params needed for ValidationRow
        intervals = [build_time_interval(self.start_date, self.end_date)]
        calculation = compute_field_calculation(fields)
        dimension_filter = GroupByQueryBuilder.build_dimension_filter(
            [dimension_values])
        super(FieldValidationRow, self).__init__(
            intervals,
            calculation,
            dimensions,
            dimension_filter,
            granularity,
            query_client,
            compute_field_calculation,
        )
Example #4
0
 def _build_query(self):
     return GroupByQueryBuilder(
         datasource=self.datasource.name,
         granularity=self.granularity,
         grouping_fields=self.dimensions,
         intervals=[self.interval],
         calculation=self.calculation,
         dimension_filter=self.query_filter,
     )
Example #5
0
 def build_query(self, datasource_name):
     return GroupByQueryBuilder(
         datasource=datasource_name,
         granularity=self.granularity,
         grouping_fields=self.dimensions,
         intervals=self.intervals,
         calculation=self.calculation,
         dimension_filter=self.dimension_filter,
         optimize=True,
     )
Example #6
0
 def build_query(self):
     calculations = [
         build_calculation(field) for field in self.request.fields
     ]
     return GroupByQueryBuilder(
         datasource=self.datasource.name,
         granularity=self.request.build_granularity(),
         grouping_fields=self.request.build_dimensions(),
         intervals=self.request.build_intervals(),
         calculation=CalculationMerger(calculations),
         dimension_filter=self.request.build_query_filter(),
     )
Example #7
0
    def to_druid_query(self, datasource):
        # Always exclude nation values for AQT.
        query_filter = self.build_query_filter()
        use_nation_hack = datasource.startswith('et')
        if use_nation_hack:
            query_filter &= DimensionFilter('RegionName') != 'Nation'

        return GroupByQueryBuilder(
            datasource=datasource,
            granularity=self.build_granularity(),
            grouping_fields=self.build_dimensions(),
            intervals=self.build_intervals(),
            calculation=self.build_calculation(),
            dimension_filter=query_filter,
            subtotal_dimensions=self.build_subtotal_dimensions(),
            subtotal_result_label=SUBTOTAL_RESULT_LABEL,
        )
Example #8
0
    def load_ranges_from_druid(self):
        """Return a dictionary mapping data source name to a
        dictionary (minTime, maxTime) of datetime objects.
        """
        date_ranges = {}
        LOG.info('Querying time ranges of data from Druid...')
        aggregations = {
            MIN_TIME_FIELD: {
                'type': 'longMin',
                'fieldName': '__time'
            },
            MAX_TIME_FIELD: {
                'type': 'longMax',
                'fieldName': '__time'
            },
        }
        calculation = BaseCalculation(aggregations=aggregations)
        query = GroupByQueryBuilder(
            datasource=self.datasource.name,
            granularity='all',
            grouping_fields=[SOURCE_FIELD],
            intervals=INTERVAL,
            calculation=calculation,
        )
        query.query_filter &= Dimension(SOURCE_FIELD) != None

        query_result = self.query_client.run_query(query)
        for row in query_result.result:
            event = row['event']
            # making {data_source: (minTime, maxTime)}
            date_ranges[event[SOURCE_FIELD]] = {
                MIN_TIME_FIELD:
                self.date_from_timestamp(event[MIN_TIME_FIELD]),
                MAX_TIME_FIELD:
                self.date_from_timestamp(event[MAX_TIME_FIELD]),
            }

        LOG.info('Done querying date ranges of data')
        return date_ranges
Example #9
0
    def get_no_date_filter_df(self):
        interval = current_app.druid_context.data_time_boundary.get_full_time_interval(
        )

        # TODO(david): Update this when we work out a way of getting the first report
        # date for each geography without retriveing all report dates.
        # TODO(david): Work out a way of seperating the existing time and geographical
        # filters so that the geo filters can be included here. This will do for now as the only
        # effect this will have is if some dimension values are split across several higher
        # dimension values. E.g. if a county is split accross two regions and the different parts of
        # that county have different first report dates.
        earliest_report_query = GroupByQueryBuilder(
            self.datasource.name,
            'day',
            self.request.build_dimensions(),
            [interval],
            self.request.build_calculation(),
        )

        raw_df = self.query_client.run_query(
            earliest_report_query).export_pandas()

        return self.build_df(raw_df)
Example #10
0
    def run_query(self):
        '''
        Constructs and runs the Druid request for this query. The query is
        blocking.
        '''

        LOG.info('Running query...')

        # Filter the dimensions using the location filters passed in
        dimension_filter = GroupByQueryBuilder.build_dimension_filter(
            self.location_filters
        )

        # AND the selected locations with the non-location filters requested
        dimension_filter &= self.non_hierarchical_filter

        # Slice by selected granularity + all fields less specific than it. For
        # example, if user makes a Woreda query, we also want to slice by Zone
        # and Region.
        if self.geo_field:
            # Restrict query to non-null for the given geo
            dimension_filter &= Dimension(self.geo_field) != ''

            # Set the appropriate dimensions for this query
            self.druid_slice_dimensions = self.get_slice_dimensions()
            if self.latitude_field and self.longitude_field:
                self.druid_geo_dimensions = [self.latitude_field, self.longitude_field]

        grouping_fields = self.druid_slice_dimensions + self.druid_geo_dimensions

        batches = []
        overall_interval = build_time_interval(self.start_date, self.end_date)
        for selected_granularity in self.selected_granularities:
            granularity = selected_granularity
            intervals = [overall_interval]  # Druid expects time intervals as
            # a list
            granularity = current_app.zen_config.aggregation_rules.get_granularity_for_interval(
                selected_granularity, self.start_date, self.end_date
            )

            query = GroupByQueryBuilder(
                datasource=current_app.druid_context.current_datasource.name,
                granularity=granularity,
                grouping_fields=grouping_fields,
                intervals=intervals,
                calculation=self.calculation,
                dimension_filter=dimension_filter,
            )

            batch = QueryBatch(
                query,
                selected_granularity,
                self.geo_field,
                self.latitude_field,
                self.longitude_field,
                self.ordered_fields,
                self.denom,
                self.druid_slice_dimensions,
                self.query_client,
            )
            batches.append(batch)

        num_granularities = len(self.selected_granularities)
        if USE_THREAD_POOL and num_granularities > 1:
            pool = ThreadPool(num_granularities)
            pool.map(QueryBatch.run, batches)
            pool.close()
            pool.join()
        else:
            _ = [batch.run() for batch in batches]

        self.batches = batches
        return True