Exemple #1
0
    def _project_column(
        cls,
        query: Select,
        taxon: Taxon,
        source_df_column: Optional[DataframeColumn],
    ) -> ColumnAndDataframeColumn:
        """
        Returns projection SQL for given taxon, and also description of that projected column in a form of
        DataframeColumn.
        """
        try:
            col = query.columns[taxon.slug_safe_sql_identifier]
            assert (
                source_df_column is not None
            ), f'DataframeColumn is required for dimension types. taxon_slug: {taxon.slug}'
            if source_df_column.quantity_type == ValueQuantityType.array:
                # We want to cast array into a string when selecting it.
                # The final quantity is thus always scalar.
                col = cast(col, String)
            df_col = DataframeColumn(TaxonExpressionStr(taxon.slug), taxon,
                                     ValueQuantityType.scalar)

            return col.label(taxon.slug_safe_sql_identifier), df_col
        except TelExpressionException as error:
            raise HuskyInvalidTelException(error, taxon.slug)
Exemple #2
0
def get_mocked_dataframe_columns_map(
        taxon_slugs: List[str]) -> Dict[TaxonExpressionStr, DataframeColumn]:
    """
    Helper fn that creates DataframeColumn map, where all columns are scalar by default.
    :param taxon_slugs:
    :return:
    """
    taxon_map = mock_get_taxons_map(None, taxon_slugs)
    return {
        slug_expr: DataframeColumn(slug_expr, taxon, ValueQuantityType.scalar)
        for slug_expr, taxon in taxon_map.items()
    }
    def calculate_dataframe(
        self,
        ctx: HuskyQueryContext,
        df: Dataframe,
        physical_data_sources: Set[str],
        grouping_sets: Optional[GroupingSets] = None,
        filter_clause: Optional[FilterClause] = None,
    ) -> Dataframe:
        """
        Applies in this order:
        - pre aggregation logic
        - aggregation by group by or grouping sets
        - optional step of window function aggregation
        - after aggregation logic
        - filters. Filters are applied here to simplify the final query and apply filtering before filling date gaps.
        """
        pre_agg_columns = [
        ]  # Columns with applied aggregation function in aggregation step

        # Columns to select from window step - columns that are not removed and dont need window step
        select_from_window_step: List[ColumnClause] = []
        df_columns: List[DataframeColumn] = [
        ]  # Final df columns after all steps.
        group_columns = []
        final_columns: List[ColumnClause] = []
        for pre_formula in self.taxon_manager.plan.metric_pre:
            col = pre_formula.formula.label(pre_formula.label)
            aggregation_fn = self.AGGREGATION_FUNCTIONS_MAP.get(
                pre_formula.aggregation.type)

            if aggregation_fn:
                # we know the aggregation function so let's use it
                pre_agg_columns.append(
                    aggregation_fn(col).label(pre_formula.label))
            else:
                # if no aggregation function is defined, then we simply group by this formula
                group_columns.append(col)

            select_from_window_step.append(col)

        # taxon slugs used in group by clause
        dimension_taxon_slugs = {
            group_column.name
            for group_column in group_columns
        }

        for post_formula, taxon in self.taxon_manager.plan.metric_post:
            post_formula_sql = post_formula.render_formula(
                ctx.dialect, dimension_taxon_slugs)
            col = post_formula_sql.label(taxon.slug_safe_sql_identifier)
            final_columns.append(col)
            df_columns.append(DataframeColumn(taxon.slug_expr, taxon))

        # Aggregation query with column logic. This is the first aggregation step, regular group by
        # or a common table expression with multiple group by statements in case of grouping sets.
        pre_query = self._add_aggregation(df.query, pre_agg_columns,
                                          group_columns, grouping_sets)

        # Post aggregation logic
        post_query = Select(
            columns=sort_columns(final_columns)).select_from(pre_query)

        slug_to_column = Dataframe.dataframe_columns_to_map(df_columns)
        if filter_clause:
            taxon_model_info = {
                str(slug):
                TaxonModelInfo(safe_quote_identifier(slug, ctx.dialect))
                for slug in slug_to_column.keys()
            }
            post_query = FilterBuilder.augment_query(ctx, post_query,
                                                     taxon_model_info,
                                                     filter_clause)

        return Dataframe(post_query, slug_to_column, df.used_model_names,
                         physical_data_sources)
    def _build_comparison_blend_query(
        cls,
        ctx: HuskyQueryContext,
        config_arg: BlendingDataRequest,
        taxon_manager: BlendingTaxonManager,
        query_info: BlendingQueryInfo,
        allowed_physical_data_sources: Optional[Set[str]] = None,
    ) -> Optional[Dataframe]:
        """
        Builds comparison query for each subrequest and then blends them all into one comparison dataframe.
        """
        dataframes = []
        config = BlendingDataRequest(config_arg.to_native(
        ))  # Clone, coz we will be modifying subqueries
        assert config.comparison, 'Comparison must be defined when trying to build comparison query..'
        comparison: ComparisonConfig = config.comparison
        for _subrequest in config.data_subrequests:
            subrequest = cls._build_comparison_subrequest(
                _subrequest, comparison, taxon_manager)
            data_source = subrequest.properties.data_source

            # if no comparison taxons were found for this subrequest, skip creating comparison query for it as well
            if len(subrequest.taxons) == 0:
                continue

            bm_sub_query_info = QueryInfo.create(subrequest)
            query_info.comparison_subrequests_info.append(bm_sub_query_info)
            # Build comparison dataframe and add it to a list.
            # TODO pass down TelPlan for comparisons
            # ComparisonRequestBuilder might have added filters (typically for company id project id)
            # Me create new filter templates for this comparison subrequest.
            filter_templates = TelPlanner.get_preaggregation_filter_templates(
                ctx,
                [
                    subrequest.preaggregation_filters,
                    subrequest.scope.preaggregation_filters
                ],
                taxon_manager.taxon_map,
                data_source,
            )

            dataframes.append(
                QueryBuilder.build_query(
                    ctx,
                    subrequest,
                    bm_sub_query_info,
                    taxon_manager.used_taxons,
                    dimension_templates=taxon_manager.plan.
                    comparison_data_source_formula_templates[data_source],
                    filter_templates=filter_templates,
                    allowed_physical_data_sources=allowed_physical_data_sources,
                ))

        # if no comparison subrequests were created, there is no need to blend data frames
        if len(dataframes) == 0:
            return None

        # Blend all comparison dataframes into one
        # TODO pass down TelPlan for comparisons
        data_source_formula_templates = taxon_manager.plan.comparison_data_source_formula_templates
        dataframe = blend_dataframes(ctx, dataframes,
                                     data_source_formula_templates)

        # Prefix all comparison metric columns with 'comparison@' and create comparison taxon for it.
        query = dataframe.query
        final_columns = []
        aliased_taxon_by_slug: Dict[TaxonExpressionStr,
                                    DataframeColumn] = dict()
        for slug, df_column in dataframe.slug_to_column.items():
            # Alias metrics with comparison@ prefix, and select dimensions..
            if df_column.taxon.is_dimension:
                new_taxon = df_column.taxon.copy(deep=True)
                new_slug = TaxonExpressionStr(f'{slug}')
            else:
                new_slug, new_taxon = BlendingTaxonManager.create_comparison_taxon(
                    df_column.taxon)

            final_columns.append(query.c[safe_identifier(slug)].label(
                new_taxon.slug_safe_sql_identifier))
            aliased_taxon_by_slug[new_slug] = DataframeColumn(
                new_slug, new_taxon, df_column.quantity_type)
        for pre_formulas in data_source_formula_templates.values():
            # and also select the dim columns from dim templates.
            for pre_formula in pre_formulas:
                final_columns.append(
                    literal_column(
                        quote_identifier(pre_formula.label, ctx.dialect)))
        renamed_cols_query = select(sort_columns(final_columns)).select_from(
            dataframe.query)
        return Dataframe(renamed_cols_query, aliased_taxon_by_slug,
                         dataframe.used_model_names,
                         dataframe.used_physical_data_sources)
Exemple #5
0
    def query(
        cls,
        select_query: Select,
        taxon_model_info_map: Dict[str, TaxonModelInfo],
        projection_taxons: SlugExprTaxonMap,
        data_source: str,
        order_by: Optional[List[TaxonDataOrder]],
        limit: Optional[int],
        offset: Optional[int],
        used_physical_data_sources: Set[str],
        dimension_templates: Optional[List[SqlFormulaTemplate]] = None,
    ) -> Dataframe:
        """
        Generates the final projected dataframe

        :param select_query: Original query fetching all necessary fields
        :param taxon_model_info_map: Map of taxon slug expression to taxon model info
        :param projection_taxons: List of taxons meant to be projected by the final query
        :param data_source: Virtual data source for this subrequest
        :param order_by: List of clauses for order by
        :param limit: Limit for the query
        :param offset: Offset for the query
        :param dimension_templates: List of dimension templates

        :return: Final dataframe including all requested taxons
        """
        group_by = []
        selectors = []

        projected_df_columns: Dict[TaxonExpressionStr, DataframeColumn] = {}
        for taxon in projection_taxons.values():
            # apply aggregation, if you need to
            agg_type = taxon.tel_metadata_aggregation_type
            if agg_type and agg_type in cls._AGGREGATION_FUNCTIONS_MAP:
                col = cls._AGGREGATION_FUNCTIONS_MAP[agg_type](column(taxon.slug_safe_sql_identifier))
            else:
                col = column(taxon.slug_safe_sql_identifier)

            col = col.label(taxon.slug_safe_sql_identifier)

            # create appropriate dataframe column
            value_quality_type = ValueQuantityType.scalar
            if not taxon.calculation and taxon.slug_expr in taxon_model_info_map:
                value_quality_type = taxon_model_info_map[taxon.slug_expr].quantity_type
            df_column_name = TaxonExpressionStr(taxon.slug)
            projected_df_columns[df_column_name] = DataframeColumn(df_column_name, taxon, value_quality_type)

            # make sure we select this column in the query
            selectors.append(col)

            # check whether this taxon should be in group by clause
            if agg_type in cls._GROUP_BY_AGGREGATION_TYPES:
                group_by.append(col)

        # make sure we select all columns for dimension templates
        for dim_template in dimension_templates or []:
            col = column(dim_template.label)
            selectors.append(col)

            # we should group by all dimension templates
            group_by.append(col)

        # On purpose adding this value to emulate USING ON FALSE => PROD-8136
        selectors.append(literal(data_source).label(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME))
        # using literal_column here because some database engines do not like grouping by constant
        group_by.append(literal_column(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME))

        # created this query
        new_query = Select(
            columns=sort_columns(selectors),
            order_by=[nullslast(ORDER_BY_FUNCTIONS[item.type](item.taxon)) for item in (order_by or [])],
            group_by=sort_columns(group_by),
        ).select_from(select_query)

        if limit is not None:
            new_query = new_query.limit(limit)
        if offset is not None:
            new_query = new_query.offset(offset)

        # collect names of all used models
        used_model_names = {
            model_info.model_name for model_info in taxon_model_info_map.values() if model_info.model_name is not None
        }

        return Dataframe(new_query, projected_df_columns, used_model_names, used_physical_data_sources)