Example #1
0
 def __init__(
     self,
     formula: ClauseElement,
     label: str,
     aggregation: Optional[AggregationDefinition] = None,
     data_source: Optional[str] = None,
 ):
     self.formula = formula
     self.label = safe_identifier(label)
     self.aggregation = aggregation or AggregationDefinition(
         type=AggregationType.sum)
     self.data_source = data_source
    def project_dataframe(
        cls,
        calc_df: Dataframe,
        return_taxons: Dict[TaxonExpressionStr, Taxon],
        physical_data_sources: Set[str],
        order_by: Optional[List[TaxonDataOrder]] = None,
        limit: Optional[int] = None,
        offset: Optional[int] = None,
    ) -> Dataframe:
        """
        Applies in this order:
        - filtering
        - ordering
        - limiting and offsetting
        """
        for order_by_rule in order_by or []:
            if order_by_rule.taxon not in return_taxons:
                raise InvalidRequest(
                    'request.order_by',
                    f'Taxon "{order_by_rule.taxon}" used in order_by clause must be also selected.'
                )

        projected_sql_and_df_columns, final_query = cls._project_columns(
            calc_df.query, calc_df, return_taxons)
        final_query = final_query.select_from(calc_df.query)

        projected_df_columns = Dataframe.dataframe_columns_to_map(
            [df_col for _, df_col in projected_sql_and_df_columns])

        if order_by:
            final_query = final_query.order_by(*[
                nullslast(ORDER_BY_FUNCTIONS[item.type](column(
                    safe_identifier(item.taxon)))) for item in (order_by or [])
            ])

        if limit is not None:
            final_query = final_query.limit(limit)
        if offset is not None:
            final_query = final_query.offset(offset)

        return Dataframe(
            final_query,
            projected_df_columns,
            calc_df.used_model_names,
            used_physical_data_sources=physical_data_sources,
        )
Example #3
0
    def _rebuild_taxon_info_map_inner_query(self):
        """
        Updates the internal taxon model info map, because we use inner query to select the raw data
        """
        taxon_model_info_map = dict()
        for taxon_slug_expression, info in self.taxon_model_info_map.items():
            new_info = TaxonModelInfo(safe_identifier(taxon_slug_expression),
                                      info.model_name, info.quantity_type)
            taxon_model_info_map[taxon_slug_expression] = new_info

        for filter_slug, formula in self.filter_templates.items():
            if filter_slug in taxon_model_info_map:
                info = taxon_model_info_map[filter_slug]
                render_params = dict()
                for used_slug in formula.used_taxons:
                    render_params[used_slug] = taxon_model_info_map[
                        used_slug].taxon_sql_accessor
                sql_accessor = formula.render_formula(**render_params)

                taxon_model_info_map[filter_slug] = TaxonModelInfo(
                    sql_accessor, info.model_name, info.quantity_type)

        self.taxon_model_info_map = taxon_model_info_map
Example #4
0
    def _build_query_window_aggregations(
        self,
        taxon_to_model: Dict[TaxonSlugExpression, HuskyModel],
        ordered_query_joins: Sequence[QueryJoins],
    ) -> Select:
        """
        Generates query for taxons which need window functions for aggregation

        :param taxon_to_model: Map of taxon slugs (key) and models they are coming from (value)
        :param ordered_query_joins: List of joins
        """
        selectors = []
        # generate inner query with window aggregation functions
        for taxon_slug_expression, taxon in sorted(
                self.projection_taxons.items(), key=lambda x: str(x[0])):
            model = taxon_to_model[taxon_slug_expression]
            if (taxon.tel_metadata
                    and taxon.tel_metadata.aggregation_definition
                    and taxon.tel_metadata.aggregation_definition.params
                    and taxon.tel_metadata_aggregation_type
                    in self._AGGREGATION_WINDOW_FUNCTIONS):
                # find the order_by columns
                order_by = []
                window_params = cast(
                    AggregationParamsSortDimension,
                    taxon.tel_metadata.aggregation_definition.params)
                for field in window_params.sort_dimensions:
                    col = taxon_to_model[TaxonSlugExpression(
                        field.taxon)].taxon_sql_accessor(
                            self.ctx, field.taxon)

                    order_by_dir = field.order_by or TaxonOrderType.asc
                    order_by.append(
                        nullslast(ORDER_BY_FUNCTIONS[order_by_dir](
                            literal_column(col))))

                # apply window aggregation functions
                column = self._AGGREGATION_WINDOW_FUNCTIONS[
                    taxon.tel_metadata_aggregation_type](literal_column(
                        model.taxon_sql_accessor(self.ctx, taxon.slug))).over(
                            partition_by=self.get_partition_by_columns(model),
                            order_by=order_by)
            else:
                # otherwise, render the columns "as-is"
                column = literal_column(
                    model.taxon_sql_accessor(self.ctx, taxon.slug))

            selectors.append(column.label(taxon.slug_safe_sql_identifier))

        # add joins to the inner query
        inner_query = select(selectors).select_from(
            self._build_from_joins(ordered_query_joins))

        # apply scope filters to the inner query
        inner_query = ScopeGuard.add_scope_row_filters(
            self.ctx, self.scope, inner_query, self.taxon_model_info_map)

        # update taxon model info map, because we're selecting from outer query and not the inner query
        self._rebuild_taxon_info_map_inner_query()

        # then, we use prepare the outer query on which we can safely apply GROUP BY
        return self._build_selectors(lambda _, taxon_slug: safe_identifier(
            taxon_slug)).select_from(inner_query)
Example #5
0
 def generate_identifier(column_name: str, slug: str,
                         include_unknown_values: bool) -> str:
     """Generate SQL identifier for the mapping"""
     return safe_identifier(
         f'__om_{column_name}_{slug}_{str(include_unknown_values)}')
Example #6
0
 def generate_cte_name(slug: str) -> str:
     """Generated name for CTE representing override mapping with the slug"""
     return safe_identifier(f'__om_{slug}')
    def _add_aggregation(
        cls,
        inner_query: Select,
        aggregation_columns: List[ColumnClause],
        group_by_columns: List[ColumnClause],
        grouping_sets: Optional[GroupingSets] = None,
    ) -> Select:
        """
        Aggregates raw metric taxons. Groups by given dimension taxons or grouping sets.

        :param inner_query: Query to aggregate
        :param aggregation_columns: List of columns with applied aggregation function
        :param group_by_columns: List of columns to group by
        :param grouping_sets: Optional list of grouping sets to group by instead
        :return: Aggregated query
        """
        if grouping_sets:
            # Because we union _PANORAMIC_GROUPINGSETS_NULL with column that can be date(time) or number,
            # we must cast all group columns to text. Some DB engines fail when we do casting and grouping in one query,
            # thus here we need to stringify the group columns in the CTE, and not in the group by query just below...
            group_by_column_names = {col.name for col in group_by_columns}
            stringified_group_columns = []
            for col in inner_query.columns:
                if col.name in group_by_column_names:
                    stringified_group_columns.append(
                        cast(col, sqlalchemy.VARCHAR).label(col.name))
                else:
                    stringified_group_columns.append(col)

            # common table expression reused by multiple grouping sets queries
            cte_query = (Select(
                columns=sort_columns(stringified_group_columns)).select_from(
                    inner_query).cte('__cte_grouping_sets'))
            grouping_sets_queries = []

            for grouping_set in grouping_sets:
                safe_grouping_set = [
                    safe_identifier(col) for col in grouping_set
                ]
                # dimensions in the grouping set, used to aggregate values with group by
                gs_group_columns = [
                    col for col in group_by_columns
                    if col.name in safe_grouping_set
                ]
                # extra dimensions not in the grouping set, returned as custom null values
                gs_null_columns = [
                    literal_column(f"'{_PANORAMIC_GROUPINGSETS_NULL}'").label(
                        col.name) for col in group_by_columns
                    if col.name not in safe_grouping_set
                ]
                grouping_sets_queries.append(
                    Select(columns=sort_columns(
                        gs_group_columns + gs_null_columns +
                        aggregation_columns)).select_from(cte_query).group_by(
                            *sort_columns(gs_group_columns)))
            return union_all(*grouping_sets_queries)

        # If grouping sets are not defined, use all dimensions for grouping.
        return (Select(columns=sort_columns(
            group_by_columns +
            aggregation_columns)).select_from(inner_query).group_by(
                *sort_columns(group_by_columns)))
    def _build_comparison_blend_query(
        cls,
        ctx: HuskyQueryContext,
        config_arg: BlendingDataRequest,
        taxon_manager: BlendingTaxonManager,
        query_info: BlendingQueryInfo,
        allowed_physical_data_sources: Optional[Set[str]] = None,
    ) -> Optional[Dataframe]:
        """
        Builds comparison query for each subrequest and then blends them all into one comparison dataframe.
        """
        dataframes = []
        config = BlendingDataRequest(config_arg.to_native(
        ))  # Clone, coz we will be modifying subqueries
        assert config.comparison, 'Comparison must be defined when trying to build comparison query..'
        comparison: ComparisonConfig = config.comparison
        for _subrequest in config.data_subrequests:
            subrequest = cls._build_comparison_subrequest(
                _subrequest, comparison, taxon_manager)
            data_source = subrequest.properties.data_source

            # if no comparison taxons were found for this subrequest, skip creating comparison query for it as well
            if len(subrequest.taxons) == 0:
                continue

            bm_sub_query_info = QueryInfo.create(subrequest)
            query_info.comparison_subrequests_info.append(bm_sub_query_info)
            # Build comparison dataframe and add it to a list.
            # TODO pass down TelPlan for comparisons
            # ComparisonRequestBuilder might have added filters (typically for company id project id)
            # Me create new filter templates for this comparison subrequest.
            filter_templates = TelPlanner.get_preaggregation_filter_templates(
                ctx,
                [
                    subrequest.preaggregation_filters,
                    subrequest.scope.preaggregation_filters
                ],
                taxon_manager.taxon_map,
                data_source,
            )

            dataframes.append(
                QueryBuilder.build_query(
                    ctx,
                    subrequest,
                    bm_sub_query_info,
                    taxon_manager.used_taxons,
                    dimension_templates=taxon_manager.plan.
                    comparison_data_source_formula_templates[data_source],
                    filter_templates=filter_templates,
                    allowed_physical_data_sources=allowed_physical_data_sources,
                ))

        # if no comparison subrequests were created, there is no need to blend data frames
        if len(dataframes) == 0:
            return None

        # Blend all comparison dataframes into one
        # TODO pass down TelPlan for comparisons
        data_source_formula_templates = taxon_manager.plan.comparison_data_source_formula_templates
        dataframe = blend_dataframes(ctx, dataframes,
                                     data_source_formula_templates)

        # Prefix all comparison metric columns with 'comparison@' and create comparison taxon for it.
        query = dataframe.query
        final_columns = []
        aliased_taxon_by_slug: Dict[TaxonExpressionStr,
                                    DataframeColumn] = dict()
        for slug, df_column in dataframe.slug_to_column.items():
            # Alias metrics with comparison@ prefix, and select dimensions..
            if df_column.taxon.is_dimension:
                new_taxon = df_column.taxon.copy(deep=True)
                new_slug = TaxonExpressionStr(f'{slug}')
            else:
                new_slug, new_taxon = BlendingTaxonManager.create_comparison_taxon(
                    df_column.taxon)

            final_columns.append(query.c[safe_identifier(slug)].label(
                new_taxon.slug_safe_sql_identifier))
            aliased_taxon_by_slug[new_slug] = DataframeColumn(
                new_slug, new_taxon, df_column.quantity_type)
        for pre_formulas in data_source_formula_templates.values():
            # and also select the dim columns from dim templates.
            for pre_formula in pre_formulas:
                final_columns.append(
                    literal_column(
                        quote_identifier(pre_formula.label, ctx.dialect)))
        renamed_cols_query = select(sort_columns(final_columns)).select_from(
            dataframe.query)
        return Dataframe(renamed_cols_query, aliased_taxon_by_slug,
                         dataframe.used_model_names,
                         dataframe.used_physical_data_sources)
Example #9
0
def left_join_dataframes(
    ctx: HuskyQueryContext, data_dataframe: Dataframe, comparison_dataframe: Dataframe, tel_plan: TelPlan
) -> Dataframe:
    """
    Produces new DF, that is DATA_DF LEFT JOIN COMPARISON_DF on given list of taxons.
    :param ctx: Husky query context
    :param data_dataframe: df to left join to
    :param comparison_dataframe: other df
    :param tel_plan: Current TEL plan
    :return: Left joined dataframe
    """
    # Alias their queries to be able to easily reference them.
    data_table = data_dataframe.query.alias('data_dataframe')
    comparison_table = comparison_dataframe.query.alias('comparison_dataframe')

    # Union taxon slugs from both DFs.
    columns_by_slug = {**data_dataframe.slug_to_column, **comparison_dataframe.slug_to_column}
    select_columns = set()
    #  Select the column from specific data frame (data or comparison), but then label them to remove that prefix,
    # since the names are already unique (from TEL planner)
    for slug in data_dataframe.slug_to_column.keys():
        select_columns.add(
            literal_column(f'data_dataframe.{safe_quote_identifier(slug, ctx.dialect)}').label(safe_identifier(slug))
        )

    for slug, df_column in comparison_dataframe.slug_to_column.items():
        taxon: Taxon = df_column.taxon
        if taxon.is_comparison_taxon:
            select_columns.add(
                literal_column(f'comparison_dataframe.{safe_quote_identifier(slug, ctx.dialect)}').label(
                    safe_identifier(slug)
                )
            )
    join_on_conditions = []

    for template in tel_plan.dimension_formulas:
        # Select the data source formula labels explicitly from data table
        select_columns.add(data_table.c[template.label])

    for join_column in tel_plan.comparison_join_columns:
        join_on_conditions.append(
            # Account for dimensions that can have NULL values, because NULL = NULL evaluates to FALSE in SQL,
            # second condition that compares both columns to IS NULL needs to be added.
            or_(
                data_table.c[join_column] == comparison_table.c[join_column],
                and_(data_table.c[join_column].is_(None), comparison_table.c[join_column].is_(None)).self_group(),
            )
        )

    if len(join_on_conditions) == 0:
        # In case there were no comparison dimensions defined, the comparison dataframe also has no dimensions
        # (thus it is one row) and we can safely do a join without ON clause to data dataframe.
        # Using 1=1 as a easiest way to do join without ON clause in alchemy...
        join_on_conditions.append(text('1=1'))

    q = select(sort_columns(list(select_columns))).select_from(
        data_table.outerjoin(comparison_table, and_(*join_on_conditions))
    )

    return Dataframe(
        q,
        columns_by_slug,
        data_dataframe.used_model_names | comparison_dataframe.used_model_names,
        data_dataframe.used_physical_data_sources | comparison_dataframe.used_physical_data_sources,
    )
Example #10
0
 def slug_safe_sql_identifier(self):
     """
     Returns slug that is safe to use on any database, especially on BigQuery.
     :return:
     """
     return safe_identifier(self.slug)
Example #11
0
 def __init__(
     self, name: TaxonExpressionStr, taxon: Taxon, quantity_type: ValueQuantityType = ValueQuantityType.scalar
 ):
     self.name = safe_identifier(name)
     self.taxon = taxon
     self.quantity_type = quantity_type
Example #12
0
 def __init__(self, template: SqlTemplate, label: str, data_source: str, used_taxons: Set[str]):
     self.template = template
     self.label = safe_identifier(label)
     self.data_source = data_source
     self.used_taxons = used_taxons