Ejemplo n.º 1
0
 def test_blending_2_2(self):
     q1 = Select(
         columns=[
             column('ad_id'),
             column('impressions'),
             column(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME)
         ],
         from_obj=table('table1'),
     )
     df1 = Dataframe(
         q1, get_mocked_dataframe_columns_map(['ad_id', 'impressions']),
         set(), {'SF'})
     q2 = Select(
         columns=[
             column('ad_id'),
             column('campaign_id'),
             column('impressions'),
             column(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME),
         ],
         from_obj=table('table2'),
     )
     df2 = Dataframe(
         q2,
         get_mocked_dataframe_columns_map(
             ['ad_id', 'impressions', 'campaign_id']), set(), {'SF'})
     blended_df = blend_dataframes(SNOWFLAKE_HUSKY_CONTEXT, [df1, df2])
     self.write_test_expectations('query.sql',
                                  compile_query(blended_df.query))
     expected_query = self.read_test_expectations('query.sql')
     self.assertEqual(expected_query, compile_query(blended_df.query))
     self.assertEqual({'ad_id', 'impressions', 'campaign_id'},
                      set(blended_df.slug_to_column.keys()))
Ejemplo n.º 2
0
    def project_dataframe(
        cls,
        calc_df: Dataframe,
        return_taxons: Dict[TaxonExpressionStr, Taxon],
        physical_data_sources: Set[str],
        order_by: Optional[List[TaxonDataOrder]] = None,
        limit: Optional[int] = None,
        offset: Optional[int] = None,
    ) -> Dataframe:
        """
        Applies in this order:
        - filtering
        - ordering
        - limiting and offsetting
        """
        for order_by_rule in order_by or []:
            if order_by_rule.taxon not in return_taxons:
                raise InvalidRequest(
                    'request.order_by',
                    f'Taxon "{order_by_rule.taxon}" used in order_by clause must be also selected.'
                )

        projected_sql_and_df_columns, final_query = cls._project_columns(
            calc_df.query, calc_df, return_taxons)
        final_query = final_query.select_from(calc_df.query)

        projected_df_columns = Dataframe.dataframe_columns_to_map(
            [df_col for _, df_col in projected_sql_and_df_columns])

        if order_by:
            final_query = final_query.order_by(*[
                nullslast(ORDER_BY_FUNCTIONS[item.type](column(
                    safe_identifier(item.taxon)))) for item in (order_by or [])
            ])

        if limit is not None:
            final_query = final_query.limit(limit)
        if offset is not None:
            final_query = final_query.offset(offset)

        return Dataframe(
            final_query,
            projected_df_columns,
            calc_df.used_model_names,
            used_physical_data_sources=physical_data_sources,
        )
    def build_comparison_query(
        cls,
        ctx: HuskyQueryContext,
        config_arg: BlendingDataRequest,
        taxon_manager: BlendingTaxonManager,
        override_mapping_manager: OverrideMappingManager,
        query_info: BlendingQueryInfo,
        allowed_physical_data_sources: Optional[Set[str]] = None,
    ) -> Optional[Dataframe]:
        comp_df = cls._build_comparison_blend_query(
            ctx,
            config_arg,
            taxon_manager,
            query_info,
            allowed_physical_data_sources=allowed_physical_data_sources)
        if comp_df is None or len(
                taxon_manager.plan.comparison_dimension_formulas) == 0:
            # There are no comparison dim formulas, means the rows are already grouped correctly
            return comp_df

        comp_df = DimensionPhaseBuilder.calculate_dataframe(
            taxon_manager.plan.comparison_dimension_formulas,
            override_mapping_manager.comparison_override_mapping_tel_data,
            override_mapping_manager.cte_map,
            comp_df,
        )

        # After dimension join, there could have been a merge (coalesce). We need to group them by the merged column
        # once more, to keep single row per dimension.. otherwise we will get row fanout when left joining with
        # data dataframe
        group_by_cols = []
        selectors = []
        for dim_formula in taxon_manager.plan.comparison_dimension_formulas:
            group_by_cols.append(column(dim_formula.label))
        for df_column in comp_df.slug_to_column.values():
            taxon = df_column.taxon
            col = column(df_column.name)
            if taxon.is_dimension:
                group_by_cols.append(col)
            else:
                agg_type = taxon.tel_metadata_aggregation_type
                agg_fn = None
                if agg_type:
                    agg_fn = MetricPhaseBuilder.AGGREGATION_FUNCTIONS_MAP.get(
                        agg_type)

                if agg_fn is None:
                    raise UnsupportedAggregationType(taxon)
                col = agg_fn(col).label(df_column.name)
                selectors.append(col)
        selectors.extend(group_by_cols)
        query = select(sort_columns(selectors)).select_from(
            comp_df.query).group_by(*group_by_cols)

        return Dataframe(query, comp_df.slug_to_column,
                         comp_df.used_model_names,
                         comp_df.used_physical_data_sources)
    def calculate_dataframe(
        cls,
        dimension_formulas: List[PreFormula],
        override_mappings_tel_data: OverrideMappingTelData,
        override_mapping_cte_map: Dict[OverrideMappingSlug, Select],
        df: Dataframe,
    ) -> Dataframe:
        select_columns = []
        select_columns.extend(df.query.columns)
        for dim_formula in dimension_formulas:
            col = dim_formula.formula.label(dim_formula.label)
            select_columns.append(col)

        # add joins to relevant override mapping CTEs
        select_from_query = OverrideMappingSql.insert_cte_joins(
            df.query, override_mappings_tel_data, override_mapping_cte_map)

        query = Select(columns=sort_columns(select_columns)).select_from(
            select_from_query)
        return Dataframe(query, df.slug_to_column, df.used_model_names,
                         df.used_physical_data_sources)
Ejemplo n.º 5
0
    def calculate_dataframe(
        self,
        ctx: HuskyQueryContext,
        df: Dataframe,
        physical_data_sources: Set[str],
        grouping_sets: Optional[GroupingSets] = None,
        filter_clause: Optional[FilterClause] = None,
    ) -> Dataframe:
        """
        Applies in this order:
        - pre aggregation logic
        - aggregation by group by or grouping sets
        - optional step of window function aggregation
        - after aggregation logic
        - filters. Filters are applied here to simplify the final query and apply filtering before filling date gaps.
        """
        pre_agg_columns = [
        ]  # Columns with applied aggregation function in aggregation step

        # Columns to select from window step - columns that are not removed and dont need window step
        select_from_window_step: List[ColumnClause] = []
        df_columns: List[DataframeColumn] = [
        ]  # Final df columns after all steps.
        group_columns = []
        final_columns: List[ColumnClause] = []
        for pre_formula in self.taxon_manager.plan.metric_pre:
            col = pre_formula.formula.label(pre_formula.label)
            aggregation_fn = self.AGGREGATION_FUNCTIONS_MAP.get(
                pre_formula.aggregation.type)

            if aggregation_fn:
                # we know the aggregation function so let's use it
                pre_agg_columns.append(
                    aggregation_fn(col).label(pre_formula.label))
            else:
                # if no aggregation function is defined, then we simply group by this formula
                group_columns.append(col)

            select_from_window_step.append(col)

        # taxon slugs used in group by clause
        dimension_taxon_slugs = {
            group_column.name
            for group_column in group_columns
        }

        for post_formula, taxon in self.taxon_manager.plan.metric_post:
            post_formula_sql = post_formula.render_formula(
                ctx.dialect, dimension_taxon_slugs)
            col = post_formula_sql.label(taxon.slug_safe_sql_identifier)
            final_columns.append(col)
            df_columns.append(DataframeColumn(taxon.slug_expr, taxon))

        # Aggregation query with column logic. This is the first aggregation step, regular group by
        # or a common table expression with multiple group by statements in case of grouping sets.
        pre_query = self._add_aggregation(df.query, pre_agg_columns,
                                          group_columns, grouping_sets)

        # Post aggregation logic
        post_query = Select(
            columns=sort_columns(final_columns)).select_from(pre_query)

        slug_to_column = Dataframe.dataframe_columns_to_map(df_columns)
        if filter_clause:
            taxon_model_info = {
                str(slug):
                TaxonModelInfo(safe_quote_identifier(slug, ctx.dialect))
                for slug in slug_to_column.keys()
            }
            post_query = FilterBuilder.augment_query(ctx, post_query,
                                                     taxon_model_info,
                                                     filter_clause)

        return Dataframe(post_query, slug_to_column, df.used_model_names,
                         physical_data_sources)
    def _build_comparison_blend_query(
        cls,
        ctx: HuskyQueryContext,
        config_arg: BlendingDataRequest,
        taxon_manager: BlendingTaxonManager,
        query_info: BlendingQueryInfo,
        allowed_physical_data_sources: Optional[Set[str]] = None,
    ) -> Optional[Dataframe]:
        """
        Builds comparison query for each subrequest and then blends them all into one comparison dataframe.
        """
        dataframes = []
        config = BlendingDataRequest(config_arg.to_native(
        ))  # Clone, coz we will be modifying subqueries
        assert config.comparison, 'Comparison must be defined when trying to build comparison query..'
        comparison: ComparisonConfig = config.comparison
        for _subrequest in config.data_subrequests:
            subrequest = cls._build_comparison_subrequest(
                _subrequest, comparison, taxon_manager)
            data_source = subrequest.properties.data_source

            # if no comparison taxons were found for this subrequest, skip creating comparison query for it as well
            if len(subrequest.taxons) == 0:
                continue

            bm_sub_query_info = QueryInfo.create(subrequest)
            query_info.comparison_subrequests_info.append(bm_sub_query_info)
            # Build comparison dataframe and add it to a list.
            # TODO pass down TelPlan for comparisons
            # ComparisonRequestBuilder might have added filters (typically for company id project id)
            # Me create new filter templates for this comparison subrequest.
            filter_templates = TelPlanner.get_preaggregation_filter_templates(
                ctx,
                [
                    subrequest.preaggregation_filters,
                    subrequest.scope.preaggregation_filters
                ],
                taxon_manager.taxon_map,
                data_source,
            )

            dataframes.append(
                QueryBuilder.build_query(
                    ctx,
                    subrequest,
                    bm_sub_query_info,
                    taxon_manager.used_taxons,
                    dimension_templates=taxon_manager.plan.
                    comparison_data_source_formula_templates[data_source],
                    filter_templates=filter_templates,
                    allowed_physical_data_sources=allowed_physical_data_sources,
                ))

        # if no comparison subrequests were created, there is no need to blend data frames
        if len(dataframes) == 0:
            return None

        # Blend all comparison dataframes into one
        # TODO pass down TelPlan for comparisons
        data_source_formula_templates = taxon_manager.plan.comparison_data_source_formula_templates
        dataframe = blend_dataframes(ctx, dataframes,
                                     data_source_formula_templates)

        # Prefix all comparison metric columns with 'comparison@' and create comparison taxon for it.
        query = dataframe.query
        final_columns = []
        aliased_taxon_by_slug: Dict[TaxonExpressionStr,
                                    DataframeColumn] = dict()
        for slug, df_column in dataframe.slug_to_column.items():
            # Alias metrics with comparison@ prefix, and select dimensions..
            if df_column.taxon.is_dimension:
                new_taxon = df_column.taxon.copy(deep=True)
                new_slug = TaxonExpressionStr(f'{slug}')
            else:
                new_slug, new_taxon = BlendingTaxonManager.create_comparison_taxon(
                    df_column.taxon)

            final_columns.append(query.c[safe_identifier(slug)].label(
                new_taxon.slug_safe_sql_identifier))
            aliased_taxon_by_slug[new_slug] = DataframeColumn(
                new_slug, new_taxon, df_column.quantity_type)
        for pre_formulas in data_source_formula_templates.values():
            # and also select the dim columns from dim templates.
            for pre_formula in pre_formulas:
                final_columns.append(
                    literal_column(
                        quote_identifier(pre_formula.label, ctx.dialect)))
        renamed_cols_query = select(sort_columns(final_columns)).select_from(
            dataframe.query)
        return Dataframe(renamed_cols_query, aliased_taxon_by_slug,
                         dataframe.used_model_names,
                         dataframe.used_physical_data_sources)
Ejemplo n.º 7
0
def blend_dataframes(
    ctx: HuskyQueryContext,
    dataframes: List[Dataframe],
    data_source_formula_templates: Optional[Dict[str, List[SqlFormulaTemplate]]] = None,
) -> Dataframe:
    """
    Produces new blended dataframe from all the given dataframes joined on all dimensions that appear at least twice in
    different dataframes.
    """
    slug_to_dataframes: Dict[TaxonExpressionStr, List[Dataframe]] = _prepare_slug_to_dataframes(dataframes)
    dataframe_to_query: Dict[Dataframe, Selectable] = dict()
    used_model_names: Set[str] = set()
    used_physical_sources: Set[str] = set()
    for idx, df in enumerate(dataframes):
        # Create query for each dataframe, that has alias as 'q<number>'
        dataframe_to_query[df] = df.query.alias(f'q{idx}')
        used_model_names.update(df.used_model_names)
        used_physical_sources.update(df.used_physical_data_sources)

    selectors: List[TextClause] = []
    dimension_columns: List[ColumnClause] = []
    # Prepare list of sql selectors. If it is a metric, do zeroifnull(q0.metric + q1.metric + ...)
    # If it is a dimension, just select it. Because we are using USING clause, no need for coalesce.
    for taxon_slug in sorted(slug_to_dataframes.keys()):
        dataframes_with_slug = slug_to_dataframes[taxon_slug]
        taxon = dataframes_with_slug[0].slug_to_column[taxon_slug].taxon
        taxon_column = quote_identifier(taxon.slug_safe_sql_identifier, ctx.dialect)
        query_aliases = [dataframe_to_query[df].name for df in dataframes_with_slug]
        if taxon.is_dimension:
            if len(query_aliases) > 1:
                # Coalesce must have two or more args
                dimension_coalesce = functions.coalesce(
                    *[literal_column(f'{query_alias}.{taxon_column}') for query_alias in query_aliases]
                )
            else:
                #  No need to coalesce now
                dimension_coalesce = literal_column(f'{query_aliases[0]}.{taxon_column}')
            col = dimension_coalesce.label(taxon.slug_safe_sql_identifier)

            dimension_columns.append(col)
            selectors.append(col)
        else:
            if taxon.data_source:
                # do not use coalesce aka zeroifnull when summing namespaces taxons..
                # There are using TEL expr, where null is handled by TEL compilation.
                summed = '+'.join([f'{query_alias}.{taxon_column}' for query_alias in query_aliases])
            else:
                summed = '+'.join([f'coalesce({query_alias}.{taxon_column},0)' for query_alias in query_aliases])
            selectors.append(text(f'sum({summed}) as {taxon_column}'))

    final_columns: List[ColumnClause] = []
    if data_source_formula_templates:
        for pre_formulas in data_source_formula_templates.values():
            for pre_formula in pre_formulas:
                col = column(pre_formula.label)
                dimension_columns.append(col)
                selectors.append(col)
                final_columns.append(column(quote_identifier(pre_formula.label, ctx.dialect)))

    # All taxons in final DF
    final_slug_to_taxon: Dict[TaxonExpressionStr, DataframeColumn] = dataframes[0].slug_to_column.copy()

    # Because of sql alchemy compiler putting extra () around every using select_from, we first join all queries
    # And then define the aggregation selectors (right after this for loop)
    join_query = dataframe_to_query[dataframes[0]]
    for i in range(1, len(dataframes)):
        #  Iterate dataframes, and do full outer join on FALSE, effectively meaning UNION-ALL without the need to
        # align all columns
        dataframe_to_join = dataframes[i]
        used_physical_sources.update(dataframe_to_join.used_physical_data_sources)

        final_slug_to_taxon = {**final_slug_to_taxon, **dataframe_to_join.slug_to_column}
        join_from = join_query
        join_to = dataframe_to_query[dataframe_to_join]

        # On purpose joining on value that will always return FALSE => PROD-8136
        join_query = join_from.join(
            join_to,
            dataframe_to_query[dataframes[0]].columns[HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME]
            == join_to.columns[HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME],
            full=True,
        )

    aggregate_join_query = select(selectors).select_from(join_query)
    for dimension_column in dimension_columns:
        aggregate_join_query = aggregate_join_query.group_by(dimension_column)

    # We have to wrap it in one more select, so the alchemy query object has columns referencable via 'c' attribute.
    final_columns.extend(column(id_) for id_ in safe_identifiers_iterable(final_slug_to_taxon.keys()))
    query = select(sort_columns(final_columns)).select_from(aggregate_join_query)

    return Dataframe(query, final_slug_to_taxon, used_model_names, used_physical_sources)
Ejemplo n.º 8
0
def left_join_dataframes(
    ctx: HuskyQueryContext, data_dataframe: Dataframe, comparison_dataframe: Dataframe, tel_plan: TelPlan
) -> Dataframe:
    """
    Produces new DF, that is DATA_DF LEFT JOIN COMPARISON_DF on given list of taxons.
    :param ctx: Husky query context
    :param data_dataframe: df to left join to
    :param comparison_dataframe: other df
    :param tel_plan: Current TEL plan
    :return: Left joined dataframe
    """
    # Alias their queries to be able to easily reference them.
    data_table = data_dataframe.query.alias('data_dataframe')
    comparison_table = comparison_dataframe.query.alias('comparison_dataframe')

    # Union taxon slugs from both DFs.
    columns_by_slug = {**data_dataframe.slug_to_column, **comparison_dataframe.slug_to_column}
    select_columns = set()
    #  Select the column from specific data frame (data or comparison), but then label them to remove that prefix,
    # since the names are already unique (from TEL planner)
    for slug in data_dataframe.slug_to_column.keys():
        select_columns.add(
            literal_column(f'data_dataframe.{safe_quote_identifier(slug, ctx.dialect)}').label(safe_identifier(slug))
        )

    for slug, df_column in comparison_dataframe.slug_to_column.items():
        taxon: Taxon = df_column.taxon
        if taxon.is_comparison_taxon:
            select_columns.add(
                literal_column(f'comparison_dataframe.{safe_quote_identifier(slug, ctx.dialect)}').label(
                    safe_identifier(slug)
                )
            )
    join_on_conditions = []

    for template in tel_plan.dimension_formulas:
        # Select the data source formula labels explicitly from data table
        select_columns.add(data_table.c[template.label])

    for join_column in tel_plan.comparison_join_columns:
        join_on_conditions.append(
            # Account for dimensions that can have NULL values, because NULL = NULL evaluates to FALSE in SQL,
            # second condition that compares both columns to IS NULL needs to be added.
            or_(
                data_table.c[join_column] == comparison_table.c[join_column],
                and_(data_table.c[join_column].is_(None), comparison_table.c[join_column].is_(None)).self_group(),
            )
        )

    if len(join_on_conditions) == 0:
        # In case there were no comparison dimensions defined, the comparison dataframe also has no dimensions
        # (thus it is one row) and we can safely do a join without ON clause to data dataframe.
        # Using 1=1 as a easiest way to do join without ON clause in alchemy...
        join_on_conditions.append(text('1=1'))

    q = select(sort_columns(list(select_columns))).select_from(
        data_table.outerjoin(comparison_table, and_(*join_on_conditions))
    )

    return Dataframe(
        q,
        columns_by_slug,
        data_dataframe.used_model_names | comparison_dataframe.used_model_names,
        data_dataframe.used_physical_data_sources | comparison_dataframe.used_physical_data_sources,
    )
Ejemplo n.º 9
0
    def query(
        cls,
        select_query: Select,
        taxon_model_info_map: Dict[str, TaxonModelInfo],
        projection_taxons: SlugExprTaxonMap,
        data_source: str,
        order_by: Optional[List[TaxonDataOrder]],
        limit: Optional[int],
        offset: Optional[int],
        used_physical_data_sources: Set[str],
        dimension_templates: Optional[List[SqlFormulaTemplate]] = None,
    ) -> Dataframe:
        """
        Generates the final projected dataframe

        :param select_query: Original query fetching all necessary fields
        :param taxon_model_info_map: Map of taxon slug expression to taxon model info
        :param projection_taxons: List of taxons meant to be projected by the final query
        :param data_source: Virtual data source for this subrequest
        :param order_by: List of clauses for order by
        :param limit: Limit for the query
        :param offset: Offset for the query
        :param dimension_templates: List of dimension templates

        :return: Final dataframe including all requested taxons
        """
        group_by = []
        selectors = []

        projected_df_columns: Dict[TaxonExpressionStr, DataframeColumn] = {}
        for taxon in projection_taxons.values():
            # apply aggregation, if you need to
            agg_type = taxon.tel_metadata_aggregation_type
            if agg_type and agg_type in cls._AGGREGATION_FUNCTIONS_MAP:
                col = cls._AGGREGATION_FUNCTIONS_MAP[agg_type](column(taxon.slug_safe_sql_identifier))
            else:
                col = column(taxon.slug_safe_sql_identifier)

            col = col.label(taxon.slug_safe_sql_identifier)

            # create appropriate dataframe column
            value_quality_type = ValueQuantityType.scalar
            if not taxon.calculation and taxon.slug_expr in taxon_model_info_map:
                value_quality_type = taxon_model_info_map[taxon.slug_expr].quantity_type
            df_column_name = TaxonExpressionStr(taxon.slug)
            projected_df_columns[df_column_name] = DataframeColumn(df_column_name, taxon, value_quality_type)

            # make sure we select this column in the query
            selectors.append(col)

            # check whether this taxon should be in group by clause
            if agg_type in cls._GROUP_BY_AGGREGATION_TYPES:
                group_by.append(col)

        # make sure we select all columns for dimension templates
        for dim_template in dimension_templates or []:
            col = column(dim_template.label)
            selectors.append(col)

            # we should group by all dimension templates
            group_by.append(col)

        # On purpose adding this value to emulate USING ON FALSE => PROD-8136
        selectors.append(literal(data_source).label(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME))
        # using literal_column here because some database engines do not like grouping by constant
        group_by.append(literal_column(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME))

        # created this query
        new_query = Select(
            columns=sort_columns(selectors),
            order_by=[nullslast(ORDER_BY_FUNCTIONS[item.type](item.taxon)) for item in (order_by or [])],
            group_by=sort_columns(group_by),
        ).select_from(select_query)

        if limit is not None:
            new_query = new_query.limit(limit)
        if offset is not None:
            new_query = new_query.offset(offset)

        # collect names of all used models
        used_model_names = {
            model_info.model_name for model_info in taxon_model_info_map.values() if model_info.model_name is not None
        }

        return Dataframe(new_query, projected_df_columns, used_model_names, used_physical_data_sources)