def test_blending_2_2(self): q1 = Select( columns=[ column('ad_id'), column('impressions'), column(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME) ], from_obj=table('table1'), ) df1 = Dataframe( q1, get_mocked_dataframe_columns_map(['ad_id', 'impressions']), set(), {'SF'}) q2 = Select( columns=[ column('ad_id'), column('campaign_id'), column('impressions'), column(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME), ], from_obj=table('table2'), ) df2 = Dataframe( q2, get_mocked_dataframe_columns_map( ['ad_id', 'impressions', 'campaign_id']), set(), {'SF'}) blended_df = blend_dataframes(SNOWFLAKE_HUSKY_CONTEXT, [df1, df2]) self.write_test_expectations('query.sql', compile_query(blended_df.query)) expected_query = self.read_test_expectations('query.sql') self.assertEqual(expected_query, compile_query(blended_df.query)) self.assertEqual({'ad_id', 'impressions', 'campaign_id'}, set(blended_df.slug_to_column.keys()))
def project_dataframe( cls, calc_df: Dataframe, return_taxons: Dict[TaxonExpressionStr, Taxon], physical_data_sources: Set[str], order_by: Optional[List[TaxonDataOrder]] = None, limit: Optional[int] = None, offset: Optional[int] = None, ) -> Dataframe: """ Applies in this order: - filtering - ordering - limiting and offsetting """ for order_by_rule in order_by or []: if order_by_rule.taxon not in return_taxons: raise InvalidRequest( 'request.order_by', f'Taxon "{order_by_rule.taxon}" used in order_by clause must be also selected.' ) projected_sql_and_df_columns, final_query = cls._project_columns( calc_df.query, calc_df, return_taxons) final_query = final_query.select_from(calc_df.query) projected_df_columns = Dataframe.dataframe_columns_to_map( [df_col for _, df_col in projected_sql_and_df_columns]) if order_by: final_query = final_query.order_by(*[ nullslast(ORDER_BY_FUNCTIONS[item.type](column( safe_identifier(item.taxon)))) for item in (order_by or []) ]) if limit is not None: final_query = final_query.limit(limit) if offset is not None: final_query = final_query.offset(offset) return Dataframe( final_query, projected_df_columns, calc_df.used_model_names, used_physical_data_sources=physical_data_sources, )
def build_comparison_query( cls, ctx: HuskyQueryContext, config_arg: BlendingDataRequest, taxon_manager: BlendingTaxonManager, override_mapping_manager: OverrideMappingManager, query_info: BlendingQueryInfo, allowed_physical_data_sources: Optional[Set[str]] = None, ) -> Optional[Dataframe]: comp_df = cls._build_comparison_blend_query( ctx, config_arg, taxon_manager, query_info, allowed_physical_data_sources=allowed_physical_data_sources) if comp_df is None or len( taxon_manager.plan.comparison_dimension_formulas) == 0: # There are no comparison dim formulas, means the rows are already grouped correctly return comp_df comp_df = DimensionPhaseBuilder.calculate_dataframe( taxon_manager.plan.comparison_dimension_formulas, override_mapping_manager.comparison_override_mapping_tel_data, override_mapping_manager.cte_map, comp_df, ) # After dimension join, there could have been a merge (coalesce). We need to group them by the merged column # once more, to keep single row per dimension.. otherwise we will get row fanout when left joining with # data dataframe group_by_cols = [] selectors = [] for dim_formula in taxon_manager.plan.comparison_dimension_formulas: group_by_cols.append(column(dim_formula.label)) for df_column in comp_df.slug_to_column.values(): taxon = df_column.taxon col = column(df_column.name) if taxon.is_dimension: group_by_cols.append(col) else: agg_type = taxon.tel_metadata_aggregation_type agg_fn = None if agg_type: agg_fn = MetricPhaseBuilder.AGGREGATION_FUNCTIONS_MAP.get( agg_type) if agg_fn is None: raise UnsupportedAggregationType(taxon) col = agg_fn(col).label(df_column.name) selectors.append(col) selectors.extend(group_by_cols) query = select(sort_columns(selectors)).select_from( comp_df.query).group_by(*group_by_cols) return Dataframe(query, comp_df.slug_to_column, comp_df.used_model_names, comp_df.used_physical_data_sources)
def calculate_dataframe( cls, dimension_formulas: List[PreFormula], override_mappings_tel_data: OverrideMappingTelData, override_mapping_cte_map: Dict[OverrideMappingSlug, Select], df: Dataframe, ) -> Dataframe: select_columns = [] select_columns.extend(df.query.columns) for dim_formula in dimension_formulas: col = dim_formula.formula.label(dim_formula.label) select_columns.append(col) # add joins to relevant override mapping CTEs select_from_query = OverrideMappingSql.insert_cte_joins( df.query, override_mappings_tel_data, override_mapping_cte_map) query = Select(columns=sort_columns(select_columns)).select_from( select_from_query) return Dataframe(query, df.slug_to_column, df.used_model_names, df.used_physical_data_sources)
def calculate_dataframe( self, ctx: HuskyQueryContext, df: Dataframe, physical_data_sources: Set[str], grouping_sets: Optional[GroupingSets] = None, filter_clause: Optional[FilterClause] = None, ) -> Dataframe: """ Applies in this order: - pre aggregation logic - aggregation by group by or grouping sets - optional step of window function aggregation - after aggregation logic - filters. Filters are applied here to simplify the final query and apply filtering before filling date gaps. """ pre_agg_columns = [ ] # Columns with applied aggregation function in aggregation step # Columns to select from window step - columns that are not removed and dont need window step select_from_window_step: List[ColumnClause] = [] df_columns: List[DataframeColumn] = [ ] # Final df columns after all steps. group_columns = [] final_columns: List[ColumnClause] = [] for pre_formula in self.taxon_manager.plan.metric_pre: col = pre_formula.formula.label(pre_formula.label) aggregation_fn = self.AGGREGATION_FUNCTIONS_MAP.get( pre_formula.aggregation.type) if aggregation_fn: # we know the aggregation function so let's use it pre_agg_columns.append( aggregation_fn(col).label(pre_formula.label)) else: # if no aggregation function is defined, then we simply group by this formula group_columns.append(col) select_from_window_step.append(col) # taxon slugs used in group by clause dimension_taxon_slugs = { group_column.name for group_column in group_columns } for post_formula, taxon in self.taxon_manager.plan.metric_post: post_formula_sql = post_formula.render_formula( ctx.dialect, dimension_taxon_slugs) col = post_formula_sql.label(taxon.slug_safe_sql_identifier) final_columns.append(col) df_columns.append(DataframeColumn(taxon.slug_expr, taxon)) # Aggregation query with column logic. This is the first aggregation step, regular group by # or a common table expression with multiple group by statements in case of grouping sets. pre_query = self._add_aggregation(df.query, pre_agg_columns, group_columns, grouping_sets) # Post aggregation logic post_query = Select( columns=sort_columns(final_columns)).select_from(pre_query) slug_to_column = Dataframe.dataframe_columns_to_map(df_columns) if filter_clause: taxon_model_info = { str(slug): TaxonModelInfo(safe_quote_identifier(slug, ctx.dialect)) for slug in slug_to_column.keys() } post_query = FilterBuilder.augment_query(ctx, post_query, taxon_model_info, filter_clause) return Dataframe(post_query, slug_to_column, df.used_model_names, physical_data_sources)
def _build_comparison_blend_query( cls, ctx: HuskyQueryContext, config_arg: BlendingDataRequest, taxon_manager: BlendingTaxonManager, query_info: BlendingQueryInfo, allowed_physical_data_sources: Optional[Set[str]] = None, ) -> Optional[Dataframe]: """ Builds comparison query for each subrequest and then blends them all into one comparison dataframe. """ dataframes = [] config = BlendingDataRequest(config_arg.to_native( )) # Clone, coz we will be modifying subqueries assert config.comparison, 'Comparison must be defined when trying to build comparison query..' comparison: ComparisonConfig = config.comparison for _subrequest in config.data_subrequests: subrequest = cls._build_comparison_subrequest( _subrequest, comparison, taxon_manager) data_source = subrequest.properties.data_source # if no comparison taxons were found for this subrequest, skip creating comparison query for it as well if len(subrequest.taxons) == 0: continue bm_sub_query_info = QueryInfo.create(subrequest) query_info.comparison_subrequests_info.append(bm_sub_query_info) # Build comparison dataframe and add it to a list. # TODO pass down TelPlan for comparisons # ComparisonRequestBuilder might have added filters (typically for company id project id) # Me create new filter templates for this comparison subrequest. filter_templates = TelPlanner.get_preaggregation_filter_templates( ctx, [ subrequest.preaggregation_filters, subrequest.scope.preaggregation_filters ], taxon_manager.taxon_map, data_source, ) dataframes.append( QueryBuilder.build_query( ctx, subrequest, bm_sub_query_info, taxon_manager.used_taxons, dimension_templates=taxon_manager.plan. comparison_data_source_formula_templates[data_source], filter_templates=filter_templates, allowed_physical_data_sources=allowed_physical_data_sources, )) # if no comparison subrequests were created, there is no need to blend data frames if len(dataframes) == 0: return None # Blend all comparison dataframes into one # TODO pass down TelPlan for comparisons data_source_formula_templates = taxon_manager.plan.comparison_data_source_formula_templates dataframe = blend_dataframes(ctx, dataframes, data_source_formula_templates) # Prefix all comparison metric columns with 'comparison@' and create comparison taxon for it. query = dataframe.query final_columns = [] aliased_taxon_by_slug: Dict[TaxonExpressionStr, DataframeColumn] = dict() for slug, df_column in dataframe.slug_to_column.items(): # Alias metrics with comparison@ prefix, and select dimensions.. if df_column.taxon.is_dimension: new_taxon = df_column.taxon.copy(deep=True) new_slug = TaxonExpressionStr(f'{slug}') else: new_slug, new_taxon = BlendingTaxonManager.create_comparison_taxon( df_column.taxon) final_columns.append(query.c[safe_identifier(slug)].label( new_taxon.slug_safe_sql_identifier)) aliased_taxon_by_slug[new_slug] = DataframeColumn( new_slug, new_taxon, df_column.quantity_type) for pre_formulas in data_source_formula_templates.values(): # and also select the dim columns from dim templates. for pre_formula in pre_formulas: final_columns.append( literal_column( quote_identifier(pre_formula.label, ctx.dialect))) renamed_cols_query = select(sort_columns(final_columns)).select_from( dataframe.query) return Dataframe(renamed_cols_query, aliased_taxon_by_slug, dataframe.used_model_names, dataframe.used_physical_data_sources)
def blend_dataframes( ctx: HuskyQueryContext, dataframes: List[Dataframe], data_source_formula_templates: Optional[Dict[str, List[SqlFormulaTemplate]]] = None, ) -> Dataframe: """ Produces new blended dataframe from all the given dataframes joined on all dimensions that appear at least twice in different dataframes. """ slug_to_dataframes: Dict[TaxonExpressionStr, List[Dataframe]] = _prepare_slug_to_dataframes(dataframes) dataframe_to_query: Dict[Dataframe, Selectable] = dict() used_model_names: Set[str] = set() used_physical_sources: Set[str] = set() for idx, df in enumerate(dataframes): # Create query for each dataframe, that has alias as 'q<number>' dataframe_to_query[df] = df.query.alias(f'q{idx}') used_model_names.update(df.used_model_names) used_physical_sources.update(df.used_physical_data_sources) selectors: List[TextClause] = [] dimension_columns: List[ColumnClause] = [] # Prepare list of sql selectors. If it is a metric, do zeroifnull(q0.metric + q1.metric + ...) # If it is a dimension, just select it. Because we are using USING clause, no need for coalesce. for taxon_slug in sorted(slug_to_dataframes.keys()): dataframes_with_slug = slug_to_dataframes[taxon_slug] taxon = dataframes_with_slug[0].slug_to_column[taxon_slug].taxon taxon_column = quote_identifier(taxon.slug_safe_sql_identifier, ctx.dialect) query_aliases = [dataframe_to_query[df].name for df in dataframes_with_slug] if taxon.is_dimension: if len(query_aliases) > 1: # Coalesce must have two or more args dimension_coalesce = functions.coalesce( *[literal_column(f'{query_alias}.{taxon_column}') for query_alias in query_aliases] ) else: # No need to coalesce now dimension_coalesce = literal_column(f'{query_aliases[0]}.{taxon_column}') col = dimension_coalesce.label(taxon.slug_safe_sql_identifier) dimension_columns.append(col) selectors.append(col) else: if taxon.data_source: # do not use coalesce aka zeroifnull when summing namespaces taxons.. # There are using TEL expr, where null is handled by TEL compilation. summed = '+'.join([f'{query_alias}.{taxon_column}' for query_alias in query_aliases]) else: summed = '+'.join([f'coalesce({query_alias}.{taxon_column},0)' for query_alias in query_aliases]) selectors.append(text(f'sum({summed}) as {taxon_column}')) final_columns: List[ColumnClause] = [] if data_source_formula_templates: for pre_formulas in data_source_formula_templates.values(): for pre_formula in pre_formulas: col = column(pre_formula.label) dimension_columns.append(col) selectors.append(col) final_columns.append(column(quote_identifier(pre_formula.label, ctx.dialect))) # All taxons in final DF final_slug_to_taxon: Dict[TaxonExpressionStr, DataframeColumn] = dataframes[0].slug_to_column.copy() # Because of sql alchemy compiler putting extra () around every using select_from, we first join all queries # And then define the aggregation selectors (right after this for loop) join_query = dataframe_to_query[dataframes[0]] for i in range(1, len(dataframes)): # Iterate dataframes, and do full outer join on FALSE, effectively meaning UNION-ALL without the need to # align all columns dataframe_to_join = dataframes[i] used_physical_sources.update(dataframe_to_join.used_physical_data_sources) final_slug_to_taxon = {**final_slug_to_taxon, **dataframe_to_join.slug_to_column} join_from = join_query join_to = dataframe_to_query[dataframe_to_join] # On purpose joining on value that will always return FALSE => PROD-8136 join_query = join_from.join( join_to, dataframe_to_query[dataframes[0]].columns[HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME] == join_to.columns[HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME], full=True, ) aggregate_join_query = select(selectors).select_from(join_query) for dimension_column in dimension_columns: aggregate_join_query = aggregate_join_query.group_by(dimension_column) # We have to wrap it in one more select, so the alchemy query object has columns referencable via 'c' attribute. final_columns.extend(column(id_) for id_ in safe_identifiers_iterable(final_slug_to_taxon.keys())) query = select(sort_columns(final_columns)).select_from(aggregate_join_query) return Dataframe(query, final_slug_to_taxon, used_model_names, used_physical_sources)
def left_join_dataframes( ctx: HuskyQueryContext, data_dataframe: Dataframe, comparison_dataframe: Dataframe, tel_plan: TelPlan ) -> Dataframe: """ Produces new DF, that is DATA_DF LEFT JOIN COMPARISON_DF on given list of taxons. :param ctx: Husky query context :param data_dataframe: df to left join to :param comparison_dataframe: other df :param tel_plan: Current TEL plan :return: Left joined dataframe """ # Alias their queries to be able to easily reference them. data_table = data_dataframe.query.alias('data_dataframe') comparison_table = comparison_dataframe.query.alias('comparison_dataframe') # Union taxon slugs from both DFs. columns_by_slug = {**data_dataframe.slug_to_column, **comparison_dataframe.slug_to_column} select_columns = set() # Select the column from specific data frame (data or comparison), but then label them to remove that prefix, # since the names are already unique (from TEL planner) for slug in data_dataframe.slug_to_column.keys(): select_columns.add( literal_column(f'data_dataframe.{safe_quote_identifier(slug, ctx.dialect)}').label(safe_identifier(slug)) ) for slug, df_column in comparison_dataframe.slug_to_column.items(): taxon: Taxon = df_column.taxon if taxon.is_comparison_taxon: select_columns.add( literal_column(f'comparison_dataframe.{safe_quote_identifier(slug, ctx.dialect)}').label( safe_identifier(slug) ) ) join_on_conditions = [] for template in tel_plan.dimension_formulas: # Select the data source formula labels explicitly from data table select_columns.add(data_table.c[template.label]) for join_column in tel_plan.comparison_join_columns: join_on_conditions.append( # Account for dimensions that can have NULL values, because NULL = NULL evaluates to FALSE in SQL, # second condition that compares both columns to IS NULL needs to be added. or_( data_table.c[join_column] == comparison_table.c[join_column], and_(data_table.c[join_column].is_(None), comparison_table.c[join_column].is_(None)).self_group(), ) ) if len(join_on_conditions) == 0: # In case there were no comparison dimensions defined, the comparison dataframe also has no dimensions # (thus it is one row) and we can safely do a join without ON clause to data dataframe. # Using 1=1 as a easiest way to do join without ON clause in alchemy... join_on_conditions.append(text('1=1')) q = select(sort_columns(list(select_columns))).select_from( data_table.outerjoin(comparison_table, and_(*join_on_conditions)) ) return Dataframe( q, columns_by_slug, data_dataframe.used_model_names | comparison_dataframe.used_model_names, data_dataframe.used_physical_data_sources | comparison_dataframe.used_physical_data_sources, )
def query( cls, select_query: Select, taxon_model_info_map: Dict[str, TaxonModelInfo], projection_taxons: SlugExprTaxonMap, data_source: str, order_by: Optional[List[TaxonDataOrder]], limit: Optional[int], offset: Optional[int], used_physical_data_sources: Set[str], dimension_templates: Optional[List[SqlFormulaTemplate]] = None, ) -> Dataframe: """ Generates the final projected dataframe :param select_query: Original query fetching all necessary fields :param taxon_model_info_map: Map of taxon slug expression to taxon model info :param projection_taxons: List of taxons meant to be projected by the final query :param data_source: Virtual data source for this subrequest :param order_by: List of clauses for order by :param limit: Limit for the query :param offset: Offset for the query :param dimension_templates: List of dimension templates :return: Final dataframe including all requested taxons """ group_by = [] selectors = [] projected_df_columns: Dict[TaxonExpressionStr, DataframeColumn] = {} for taxon in projection_taxons.values(): # apply aggregation, if you need to agg_type = taxon.tel_metadata_aggregation_type if agg_type and agg_type in cls._AGGREGATION_FUNCTIONS_MAP: col = cls._AGGREGATION_FUNCTIONS_MAP[agg_type](column(taxon.slug_safe_sql_identifier)) else: col = column(taxon.slug_safe_sql_identifier) col = col.label(taxon.slug_safe_sql_identifier) # create appropriate dataframe column value_quality_type = ValueQuantityType.scalar if not taxon.calculation and taxon.slug_expr in taxon_model_info_map: value_quality_type = taxon_model_info_map[taxon.slug_expr].quantity_type df_column_name = TaxonExpressionStr(taxon.slug) projected_df_columns[df_column_name] = DataframeColumn(df_column_name, taxon, value_quality_type) # make sure we select this column in the query selectors.append(col) # check whether this taxon should be in group by clause if agg_type in cls._GROUP_BY_AGGREGATION_TYPES: group_by.append(col) # make sure we select all columns for dimension templates for dim_template in dimension_templates or []: col = column(dim_template.label) selectors.append(col) # we should group by all dimension templates group_by.append(col) # On purpose adding this value to emulate USING ON FALSE => PROD-8136 selectors.append(literal(data_source).label(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME)) # using literal_column here because some database engines do not like grouping by constant group_by.append(literal_column(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME)) # created this query new_query = Select( columns=sort_columns(selectors), order_by=[nullslast(ORDER_BY_FUNCTIONS[item.type](item.taxon)) for item in (order_by or [])], group_by=sort_columns(group_by), ).select_from(select_query) if limit is not None: new_query = new_query.limit(limit) if offset is not None: new_query = new_query.offset(offset) # collect names of all used models used_model_names = { model_info.model_name for model_info in taxon_model_info_map.values() if model_info.model_name is not None } return Dataframe(new_query, projected_df_columns, used_model_names, used_physical_data_sources)