Ejemplo n.º 1
0
 def test_blending_2_2(self):
     q1 = Select(
         columns=[
             column('ad_id'),
             column('impressions'),
             column(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME)
         ],
         from_obj=table('table1'),
     )
     df1 = Dataframe(
         q1, get_mocked_dataframe_columns_map(['ad_id', 'impressions']),
         set(), {'SF'})
     q2 = Select(
         columns=[
             column('ad_id'),
             column('campaign_id'),
             column('impressions'),
             column(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME),
         ],
         from_obj=table('table2'),
     )
     df2 = Dataframe(
         q2,
         get_mocked_dataframe_columns_map(
             ['ad_id', 'impressions', 'campaign_id']), set(), {'SF'})
     blended_df = blend_dataframes(SNOWFLAKE_HUSKY_CONTEXT, [df1, df2])
     self.write_test_expectations('query.sql',
                                  compile_query(blended_df.query))
     expected_query = self.read_test_expectations('query.sql')
     self.assertEqual(expected_query, compile_query(blended_df.query))
     self.assertEqual({'ad_id', 'impressions', 'campaign_id'},
                      set(blended_df.slug_to_column.keys()))
Ejemplo n.º 2
0
    def _build_data_blend_query(
        cls,
        ctx: HuskyQueryContext,
        taxon_manager: BlendingTaxonManager,
        config_arg: BlendingDataRequest,
        query_info: BlendingQueryInfo,
    ) -> Dataframe:
        """
        Builds subquery for each subrequest and then blends them all into one dataframe.
        :param ctx: Husky query context
        """
        dataframes = []
        request = BlendingDataRequest(config_arg.to_native(
        ))  # Clone, coz we will be modifying subqueries
        for subrequest in request.data_subrequests:
            # add comparison taxons to data subrequest taxons
            subrequest.taxons = taxon_manager.get_subrequest_taxons(subrequest)
            sub_query_info = QueryInfo({
                'used_raw_taxons': subrequest.taxons,
            })
            query_info.subrequests_info.append(sub_query_info)

            # Build query for subrequest and add it to the list
            data_source = subrequest.properties.data_source
            dimension_templates = taxon_manager.plan.data_source_formula_templates[
                data_source]
            filter_templates = taxon_manager.plan.data_source_filter_templates[
                data_source]
            df = MainQueryBuilder.build_query(
                ctx,
                subrequest.to_internal_model(),
                sub_query_info,
                taxon_manager.used_taxons,
                dimension_templates,
                filter_templates=filter_templates,
                allowed_physical_data_sources=set(
                    request.physical_data_sources)
                if request.physical_data_sources else None,
            )
            dataframes.append(df)

        return blend_dataframes(
            ctx, dataframes, taxon_manager.plan.data_source_formula_templates)
    def _build_comparison_blend_query(
        cls,
        ctx: HuskyQueryContext,
        config_arg: BlendingDataRequest,
        taxon_manager: BlendingTaxonManager,
        query_info: BlendingQueryInfo,
        allowed_physical_data_sources: Optional[Set[str]] = None,
    ) -> Optional[Dataframe]:
        """
        Builds comparison query for each subrequest and then blends them all into one comparison dataframe.
        """
        dataframes = []
        config = BlendingDataRequest(config_arg.to_native(
        ))  # Clone, coz we will be modifying subqueries
        assert config.comparison, 'Comparison must be defined when trying to build comparison query..'
        comparison: ComparisonConfig = config.comparison
        for _subrequest in config.data_subrequests:
            subrequest = cls._build_comparison_subrequest(
                _subrequest, comparison, taxon_manager)
            data_source = subrequest.properties.data_source

            # if no comparison taxons were found for this subrequest, skip creating comparison query for it as well
            if len(subrequest.taxons) == 0:
                continue

            bm_sub_query_info = QueryInfo.create(subrequest)
            query_info.comparison_subrequests_info.append(bm_sub_query_info)
            # Build comparison dataframe and add it to a list.
            # TODO pass down TelPlan for comparisons
            # ComparisonRequestBuilder might have added filters (typically for company id project id)
            # Me create new filter templates for this comparison subrequest.
            filter_templates = TelPlanner.get_preaggregation_filter_templates(
                ctx,
                [
                    subrequest.preaggregation_filters,
                    subrequest.scope.preaggregation_filters
                ],
                taxon_manager.taxon_map,
                data_source,
            )

            dataframes.append(
                QueryBuilder.build_query(
                    ctx,
                    subrequest,
                    bm_sub_query_info,
                    taxon_manager.used_taxons,
                    dimension_templates=taxon_manager.plan.
                    comparison_data_source_formula_templates[data_source],
                    filter_templates=filter_templates,
                    allowed_physical_data_sources=allowed_physical_data_sources,
                ))

        # if no comparison subrequests were created, there is no need to blend data frames
        if len(dataframes) == 0:
            return None

        # Blend all comparison dataframes into one
        # TODO pass down TelPlan for comparisons
        data_source_formula_templates = taxon_manager.plan.comparison_data_source_formula_templates
        dataframe = blend_dataframes(ctx, dataframes,
                                     data_source_formula_templates)

        # Prefix all comparison metric columns with 'comparison@' and create comparison taxon for it.
        query = dataframe.query
        final_columns = []
        aliased_taxon_by_slug: Dict[TaxonExpressionStr,
                                    DataframeColumn] = dict()
        for slug, df_column in dataframe.slug_to_column.items():
            # Alias metrics with comparison@ prefix, and select dimensions..
            if df_column.taxon.is_dimension:
                new_taxon = df_column.taxon.copy(deep=True)
                new_slug = TaxonExpressionStr(f'{slug}')
            else:
                new_slug, new_taxon = BlendingTaxonManager.create_comparison_taxon(
                    df_column.taxon)

            final_columns.append(query.c[safe_identifier(slug)].label(
                new_taxon.slug_safe_sql_identifier))
            aliased_taxon_by_slug[new_slug] = DataframeColumn(
                new_slug, new_taxon, df_column.quantity_type)
        for pre_formulas in data_source_formula_templates.values():
            # and also select the dim columns from dim templates.
            for pre_formula in pre_formulas:
                final_columns.append(
                    literal_column(
                        quote_identifier(pre_formula.label, ctx.dialect)))
        renamed_cols_query = select(sort_columns(final_columns)).select_from(
            dataframe.query)
        return Dataframe(renamed_cols_query, aliased_taxon_by_slug,
                         dataframe.used_model_names,
                         dataframe.used_physical_data_sources)