def test_simple_aggregate(sample_df): step = AggregateStep( name='aggregate', on=['Group'], aggregations=[ Aggregation( aggfunction='sum', columns=['Value1', 'Value2'], newcolumns=['Sum-Value1', 'Sum-Value2'], ), Aggregation(aggfunction='avg', columns=['Value1'], newcolumns=['Avg-Value1']), ], ) df_result = execute_aggregate(step, sample_df) assert_dataframes_equals( df_result.sort_values(by=['Group']), DataFrame({ 'Group': ['Group 1', 'Group 2'], 'Sum-Value1': [40, 16], 'Sum-Value2': [35, 31], 'Avg-Value1': [np.average([13, 7, 20]), np.average([1, 10, 5])], }).sort_values(by=['Group']), )
def test_with_group_by_with_aggregation_with_last_with_first( query, sql_query_describer): step = AggregateStep( name='aggregate', on=['category'], aggregations=[ Aggregation(aggfunction='last', columns=['Label'], newcolumns=['last_label']), Aggregation(aggfunction='first', columns=['title'], newcolumns=['first_title']), Aggregation(aggfunction='sum', columns=['title'], newcolumns=['sum_title']), ], ) sql_query = translate_aggregate(step, query, index=1, sql_query_describer=sql_query_describer) assert ( sql_query.transformed_query == 'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT A.*, F.first_title, F.last_label FROM (SELECT SUM(title) AS sum_title, category FROM SELECT_STEP_0 GROUP BY category) A INNER JOIN (SELECT first_title, last_label, category FROM (SELECT title AS first_title, Label AS last_label, category, ROW_NUMBER() OVER (PARTITION BY category ORDER BY title, Label DESC) AS R FROM SELECT_STEP_0 QUALIFY R = 1)) F ON A.category=F.category)' )
def test_with_original_granularity_multiple_aggregations(sample_df): step = AggregateStep( name='aggregate', keepOriginalGranularity=True, on=['Group'], aggregations=[ Aggregation(aggfunction='min', columns=['Value1'], newcolumns=['min_Value1']), Aggregation(aggfunction='max', columns=['Value2'], newcolumns=['max_Value2']), ], ) df_result = execute_aggregate(step, sample_df) assert_dataframes_equals( df_result, DataFrame({ 'Label': ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'], 'Group': ['Group 1'] * 3 + ['Group 2'] * 3, 'Value1': [13, 7, 20, 1, 10, 5], 'min_Value1': [7] * 3 + [1] * 3, 'Value2': [10, 21, 4, 17, 12, 2], 'max_Value2': [21] * 3 + [17] * 3, }), )
def test_totals_2(): sample_df = pd.DataFrame({ 'COUNTRY': ['France', 'USA'] * 4, 'PRODUCT': (['product A'] * 2 + ['product B'] * 2) * 2, 'YEAR': ['2019'] * 4 + ['2020'] * 4, 'VALUE_1': [5, 10, 10, 15, 20, 20, 30, 25], 'VALUE_2': [50, 100, 100, 150, 200, 200, 300, 250], }) step = TotalsStep( name='totals', totalDimensions=[ TotalDimension(total_column='COUNTRY', total_rows_label='All countries'), TotalDimension(total_column='PRODUCT', total_rows_label='All products'), ], aggregations=[ Aggregation( columns=['VALUE_1', 'VALUE_2'], aggfunction='sum', newcolumns=['VALUE_1-sum', 'VALUE_2'], ), Aggregation(columns=['VALUE_1'], aggfunction='avg', newcolumns=['VALUE_1-avg']), ], groups=['YEAR'], ) expected_result = sample_df.copy() expected_result['VALUE_1-sum'] = expected_result[ 'VALUE_1-avg'] = expected_result['VALUE_1'] del expected_result['VALUE_1'] expected_result = pd.concat([ expected_result, pd.DataFrame({ 'COUNTRY': ['USA', 'France'] * 2 + ['All countries'] * 6, 'PRODUCT': ['All products'] * 4 + ['product B', 'product A'] * 2 + ['All products'] * 2, 'YEAR': (['2020'] * 2 + ['2019'] * 2) * 2 + ['2020', '2019'], 'VALUE_2': [450, 500, 250, 150, 550, 400, 250, 150, 950, 400], 'VALUE_1-sum': [45, 50, 25, 15, 55, 40, 25, 15, 95, 40], 'VALUE_1-avg': [22.5, 25, 12.5, 7.5, 27.5, 20, 12.5, 7.5, 23.75, 10], }), ]) real_result = execute_totals(step, sample_df) real_sorted = real_result.sort_values(by=real_result.columns.tolist()) expected_sorted = expected_result.sort_values( by=expected_result.columns.tolist()) assert_dataframes_equals(real_sorted, expected_sorted)
def test_to_dict(): pipeline = Pipeline( steps=[ DomainStep(name='domain', domain='foobar'), RollupStep( name='rollup', hierarchy=['a', 'b'], aggregations=[Aggregation(newcolumns=['a'], aggfunction='sum', columns=['a'])], ), ] ) actual_dict = pipeline.dict() expected_dict = { 'steps': [ {'domain': 'foobar', 'name': 'domain'}, { 'name': 'rollup', 'hierarchy': ['a', 'b'], 'aggregations': [{'new_columns': ['a'], 'agg_function': 'sum', 'columns': ['a']}], }, ] } assert actual_dict == expected_dict assert pipeline == Pipeline(**pipeline.dict())
def test_count_with_null(): df = DataFrame({ 'Label': ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'], 'Group': ['Group 1'] * 3 + [None] * 3, # type: ignore 'Value1': [13, 7, 20, 1, 10, 5], }) step = AggregateStep( name='aggregate', on=['Group'], aggregations=[ Aggregation( aggfunction='count distinct including empty', columns=['Group'], newcolumns=['__VQB_COUNT'], ), ], ) df_result = execute_aggregate(step, df) assert_dataframes_equals( df_result.sort_values(by=['Group']), DataFrame({ 'Group': ['Group 1', np.nan], '__VQB_COUNT': [3, 3], }).sort_values(by=['Group']), )
def test_simple_aggregate_with_null(): df = DataFrame({ 'Label': ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'], 'Group': ['Group 1'] * 3 + [None] * 3, # type: ignore 'Value1': [13, 7, 20, 1, 10, 5], }) step = AggregateStep( name='aggregate', on=['Group'], aggregations=[ Aggregation( aggfunction='sum', columns=['Value1'], newcolumns=['Sum-Value1'], ), ], ) df_result = execute_aggregate(step, df) assert_dataframes_equals( df_result.sort_values(by=['Group']), DataFrame({ 'Group': ['Group 1', None], 'Sum-Value1': [40, 16], }).sort_values(by=['Group']), )
def test_single_totals_without_groups(): sample_df = pd.DataFrame({ 'COUNTRY': ['France', 'USA'] * 4, 'PRODUCT': (['PRODUCT A'] * 2 + ['PRODUCT B'] * 2) * 2, 'YEAR': ['2019'] * 4 + ['2020'] * 4, 'VALUE': [5, 10, 10, 15, 20, 20, 30, 25], }) step = TotalsStep( name='totals', totalDimensions=[ TotalDimension(total_column='COUNTRY', total_rows_label='All countries') ], aggregations=[ Aggregation(columns=['VALUE'], aggfunction='sum', newcolumns=['VALUE']) ], groups=[], ) expected_result = pd.concat([ sample_df, pd.DataFrame({ 'COUNTRY': 'All countries', 'PRODUCT': [None], 'YEAR': [None], 'VALUE': [135] }), ]) real_result = execute_totals(step, sample_df) assert_dataframes_equals(real_result, expected_result)
def test_last_no_group_by_with_aggregation_with_last(query, sql_query_describer): step = AggregateStep( name='aggregate', aggregations=[ Aggregation(aggfunction='last', columns=['Label'], newcolumns=['last_label']), Aggregation(aggfunction='sum', columns=['title'], newcolumns=['sum_title']), ], ) sql_query = translate_aggregate(step, query, index=1, sql_query_describer=sql_query_describer) assert ( sql_query.transformed_query == 'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT A.*, F.last_label FROM (SELECT SUM(title) AS sum_title FROM SELECT_STEP_0) A INNER JOIN (SELECT last_label FROM (SELECT Label AS last_label, ROW_NUMBER() OVER (ORDER BY Label DESC) AS R FROM SELECT_STEP_0 QUALIFY R = 1)) F)' )
def test_last_with_group_by_no_aggregation_with_first(query, sql_query_describer): step = AggregateStep( name='aggregate', on=['category'], aggregations=[ Aggregation(aggfunction='first', columns=['Label'], newcolumns=['first_label']), Aggregation(aggfunction='last', columns=['title'], newcolumns=['last_title']), ], ) sql_query = translate_aggregate(step, query, index=1, sql_query_describer=sql_query_describer) assert ( sql_query.transformed_query == 'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT first_label, last_title, category FROM (SELECT Label AS first_label, title AS last_title, category, ROW_NUMBER() OVER (PARTITION BY category ORDER BY Label, title DESC) AS R FROM SELECT_STEP_0 QUALIFY R = 1))' )
def test_translate_aggregate_with_group_by(query, sql_query_describer): step = AggregateStep( name='aggregate', on=['category'], aggregations=[ Aggregation( aggfunction='sum', columns=['Value1', 'Value2'], newcolumns=['Sum_Value1', 'Sum_Value2'], ), Aggregation(aggfunction='avg', columns=['Value1'], newcolumns=['Avg_Value1']), ], ) sql_query = translate_aggregate(step, query, index=1, sql_query_describer=sql_query_describer) assert ( sql_query.transformed_query == 'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT SUM(Value1) AS Sum_Value1, SUM(Value2) AS Sum_Value2, AVG(Value1) AS Avg_Value1, category FROM SELECT_STEP_0 GROUP BY category)' )
def test_keep_original_granularity_empty_on(sample_df): step = AggregateStep( name='aggregate', on=[], keepOriginalGranularity=True, aggregations=[ Aggregation(aggfunction='count', columns=['Group'], newcolumns=['__vqb_count__']), ], ) df_result = execute_aggregate(step, sample_df) assert_dataframes_equals(df_result, sample_df.assign(__vqb_count__=6))
def test_totals_nogroup_3_dimensions(): sample_df = pd.DataFrame({ 'COUNTRY': ['France', 'USA'] * 4, 'PRODUCT': (['product A'] * 2 + ['product B'] * 2) * 2, 'YEAR': ['2019'] * 4 + ['2020'] * 4, 'VALUE_1': [5, 10, 10, 15, 20, 20, 30, 25], }) step = TotalsStep( name='totals', totalDimensions=[ TotalDimension(total_column='COUNTRY', total_rows_label='All countries'), TotalDimension(total_column='PRODUCT', total_rows_label='All products'), TotalDimension(total_column='YEAR', total_rows_label='All years'), ], aggregations=[ Aggregation( columns=['VALUE_1'], aggfunction='sum', newcolumns=['VALUE_1-sum'], ), Aggregation(columns=['VALUE_1'], aggfunction='avg', newcolumns=['VALUE_1-avg']), ], ) real_result = execute_totals(step, sample_df) assert real_result[real_result['YEAR'] == 'All years'].count()['YEAR'] == 9 assert real_result[real_result['COUNTRY'] == 'All countries'].count()['COUNTRY'] == 9 assert real_result[real_result['PRODUCT'] == 'All products'].count()['PRODUCT'] == 9 # could be any column assert real_result.count()['YEAR'] == 27
def test_duplicate_aggregation_columns(query, sql_query_describer): with pytest.raises(DuplicateColumnError): step = AggregateStep( name='aggregate', on=['Group'], aggregations=[ Aggregation( aggfunction='count distinct including empty', columns=['Group', 'Group'], newcolumns=['coucou'], ), ], ) translate_aggregate(step, query, index=1)
def test_count_distinct(sample_df): step = AggregateStep( name='aggregate', keepOriginalGranularity=False, on=[], aggregations=[ Aggregation( aggfunction='count distinct', columns=['Group'], newcolumns=['Group_CD'], ) ], ) df_result = execute_aggregate(step, sample_df) assert_dataframes_equals(df_result, DataFrame({'Group_CD': [2]}))
def test_without_on(sample_df): step = AggregateStep( name='aggregate', keepOriginalGranularity=False, on=[], aggregations=[ Aggregation(aggfunction='sum', columns=['Value1'], newcolumns=['sum_value']), ], ) df_result = execute_aggregate(step, sample_df) assert_dataframes_equals( df_result, DataFrame({'sum_value': [56]}), )
def test_first_no_group_by_no_aggregation(query, sql_query_describer): step = AggregateStep( name='aggregate', aggregations=[ Aggregation(aggfunction='first', columns=['Label'], newcolumns=['first_Label']), ], ) sql_query = translate_aggregate(step, query, index=1, sql_query_describer=sql_query_describer) assert ( sql_query.transformed_query == 'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT first_Label FROM (SELECT Label AS first_Label, ROW_NUMBER() OVER (ORDER BY Label) AS R FROM SELECT_STEP_0 QUALIFY R = 1))' )
def test_count(query, sql_query_describer): step = AggregateStep( name='aggregate', on=['category'], aggregations=[ Aggregation(aggfunction='count', columns=['Label'], newcolumns=['count']), ], ) sql_query = translate_aggregate(step, query, index=1, sql_query_describer=sql_query_describer) assert ( sql_query.transformed_query == 'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT COUNT(Label) AS count, category FROM SELECT_STEP_0 GROUP BY category)' )
def test_count(sample_df): step = AggregateStep( name='aggregate', keepOriginalGranularity=False, on=['Group'], aggregations=[ Aggregation(aggfunction='count', columns=['Label'], newcolumns=['count']), ], ) df_result = execute_aggregate(step, sample_df) assert_dataframes_equals( df_result, DataFrame({ 'Group': ['Group 1', 'Group 2'], 'count': [3, 3] }))
def test_benchmark_aggregate(benchmark): sample_df = DataFrame({ 'Group': ['Group 1'] * 500 + ['Group 2'] * 500, 'Value1': np.random.random(1000), 'Value2': np.random.random(1000), }) step = AggregateStep( name='aggregate', on=['Group'], aggregations=[ Aggregation( aggfunction='avg', columns=['Value1'], newcolumns=['RESULT'], ), ], ) benchmark(execute_aggregate, step, sample_df)
def test_duplicate_aggregation_columns(): df = DataFrame({ 'Label': ['Label 1', 'Label 2', 'Label 3'], 'Group': ['Group 1'] * 3, # type: ignore 'Value1': [13, 7, 20], }) with pytest.raises(DuplicateColumnError): step = AggregateStep( name='aggregate', on=['Group'], aggregations=[ Aggregation( aggfunction='count distinct including empty', columns=['Group', 'Group'], newcolumns=['__VQB_COUNT'], ), ], ) execute_aggregate(step, df)
def test_legacy_syntax(sample_df): step = AggregateStep( name='aggregate', keepOriginalGranularity=False, on=[], aggregations=[ Aggregation(**{ 'aggfunction': 'sum', 'column': 'Value1', 'newcolumn': 'sum_value' }), ], ) df_result = execute_aggregate(step, sample_df) assert_dataframes_equals( df_result, DataFrame({'sum_value': [56]}), )
def test_count_distinct(query, sql_query_describer): step = AggregateStep( name='aggregate', on=[], aggregations=[ Aggregation( aggfunction='count distinct', columns=['Group'], newcolumns=['Group_CD'], ) ], ) sql_query = translate_aggregate(step, query, index=1, sql_query_describer=sql_query_describer) assert ( sql_query.transformed_query == 'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT COUNT(DISTINCT Group) AS Group_CD FROM SELECT_STEP_0)' )
def test_benchmark_totals(benchmark): groups = ['group_1', 'group_2'] df = pd.DataFrame({ 'value': np.random.random(1000), 'id': list(range(1000)), 'group': [random.choice(groups) for _ in range(1000)], }) step = TotalsStep( name='totals', totalDimensions=[ TotalDimension(total_column='group', total_rows_label='All groups') ], aggregations=[ Aggregation(columns=['value'], aggfunction='sum', newcolumns=['value']) ], groups=[], ) benchmark(execute_totals, step, df)