def test_group_by_and_summarize_should_return_a_ready_to_use_dataframe(data): """ Let's group by `seasons` and calculate some aggregations """ recipe = Recipe([ GroupByStep(['seasons']), SummarizeStep({ 'rating_mean': AggMean('rating'), 'rating_median': AggMedian('rating'), 'year_max': AggMax('year'), 'year_min': AggMin('year'), 'watched_count': AggSum('watched'), 'years_count': AggCount('year'), 'years_unique_count': AggCountDistinct('year') }) ]) baked_df = recipe.prepare(data).bake(data) assert 'seasons' in baked_df.columns assert 'rating_mean' in baked_df.columns assert 'rating_median' in baked_df.columns assert 'year_max' in baked_df.columns assert 'year_min' in baked_df.columns assert 'watched_count' in baked_df.columns assert 'years_count' in baked_df.columns assert 'years_unique_count' in baked_df.columns
def test_right_join_with_using_a_recipe(startrek_starships, startrek_starships_specs): """ Right Join with another Recipe """ right_recipe = Recipe([SortStep('uid')]) left_recipe = Recipe([ RightJoinStep(right_recipe, by='uid', df=startrek_starships_specs), SortStep('uid') ]) bdf = left_recipe.prepare(startrek_starships).bake(startrek_starships) assert bdf.shape == (4, 3) row = pd.Series({ 'uid': 'NCC-1031', 'name': 'USS Discovery', 'warp': 9.9 }, name=0) assert_series_equal(bdf.loc[0], row) row = pd.Series({ 'uid': 'NCC-74656', 'name': 'USS Voyager', 'warp': 9.975 }, name=3) assert_series_equal(bdf.loc[3], row)
def test_mean_inputation_on_numerical_column(data): """ Before impute the season, replace 7 with NAN to have some missing values """ recipe = Recipe([ MutateStep({ 'seasons': MapValues({ 7: np.NaN }), 'rating': MapValues({ 9.3: np.NaN, 9.9: np.NaN, 9.0: np.NaN }) }), MeanImputeStep(['seasons', 'rating']) ]) bdf = recipe.prepare(data).bake(data) seasons = bdf['seasons'].round(1).tolist() assert seasons[0] == 1.0 assert seasons[1] == 2.3 # mean assert seasons[2] == 2.3 # mean assert seasons[3] == 4.0 assert seasons[4] == 2.3 # mean assert seasons[5] == 2.0 ratings = bdf['rating'].round(1).tolist() assert ratings[0] == 7.7 # mean assert ratings[1] == 7.7 # mean assert ratings[2] == 7.4 assert ratings[3] == 6.8 assert ratings[4] == 8.9 assert ratings[5] == 7.7 # mean
def test_join_on_invalid_how_step(startrek_starships, startrek_starships_specs): """ The how parameter is invalid and the step shoul fail the validation """ recipe = Recipe([ JoinStep(startrek_starships_specs, by="uid", how="not_exist"), ]) with pytest.raises(YeastValidationError) as ex: recipe.prepare(startrek_starships).bake(startrek_starships)
def test_left_join_using_df_but_not_a_recipe(startrek_starships, startrek_starships_specs): """ df is only used if right is a Recipe """ recipe = Recipe([ JoinStep(startrek_starships_specs, by="uid", df=startrek_starships, how="left") ]) with pytest.raises(YeastValidationError) as ex: recipe.prepare(startrek_starships).bake(startrek_starships)
def test_median_inputation_on_a_non_numerical_column_must_fail(data): """ We can not calculate the median on a non-numerical column. """ recipe = Recipe([ MutateStep({'title': MapValues({'Voyager': np.NaN})}), MedianImputeStep(['title']) ]) with pytest.raises(YeastPreparationError): recipe.prepare(data)
def test_left_join_column_not_found_left_thus_fail(startrek_starships, startrek_starships_specs): """ Left Join but the column uid does not exist on left """ recipe = Recipe([ RenameColumnsStep({'uid': 'not_found'}), JoinStep(startrek_starships_specs, by='uid', how="left") ]) with pytest.raises(YeastValidationError): recipe.prepare(startrek_starships).bake(startrek_starships)
def test_left_join_without_by_workflow(startrek_starships, startrek_starships_specs): """ Left Join without pass the column names """ recipe = Recipe([ JoinStep(startrek_starships_specs, how="left"), SortStep('uid') ]) baked_data = recipe.prepare(startrek_starships).bake(startrek_starships) row = pd.Series({'uid': 'NCC-1031', 'name': 'USS Discovery', 'warp': 9.9}, name=0) assert_series_equal(baked_data.loc[0], row)
def test_recipe_workflow_validations_should_be_on_the_last_transformed_data( raw_data): """ Title exist on the original data but not after the first step. Validation on the second step must fail. """ recipe = Recipe([ steps.SelectColumnsStep(['year', 'seasons']), steps.SelectColumnsStep(['title']), ]) with pytest.raises(errors.YeastValidationError): recipe.prepare(raw_data).bake(raw_data)
def test_join_on_right_step(startrek_starships, startrek_starships_specs): """ Right Join with NA mismmatches """ recipe = Recipe([ JoinStep(startrek_starships_specs, by="uid", how="right"), SortStep('uid') ]) baked_data = recipe.prepare(startrek_starships).bake(startrek_starships) assert baked_data.shape == (4, 3) row = pd.Series({'uid': 'NCC-1031', 'name': 'USS Discovery', 'warp': 9.9}, name=0) assert_series_equal(baked_data.loc[0], row) row = pd.Series({'uid': 'NCC-74656', 'name': 'USS Voyager', 'warp': 9.975}, name=3) assert_series_equal(baked_data.loc[3], row)
def test_recipe_workflow(raw_data): """ Secuential execution of the recipe """ recipe = Recipe([ steps.CleanColumnNamesStep('snake'), steps.SelectColumnsStep(['creation_year', 'total_seasons']) ]) baked_data = recipe.prepare(raw_data).bake(raw_data) assert 'creation_year' in baked_data.columns assert 'total_seasons' in baked_data.columns assert 'series_name' not in baked_data.columns assert 'series_Name' not in baked_data.columns assert 'CreationYear' not in baked_data.columns assert 'Total Seasons' not in baked_data.columns
def test_join_on_fullouter_step(startrek_starships, startrek_starships_specs): """ Full outer Join with NA mismmatches """ recipe = Recipe([ JoinStep(startrek_starships_specs, by="uid", how="full"), SortStep('uid') ]) baked_data = recipe.prepare(startrek_starships).bake(startrek_starships) assert baked_data.shape == (5, 3) row = pd.Series({'uid': 'NCC-1031', 'name': 'USS Discovery', 'warp': 9.9}, name=0) assert_series_equal(baked_data.loc[0], row) row = pd.Series({'uid': 'NX-01', 'name': 'Enterprise', 'warp': None}, name=4) assert_series_equal(baked_data.loc[4], row)
def test_skip_on_other_set_workflow(raw_data): """ Secuential execution of the recipe but one step (FilterStep) must be skipped on new data """ recipe = Recipe([ steps.CleanColumnNamesStep('snake'), steps.SelectColumnsStep(['creation_year', 'total_seasons']), steps.FilterStep('creation_year in [2017, 2020]', role='train') ]) recipe = recipe.prepare(raw_data) train = recipe.bake(raw_data) assert train.shape == (2, 2) # Filter Step was applied test = recipe.bake(raw_data, role='test') assert test.shape == (6, 2) # Filter Step was applied
def test_left_join_with_using_a_recipe(startrek_starships, startrek_starships_specs): """ Left Join with another Recipe """ right_recipe = Recipe([ SortStep('uid') ]) left_recipe = Recipe([ JoinStep(right_recipe, by='uid', how='left', df=startrek_starships_specs), SortStep('uid') ]) baked_df = left_recipe.prepare(startrek_starships).bake(startrek_starships) row = pd.Series({'uid': 'NCC-1031', 'name': 'USS Discovery', 'warp': 9.9}, name=0) assert_series_equal(baked_df.loc[0], row)
def test_recipe_workflow_only_accepts_yeast_steps(raw_data): """ Currently, we are only going to support Steps objects on the Recipe """ with pytest.raises(errors.YeastRecipeError): Recipe([ steps.SelectColumnsStep(['year', 'seasons']), {}, ])
def test_custom_roles_workflow(raw_data): """ Different steps with different roles should be executed differently """ recipe = Recipe([ steps.CleanColumnNamesStep('snake'), steps.SelectColumnsStep(['creation_year', 'total_seasons']), steps.FilterStep('creation_year == 2020', role='train'), steps.SelectColumnsStep(['creation_year'], role='train'), steps.SelectColumnsStep(['total_seasons'], role='test'), ]) recipe = recipe.prepare(raw_data) train = recipe.bake(raw_data, role='train') assert train.shape == (1, 1) # Filter Step was applied assert 'creation_year' in train.columns # Only creation_year in train role test = recipe.bake(raw_data, role='test') assert test.shape == (6, 1) # Filter Step was applied assert 'total_seasons' in test.columns # Only total_seasons in test role
def test_rownumber_mutation_using_a_lambda_after_groupby(startrek_data): """ Use Mutate After a GroupBy with a lambda to create a new column with the row_number inside the group: ``` title year row_number Picard 2020 1.0 Discovery 2017 1.0 Enterprise 2001 1.0 Voyager 1995 1.0 Deep Space Nine 1993 2.0 TNG 1987 3.0 ``` """ recipe = Recipe([ GroupByStep('seasons'), MutateStep({ 'row_number': lambda df: df['rating'].rank(method="first") }), SortStep(['seasons', 'row_number']) ]) bdf = recipe.bake(startrek_data) assert bdf[['seasons', 'row_number']].loc[0]['seasons'] == 1 assert bdf[['seasons', 'row_number']].loc[0]['row_number'] == 1 assert bdf[['seasons', 'row_number']].loc[1]['seasons'] == 2 assert bdf[['seasons', 'row_number']].loc[1]['row_number'] == 1 assert bdf[['seasons', 'row_number']].loc[2]['seasons'] == 4 assert bdf[['seasons', 'row_number']].loc[2]['row_number'] == 1 assert bdf[['seasons', 'row_number']].loc[3]['seasons'] == 7 assert bdf[['seasons', 'row_number']].loc[3]['row_number'] == 1 assert bdf[['seasons', 'row_number']].loc[4]['seasons'] == 7 assert bdf[['seasons', 'row_number']].loc[4]['row_number'] == 2 assert bdf[['seasons', 'row_number']].loc[5]['seasons'] == 7 assert bdf[['seasons', 'row_number']].loc[5]['row_number'] == 3
def test_rownumber_mutation_using_a_transformer_on_dataframe(startrek_data): """ Test a Rank operation without group by: ``` title year row_number TNG 1987 1.0 Deep Space Nine 1993 2.0 Voyager 1995 3.0 Enterprise 2001 4.0 Discovery 2017 5.0 Picard 2020 6.0 ``` """ recipe = Recipe([ MutateStep({ 'row_number': RowNumber('year') }), SortStep('year') ]) bdf = recipe.bake(startrek_data) assert bdf[['year', 'row_number']].loc[0]['year'] == 1987 assert bdf[['year', 'row_number']].loc[0]['row_number'] == 1 assert bdf[['year', 'row_number']].loc[1]['year'] == 1993 assert bdf[['year', 'row_number']].loc[1]['row_number'] == 2 assert bdf[['year', 'row_number']].loc[2]['year'] == 1995 assert bdf[['year', 'row_number']].loc[2]['row_number'] == 3 assert bdf[['year', 'row_number']].loc[3]['year'] == 2001 assert bdf[['year', 'row_number']].loc[3]['row_number'] == 4 assert bdf[['year', 'row_number']].loc[4]['year'] == 2017 assert bdf[['year', 'row_number']].loc[4]['row_number'] == 5 assert bdf[['year', 'row_number']].loc[5]['year'] == 2020 assert bdf[['year', 'row_number']].loc[5]['row_number'] == 6
def test_inner_join_step(startrek_starships, startrek_starships_specs): """ Inner Join with NA mismmatches """ recipe = Recipe( [InnerJoinStep(startrek_starships_specs, by="uid"), SortStep('uid')]) bdf = recipe.prepare(startrek_starships).bake(startrek_starships) assert bdf.shape == (4, 3) row = pd.Series({ 'uid': 'NCC-1031', 'name': 'USS Discovery', 'warp': 9.9 }, name=0) assert_series_equal(bdf.loc[0], row) row = pd.Series({ 'uid': 'NCC-74656', 'name': 'USS Voyager', 'warp': 9.975 }, name=3) assert_series_equal(bdf.loc[3], row)
def test_add_recipe_by_name(data): """ Add a new recipe into the cookbook """ # Define the cookbook cookbook = Cookbook() assert len(cookbook.list()) == 0 # Define a Recipe recipe = Recipe([]) # Add the Recipe into the Cookbook cookbook.add('test.recipe', recipe) assert len(cookbook.list()) == 1 assert 'test.recipe' in cookbook.list() # Get the Recipe assert cookbook.get('test.recipe') == recipe
def test_adding_two_recipes_with_same_name_raises_an_error(data): """ You can't have two recipes with the same name """ cookbook = Cookbook() assert len(cookbook.list()) == 0 # Define a Recipe recipe = Recipe([]) # Add the Recipe into the Cookbook cookbook.add('test.recipe', recipe) assert len(cookbook.list()) == 1 with pytest.raises(YeastCookbookError): # Add the seconds Recipe into the Cookbook cookbook.add('test.recipe', recipe) assert len(cookbook.list()) == 1