def test_forward_selection(self, mocker, max_predictors: int): # create list of elements [var1_enc, var2_c, ..., var10_enc] predictors_list = [f"var{i+1}_enc" for i in range(10)] # extract sublist [var1_enc, var5_enc, var9_enc]: forced_predictors = predictors_list[::4] # remove these from predictors list to have clean version predictors = [ pred for pred in predictors_list if pred not in forced_predictors ] ordered_output_list = forced_predictors + predictors def mock_find_next_best_model(self, train_data, target_column_name, candidate_predictors, current_predictors): return mock_model(current_predictors + candidate_predictors[0:1]) mocker.patch(("cobra.model_building.ForwardFeatureSelection." "_find_next_best_model"), mock_find_next_best_model) fw_selection = ForwardFeatureSelection(max_predictors=max_predictors) fitted_models = (fw_selection._forward_selection( pd.DataFrame(), "target", predictors, forced_predictors)) actual = [sorted(model.predictors) for model in fitted_models] expected = [ sorted(ordered_output_list[:i + 1]) for i in range(min(max_predictors, len(predictors_list))) ] assert actual == expected
def test_fit(self, mocker, max_predictors: int, expectation): # create list of elements [var1_enc, var2_c, ..., var10_enc] predictors_list = [f"var{i+1}_enc" for i in range(10)] # extract sublist [var1_enc, var5_enc, var9_enc]: forced_predictors_list = predictors_list[::4] ordered_output_list = (forced_predictors_list + [ pred for pred in predictors_list if pred not in forced_predictors_list ]) fw_selection = ForwardFeatureSelection(max_predictors=max_predictors) def mock_train_model(self, train_data, target_column_name, predictors): return mock_model(predictors) def mock_forward_selection(self, train_data, target_column_name, predictors, forced_predictors): n_models = min(max_predictors, len(predictors) + len(forced_predictors)) return [ mock_model(ordered_output_list[:i + 1]) for i in range(n_models) ] (mocker.patch( "cobra.model_building.ForwardFeatureSelection._train_model", mock_train_model)) mocker.patch(("cobra.model_building.ForwardFeatureSelection" "._forward_selection"), mock_forward_selection) with expectation: fw_selection.fit(pd.DataFrame(), "target", predictors=predictors_list, forced_predictors=forced_predictors_list, excluded_predictors=[]) # for each fitted model, check number of predictors actual = [ model.predictors for model in fw_selection._fitted_models ] expected = [ ordered_output_list[:i + 1] for i in range(min(max_predictors, len(predictors_list))) ] if max_predictors == len(forced_predictors_list): expected = [forced_predictors_list] assert actual == expected
def test_compute_model_performances(self, mocker, model_type): data = mock_data(add_split_col=True, model_type=model_type) fw_selection = ForwardFeatureSelection(model_type=model_type) fw_selection._fitted_models = [ mock_model_num_pred(1, model_type=model_type), mock_model_num_pred(2, model_type=model_type), mock_model_num_pred(3, model_type=model_type) ] def mock_evaluate(self, X, y, split, metric): # on AUC scale, but gives the same for RMSE as it is a mock if split == "train": return 0.612 else: return 0.609 if model_type == "classification": patch_fct = "cobra.model_building.forward_selection.LogisticRegressionModel.evaluate" elif model_type == "regression": patch_fct = "cobra.model_building.forward_selection.LinearRegressionModel.evaluate" mocker.patch(patch_fct, mock_evaluate) actual = (fw_selection .compute_model_performances(data, "target", splits=["train", "selection"], metric=None)) expected = pd.DataFrame([ {"predictors": ["var1_enc"], "last_added_predictor": "var1_enc", "train_performance": 0.612, "selection_performance": 0.609, "model_type": model_type}, {"predictors": ["var1_enc", "var2_enc"], "last_added_predictor": "var2_enc", "train_performance": 0.612, "selection_performance": 0.609, "model_type": model_type}, {"predictors": ["var1_enc", "var2_enc", "var3_enc"], "last_added_predictor": "var3_enc", "train_performance": 0.612, "selection_performance": 0.609, "model_type": model_type} ]) pd.testing.assert_frame_equal(actual, expected)
def test_compute_model_performances(self, mocker): data = mock_data(add_split_col=True) fw_selection = ForwardFeatureSelection() fw_selection._fitted_models = [ mock_model_num_pred(1), mock_model_num_pred(2), mock_model_num_pred(3) ] def mock_evaluate(self, X, y, split): if split == "train": return 0.612 else: return 0.609 (mocker.patch(("cobra.model_building.forward_selection" ".MLModel.evaluate"), mock_evaluate)) actual = (fw_selection.compute_model_performances( data, "target", splits=["train", "selection"])) expected = pd.DataFrame([{ "predictors": ["var1_enc"], "last_added_predictor": "var1_enc", "train_performance": 0.612, "selection_performance": 0.609 }, { "predictors": ["var1_enc", "var2_enc"], "last_added_predictor": "var2_enc", "train_performance": 0.612, "selection_performance": 0.609 }, { "predictors": ["var1_enc", "var2_enc", "var3_enc"], "last_added_predictor": "var3_enc", "train_performance": 0.612, "selection_performance": 0.609 }]) pd.testing.assert_frame_equal(actual, expected)
def test_ffs_train_data_assertions(self, model_type): fw_selection = ForwardFeatureSelection(model_type=model_type) with pytest.raises(AssertionError): # no split column fw_selection.fit(pd.DataFrame(), "target", predictors=[""]) df = mock_data(add_split_col=True, model_type=model_type) with pytest.raises(AssertionError): # not at least train & selection sets fw_selection.fit(df[df["split"] == "train"], "target", predictors=[""])
def test_get_model_from_step(self): forward_selection = ForwardFeatureSelection() with pytest.raises(ValueError): forward_selection.get_model_from_step(2)