def test_partial_dependence_all_nan_value_error( logistic_regression_binary_pipeline_class): pl = logistic_regression_binary_pipeline_class({}) X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) y = pd.Series([0, 1, 0]) pl.fit(X, y) pred_df = pd.DataFrame({ "a": [None] * 5, "b": [1, 2, 3, 4, 4], "c": [None] * 5 }) message = "The following features have all NaN values and so the partial dependence cannot be computed: {}" with pytest.raises(ValueError, match=message.format("'a'")): partial_dependence(pl, pred_df, features="a", grid_resolution=10) with pytest.raises(ValueError, match=message.format("'a'")): partial_dependence(pl, pred_df, features=0, grid_resolution=10) with pytest.raises(ValueError, match=message.format("'a'")): partial_dependence(pl, pred_df, features=("a", "b"), grid_resolution=10) with pytest.raises(ValueError, match=message.format("'a', 'c'")): partial_dependence(pl, pred_df, features=("a", "c"), grid_resolution=10) pred_df = pred_df.rename(columns={"a": 0}) with pytest.raises(ValueError, match=message.format("'0'")): partial_dependence(pl, pred_df, features=0, grid_resolution=10)
def test_partial_dependence_multiclass( logistic_regression_multiclass_pipeline_class): X, y = load_wine() pipeline = logistic_regression_multiclass_pipeline_class( parameters={"Logistic Regression Classifier": { "n_jobs": 1 }}) pipeline.fit(X, y) num_classes = y.to_series().nunique() grid_resolution = 20 one_way_part_dep = partial_dependence(pipeline=pipeline, X=X, features="magnesium", grid_resolution=grid_resolution) assert "class_label" in one_way_part_dep.columns assert one_way_part_dep["class_label"].nunique() == num_classes assert len(one_way_part_dep.index) == num_classes * grid_resolution assert list(one_way_part_dep.columns) == [ "feature_values", "partial_dependence", "class_label" ] two_way_part_dep = partial_dependence(pipeline=pipeline, X=X, features=("magnesium", "alcohol"), grid_resolution=grid_resolution) assert "class_label" in two_way_part_dep.columns assert two_way_part_dep["class_label"].nunique() == num_classes assert len(two_way_part_dep.index) == num_classes * grid_resolution assert len(two_way_part_dep.columns) == grid_resolution + 1
def test_partial_dependence_with_non_numeric_columns( data_type, linear_regression_pipeline_class, logistic_regression_binary_pipeline_class): X = pd.DataFrame({ 'numeric': [1, 2, 3, 0], 'also numeric': [2, 3, 4, 1], 'string': ['a', 'b', 'a', 'c'], 'also string': ['c', 'b', 'a', 'd'] }) if data_type == "ww": X = ww.DataTable(X) y = [0, 0.2, 1.4, 1] pipeline = linear_regression_pipeline_class( parameters={"Linear Regressor": { "n_jobs": 1 }}) pipeline.fit(X, y) part_dep = partial_dependence(pipeline, X, features='numeric') assert list(part_dep.columns) == ["feature_values", "partial_dependence"] assert len(part_dep["partial_dependence"]) == 4 assert len(part_dep["feature_values"]) == 4 assert not part_dep.isnull().any(axis=None) part_dep = partial_dependence(pipeline, X, features='string') assert list(part_dep.columns) == ["feature_values", "partial_dependence"] assert len(part_dep["partial_dependence"]) == 3 assert len(part_dep["feature_values"]) == 3 assert not part_dep.isnull().any(axis=None)
def test_partial_dependence_errors(logistic_regression_binary_pipeline_class): X = pd.DataFrame({ 'a': [2, None, 2, 2], 'b': [1, 2, 2, 1], 'c': [0, 0, 0, 0] }) y = pd.Series([0, 1, 0, 1]) pipeline = logistic_regression_binary_pipeline_class( parameters={"Logistic Regression Classifier": { "n_jobs": 1 }}) pipeline.fit(X, y) with pytest.raises( ValueError, match= "Too many features given to graph_partial_dependence. Only one or two-way partial dependence is supported." ): partial_dependence(pipeline, X, features=('a', 'b', 'c'), grid_resolution=20) with pytest.raises( ValueError, match= "Features provided must be a tuple entirely of integers or strings, not a mixture of both." ): partial_dependence(pipeline, X, features=(0, 'b'))
def test_partial_dependence_more_categories_than_grid_resolution(logistic_regression_binary_pipeline_class): def round_dict_keys(dictionary, places=6): """ Function to round all keys of a dictionary that has floats as keys. """ dictionary_rounded = {} for key in dictionary: dictionary_rounded[round(key, places)] = dictionary[key] return dictionary_rounded X, y = load_fraud(1000) X = X.drop(columns=['datetime', 'expiration_date', 'country', 'region', 'provider']) pipeline = logistic_regression_binary_pipeline_class({}) pipeline.fit(X, y) num_cat_features = len(set(X["currency"].to_series())) assert num_cat_features == 164 part_dep_ans = {0.1432616813857269: 154, 0.1502346349971562: 1, 0.14487916687594762: 1, 0.1573183451314127: 1, 0.11695462432136654: 1, 0.07950579532536253: 1, 0.006794444792966759: 1, 0.17745270478939879: 1, 0.1666874487986626: 1, 0.13357573073236878: 1, 0.06778096366056789: 1} part_dep_ans_rounded = round_dict_keys(part_dep_ans) # Check the case where grid_resolution < number of categorical features part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features / 2)) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict) # Check the case where grid_resolution == number of categorical features part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features)) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict) # Check the case where grid_resolution > number of categorical features part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features * 2)) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict)
def test_partial_dependence_xgboost_feature_names(problem_type, has_minimal_dependencies, X_y_binary, X_y_multi, X_y_regression): if has_minimal_dependencies: pytest.skip("Skipping because XGBoost not installed for minimal dependencies") if problem_type == ProblemTypes.REGRESSION: pipeline = RegressionPipeline(component_graph=['Simple Imputer', 'XGBoost Regressor'], parameters={'XGBoost Classifier': {'nthread': 1}}) X, y = X_y_regression elif problem_type == ProblemTypes.BINARY: pipeline = BinaryClassificationPipeline(component_graph=['Simple Imputer', 'XGBoost Classifier'], parameters={'XGBoost Classifier': {'nthread': 1}}) X, y = X_y_binary elif problem_type == ProblemTypes.MULTICLASS: pipeline = MulticlassClassificationPipeline(component_graph=['Simple Imputer', 'XGBoost Classifier'], parameters={'XGBoost Classifier': {'nthread': 1}}) X, y = X_y_multi X = pd.DataFrame(X) X = X.rename(columns={0: '<[0]'}) pipeline.fit(X, y) part_dep = partial_dependence(pipeline, X, features="<[0]", grid_resolution=20) check_partial_dependence_dataframe(pipeline, part_dep) assert not part_dep.isnull().all().all() part_dep = partial_dependence(pipeline, X, features=1, grid_resolution=20) check_partial_dependence_dataframe(pipeline, part_dep) assert not part_dep.isnull().all().all()
def test_partial_dependence_catboost(problem_type, X_y_binary, X_y_multi, has_minimal_dependencies): if not has_minimal_dependencies: if problem_type == ProblemTypes.BINARY: X, y = X_y_binary y_small = ['a', 'b', 'a'] pipeline_class = BinaryClassificationPipeline else: X, y = X_y_multi y_small = ['a', 'b', 'c'] pipeline_class = MulticlassClassificationPipeline pipeline = pipeline_class(component_graph=["CatBoost Classifier"], parameters={"CatBoost Classifier": {'thread_count': 1}}) pipeline.fit(X, y) part_dep = partial_dependence(pipeline, X, features=0, grid_resolution=20) check_partial_dependence_dataframe(pipeline, part_dep) assert not part_dep.isnull().all().all() # test that CatBoost can natively handle non-numerical columns as feature passed to partial_dependence X = pd.DataFrame({'numeric': [1, 2, 3], 'also numeric': [2, 3, 4], 'string': ['a', 'b', 'c'], 'also string': ['c', 'b', 'a']}) pipeline = pipeline_class(component_graph=["CatBoost Classifier"], parameters={"CatBoost Classifier": {'thread_count': 1}}) pipeline.fit(X, y_small) part_dep = partial_dependence(pipeline, X, features='string') check_partial_dependence_dataframe(pipeline, part_dep, grid_size=3) assert not part_dep.isnull().all().all()
def test_partial_dependence_baseline(): X = pd.DataFrame([[1, 0], [0, 1]]) y = pd.Series([0, 1]) pipeline = BinaryClassificationPipeline(component_graph=["Baseline Classifier"], parameters={}) pipeline.fit(X, y) with pytest.raises(ValueError, match="Partial dependence plots are not supported for Baseline pipelines"): partial_dependence(pipeline, X, features=0, grid_resolution=20)
def test_partial_dependence_more_categories_than_grid_resolution( logistic_regression_binary_pipeline_class): def round_dict_keys(dictionary, places=6): """ Function to round all keys of a dictionary that has floats as keys. """ dictionary_rounded = {} for key in dictionary: dictionary_rounded[round(key, places)] = dictionary[key] return dictionary_rounded X, y = load_fraud(1000) X = X.drop(columns=[ 'datetime', 'expiration_date', 'country', 'region', 'provider' ]) pipeline = logistic_regression_binary_pipeline_class({}) pipeline.fit(X, y) num_cat_features = len(set(X["currency"].to_series())) assert num_cat_features == 164 part_dep_ans = { 0.1424060057413758: 154, 0.006837318701999957: 1, 0.24445532203317386: 1, 0.15637574440029903: 1, 0.11676042311300606: 1, 0.13434069071819482: 1, 0.1502609021969637: 1, 0.14486201259150977: 1, 0.16687406140200164: 1, 0.06815227785761911: 1, 0.0791821060634158: 1 } part_dep_ans_rounded = round_dict_keys(part_dep_ans) # Check the case where grid_resolution < number of categorical features part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features / 2)) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict) # Check the case where grid_resolution == number of categorical features part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features)) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict) # Check the case where grid_resolution > number of categorical features part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features * 2)) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict)
def test_partial_dependence_not_fitted( X_y_binary, logistic_regression_binary_pipeline_class): X, y = X_y_binary pipeline = logistic_regression_binary_pipeline_class( parameters={"Logistic Regression Classifier": { "n_jobs": 1 }}) with pytest.raises( ValueError, match="Pipeline to calculate partial dependence for must be fitted" ): partial_dependence(pipeline, X, features=0, grid_resolution=20)
def test_partial_dependence_respect_grid_resolution(): X, y = load_fraud(1000) pl = BinaryClassificationPipeline(component_graph=["DateTime Featurization Component", "One Hot Encoder", "Random Forest Classifier"]) pl.fit(X, y) dep = partial_dependence(pl, X, features="amount", grid_resolution=20) assert dep.shape[0] == 20 assert dep.shape[0] != max(X.select('categorical').describe().loc["nunique"]) + 1 dep = partial_dependence(pl, X, features="provider", grid_resolution=20) assert dep.shape[0] == X['provider'].to_series().nunique() assert dep.shape[0] != max(X.select('categorical').describe().loc["nunique"]) + 1
def test_partial_dependence_problem_types( data_type, problem_type, X_y_binary, X_y_multi, X_y_regression, logistic_regression_binary_pipeline_class, logistic_regression_multiclass_pipeline_class, linear_regression_pipeline_class, make_data_type): if problem_type == ProblemTypes.BINARY: X, y = X_y_binary pipeline = logistic_regression_binary_pipeline_class( parameters={"Logistic Regression Classifier": { "n_jobs": 1 }}) elif problem_type == ProblemTypes.MULTICLASS: X, y = X_y_multi pipeline = logistic_regression_multiclass_pipeline_class( parameters={"Logistic Regression Classifier": { "n_jobs": 1 }}) elif problem_type == ProblemTypes.REGRESSION: X, y = X_y_regression pipeline = linear_regression_pipeline_class( parameters={"Linear Regressor": { "n_jobs": 1 }}) X = make_data_type(data_type, X) pipeline.fit(X, y) part_dep = partial_dependence(pipeline, X, features=0, grid_resolution=20) check_partial_dependence_dataframe(pipeline, part_dep) assert not part_dep.isnull().any(axis=None)
def test_graph_two_way_partial_dependence(test_pipeline): X, y = load_breast_cancer() go = pytest.importorskip( 'plotly.graph_objects', reason='Skipping plotting test because plotly not installed') clf = test_pipeline clf.fit(X, y) fig = graph_partial_dependence(clf, X, features=('mean radius', 'mean area'), grid_resolution=20) assert isinstance(fig, go.Figure) fig_dict = fig.to_dict() assert fig_dict['layout']['title'][ 'text'] == "Partial Dependence of 'mean radius' vs. 'mean area'" assert len(fig_dict['data']) == 1 assert fig_dict['data'][0]['name'] == "Partial Dependence" part_dep_data = partial_dependence(clf, X, features=('mean radius', 'mean area'), grid_resolution=20) assert np.array_equal(fig_dict['data'][0]['x'], part_dep_data.index) assert np.array_equal(fig_dict['data'][0]['y'], part_dep_data.columns) assert np.array_equal(fig_dict['data'][0]['z'], part_dep_data.values)
def test_partial_dependence_string_feature_name(logistic_regression_binary_pipeline_class): X, y = load_breast_cancer() pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) pipeline.fit(X, y) part_dep = partial_dependence(pipeline, X, features="mean radius", grid_resolution=20) assert list(part_dep.columns) == ["feature_values", "partial_dependence", "class_label"] assert len(part_dep["partial_dependence"]) == 20 assert len(part_dep["feature_values"]) == 20 assert not part_dep.isnull().any(axis=None)
def test_partial_dependence_warning(logistic_regression_binary_pipeline_class): X = pd.DataFrame({'a': [1, 2, None, 2, 2], 'b': [1, 1, 2, 2, 1]}) y = pd.Series([0, 1, 0, 1, 0]) pipeline = logistic_regression_binary_pipeline_class( parameters={"Logistic Regression Classifier": { "n_jobs": 1 }}) pipeline.fit(X, y) with pytest.warns( NullsInColumnWarning, match= "There are null values in the features, which will cause NaN values in the partial dependence output" ): partial_dependence(pipeline, X, features=0, grid_resolution=20) with pytest.warns( NullsInColumnWarning, match= "There are null values in the features, which will cause NaN values in the partial dependence output" ): partial_dependence(pipeline, X, features='a', grid_resolution=20)
def test_partial_dependence_datetime(problem_type, X_y_regression, X_y_binary, X_y_multi): if problem_type == 'binary': X, y = X_y_binary pipeline = BinaryClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Logistic Regression Classifier']) elif problem_type == 'multiclass': X, y = X_y_multi pipeline = MulticlassClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Logistic Regression Classifier']) else: X, y = X_y_regression pipeline = RegressionPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Linear Regressor']) X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) y = pd.Series(y) X['dt_column'] = pd.Series(pd.date_range('20200101', periods=X.shape[0])) pipeline.fit(X, y) part_dep = partial_dependence(pipeline, X, features='dt_column') if problem_type == 'multiclass': assert len(part_dep["partial_dependence"]) == 300 # 100 rows * 3 classes assert len(part_dep["feature_values"]) == 300 else: assert len(part_dep["partial_dependence"]) == 100 assert len(part_dep["feature_values"]) == 100 assert not part_dep.isnull().any(axis=None) part_dep = partial_dependence(pipeline, X, features=20) if problem_type == 'multiclass': assert len(part_dep["partial_dependence"]) == 300 # 100 rows * 3 classes assert len(part_dep["feature_values"]) == 300 else: assert len(part_dep["partial_dependence"]) == 100 assert len(part_dep["feature_values"]) == 100 assert not part_dep.isnull().any(axis=None) with pytest.raises(ValueError, match='Two-way partial dependence is not supported for datetime columns.'): part_dep = partial_dependence(pipeline, X, features=('0', 'dt_column')) with pytest.raises(ValueError, match='Two-way partial dependence is not supported for datetime columns.'): part_dep = partial_dependence(pipeline, X, features=(0, 20))
def test_partial_dependence_percentile_errors( logistic_regression_binary_pipeline_class): # random_col will be 5% 0, 95% 1 X = pd.DataFrame({ "A": [i % 3 for i in range(1000)], "B": [(j + 3) % 5 for j in range(1000)], "random_col": [0 if i < 50 else 1 for i in range(1000)] }) y = pd.Series([i % 2 for i in range(1000)]) pipeline = logistic_regression_binary_pipeline_class( parameters={"Logistic Regression Classifier": { "n_jobs": 1 }}) pipeline.fit(X, y) with pytest.raises( ValueError, match="Feature 'random_col' is mostly one value, 1, and cannot be" ): partial_dependence(pipeline, X, features="random_col", grid_resolution=20) with pytest.raises( ValueError, match="Feature 'random_col' is mostly one value, 1, and cannot be" ): partial_dependence(pipeline, X, features="random_col", percentiles=(0.01, 0.955), grid_resolution=20) with pytest.raises( ValueError, match="Feature 'random_col' is mostly one value, 1, and cannot be" ): partial_dependence(pipeline, X, features=2, percentiles=(0.01, 0.955), grid_resolution=20) part_dep = partial_dependence(pipeline, X, features="random_col", percentiles=(0.01, 0.96), grid_resolution=20) assert list(part_dep.columns) == [ "feature_values", "partial_dependence", "class_label" ] assert len(part_dep["partial_dependence"]) == 2 assert len(part_dep["feature_values"]) == 2 assert not part_dep.isnull().any(axis=None)