def test_wine(): X, y = demos.load_wine() assert X.shape == (178, 13) assert y.shape == (178, ) assert isinstance(X, ww.DataTable) assert isinstance(y, ww.DataColumn) X, y = demos.load_wine(return_pandas=True) assert X.shape == (178, 13) assert y.shape == (178, ) assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series)
def test_partial_dependence_multiclass( logistic_regression_multiclass_pipeline_class): X, y = load_wine() pipeline = logistic_regression_multiclass_pipeline_class( parameters={"Logistic Regression Classifier": { "n_jobs": 1 }}) pipeline.fit(X, y) num_classes = y.to_series().nunique() grid_resolution = 20 one_way_part_dep = partial_dependence(pipeline=pipeline, X=X, features="magnesium", grid_resolution=grid_resolution) assert "class_label" in one_way_part_dep.columns assert one_way_part_dep["class_label"].nunique() == num_classes assert len(one_way_part_dep.index) == num_classes * grid_resolution assert list(one_way_part_dep.columns) == [ "feature_values", "partial_dependence", "class_label" ] two_way_part_dep = partial_dependence(pipeline=pipeline, X=X, features=("magnesium", "alcohol"), grid_resolution=grid_resolution) assert "class_label" in two_way_part_dep.columns assert two_way_part_dep["class_label"].nunique() == num_classes assert len(two_way_part_dep.index) == num_classes * grid_resolution assert len(two_way_part_dep.columns) == grid_resolution + 1
def test_pipeline_has_classes_property( logistic_regression_binary_pipeline_class, logistic_regression_multiclass_pipeline_class, problem_type, use_ints): if problem_type == "binary": X, y = load_breast_cancer(return_pandas=True) pipeline = logistic_regression_binary_pipeline_class( parameters={"Logistic Regression Classifier": { "n_jobs": 1 }}) if use_ints: y = y.map({'malignant': 0, 'benign': 1}) answer = [0, 1] else: answer = ["benign", "malignant"] elif problem_type == "multi": X, y = load_wine(return_pandas=True) pipeline = logistic_regression_multiclass_pipeline_class( parameters={"Logistic Regression Classifier": { "n_jobs": 1 }}) if use_ints: y = y.map({"class_0": 0, "class_1": 1, "class_2": 2}) answer = [0, 1, 2] else: answer = ["class_0", "class_1", "class_2"] with pytest.raises( AttributeError, match="Cannot access class names before fitting the pipeline."): pipeline.classes_ pipeline.fit(X, y) pd.testing.assert_series_equal(pd.Series(pipeline.classes_), pd.Series(answer))
def test_graph_partial_dependence_multiclass(logistic_regression_multiclass_pipeline_class): go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') X, y = load_wine() pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) pipeline.fit(X, y) # Test one-way without class labels fig_one_way_no_class_labels = graph_partial_dependence(pipeline, X, features='magnesium', grid_resolution=20) assert isinstance(fig_one_way_no_class_labels, go.Figure) fig_dict = fig_one_way_no_class_labels.to_dict() assert len(fig_dict['data']) == len(pipeline.classes_) for data, label in zip(fig_dict['data'], pipeline.classes_): assert len(data['x']) == 20 assert len(data['y']) == 20 assert data['name'] == label # Check that all the subplots axes have the same range for suplot_1_axis, suplot_2_axis in [('axis2', 'axis3'), ('axis2', 'axis4'), ('axis3', 'axis4')]: for axis_type in ['x', 'y']: assert fig_dict['layout'][axis_type + suplot_1_axis]['range'] == fig_dict['layout'][axis_type + suplot_2_axis]['range'] # Test one-way with class labels fig_one_way_class_labels = graph_partial_dependence(pipeline, X, features='magnesium', class_label='class_1', grid_resolution=20) assert isinstance(fig_one_way_class_labels, go.Figure) fig_dict = fig_one_way_class_labels.to_dict() assert len(fig_dict['data']) == 1 assert len(fig_dict['data'][0]['x']) == 20 assert len(fig_dict['data'][0]['y']) == 20 assert fig_dict['data'][0]['name'] == 'class_1' msg = "Class wine is not one of the classes the pipeline was fit on: class_0, class_1, class_2" with pytest.raises(ValueError, match=msg): graph_partial_dependence(pipeline, X, features='alcohol', class_label='wine') # Test two-way without class labels fig_two_way_no_class_labels = graph_partial_dependence(pipeline, X, features=('magnesium', 'alcohol'), grid_resolution=20) assert isinstance(fig_two_way_no_class_labels, go.Figure) fig_dict = fig_two_way_no_class_labels.to_dict() assert len(fig_dict['data']) == 3, "Figure does not have partial dependence data for each class." assert all([len(fig_dict["data"][i]['x']) == 20 for i in range(3)]) assert all([len(fig_dict["data"][i]['y']) == 20 for i in range(3)]) assert [fig_dict["data"][i]['name'] for i in range(3)] == ["class_0", "class_1", "class_2"] # Check that all the subplots axes have the same range for suplot_1_axis, suplot_2_axis in [('axis', 'axis2'), ('axis', 'axis3'), ('axis2', 'axis3')]: for axis_type in ['x', 'y']: assert fig_dict['layout'][axis_type + suplot_1_axis]['range'] == fig_dict['layout'][axis_type + suplot_2_axis]['range'] # Test two-way with class labels fig_two_way_class_labels = graph_partial_dependence(pipeline, X, features=('magnesium', 'alcohol'), class_label='class_1', grid_resolution=20) assert isinstance(fig_two_way_class_labels, go.Figure) fig_dict = fig_two_way_class_labels.to_dict() assert len(fig_dict['data']) == 1 assert len(fig_dict['data'][0]['x']) == 20 assert len(fig_dict['data'][0]['y']) == 20 assert fig_dict['data'][0]['name'] == 'class_1' msg = "Class wine is not one of the classes the pipeline was fit on: class_0, class_1, class_2" with pytest.raises(ValueError, match=msg): graph_partial_dependence(pipeline, X, features='alcohol', class_label='wine')
def test_invalid_targets_regression_pipeline(target_type, dummy_regression_pipeline_class): X, y = load_wine(return_pandas=True) if target_type == "category": y = pd.Series(y).astype("category") if target_type == "bool": X, y = load_breast_cancer(return_pandas=True) y = y.map({"malignant": False, "benign": True}) mock_regression_pipeline = dummy_regression_pipeline_class(parameters={}) with pytest.raises(ValueError, match="Regression pipeline can only handle numeric target data"): mock_regression_pipeline.fit(X, y)
def test_partial_dependence_multiclass_categorical(class_label, logistic_regression_multiclass_pipeline_class): pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') X, y = load_wine() X['categorical_column'] = ww.DataColumn(pd.Series([i % 3 for i in range(X.shape[0])]).astype(str), logical_type="Categorical") X['categorical_column_2'] = ww.DataColumn(pd.Series([i % 6 for i in range(X.shape[0])]).astype(str), logical_type="Categorical") pipeline = logistic_regression_multiclass_pipeline_class({"Logistic Regression Classifier": {"n_jobs": 1}}) pipeline.fit(X, y) fig = graph_partial_dependence(pipeline, X, features='categorical_column', class_label=class_label, grid_resolution=5) for i, plot_data in enumerate(fig.to_dict()['data']): assert plot_data['type'] == 'bar' assert plot_data['x'].tolist() == ['0', '1', '2'] if class_label is None: assert plot_data['name'] == f'class_{i}' else: assert plot_data['name'] == class_label fig = graph_partial_dependence(pipeline, X, features=('alcohol', 'categorical_column'), class_label=class_label, grid_resolution=5) for i, plot_data in enumerate(fig.to_dict()['data']): assert plot_data['type'] == 'contour' assert fig.to_dict()['layout']['yaxis']['ticktext'] == ['0', '1', '2'] if class_label is None: assert plot_data['name'] == f'class_{i}' else: assert plot_data['name'] == class_label fig = graph_partial_dependence(pipeline, X, features=('categorical_column_2', 'categorical_column'), class_label=class_label, grid_resolution=5) for i, plot_data in enumerate(fig.to_dict()['data']): assert plot_data['type'] == 'contour' assert fig.to_dict()['layout']['xaxis']['ticktext'] == ['0', '1', '2'] assert fig.to_dict()['layout']['yaxis']['ticktext'] == ['0', '1', '2', '3', '4', '5'] if class_label is None: assert plot_data['name'] == f'class_{i}' else: assert plot_data['name'] == class_label