Beispiel #1
0
def test_graph_partial_dependence_multiclass(logistic_regression_multiclass_pipeline_class):
    go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed')
    X, y = load_wine()
    pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}})
    pipeline.fit(X, y)

    # Test one-way without class labels
    fig_one_way_no_class_labels = graph_partial_dependence(pipeline, X, features='magnesium', grid_resolution=20)
    assert isinstance(fig_one_way_no_class_labels, go.Figure)
    fig_dict = fig_one_way_no_class_labels.to_dict()
    assert len(fig_dict['data']) == len(pipeline.classes_)
    for data, label in zip(fig_dict['data'], pipeline.classes_):
        assert len(data['x']) == 20
        assert len(data['y']) == 20
        assert data['name'] == label

    # Check that all the subplots axes have the same range
    for suplot_1_axis, suplot_2_axis in [('axis2', 'axis3'), ('axis2', 'axis4'), ('axis3', 'axis4')]:
        for axis_type in ['x', 'y']:
            assert fig_dict['layout'][axis_type + suplot_1_axis]['range'] == fig_dict['layout'][axis_type + suplot_2_axis]['range']

    # Test one-way with class labels
    fig_one_way_class_labels = graph_partial_dependence(pipeline, X, features='magnesium', class_label='class_1', grid_resolution=20)
    assert isinstance(fig_one_way_class_labels, go.Figure)
    fig_dict = fig_one_way_class_labels.to_dict()
    assert len(fig_dict['data']) == 1
    assert len(fig_dict['data'][0]['x']) == 20
    assert len(fig_dict['data'][0]['y']) == 20
    assert fig_dict['data'][0]['name'] == 'class_1'

    msg = "Class wine is not one of the classes the pipeline was fit on: class_0, class_1, class_2"
    with pytest.raises(ValueError, match=msg):
        graph_partial_dependence(pipeline, X, features='alcohol', class_label='wine')

    # Test two-way without class labels
    fig_two_way_no_class_labels = graph_partial_dependence(pipeline, X, features=('magnesium', 'alcohol'), grid_resolution=20)
    assert isinstance(fig_two_way_no_class_labels, go.Figure)
    fig_dict = fig_two_way_no_class_labels.to_dict()
    assert len(fig_dict['data']) == 3, "Figure does not have partial dependence data for each class."
    assert all([len(fig_dict["data"][i]['x']) == 20 for i in range(3)])
    assert all([len(fig_dict["data"][i]['y']) == 20 for i in range(3)])
    assert [fig_dict["data"][i]['name'] for i in range(3)] == ["class_0", "class_1", "class_2"]

    # Check that all the subplots axes have the same range
    for suplot_1_axis, suplot_2_axis in [('axis', 'axis2'), ('axis', 'axis3'), ('axis2', 'axis3')]:
        for axis_type in ['x', 'y']:
            assert fig_dict['layout'][axis_type + suplot_1_axis]['range'] == fig_dict['layout'][axis_type + suplot_2_axis]['range']

    # Test two-way with class labels
    fig_two_way_class_labels = graph_partial_dependence(pipeline, X, features=('magnesium', 'alcohol'), class_label='class_1', grid_resolution=20)
    assert isinstance(fig_two_way_class_labels, go.Figure)
    fig_dict = fig_two_way_class_labels.to_dict()
    assert len(fig_dict['data']) == 1
    assert len(fig_dict['data'][0]['x']) == 20
    assert len(fig_dict['data'][0]['y']) == 20
    assert fig_dict['data'][0]['name'] == 'class_1'

    msg = "Class wine is not one of the classes the pipeline was fit on: class_0, class_1, class_2"
    with pytest.raises(ValueError, match=msg):
        graph_partial_dependence(pipeline, X, features='alcohol', class_label='wine')
Beispiel #2
0
def test_graph_two_way_partial_dependence(test_pipeline):
    X, y = load_breast_cancer()

    go = pytest.importorskip(
        'plotly.graph_objects',
        reason='Skipping plotting test because plotly not installed')
    clf = test_pipeline
    clf.fit(X, y)
    fig = graph_partial_dependence(clf,
                                   X,
                                   features=('mean radius', 'mean area'),
                                   grid_resolution=20)
    assert isinstance(fig, go.Figure)
    fig_dict = fig.to_dict()
    assert fig_dict['layout']['title'][
        'text'] == "Partial Dependence of 'mean radius' vs. 'mean area'"
    assert len(fig_dict['data']) == 1
    assert fig_dict['data'][0]['name'] == "Partial Dependence"

    part_dep_data = partial_dependence(clf,
                                       X,
                                       features=('mean radius', 'mean area'),
                                       grid_resolution=20)
    assert np.array_equal(fig_dict['data'][0]['x'], part_dep_data.index)
    assert np.array_equal(fig_dict['data'][0]['y'], part_dep_data.columns)
    assert np.array_equal(fig_dict['data'][0]['z'], part_dep_data.values)
Beispiel #3
0
def test_partial_dependence_multiclass_categorical(class_label,
                                                   logistic_regression_multiclass_pipeline_class):
    pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed')

    X, y = load_wine()
    X['categorical_column'] = ww.DataColumn(pd.Series([i % 3 for i in range(X.shape[0])]).astype(str),
                                            logical_type="Categorical")
    X['categorical_column_2'] = ww.DataColumn(pd.Series([i % 6 for i in range(X.shape[0])]).astype(str),
                                              logical_type="Categorical")

    pipeline = logistic_regression_multiclass_pipeline_class({"Logistic Regression Classifier": {"n_jobs": 1}})

    pipeline.fit(X, y)

    fig = graph_partial_dependence(pipeline, X, features='categorical_column', class_label=class_label,
                                   grid_resolution=5)

    for i, plot_data in enumerate(fig.to_dict()['data']):
        assert plot_data['type'] == 'bar'
        assert plot_data['x'].tolist() == ['0', '1', '2']
        if class_label is None:
            assert plot_data['name'] == f'class_{i}'
        else:
            assert plot_data['name'] == class_label

    fig = graph_partial_dependence(pipeline, X, features=('alcohol', 'categorical_column'), class_label=class_label,
                                   grid_resolution=5)

    for i, plot_data in enumerate(fig.to_dict()['data']):
        assert plot_data['type'] == 'contour'
        assert fig.to_dict()['layout']['yaxis']['ticktext'] == ['0', '1', '2']
        if class_label is None:
            assert plot_data['name'] == f'class_{i}'
        else:
            assert plot_data['name'] == class_label

    fig = graph_partial_dependence(pipeline, X, features=('categorical_column_2', 'categorical_column'),
                                   class_label=class_label, grid_resolution=5)

    for i, plot_data in enumerate(fig.to_dict()['data']):
        assert plot_data['type'] == 'contour'
        assert fig.to_dict()['layout']['xaxis']['ticktext'] == ['0', '1', '2']
        assert fig.to_dict()['layout']['yaxis']['ticktext'] == ['0', '1', '2', '3', '4', '5']
        if class_label is None:
            assert plot_data['name'] == f'class_{i}'
        else:
            assert plot_data['name'] == class_label
Beispiel #4
0
def test_graph_partial_dependence_regression_and_binary_categorical(problem_type, linear_regression_pipeline_class,
                                                                    X_y_regression, X_y_binary,
                                                                    logistic_regression_binary_pipeline_class):
    pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed')

    if problem_type == 'binary':
        X, y = X_y_binary
        pipeline = logistic_regression_binary_pipeline_class({"Logistic Regression Classifier": {"n_jobs": 1}})
    else:
        X, y = X_y_regression
        pipeline = linear_regression_pipeline_class({"Linear Regressor": {"n_jobs": 1}})

    X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
    y = pd.Series(y)
    X['categorical_column'] = pd.Series([i % 3 for i in range(X.shape[0])]).astype('str')
    X['categorical_column_2'] = pd.Series([i % 6 for i in range(X.shape[0])]).astype('str')

    pipeline.fit(X, y)

    fig = graph_partial_dependence(pipeline, X, features='categorical_column', grid_resolution=5)
    plot_data = fig.to_dict()['data'][0]
    assert plot_data['type'] == 'bar'
    assert plot_data['x'].tolist() == ['0', '1', '2']

    fig = graph_partial_dependence(pipeline, X, features=('0', 'categorical_column'),
                                   grid_resolution=5)
    fig_dict = fig.to_dict()
    plot_data = fig_dict['data'][0]
    assert plot_data['type'] == 'contour'
    assert fig_dict['layout']['yaxis']['ticktext'] == ['0', '1', '2']
    assert fig_dict['layout']['title']['text'] == "Partial Dependence of 'categorical_column' vs. '0'"

    fig = graph_partial_dependence(pipeline, X, features=('categorical_column_2', 'categorical_column'),
                                   grid_resolution=5)
    fig_dict = fig.to_dict()
    plot_data = fig_dict['data'][0]
    assert plot_data['type'] == 'contour'
    assert fig_dict['layout']['xaxis']['ticktext'] == ['0', '1', '2']
    assert fig_dict['layout']['yaxis']['ticktext'] == ['0', '1', '2', '3', '4', '5']
    assert fig_dict['layout']['title']['text'] == "Partial Dependence of 'categorical_column_2' vs. 'categorical_column'"
Beispiel #5
0
def test_graph_partial_dependence_regression_date_order(X_y_binary):
    pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed')

    X, y = X_y_binary
    pipeline = BinaryClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Logistic Regression Classifier'])
    X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
    y = pd.Series(y)
    dt_series = pd.Series(pd.date_range('20200101', periods=X.shape[0])).sample(frac=1).reset_index(drop=True)
    X['dt_column'] = pd.to_datetime(dt_series, errors='coerce')

    pipeline.fit(X, y)

    fig = graph_partial_dependence(pipeline, X, features='dt_column', grid_resolution=5)
    plot_data = fig.to_dict()['data'][0]
    assert plot_data['type'] == 'scatter'
    assert plot_data['x'].tolist() == list(pd.date_range('20200101', periods=X.shape[0]))