Exemple #1
0
def test_na_vals_reg_plot_raise_warning():
    X, y = load_diabetes(return_X_y=True)
    X = pd.DataFrame(X)
    y[::50] = np.NaN
    X['target_col'] = y
    scatter_alpha = _get_scatter_alpha('auto', X['target_col'])
    scatter_size = _get_scatter_size('auto', X['target_col'])
    with pytest.warns(UserWarning,
                      match="Missing values in target_col have "
                      "been removed for regression"):
        plot(X, 'target_col')
    with pytest.warns(UserWarning,
                      match="Missing values in target_col have "
                      "been removed for regression"):
        plot_regression_continuous(X,
                                   'target_col',
                                   scatter_alpha=scatter_alpha,
                                   scatter_size=scatter_size)
    with pytest.warns(UserWarning,
                      match="Missing values in target_col have "
                      "been removed for regression"):
        plot_regression_categorical(X,
                                    'target_col',
                                    scatter_alpha=scatter_alpha,
                                    scatter_size=scatter_size)
Exemple #2
0
def test_plot_string_target():
    X, y = make_blobs(n_samples=30)
    data = pd.DataFrame(X)
    y = pd.Series(y)
    y[y == 0] = 'a'
    y[y == 1] = 'b'
    y[y == 2] = 'c'
    data['target'] = y
    plot(data, target_col='target')
Exemple #3
0
def test_float_classification_target():
    # check we can plot even if we do classification with a float target
    X, y = make_blobs()
    data = pd.DataFrame(X)
    data['target'] = y.astype(np.float)
    types = detect_types(data)
    assert types.categorical['target']
    plot(data, target_col='target')
    # same with "actual float" - we need to specify classification for that :-/
    data['target'] = y.astype(np.float) + .2
    plot(data, target_col='target', type_hints={'target': 'categorical'})
    plt.close("all")
Exemple #4
0
def test_negative_ordinal():
    # check that a low card int with negative values is plotted correctly
    data = pd.DataFrame([
        np.random.randint(0, 10, size=1000) - 5,
        np.random.randint(0, 2, size=1000)
    ]).T
    # ensure first column is low_card_int
    assert (detect_types(data).T.idxmax() == ['low_card_int',
                                              'categorical']).all()
    assert guess_ordinal(data[0])
    # smoke test
    plot(data, target_col=1)
Exemple #5
0
def test_type_hints(add, feature_type, target_type):
    X = pd.DataFrame(np.random.randint(4, size=100)) + add
    X['target'] = np.random.uniform(size=100)
    plot(X,
         type_hints={
             0: feature_type,
             'target': target_type
         },
         target_col='target')
    # get title of figure
    text = plt.gcf()._suptitle.get_text()
    assert feature_type.capitalize() in text
    ax = plt.gca()
    # one of the labels is 'target' iif regression
    labels = ax.get_ylabel() + ax.get_xlabel()
    assert ('target' in labels) == (target_type == 'continuous')
    plt.close("all")
Exemple #6
0
def test_plot_regression_missing_categories():
    df = pd.DataFrame(
        {'cat_col': np.random.choice(['a', 'b', 'c', 'd'], size=100)})
    df['target'] = np.NaN
    counts = df.cat_col.value_counts()
    df.loc[df.cat_col == "a", 'target'] = np.random.normal(size=counts['a'])
    df.loc[df.cat_col == "b", 'target'] = np.random.normal(1, size=counts['b'])
    axes = plot(df, target_col="target")
    ticklabels = axes[-1][0, 0].get_yticklabels()
    assert [label.get_text() for label in ticklabels] == ['a', 'b']
Exemple #7
0
def test_plot_regression_categorical_missing_value():
    df = pd.DataFrame({'y': np.random.normal(size=300)})
    df.loc[100:200, 'y'] += 1
    df.loc[200:300, 'y'] += 2
    df['x'] = 'a'
    df.loc[100:200, 'x'] = 'b'
    df.loc[200:300, 'x'] = np.NaN
    res = plot(df, target_col='y')
    assert len(res[2][0, 0].get_yticklabels()) == 3
    assert res[2][0, 0].get_yticklabels()[2].get_text() == 'dabl_mi...'
Exemple #8
0
def test_plots_smoke(continuous_features, categorical_features, task):
    # simple smoke test
    # should be parametrized
    n_samples = 100
    X_cont, y_cont = make_regression(n_samples=n_samples,
                                     n_features=continuous_features,
                                     n_informative=min(continuous_features, 2))
    X_cat, y_cat = make_regression(n_samples=n_samples,
                                   n_features=categorical_features,
                                   n_informative=min(categorical_features, 2))
    if X_cat.shape[1] > 0:
        X_cat = KBinsDiscretizer(encode='ordinal').fit_transform(X_cat)
    cont_columns = ["asdf_%d_cont" % i for i in range(continuous_features)]
    df_cont = pd.DataFrame(X_cont, columns=cont_columns)
    if categorical_features > 0:
        cat_columns = ["asdf_%d_cat" % i for i in range(categorical_features)]
        df_cat = pd.DataFrame(X_cat, columns=cat_columns).astype('int')
        df_cat = df_cat.astype("category")
        X_df = pd.concat([df_cont, df_cat], axis=1)
    else:
        X_df = df_cont
    assert (X_df.shape[1] == continuous_features + categorical_features)
    X_clean = clean(X_df.copy())
    y = y_cont + y_cat
    if X_df.shape[1] == 0:
        y = np.random.uniform(size=n_samples)
    if task == "classification":
        y = np.digitize(y, np.percentile(y, [5, 10, 60, 85]))
    X_clean['target'] = y
    if task == "classification":
        X_clean['target'] = X_clean['target'].astype('category')
    types = detect_types(X_clean)
    column_types = types.T.idxmax()
    assert np.all(column_types[:continuous_features] == 'continuous')
    assert np.all(column_types[continuous_features:-1] == 'categorical')
    if task == "classification":
        assert column_types[-1] == 'categorical'
    else:
        assert column_types[-1] == 'continuous'

    plot(X_clean, target_col='target')
    plt.close("all")
Exemple #9
0
def test_plot_regression_with_target_outliers():
    df = pd.DataFrame(
        data={
            "feature": np.random.randint(low=1, high=100, size=200),
            # target values are bound between 50 and 100
            "target": np.random.randint(low=50, high=100, size=200)
        })
    # append single outlier record with target value 0
    df = df.append({"feature": 50, "target": 0}, ignore_index=True)

    with pytest.warns(UserWarning,
                      match="Dropped 1 outliers in column target."):
        plot_regression_continuous(df, target_col='target')

    with pytest.warns(UserWarning,
                      match="Dropped 1 outliers in column target."):
        plot_regression_categorical(df, target_col='target')

    res = plot(df, target_col='target')
    assert len(res) == 3
    ax = res[0]
    # ensure outlier at 0 was removed
    assert ax.get_xticks()[0] == 40
Exemple #10
0
def test_plot_int_column_name():
    X, y = make_blobs()
    X = pd.DataFrame(X)
    X[3] = y
    plot(X, target_col=3)
Exemple #11
0
def test_plot_lda_binary():
    X, y = make_blobs(centers=2)
    X = pd.DataFrame(X)
    plot(X, y, univariate_plot='kde')
Exemple #12
0
def test_plot_regression_numpy():
    X, y = make_regression()
    plot(X, y)
Exemple #13
0
def test_plot_X_y():
    X, y = make_blobs()
    X = pd.DataFrame(X)
    plot(X, y)
Exemple #14
0
def test_plot_target_low_card_int():
    data = load_digits()
    df = data_df_from_bunch(data)
    plot(df[::10], target_col='target')