def test_scoring():
    X, y = iris_data()
    clf1 = LogisticRegression(random_state=1)
    clf2 = DecisionTreeClassifier(random_state=1)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.25,
                         random_state=123)

    score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
    score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score1, 2) == 0.97
    assert round(score2, 2) == 0.95

    t, p = paired_ttest_5x2cv(estimator1=clf1,
                              estimator2=clf2,
                              X=X, y=y,
                              scoring='accuracy',
                              random_seed=1)

    assert round(t, 3) == -1.539, t
    assert round(p, 3) == 0.184, p

    t, p = paired_ttest_5x2cv(estimator1=clf1,
                              estimator2=clf2,
                              X=X, y=y,
                              scoring='f1_macro',
                              random_seed=1)

    assert round(t, 3) == -1.510, t
    assert round(p, 3) == 0.191, p
Exemple #2
0
def test_gridsearch():
    np.random.seed(123)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                meta_classifier=meta,
                                use_probas=True,
                                shuffle=False)

    params = {
        'meta_classifier__C': [1.0, 100.0],
        'randomforestclassifier__n_estimators': [20, 200]
    }

    if Version(sklearn_version) < '0.24.1':
        grid = GridSearchCV(estimator=sclf, param_grid=params, cv=5, iid=False)
    else:
        grid = GridSearchCV(estimator=sclf, param_grid=params, cv=5)
    X, y = iris_data()
    grid.fit(X, y)

    mean_scores = [round(s, 2) for s in grid.cv_results_['mean_test_score']]

    assert mean_scores == [0.96, 0.95, 0.96, 0.95]
def test_not_fitted():
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_probas=True,
                              meta_classifier=meta)

    X, y = iris_data()

    assert_raises(
        NotFittedError, "This StackingClassifier instance is not fitted yet."
        " Call 'fit' with appropriate arguments"
        " before using this method.", sclf.predict, X)

    assert_raises(
        NotFittedError, "This StackingClassifier instance is not fitted yet."
        " Call 'fit' with appropriate arguments"
        " before using this method.", sclf.predict_proba, X)

    assert_raises(
        NotFittedError, "This StackingClassifier instance is not fitted yet."
        " Call 'fit' with appropriate arguments"
        " before using this method.", sclf.predict_meta_features, X)
Exemple #4
0
def test_use_clones():
    np.random.seed(123)
    X, y = iris_data()

    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    StackingCVClassifier(classifiers=[clf1, clf2],
                         use_clones=True,
                         meta_classifier=meta,
                         shuffle=False).fit(X, y)

    assert_raises(
        exceptions.NotFittedError,
        "This RandomForestClassifier instance is not fitted yet."
        " Call 'fit' with appropriate arguments"
        " before using this estimator.", clf1.predict, X)

    StackingCVClassifier(classifiers=[clf1, clf2],
                         use_probas=True,
                         use_clones=False,
                         meta_classifier=meta,
                         shuffle=False).fit(X, y)

    clf1.predict(X)
Exemple #5
0
def test_iris_data_uci():
    tmp = np.genfromtxt(fname=DATA_PATH, delimiter=',')
    original_uci_data_x, original_uci_data_y = tmp[:, :-1], tmp[:, -1]
    original_uci_data_y = original_uci_data_y.astype(int)
    iris_x, iris_y = iris_data()
    assert_array_equal(original_uci_data_x, iris_x)
    assert_array_equal(original_uci_data_y, iris_y)
def test_not_fitted():
    np.random.seed(123)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta, shuffle=False)

    X, y = iris_data()
    assert_raises(NotFittedError,
                  "This StackingCVClassifier instance is not fitted yet."
                  " Call 'fit' with appropriate arguments"
                  " before using this method.",
                  sclf.predict,
                  X)

    assert_raises(NotFittedError,
                  "This StackingCVClassifier instance is not fitted yet."
                  " Call 'fit' with appropriate arguments"
                  " before using this method.",
                  sclf.predict_proba,
                  X)

    assert_raises(NotFittedError,
                  "This StackingCVClassifier instance is not fitted yet."
                  " Call 'fit' with appropriate arguments"
                  " before using this method.",
                  sclf.predict_meta_features,
                  X)
def test_scoring():
    X, y = iris_data()
    clf1 = LogisticRegression(random_state=1,
                              solver='liblinear',
                              multi_class='ovr')
    clf2 = DecisionTreeClassifier(random_state=1)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.5,
                         random_state=123)

    score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
    score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score1, 2) == 0.96, round(score1, 2)
    assert round(score2, 2) == 0.91, round(score2, 2)

    t, p = paired_ttest_kfold_cv(estimator1=clf1,
                                 estimator2=clf2,
                                 X=X, y=y,
                                 scoring='accuracy',
                                 random_seed=1)

    assert round(t, 3) == -1.861, t
    assert round(p, 3) == 0.096, p

    t, p = paired_ttest_kfold_cv(estimator1=clf1,
                                 estimator2=clf2,
                                 X=X, y=y,
                                 scoring='recall_micro',
                                 random_seed=1)

    assert round(t, 3) == -1.861, t
    assert round(p, 3) == 0.096, p
def main():
    from mlxtend.data import iris_data
    from mlxtend.plotting import plot_decision_regions
    import matplotlib.pyplot as plt

    # Loading Data

    X, y = iris_data()
    X = X[:, [0, 3]]  # sepal length and petal width

    # standardize
    X[:, 0] = (X[:, 0] - X[:, 0].mean()) / X[:, 0].std()
    X[:, 1] = (X[:, 1] - X[:, 1].mean()) / X[:, 1].std()

    lr = SoftmaxRegression(eta=0.01, epochs=10, minibatches=1, random_seed=0)
    lr.fit(X, y)

    plot_decision_regions(X, y, clf=lr)
    plt.title('Softmax Regression - Gradient Descent')
    plt.show()

    plt.plot(range(len(lr.cost_)), lr.cost_)
    plt.xlabel('Iterations')
    plt.ylabel('Cost')
    plt.show()
def test_scoring():
    X, y = iris_data()
    clf1 = LogisticRegression(random_state=1)
    clf2 = DecisionTreeClassifier(random_state=1)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.25,
                         random_state=123)

    score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
    score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score1, 2) == 0.97
    assert round(score2, 2) == 0.95

    t, p = paired_ttest_kfold_cv(estimator1=clf1,
                                 estimator2=clf2,
                                 X=X,
                                 y=y,
                                 scoring='accuracy',
                                 random_seed=1)

    assert round(t, 3) == -1.861, t
    assert round(p, 3) == 0.096, p

    t, p = paired_ttest_kfold_cv(estimator1=clf1,
                                 estimator2=clf2,
                                 X=X,
                                 y=y,
                                 scoring='f1_macro',
                                 random_seed=1)

    assert round(t, 3) == -1.872, t
    assert round(p, 3) == 0.094, p
Exemple #10
0
def test_iris_data_uci():
    tmp = np.genfromtxt(fname=DATA_PATH, delimiter=',')
    original_uci_data_x, original_uci_data_y = tmp[:, :-1], tmp[:, -1]
    original_uci_data_y = original_uci_data_y.astype(int)
    iris_x, iris_y = iris_data()
    assert_array_equal(original_uci_data_x, iris_x)
    assert_array_equal(original_uci_data_y, iris_y)
Exemple #11
0
def test_EnsembleVoteClassifier_gridsearch():

    clf1 = LogisticRegression(solver='liblinear',
                              multi_class='ovr',
                              random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft')

    params = {
        'logisticregression__C': [1.0, 100.0],
        'randomforestclassifier__n_estimators': [20, 200]
    }

    if Version(sklearn_version) < '0.24.1':
        grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, iid=False)
    else:
        grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)

    X, y = iris_data()
    grid.fit(X, y)

    mean_scores = [round(s, 2) for s in grid.cv_results_['mean_test_score']]

    assert mean_scores == [0.95, 0.96, 0.96, 0.95]
Exemple #12
0
def test_threshold():

    X, y = iris_data()
    ax, threshold, count = ecdf(x=X[:, 0],
                                x_label='sepal length (cm)',
                                percentile=0.8)
    assert threshold == 6.5
    assert count == 120
def loadData(standardlize=True):
    X, y = iris_data()

    if standardlize:
        X = dataStandardlize(X)

    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=87)
    return train_X, test_X, train_y, test_y
Exemple #14
0
def test_threshold():

    X, y = iris_data()
    ax, threshold, count = ecdf(x=X[:, 0],
                                x_label='sepal length (cm)',
                                percentile=0.8)
    assert threshold == 6.5
    assert count == 120
Exemple #15
0
def test_iris_data_r():
    tmp = np.genfromtxt(fname=DATA_PATH, delimiter=',')
    original_r_data_x, original_r_data_y = tmp[:, :-1], tmp[:, -1]
    original_r_data_y = original_r_data_y.astype(int)
    original_r_data_x[34] = [4.9, 3.1, 1.5, 0.2]
    original_r_data_x[37] = [4.9, 3.6, 1.4, 0.1]
    iris_x, iris_y = iris_data(version='corrected')
    assert_array_equal(original_r_data_x, iris_x)
Exemple #16
0
def test_iris_data_r():
    tmp = np.genfromtxt(fname=DATA_PATH, delimiter=',')
    original_r_data_x, original_r_data_y = tmp[:, :-1], tmp[:, -1]
    original_r_data_y = original_r_data_y.astype(int)
    original_r_data_x[34] = [4.9, 3.1, 1.5, 0.2]
    original_r_data_x[37] = [4.9, 3.6, 1.4, 0.1]
    iris_x, iris_y = iris_data(version='corrected')
    assert_array_equal(original_r_data_x, iris_x)
Exemple #17
0
def test_pass_pca_corr_pca_out():
    X, y = iris_data()
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    eigen = pca.explained_variance_

    plot_pca_correlation_graph(X,
                               variables_names=['1', '2', '3', '4'],
                               X_pca=X_pca,
                               explained_variance=eigen)
def loadDataBinary(standardlize=True):
    X_temp, y_temp = iris_data()
    X = X_temp[y_temp!=2]
    y = y_temp[y_temp!=2]

    if standardlize:
        X = dataStandardlize(X)
    y[y==0] = -1

    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=87)
    return train_X, test_X, train_y, test_y
def test_verbose():
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_probas=True,
                              meta_classifier=meta,
                              verbose=3)
    X, y = iris_data()
    sclf.fit(X, y)
def test_verbose():
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_probas=True,
                              meta_classifier=meta,
                              verbose=3)
    X, y = iris_data()
    sclf.fit(X, y)
def test__clf_with_no_proba_fail():
    X, y = iris_data()
    clf = OneRClassifier()
    clf.fit(X, y)

    x_ref = X[15]

    s = ("Your `model` does not support "
         "`predict_proba`. Set `y_desired_proba` "
         " to `None` to use `predict`instead.")

    assert_raises(AttributeError, s, create_counterfactual, x_ref, 2, clf, X,
                  1., 100, 123)
def test_use_features_in_secondary_predict():
    np.random.seed(123)
    X, y = iris_data()
    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    scores = cross_val_score(sclf, X, y, cv=5, scoring='accuracy')
    scores_mean = (round(scores.mean(), 2))
    assert scores_mean == 0.95, scores_mean
Exemple #23
0
def loadData(standardlize=True):
    X, y = iris_data()

    y_one_hot = to_categorical(y, num_classes=3)

    if standardlize:
        X = dataStandardlize(X)

    train_X, test_X, train_y, test_y = train_test_split(X,
                                                        y_one_hot,
                                                        test_size=0.3,
                                                        random_state=87)
    return train_X, test_X, train_y, test_y
Exemple #24
0
def test_no_X_PCA_but_explained_variance():
    with pytest.raises(ValueError,
                       match='If `explained variance` is not None, the '
                       '`X_pca` values should not be `None`.'):

        X, y = iris_data()
        pca = PCA(n_components=2)
        pca.fit(X)
        eigen = pca.explained_variance_

        plot_pca_correlation_graph(X,
                                   variables_names=['1', '2', '3', '4'],
                                   X_pca=None,
                                   explained_variance=eigen)
Exemple #25
0
def test_X_PCA_but_no_explained_variance():
    with pytest.raises(
            ValueError,
            match='If `X_pca` is not None, the `explained variance` '
            'values should not be `None`.'):

        X, y = iris_data()
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)

        plot_pca_correlation_graph(X,
                                   variables_names=['1', '2', '3', '4'],
                                   X_pca=X_pca,
                                   explained_variance=None)
Exemple #26
0
def test_not_enough_components():
    s = (
        'Number of principal components must match the number of eigenvalues. Got 2 != 1'
    )
    with pytest.raises(ValueError, match=s):

        X, y = iris_data()
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)
        eigen = pca.explained_variance_

        plot_pca_correlation_graph(X,
                                   variables_names=['1', '2', '3', '4'],
                                   X_pca=X_pca,
                                   explained_variance=eigen[:-1])
def test_use_features_in_secondary_predict_proba():
    np.random.seed(123)
    X, y = iris_data()
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr',
                              random_state=1)
    clf1 = RandomForestClassifier(n_estimators=10, random_state=1)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(X, y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(X[idx])[:, 0]
    expect = np.array([0.916, 0.828, 0.889])
    np.testing.assert_almost_equal(y_pred, expect, 3)
def test__clf_with_no_proba_pass():
    X, y = iris_data()
    clf = OneRClassifier()
    clf.fit(X, y)

    x_ref = X[15]

    res = create_counterfactual(x_reference=x_ref,
                                y_desired=2,
                                model=clf,
                                X_dataset=X,
                                y_desired_proba=None,
                                lammbda=100,
                                random_seed=123)

    assert clf.predict(x_ref.reshape(1, -1)) == 0
    assert clf.predict(res.reshape(1, -1)) == 2
def test_classifier_defaults():
    X, y = iris_data()
    clf1 = LogisticRegression(multi_class='ovr',
                              solver='liblinear',
                              random_state=1)
    clf2 = DecisionTreeClassifier(random_state=1)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.25,
                         random_state=123)

    score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
    score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score1, 2) == 0.97
    assert round(score2, 2) == 0.95

    t, p = paired_ttest_resampled(estimator1=clf1,
                                  estimator2=clf2,
                                  X=X,
                                  y=y,
                                  random_seed=1)

    if Version(sklearn_version) < Version("0.20"):
        assert round(t, 3) == -1.809, t
        assert round(p, 3) == 0.081, p
    else:
        assert round(t, 3) == -1.702, t
        assert round(p, 3) == 0.10, p

    # change maxdepth of decision tree classifier

    clf2 = DecisionTreeClassifier(max_depth=1, random_state=1)

    score3 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score3, 2) == 0.63

    t, p = paired_ttest_resampled(estimator1=clf1,
                                  estimator2=clf2,
                                  X=X,
                                  y=y,
                                  random_seed=1)

    assert round(t, 3) == 39.214, t
    assert round(p, 3) == 0.000, p
Exemple #30
0
def test_gridsearch_enumerate_names():
    np.random.seed(123)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf1, clf2],
                                meta_classifier=meta,
                                shuffle=False)

    params = {'meta_classifier__C': [1.0, 100.0],
              'randomforestclassifier-1__n_estimators': [5, 10],
              'randomforestclassifier-2__n_estimators': [5, 20],
              'use_probas': [True, False]}

    grid = GridSearchCV(estimator=sclf, param_grid=params, cv=5, iid=False)
    X, y = iris_data()
    grid = grid.fit(X, y)
def test_use_features_in_secondary_predict_proba():
    np.random.seed(123)
    X, y = iris_data()
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr',
                              random_state=1)
    clf1 = RandomForestClassifier(n_estimators=10, random_state=1)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(X, y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(X[idx])[:, 0]
    expect = np.array([0.916, 0.828, 0.889])
    np.testing.assert_almost_equal(y_pred, expect, 3)
def test_01_loss_tree():

    X, y = iris_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3,
                                                        random_state=123,
                                                        shuffle=True,
                                                        stratify=y)

    tree = DecisionTreeClassifier(random_state=123)
    avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
            tree, X_train, y_train, X_test, y_test,
            loss='0-1_loss',
            random_seed=123)

    assert round(avg_expected_loss, 3) == 0.062
    assert round(avg_bias, 3) == 0.022
    assert round(avg_var, 3) == 0.040
def test_use_features_in_secondary_predict():
    np.random.seed(123)
    X, y = iris_data()
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    scores = cross_val_score(sclf,
                             X,
                             y,
                             cv=5,
                             scoring='accuracy')
    scores_mean = (round(scores.mean(), 2))
    assert scores_mean == 0.95, scores_mean
def test_01_loss_tree():

    X, y = iris_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3,
                                                        random_state=123,
                                                        shuffle=True,
                                                        stratify=y)

    tree = DecisionTreeClassifier(random_state=123)
    avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
            tree, X_train, y_train, X_test, y_test,
            loss='0-1_loss',
            random_seed=123)

    assert round(avg_expected_loss, 3) == 0.062
    assert round(avg_bias, 3) == 0.022
    assert round(avg_var, 3) == 0.040
def test_use_features_in_secondary_sparse_input_predict():
    np.random.seed(123)
    X, y = iris_data()
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr',
                              random_state=1)
    clf1 = RandomForestClassifier(n_estimators=10, random_state=1)
    sclf = StackingClassifier(classifiers=[clf1],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    scores = cross_val_score(sclf,
                             sparse.csr_matrix(X),
                             y,
                             cv=5,
                             scoring='accuracy')
    scores_mean = (round(scores.mean(), 2))
    assert scores_mean == 0.97, scores_mean
def test_use_features_in_secondary_sparse_input_predict():
    np.random.seed(123)
    X, y = iris_data()
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr',
                              random_state=1)
    clf1 = RandomForestClassifier(n_estimators=10, random_state=1)
    sclf = StackingClassifier(classifiers=[clf1],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    scores = cross_val_score(sclf,
                             sparse.csr_matrix(X),
                             y,
                             cv=5,
                             scoring='accuracy')
    scores_mean = (round(scores.mean(), 2))
    assert scores_mean == 0.97, scores_mean
def test_classifier_defaults():
    X, y = iris_data()
    clf1 = LogisticRegression(multi_class='ovr',
                              solver='liblinear',
                              random_state=1)
    clf2 = DecisionTreeClassifier(random_state=1)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.25,
                         random_state=123)

    score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
    score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score1, 2) == 0.97
    assert round(score2, 2) == 0.95

    t, p = paired_ttest_resampled(estimator1=clf1,
                                  estimator2=clf2,
                                  X=X, y=y,
                                  random_seed=1)

    if Version(sklearn_version) < Version("0.20"):
        assert round(t, 3) == -1.809, t
        assert round(p, 3) == 0.081, p
    else:
        assert round(t, 3) == -1.702, t
        assert round(p, 3) == 0.10, p

    # change maxdepth of decision tree classifier

    clf2 = DecisionTreeClassifier(max_depth=1, random_state=1)

    score3 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score3, 2) == 0.63

    t, p = paired_ttest_resampled(estimator1=clf1,
                                  estimator2=clf2,
                                  X=X, y=y,
                                  random_seed=1)

    assert round(t, 3) == 39.214, t
    assert round(p, 3) == 0.000, p
def test_scoring():
    X, y = iris_data()
    clf1 = LogisticRegression(multi_class='ovr',
                              solver='liblinear',
                              random_state=1)
    clf2 = DecisionTreeClassifier(random_state=1)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.25,
                         random_state=123)

    score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
    score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score1, 2) == 0.97
    assert round(score2, 2) == 0.95

    t, p = paired_ttest_resampled(estimator1=clf1,
                                  estimator2=clf2,
                                  X=X,
                                  y=y,
                                  scoring='accuracy',
                                  random_seed=1)

    if Version(sklearn_version) < Version('0.20'):
        assert round(t, 3) == -1.809, t
        assert round(p, 3) == 0.081, p
    else:
        assert round(t, 3) == -1.702, t
        assert round(p, 3) == 0.1, p

    t, p = paired_ttest_resampled(estimator1=clf1,
                                  estimator2=clf2,
                                  X=X,
                                  y=y,
                                  scoring='f1_macro',
                                  random_seed=1)

    if Version(sklearn_version) < Version("0.20"):
        assert round(t, 3) == -1.690, t
        assert round(p, 3) == 0.102, p
    else:
        assert round(t, 3) == -1.561, t
        assert round(p, 3) == 0.129, p
def test__large_lambda():
    X, y = iris_data()
    clf = LogisticRegression()
    clf.fit(X, y)

    x_ref = X[15]

    res = create_counterfactual(x_reference=x_ref,
                                y_desired=2,
                                model=clf,
                                X_dataset=X,
                                y_desired_proba=1.,
                                lammbda=100,
                                random_seed=123)

    assert np.argmax(clf.predict_proba(x_ref.reshape(1, -1))) == 0
    assert np.argmax(clf.predict_proba(res.reshape(1, -1))) == 2
    assert round(
        (clf.predict_proba(res.reshape(1, -1))).flatten()[-1], 2) >= 0.96
Exemple #40
0
def test_gridsearch():
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta)

    params = {
        'meta_classifier__C': [1.0, 100.0],
        'randomforestclassifier__n_estimators': [20, 200]
    }

    grid = GridSearchCV(estimator=sclf, param_grid=params, cv=5)
    X, y = iris_data()
    grid.fit(X, y)

    mean_scores = [round(s, 2) for s in grid.cv_results_['mean_test_score']]

    assert mean_scores == [0.95, 0.97, 0.96, 0.96], mean_scores
Exemple #41
0
def test_EnsembleVoteClassifier_gridsearch_enumerate_names():

    clf1 = LogisticRegression(solver='liblinear',
                              multi_class='ovr',
                              random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf1, clf2])

    params = {
        'logisticregression-1__C': [1.0, 100.0],
        'logisticregression-2__C': [1.0, 100.0],
        'randomforestclassifier__n_estimators': [5, 20],
        'voting': ['hard', 'soft']
    }

    grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, iid=False)

    X, y = iris_data()
    grid = grid.fit(X, y)
def test_train_size():
    X, y = iris_data()
    clf1 = LogisticRegression()
    clf2 = DecisionTreeClassifier()

    expected_err_msg = ("train_size must be of type int or float. "
                        "Got <class 'NoneType'>.")

    if sys.version_info < (3, 0):
        expected_err_msg = expected_err_msg.replace('<class', '<type')

    assert_raises(ValueError,
                  expected_err_msg,
                  paired_ttest_resampled,
                  clf1,
                  clf2,
                  X,
                  y,
                  test_size=None)
def test_train_size():
    X, y = iris_data()
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = DecisionTreeClassifier()

    expected_err_msg = ("train_size must be of type int or float. "
                        "Got <class 'NoneType'>.")

    if sys.version_info < (3, 0):
        expected_err_msg = expected_err_msg.replace('<class', '<type')

    assert_raises(ValueError,
                  expected_err_msg,
                  paired_ttest_resampled,
                  clf1,
                  clf2,
                  X,
                  y,
                  test_size=None)
def test_scoring():
    X, y = iris_data()
    clf1 = LogisticRegression(multi_class='ovr',
                              solver='liblinear',
                              random_state=1)
    clf2 = DecisionTreeClassifier(random_state=1)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.25,
                         random_state=123)

    score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
    score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score1, 2) == 0.97
    assert round(score2, 2) == 0.95

    t, p = paired_ttest_resampled(estimator1=clf1,
                                  estimator2=clf2,
                                  X=X, y=y,
                                  scoring='accuracy',
                                  random_seed=1)

    if Version(sklearn_version) < Version('0.20'):
        assert round(t, 3) == -1.809, t
        assert round(p, 3) == 0.081, p
    else:
        assert round(t, 3) == -1.702, t
        assert round(p, 3) == 0.1, p

    t, p = paired_ttest_resampled(estimator1=clf1,
                                  estimator2=clf2,
                                  X=X, y=y,
                                  scoring='f1_macro',
                                  random_seed=1)

    if Version(sklearn_version) < Version("0.20"):
        assert round(t, 3) == -1.690, t
        assert round(p, 3) == 0.102, p
    else:
        assert round(t, 3) == -1.561, t
        assert round(p, 3) == 0.129, p
def test_gridsearch():
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              meta_classifier=meta)

    params = {'meta_classifier__C': [1.0, 100.0],
              'randomforestclassifier__n_estimators': [20, 200]}

    grid = GridSearchCV(estimator=sclf, param_grid=params, cv=5, iid=False)
    X, y = iris_data()
    grid.fit(X, y)

    mean_scores = [round(s, 2) for s
                   in grid.cv_results_['mean_test_score']]

    assert mean_scores == [0.95, 0.97, 0.96, 0.96], mean_scores
def test_classifier_defaults():
    X, y = iris_data()
    clf1 = LogisticRegression(random_state=1,
                              multi_class='ovr',
                              solver='liblinear')
    clf2 = DecisionTreeClassifier(random_state=1)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.25,
                         random_state=123)

    score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
    score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score1, 2) == 0.97
    assert round(score2, 2) == 0.95

    f, p = combined_ftest_5x2cv(estimator1=clf1,
                                estimator2=clf2,
                                X=X, y=y,
                                random_seed=1)

    assert round(f, 3) == 1.053, f
    assert round(p, 3) == 0.509, p

    # change maxdepth of decision tree classifier

    clf2 = DecisionTreeClassifier(max_depth=1, random_state=1)

    score3 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score3, 2) == 0.63

    f, p = combined_ftest_5x2cv(estimator1=clf1,
                                estimator2=clf2,
                                X=X, y=y,
                                random_seed=1)

    assert round(f, 3) == 34.934, f
    assert round(p, 3) == 0.001, p
def test_classifier_defaults():
    X, y = iris_data()
    clf1 = LogisticRegression(random_state=1,
                              multi_class='ovr',
                              solver='liblinear')
    clf2 = DecisionTreeClassifier(random_state=1)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.25,
                         random_state=123)

    score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
    score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score1, 2) == 0.97
    assert round(score2, 2) == 0.95

    t, p = paired_ttest_kfold_cv(estimator1=clf1,
                                 estimator2=clf2,
                                 X=X, y=y,
                                 random_seed=1)

    assert round(t, 3) == -1.861, t
    assert round(p, 3) == 0.096, p

    # change maxdepth of decision tree classifier

    clf2 = DecisionTreeClassifier(max_depth=1, random_state=1)

    score3 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score3, 2) == 0.63

    t, p = paired_ttest_kfold_cv(estimator1=clf1,
                                 estimator2=clf2,
                                 X=X, y=y,
                                 random_seed=1)

    assert round(t, 3) == 13.491, t
    assert round(p, 3) == 0.000, p
def test_classifier_defaults():
    X, y = iris_data()
    clf1 = LogisticRegression(random_state=1)
    clf2 = DecisionTreeClassifier(random_state=1)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.25,
                         random_state=123)

    score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
    score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score1, 2) == 0.97
    assert round(score2, 2) == 0.95

    t, p = paired_ttest_5x2cv(estimator1=clf1,
                              estimator2=clf2,
                              X=X, y=y,
                              random_seed=1)

    assert round(t, 3) == -1.539, t
    assert round(p, 3) == 0.184, p

    # change maxdepth of decision tree classifier

    clf2 = DecisionTreeClassifier(max_depth=1, random_state=1)

    score3 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score3, 2) == 0.63

    t, p = paired_ttest_5x2cv(estimator1=clf1,
                              estimator2=clf2,
                              X=X, y=y,
                              random_seed=1)

    assert round(t, 3) == 5.386, t
    assert round(p, 3) == 0.003, p
def test_scoring():
    X, y = iris_data()
    clf1 = LogisticRegression(random_state=1, solver='liblinear',
                              multi_class='ovr')
    clf2 = DecisionTreeClassifier(random_state=1)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.25,
                         random_state=123)

    score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
    score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score1, 2) == 0.97
    assert round(score2, 2) == 0.95

    f, p = combined_ftest_5x2cv(estimator1=clf1,
                                estimator2=clf2,
                                X=X, y=y,
                                scoring='accuracy',
                                random_seed=1)

    assert round(f, 3) == 1.053, f
    assert round(p, 3) == 0.509, p

    f, p = combined_ftest_5x2cv(estimator1=clf1,
                                estimator2=clf2,
                                X=X, y=y,
                                scoring='f1_macro',
                                random_seed=1)

    if Version(sklearn_version) < Version('0.20'):
        assert round(f, 3) == -1.510, f
        assert round(p, 3) == 0.191, p
    else:
        assert round(f, 3) == 1.046, f
        assert round(p, 3) == 0.513, p
Exemple #50
0
# Sebastian Raschka 2014-2016
# mlxtend Machine Learning Library Extensions
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

from mlxtend.tf_classifier import TfSoftmaxRegression
from mlxtend.data import iris_data
import numpy as np
from nose.tools import raises


X, y = iris_data()
X = X[:, [0, 3]]  # sepal length and petal width
X_bin = X[0:100]  # class 0 and class 1
y_bin = y[0:100]  # class 0 and class 1

# standardize
X_bin[:, 0] = (X_bin[:, 0] - X_bin[:, 0].mean()) / X_bin[:, 0].std()
X_bin[:, 1] = (X_bin[:, 1] - X_bin[:, 1].mean()) / X_bin[:, 1].std()
X[:, 0] = (X[:, 0] - X[:, 0].mean()) / X[:, 0].std()
X[:, 1] = (X[:, 1] - X[:, 1].mean()) / X[:, 1].std()


def test_binary_logistic_regression_gd():
    t = np.array([[-0.28, 0.95],
                  [-2.23, 2.4]])
    lr = TfSoftmaxRegression(epochs=100,
                             eta=0.5,
                             minibatches=1,
                             random_seed=1)
Exemple #51
0
def test_iris_invalid_choice():
    with pytest.raises(TypeError) as excinfo:
        iris_data()
        assert excinfo.value.message == ('wrong-choice')
Exemple #52
0
def test_import_iris_data():
    X, y = iris_data()
    assert(X.shape[0] == 150)
    assert(X.shape[1] == 4)
    print(y.shape)
    assert(y.shape[0] == 150)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.base import clone
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version


X_iris, y_iris = iris_data()
X_iris = X_iris[:, 1:3]

breast_cancer = datasets.load_breast_cancer()
X_breast, y_breast = breast_cancer.data[:, 1:3], breast_cancer.target


def test_StackingCVClassifier():
    np.random.seed(123)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                meta_classifier=meta,
                                shuffle=False)