def test_sparse_inputs_with_features_in_secondary():
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf],
                                 meta_classifier=lr,
                                 random_state=42,
                                 use_features_in_secondary=True)
    X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)

    if Version(sklearn_version) < Version("0.21"):
        expected_value = 1.0
    else:
        expected_value = 0.99

    assert round(stclf.score(X_train, y_train), 2) == expected_value, \
        round(stclf.score(X_train, y_train), 2)

    # sparse
    stclf.fit(sparse.csr_matrix(X_train), y_train)

    if Version(sklearn_version) < Version("0.21"):
        expected_value = 1.0
    else:
        expected_value = 0.99
    assert round(stclf.score(X_train, y_train), 2) == expected_value, \
        round(stclf.score(X_train, y_train), 2)
def test_sparse_inputs_with_features_in_secondary():
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf],
                                 meta_classifier=lr,
                                 random_state=42,
                                 use_features_in_secondary=True)
    X_train, X_test, y_train, y_test = train_test_split(X_breast,
                                                        y_breast,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)

    expected_value = 1.0

    assert round(stclf.score(X_train, y_train), 2) == expected_value, \
        round(stclf.score(X_train, y_train), 2)

    # sparse
    stclf.fit(sparse.csr_matrix(X_train), y_train)

    if Version(sklearn_version) < Version("0.21"):
        expected_value = 1.0
    if Version(sklearn_version) < Version("0.22"):
        expected_value = 0.99
    else:
        expected_value = 1.00


    assert round(stclf.score(X_train, y_train), 2) == expected_value, \
        round(stclf.score(X_train, y_train), 2)
Example #3
0
def test_sparse_inputs():
    rf = RandomForestClassifier(random_state=1)
    lr = LogisticRegression()
    stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr)
    X_train, X_test, y_train, y_test = train_test_split(X_breast,
                                                        y_breast,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99

    # sparse
    stclf.fit(sparse.csr_matrix(X_train), y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99
Example #4
0
def test_sparse_inputs():
    np.random.seed(123)
    rf = RandomForestClassifier(n_estimators=10)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr)
    X_train, X_test, y_train, y_test = train_test_split(X_breast,
                                                        y_breast,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99

    # sparse
    stclf.fit(sparse.csr_matrix(X_train), y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99
def test_sparse_inputs():
    np.random.seed(123)
    rf = RandomForestClassifier(n_estimators=10)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf],
                                 meta_classifier=lr,
                                 random_state=42)
    X_train, X_test, y_train,  y_test = train_test_split(X_breast, y_breast,
                                                         test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99

    # sparse
    stclf.fit(sparse.csr_matrix(X_train), y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99
def test_works_with_df_if_fold_indexes_missing():
    """This is a regression test to make sure fitting will still work even if
    training data has ids that cannot be indexed using the indexes from the cv
    (e.g. skf)

    Some possibilities:
    + Output of the folds are not neatly consecutive (i.e. [341, 345, 543, ...]
      instead of [0, 1, ... n])
    + Indexes just start from some number greater than the size of the input
      (see test case)

    Training data sometimes has ids that carry other information, and selection
    of rows based on cv should not break.

    This is fixed in the code using `safe_indexing`
    """

    np.random.seed(123)
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf],
                                 meta_classifier=lr,
                                 random_state=42,
                                 use_features_in_secondary=True)

    X_modded = pd.DataFrame(X_breast,
                            index=np.arange(X_breast.shape[0]) + 1000)
    y_modded = pd.Series(y_breast, index=np.arange(y_breast.shape[0]) + 1000)

    X_train, X_test, y_train, y_test = train_test_split(X_modded,
                                                        y_modded,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)

    if Version(sklearn_version) < Version("0.22"):
        assert round(stclf.score(X_train, y_train), 2) == 0.99, \
            round(stclf.score(X_train, y_train), 2)
    else:
        assert round(stclf.score(X_train, y_train), 2) == 0.98, \
            round(stclf.score(X_train, y_train), 2)
def test_works_with_df_if_fold_indexes_missing():
    """This is a regression test to make sure fitting will still work even if
    training data has ids that cannot be indexed using the indexes from the cv
    (e.g. skf)

    Some possibilities:
    + Output of the folds are not neatly consecutive (i.e. [341, 345, 543, ...]
      instead of [0, 1, ... n])
    + Indexes just start from some number greater than the size of the input
      (see test case)

    Training data sometimes has ids that carry other information, and selection
    of rows based on cv should not break.

    This is fixed in the code using `safe_indexing`
    """

    np.random.seed(123)
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf],
                                 meta_classifier=lr,
                                 random_state=42,
                                 use_features_in_secondary=True)

    X_modded = pd.DataFrame(X_breast,
                            index=np.arange(X_breast.shape[0]) + 1000)
    y_modded = pd.Series(y_breast,
                         index=np.arange(y_breast.shape[0]) + 1000)

    X_train, X_test, y_train, y_test = train_test_split(X_modded,
                                                        y_modded,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99, \
        round(stclf.score(X_train, y_train), 2)
Example #8
0
vc = VotingClassifier(estimators, voting='hard')
vc.fit(X_train, y_train)
vc.score(X_test, y_test)

#stacking
clf1 = KNeighborsClassifier(n_neighbors=10)
clf2 = GaussianNB()
clf3 = RandomForestClassifier(random_state=46)
lr = LogisticRegression()

sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                            meta_classifier=lr,
                            random_state=46)

sclf.fit(X_train, y_train)
a = sclf.score(X_test, y_test)
print(a)
sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                            meta_classifier=lr,
                            random_state=46,
                            use_probas=True)

sclf.fit(X_train, y_train)
b = sclf.score(X_test, y_test)

print(b)

variance_inflation_factor(X_train.values, 0)

for i in range(len(X_train.columns)):
    print(variance_inflation_factor(X_train.values, i))
Example #9
0
        xgb.XGBClassifier(max_depth=6, n_estimators=100, num_round=5),
        RandomForestClassifier(n_estimators=100, max_depth=6, oob_score=True),
        GradientBoostingClassifier(learning_rate=0.3,
                                   max_depth=6,
                                   n_estimators=100)
    ]
    clf2 = LogisticRegression(C=0.5, max_iter=100)
    #============================================================================#
    from mlxtend.classifier import StackingClassifier, StackingCVClassifier
    sclf = StackingClassifier(classifiers=clfs, meta_classifier=clf2)
    sclf.fit(X_train, Y_train)
    print(sclf.score(X_train, Y_train))
    sclf_pre = sclf.predict(X_test)
    sclf_sub = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": sclf_pre
    })
    sclf_sub.to_csv("../data/sclf_sub.csv", index=False)
    #===============================================================================#
    sclf2 = StackingCVClassifier(classifiers=clfs, meta_classifier=clf2, cv=5)
    x = np.array(X_train)
    y = np.array(Y_train).flatten()
    sclf2.fit(x, y)
    print(sclf2.score(x, y))
    sclf2_pre = sclf2.predict(np.array(X_test))
    sclf2_sub = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": sclf2_pre
    })
    sclf2_sub.to_csv("../data/sclf2_sub.csv", index=False)
Example #10
0
# x_train = x_train.reset_index(drop=True)
# x_vali = x_vali.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
# y_vali = y_vali.reset_index(drop=True)
"""=====================================================================================================================
2 模型融合;
学习参考:https://blog.csdn.net/LAW_130625/article/details/78573736
"""

lr_clf = clfs["lr"]  # meta_classifier
svm_clf = clfs["svm_ploy"]
rf_clf = clfs["rf"]
xgb_clf = clfs["xgb"]
lgb_clf = clfs["lgb"]

sclf = StackingCVClassifier(
    classifiers=[lr_clf, svm_clf, rf_clf, xgb_clf, lgb_clf],
    meta_classifier=lr_clf,
    use_probas=True,
    verbose=3)

sclf.fit(x_train, y_train)

print("测试模型 & 模型参数如下:\n{0}".format(sclf))
print("=" * 20)
pre_train = sclf.predict(x_train)
print("训练集正确率: {0:.4f}".format(sclf.score(x_train, y_train)))
print("训练集f1分数: {0:.4f}".format(f1_score(y_train, pre_train)))
print("训练集auc分数: {0:.4f}".format(roc_auc_score(y_train, pre_train)))
Example #11
0
#create voting classifier
vc = VotingClassifier(estimators)
vc.fit(X_train, y_train)
vc.score(X_test, y_test)

#Simple Stacking CV classifier
clf1 = KNeighborsClassifier(n_neighbors=10)
clf2 = RandomForestClassifier(random_state=42)
clf3 = GaussianNB()
lr = LogisticRegression()

sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                            meta_classifier=lr,
                            random_state=42)
sclf.fit(X_train, y_train)
sclf.score(X_test, y_test)

#Stacking classifier using probabilities as Meta-Features
clf1 = KNeighborsClassifier(n_neighbors=10)
clf2 = RandomForestClassifier(random_state=42)
clf3 = GaussianNB()
lr = LogisticRegression()

sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                            meta_classifier=lr,
                            use_probas=True,
                            random_state=42)
sclf.fit(X_train, y_train)
sclf.score(X_test, y_test)
# In[ ]:

from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingCVClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split

clf1 = svm.SVC(C=1, gamma=0.1)
clf2 = MLPClassifier(hidden_layer_sizes=(50, ), max_iter=600, alpha=1)
clf3 = DecisionTreeClassifier(max_depth=10, min_samples_split=4)
clf4 = RandomForestClassifier(n_estimators=250,
                              max_depth=10,
                              min_samples_split=4,
                              criterion='gini')
lr = LogisticRegression()

sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3, clf4],
                            meta_classifier=lr)

X_train2, X_cv, y_train2, y_cv = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.33,
                                                  random_state=0)

sclf.fit(X_train2.values, y_train2.values)
print("[Stacking] score on training data is %0.2f",
      sclf.score(X_train2.values, y_train2.values))
print("[Stacking] score on the crossvalidation data is %0.2f",
      sclf.score(X_cv.values, y_cv.values))
Example #13
0
# X即特征属性值
X = train_np[:, 1:]

# train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3)
test_df = test_data
test_np = test_df.values
test_x = test_np[:, 0:]

lr = LogisticRegression(C=0.8,penalty='l2',tol=1e-6)
DTree = DecisionTreeClassifier(max_depth=20)
rfc=RandomForestClassifier(n_estimators=5000)

xgbc = XGBClassifier(learning_rate=0.001, n_estimators=5000, max_depth=30, objective='binary:logitraw')
gbc=GradientBoostingClassifier(learning_rate=0.001, n_estimators=5000, max_depth=30)

sclf = StackingCVClassifier(classifiers=[lr,DTree,rfc,SVC(probability=True)], meta_classifier=xgbc, use_probas=True)
sclf.fit(X, y)
print(sclf.score(X, y))

test_id = pd.read_csv('data/origin/test.csv')

predictions = sclf.predict(test_x)
result = pd.DataFrame({'PassengerId':test_id['PassengerId'].values, 'Survived':predictions.astype(np.int32)})
result.to_csv("data/predictions/stacking_test4.csv", index=False)

answer = pd.read_csv('data/predictions/submission.csv')
answer_np = answer['Survived'].values
print('acc = %.5f' % accuracy_score(answer_np, predictions))

# from calculate_acc import calculate_acc
# calculate_acc(predictions)