コード例 #1
0
def test_sparse_inputs_with_features_in_secondary():
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf],
                                 meta_classifier=lr,
                                 random_state=42,
                                 use_features_in_secondary=True)
    X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)

    if Version(sklearn_version) < Version("0.21"):
        expected_value = 1.0
    else:
        expected_value = 0.99

    assert round(stclf.score(X_train, y_train), 2) == expected_value, \
        round(stclf.score(X_train, y_train), 2)

    # sparse
    stclf.fit(sparse.csr_matrix(X_train), y_train)

    if Version(sklearn_version) < Version("0.21"):
        expected_value = 1.0
    else:
        expected_value = 0.99
    assert round(stclf.score(X_train, y_train), 2) == expected_value, \
        round(stclf.score(X_train, y_train), 2)
コード例 #2
0
def test_train_meta_features_():
    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    gnb = GaussianNB()
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 store_train_meta_features=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    stclf.fit(X_train, y_train)
    train_meta_features = stclf.train_meta_features_
    assert train_meta_features.shape == (X_train.shape[0], 2)
コード例 #3
0
def test_no_weight_support_meta():
    w = np.array([random.random() for _ in range(len(y_iris))])
    meta = KNeighborsClassifier()
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                meta_classifier=meta,
                                shuffle=False)

    with pytest.raises(TypeError):
        sclf.fit(X_iris, y_iris, sample_weight=w)
コード例 #4
0
def test_verbose():
    np.random.seed(123)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=3)
    sclf.fit(X_iris, y_iris)
コード例 #5
0
def test_no_weight_support():
    w = np.array([random.random() for _ in range(len(y_iris))])
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    clf3 = KNeighborsClassifier()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                                meta_classifier=meta,
                                shuffle=False)
    with pytest.raises(TypeError):
        sclf.fit(X_iris, y_iris, sample_weight=w)
コード例 #6
0
def test_no_weight_support_meta():
    w = np.array([random.random() for _ in range(len(y_iris))])
    meta = KNeighborsClassifier()
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                meta_classifier=meta,
                                shuffle=False)

    with pytest.raises(TypeError):
        sclf.fit(X_iris, y_iris, sample_weight=w)
コード例 #7
0
def test_verbose():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=3)
    sclf.fit(iris.data, iris.target)
コード例 #8
0
def test_train_meta_features_():
    knn = KNeighborsClassifier()
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    gnb = GaussianNB()
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 store_train_meta_features=True)
    X_train, _, y_train, _ = train_test_split(X_iris, y_iris, test_size=0.3)
    stclf.fit(X_train, y_train)
    train_meta_features = stclf.train_meta_features_
    assert train_meta_features.shape == (X_train.shape[0], 2)
コード例 #9
0
def test_no_weight_support():
    w = np.array([random.random() for _ in range(len(y_iris))])
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    clf3 = KNeighborsClassifier()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                                meta_classifier=meta,
                                shuffle=False)
    with pytest.raises(TypeError):
        sclf.fit(X_iris, y_iris, sample_weight=w)
def test_verbose():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=3)
    sclf.fit(iris.data, iris.target)
コード例 #11
0
def test_train_meta_features_():
    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    gnb = GaussianNB()
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 store_train_meta_features=True)
    X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)
    stclf.fit(X_train, y_train)
    train_meta_features = stclf.train_meta_features_
    assert train_meta_features.shape == (X_train.shape[0], 2)
コード例 #12
0
def test_verbose():
    np.random.seed(123)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=3)
    sclf.fit(X_iris, y_iris)
 def _build_model(self, X_train, y_train):
     knn = KNeighborsClassifier(n_neighbors=1)
     rf = RandomForestClassifier(max_depth=3,max_features=6,n_estimators=50,random_state=0)
     SVM = svm.SVC(C=1.0,kernel='poly',degree=5)
     Xgb = XGBClassifier(alpha=15, colsample_bytree=0.1,learning_rate=1, max_depth=5,reg_lambda=10.0)
     gnb = GaussianNB()
     lr = LogisticRegression(C = 10.0, dual=False, max_iter=100, solver='lbfgs')
     sclf = StackingCVClassifier(classifiers=[knn, rf,lr,SVM,Xgb],
                                 meta_classifier=gnb,
                                 random_state=42)
     sclf.fit(X_train,y_train)
     return sclf
コード例 #14
0
def test_predict_meta_features():
    knn = KNeighborsClassifier()
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    gnb = GaussianNB()
    X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris,
                                                        test_size=0.3)
    #  test default (class labels)
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 store_train_meta_features=True)
    stclf.fit(X_train, y_train)
    test_meta_features = stclf.predict(X_test)
    assert test_meta_features.shape == (X_test.shape[0],)
コード例 #15
0
def test_pandas():
    X_df = pd.DataFrame(X)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=0)
    try:
        sclf.fit(X_df, iris.target)
    except KeyError as e:
        assert 'are NumPy arrays. If X and y are pandas DataFrames' in str(e)
コード例 #16
0
def test_meta_feat_reordering():
    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    gnb = GaussianNB()
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 shuffle=True,
                                 store_train_meta_features=True)
    X_train, X_test, y_train,  y_test = train_test_split(X_breast, y_breast,
                                                         test_size=0.3)
    stclf.fit(X_train, y_train)

    assert round(roc_auc_score(y_train,
                 stclf.train_meta_features_[:, 1]), 2) == 0.88
コード例 #17
0
def test_no_weight_support_with_no_weight():
    logit = LogisticRegression()
    rf = RandomForestClassifier()
    gnb = GaussianNB()
    knn = KNeighborsClassifier()
    sclf = StackingCVClassifier(classifiers=[logit, rf, gnb],
                                meta_classifier=knn,
                                shuffle=False)
    sclf.fit(X_iris, y_iris)

    sclf = StackingCVClassifier(classifiers=[logit, knn, gnb],
                                meta_classifier=rf,
                                shuffle=False)
    sclf.fit(X_iris, y_iris)
コード例 #18
0
def test_pandas():
    X_df = pd.DataFrame(X_iris)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=0)
    try:
        sclf.fit(X_df, y_iris)
    except KeyError as e:
        assert 'are NumPy arrays. If X and y are pandas DataFrames' in str(e)
コード例 #19
0
def test_no_weight_support_with_no_weight():
    logit = LogisticRegression(multi_class='ovr', solver='liblinear')
    rf = RandomForestClassifier(n_estimators=10)
    gnb = GaussianNB()
    knn = KNeighborsClassifier()
    sclf = StackingCVClassifier(classifiers=[logit, rf, gnb],
                                meta_classifier=knn,
                                shuffle=False)
    sclf.fit(X_iris, y_iris)

    sclf = StackingCVClassifier(classifiers=[logit, knn, gnb],
                                meta_classifier=rf,
                                shuffle=False)
    sclf.fit(X_iris, y_iris)
コード例 #20
0
def test_pandas():
    X_df = pd.DataFrame(X)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=0)
    try:
        sclf.fit(X_df, iris.target)
    except KeyError as e:
        assert 'are NumPy arrays. If X and y are pandas DataFrames' in str(e)
コード例 #21
0
def test_no_weight_support_with_no_weight():
    logit = LogisticRegression(multi_class='ovr', solver='liblinear')
    rf = RandomForestClassifier(n_estimators=10)
    gnb = GaussianNB()
    knn = KNeighborsClassifier()
    sclf = StackingCVClassifier(classifiers=[logit, rf, gnb],
                                meta_classifier=knn,
                                shuffle=False)
    sclf.fit(X_iris, y_iris)

    sclf = StackingCVClassifier(classifiers=[logit, knn, gnb],
                                meta_classifier=rf,
                                shuffle=False)
    sclf.fit(X_iris, y_iris)
コード例 #22
0
def test_sparse_inputs():
    rf = RandomForestClassifier(random_state=1)
    lr = LogisticRegression()
    stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr)
    X_train, X_test, y_train, y_test = train_test_split(X_breast,
                                                        y_breast,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99

    # sparse
    stclf.fit(sparse.csr_matrix(X_train), y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99
コード例 #23
0
def test_list_of_lists():
    X_list = [i for i in X]
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=0)

    try:
        sclf.fit(X_list, iris.target)
    except TypeError as e:
        assert 'are NumPy arrays. If X and y are lists' in str(e)
コード例 #24
0
def test_list_of_lists():
    X_list = [i for i in X]
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=0)

    try:
        sclf.fit(X_list, iris.target)
    except TypeError as e:
        assert 'are NumPy arrays. If X and y are lists' in str(e)
コード例 #25
0
def test_list_of_lists():
    X_list = [i for i in X_iris]
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=0)

    try:
        sclf.fit(X_list, y_iris)
    except TypeError as e:
        assert 'are NumPy arrays. If X and y are lists' in str(e)
コード例 #26
0
def stacking(X_train_log,X_test_log,y_train,y_test):
    
    #global model_sclf_pred
    global model_RF_pred   

    clf3 = MLPClassifier(activation= 'tanh', learning_rate = 'adaptive', solver= 'sgd')
    clf2 = SVC(probability=True, C=100, gamma=0.001)
    clf1 = GaussianNB()
    clf5 = LogisticRegression()
    nb=GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf2, clf3,clf1],
                              shuffle = False,
                              use_probas = True,
                              cv = 5,
                              n_jobs = -1,
                              meta_classifier=clf5)
    
    model_sclf_pred = sclf.fit(X_train_log,y_train)
    sclfpred = model_sclf_pred.predict_proba(X_train_log)
    text.insert(END,'Stacking Accuracy on whole training data: '+str(model_sclf_pred.score(X_train_log,y_train))+"\n")
    fpr, tpr, _ = metrics.roc_curve(y_train, sclfpred[:,1:2], pos_label=1)
    auc = metrics.auc(fpr,tpr)
    text.insert(END,'Stacking AUC: '+str(auc)+"\n")
    sclfpred = model_sclf_pred.predict_proba(X_test_log)
    text.insert(END,'stacking Accuracy on whole testing data: '+str(model_sclf_pred.score(X_test_log,y_test))+"\n")
コード例 #27
0
def test_sparse_inputs():
    np.random.seed(123)
    rf = RandomForestClassifier(n_estimators=10)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr)
    X_train, X_test, y_train, y_test = train_test_split(X_breast,
                                                        y_breast,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99

    # sparse
    stclf.fit(sparse.csr_matrix(X_train), y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99
コード例 #28
0
def test_sparse_inputs():
    np.random.seed(123)
    rf = RandomForestClassifier(n_estimators=10)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf],
                                 meta_classifier=lr,
                                 random_state=42)
    X_train, X_test, y_train,  y_test = train_test_split(X_breast, y_breast,
                                                         test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99

    # sparse
    stclf.fit(sparse.csr_matrix(X_train), y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99
コード例 #29
0
def test_meta_feat_reordering():
    np.random.seed(123)
    knn = KNeighborsClassifier()
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    gnb = GaussianNB()
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 shuffle=True,
                                 store_train_meta_features=True)
    X_train, X_test, y_train, y_test = train_test_split(X_breast,
                                                        y_breast,
                                                        test_size=0.3)
    stclf.fit(X_train, y_train)

    assert round(roc_auc_score(y_train,
                 stclf.train_meta_features_[:, 1]), 2) == 0.87, \
        round(roc_auc_score(y_train,
              stclf.train_meta_features_[:, 1]), 2)
コード例 #30
0
def test_sparse_inputs_with_features_in_secondary():
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf],
                                 meta_classifier=lr,
                                 random_state=42,
                                 use_features_in_secondary=True)
    X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)
    assert round(stclf.score(X_train, y_train), 2) == 1.0, \
        round(stclf.score(X_train, y_train), 2)

    # sparse
    stclf.fit(sparse.csr_matrix(X_train), y_train)
    assert round(stclf.score(X_train, y_train), 2) == 1.0, \
        round(stclf.score(X_train, y_train), 2)
コード例 #31
0
 def stack(self,X,y,test_X):
     """
     模型融合
     :param X: X是一个训练数据集合,array或者list
     :param y: Y是真实值集合,array或者list
     :param test_X: 测试数据集合,array或者list
     :return:
             result_Y:根据测试数据预测出来的结果
     """
     logging.info('------Stacking之后的模型效果')
     sclf = StackingCVClassifier(classifiers=self.clfArr,meta_classifier=self.lr,cv=4)
     # sclf = StackingClassifier(classifiers=self.clfArr,meta_classifier=self.lr,verbose=1)
     X=np.array(X)
     y=np.array(y).flatten()
     sclf.fit(X,y)
     result_Y = sclf.predict(test_X)
     scores = model_selection.cross_val_score(sclf,X,y,cv=5,scoring='accuracy')
     print('The  Accuracy , mean: {:.5f} , std:+/- {:.5f}'.format(scores.mean(), scores.std()))
     return result_Y
コード例 #32
0
def test_works_with_df_if_fold_indexes_missing():
    """This is a regression test to make sure fitting will still work even if
    training data has ids that cannot be indexed using the indexes from the cv
    (e.g. skf)

    Some possibilities:
    + Output of the folds are not neatly consecutive (i.e. [341, 345, 543, ...]
      instead of [0, 1, ... n])
    + Indexes just start from some number greater than the size of the input
      (see test case)

    Training data sometimes has ids that carry other information, and selection
    of rows based on cv should not break.

    This is fixed in the code using `safe_indexing`
    """

    np.random.seed(123)
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf],
                                 meta_classifier=lr,
                                 random_state=42,
                                 use_features_in_secondary=True)

    X_modded = pd.DataFrame(X_breast,
                            index=np.arange(X_breast.shape[0]) + 1000)
    y_modded = pd.Series(y_breast, index=np.arange(y_breast.shape[0]) + 1000)

    X_train, X_test, y_train, y_test = train_test_split(X_modded,
                                                        y_modded,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)

    if Version(sklearn_version) < Version("0.22"):
        assert round(stclf.score(X_train, y_train), 2) == 0.99, \
            round(stclf.score(X_train, y_train), 2)
    else:
        assert round(stclf.score(X_train, y_train), 2) == 0.98, \
            round(stclf.score(X_train, y_train), 2)
コード例 #33
0
def test_StackingClassifier_drop_last_proba():
    np.random.seed(123)
    lr1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    sclf1 = StackingCVClassifier(classifiers=[lr1, lr1],
                                 use_probas=True,
                                 drop_last_proba=False,
                                 meta_classifier=lr1)

    sclf1.fit(X_iris, y_iris)
    r1 = sclf1.predict_meta_features(X_iris[:2])
    assert r1.shape == (2, 6)

    sclf2 = StackingCVClassifier(classifiers=[lr1, lr1],
                                 use_probas=True,
                                 drop_last_proba=True,
                                 meta_classifier=lr1)

    sclf2.fit(X_iris, y_iris)
    r2 = sclf2.predict_meta_features(X_iris[:2])
    assert r2.shape == (2, 4), r2.shape

    sclf3 = StackingCVClassifier(classifiers=[lr1, lr1],
                                 use_probas=True,
                                 drop_last_proba=True,
                                 meta_classifier=lr1)

    sclf3.fit(X_iris[0:100], y_iris[0:100])  # only 2 classes
    r3 = sclf3.predict_meta_features(X_iris[:2])
    assert r3.shape == (2, 2), r3.shape
コード例 #34
0
def test_sample_weight():
    # with no weight given
    np.random.seed(123)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                meta_classifier=meta,
                                shuffle=False)
    prob1 = sclf.fit(X_iris, y_iris).predict_proba(X_iris)

    # with weight = 1
    np.random.seed(123)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                meta_classifier=meta,
                                shuffle=False)
    w = np.ones(len(y_iris))
    prob2 = sclf.fit(X_iris, y_iris,
                     sample_weight=w).predict_proba(X_iris)

    # with random weight
    random.seed(87)
    w = np.array([random.random() for _ in range(len(y_iris))])
    np.random.seed(123)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                meta_classifier=meta,
                                shuffle=False)
    prob3 = sclf.fit(X_iris, y_iris,
                     sample_weight=w).predict_proba(X_iris)

    diff12 = np.max(np.abs(prob1 - prob2))
    diff23 = np.max(np.abs(prob2 - prob3))
    assert diff12 < 1e-3, "max diff is %.4f" % diff12
    assert diff23 > 1e-3, "max diff is %.4f" % diff23
コード例 #35
0
def test_StackingClassifier_drop_last_proba():
    np.random.seed(123)
    lr1 = LogisticRegression(solver='liblinear',
                             multi_class='ovr')
    sclf1 = StackingCVClassifier(classifiers=[lr1, lr1],
                                 use_probas=True,
                                 drop_last_proba=False,
                                 meta_classifier=lr1)

    sclf1.fit(X_iris, y_iris)
    r1 = sclf1.predict_meta_features(X_iris[:2])
    assert r1.shape == (2, 6)

    sclf2 = StackingCVClassifier(classifiers=[lr1, lr1],
                                 use_probas=True,
                                 drop_last_proba=True,
                                 meta_classifier=lr1)

    sclf2.fit(X_iris, y_iris)
    r2 = sclf2.predict_meta_features(X_iris[:2])
    assert r2.shape == (2, 4), r2.shape

    sclf3 = StackingCVClassifier(classifiers=[lr1, lr1],
                                 use_probas=True,
                                 drop_last_proba=True,
                                 meta_classifier=lr1)

    sclf3.fit(X_iris[0:100], y_iris[0:100])  # only 2 classes
    r3 = sclf3.predict_meta_features(X_iris[:2])
    assert r3.shape == (2, 2), r3.shape
コード例 #36
0
def test_works_with_df_if_fold_indexes_missing():
    """This is a regression test to make sure fitting will still work even if
    training data has ids that cannot be indexed using the indexes from the cv
    (e.g. skf)

    Some possibilities:
    + Output of the folds are not neatly consecutive (i.e. [341, 345, 543, ...]
      instead of [0, 1, ... n])
    + Indexes just start from some number greater than the size of the input
      (see test case)

    Training data sometimes has ids that carry other information, and selection
    of rows based on cv should not break.

    This is fixed in the code using `safe_indexing`
    """

    np.random.seed(123)
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf],
                                 meta_classifier=lr,
                                 random_state=42,
                                 use_features_in_secondary=True)

    X_modded = pd.DataFrame(X_breast,
                            index=np.arange(X_breast.shape[0]) + 1000)
    y_modded = pd.Series(y_breast,
                         index=np.arange(y_breast.shape[0]) + 1000)

    X_train, X_test, y_train, y_test = train_test_split(X_modded,
                                                        y_modded,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99, \
        round(stclf.score(X_train, y_train), 2)
コード例 #37
0
def test_meta_feat_reordering():
    knn = KNeighborsClassifier()
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    gnb = GaussianNB()
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 shuffle=True,
                                 random_state=42,
                                 store_train_meta_features=True)
    X_train, X_test, y_train,  y_test = train_test_split(X_breast, y_breast,
                                                         random_state=0,
                                                         test_size=0.3)
    stclf.fit(X_train, y_train)

    if Version(sklearn_version) < Version("0.21"):
        expected_value = 0.86
    else:
        expected_value = 0.87

    assert round(roc_auc_score(y_train,
                 stclf.train_meta_features_[:, 1]), 2) == expected_value, \
        round(roc_auc_score(y_train,
              stclf.train_meta_features_[:, 1]), 2)
コード例 #38
0
# optimized votingClassifier
eclf = VotingClassifier(estimators=[('rf',rf),('lr',lr),('gb',gb)],voting='soft',
                        weights=[3,2,3])

# Building and running the StackingClassifier on the test data
from mlxtend.classifier import StackingCVClassifier
sclf=StackingCVClassifier(classifiers=[rf,lr,gb,et,gnb,svc,knn,xgb,ada,mlp,lda,qda],
                          use_features_in_secondary=True,
                          use_probas=True,
                        meta_classifier=eclf)
cmetrics=[]
cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='accuracy').mean())
cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='precision').mean())
cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='recall').mean())
cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='roc_auc').mean())
sclf.fit(X.values,y.values)
pred=sclf.predict(Xt.values)

# plotting ROC-Curve
pred_proba=sclf.predict_proba(Xt.values)[:,1]
fpr, tpr, threshold = roc_curve(yt, pred_proba)
roc_auc=auc(fpr,tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig('ROC_curve_test.png',bbox_inches='tight')
            setClass=clas,
            show=False)
elif (META == False):
    HeldOutDataPredictions = pf.Classification_Model(
        data_training=vec_training,
        target_training=out_train,
        data_testing=vec_testing,
        Classifier=EnsembleCustom[0][1],
        target_testing=None,
        ModelName=EnsembleCustom[0][0],
        accur=False,
        grph=False,
        setClass=clas,
        show=False)
else:
    MetaClass.fit(vec_training, out_train)
    HeldOutDataPredictions = MetaClass.predict(vec_testing)

runingTime = timeit.default_timer(
) - tStart  #Stopping clock and getting time spent
print("Fitting and predictions done in %0.4fs." % runingTime)
print("=" * 100)
""" PRINTING THE PREDICTIONS MADE AND SAVING CSV FILE """
Preds = pd.DataFrame({"Category": HeldOutDataPredictions})
Results = pd.concat([dataTest["id"], Preds], axis=1, sort=False)
print(Results)
pf.Write_File_DF(Data_Set=Results,
                 File_Name="Predictions_Group_4",
                 separation=",",
                 head=True,
                 ind=False)
コード例 #40
0
clf3 = RFC()
meta_clf = RC()

# In[ ]:

stacker = SCVC(classifiers=[clf1, clf2, clf3, clf1],
               meta_classifier=meta_clf,
               use_probas=True,
               use_features_in_secondary=True)

# In[ ]:

for c in train.columns:
    train[c] = train[c].fillna(train[c].median())
    test[c] = test[c].fillna(train[c].median())
stacker.fit(train.values, np.array(Y))

# In[ ]:

my_prediction = stacker.predict(test.values)

# In[ ]:

# PassengerId,Survived
submission = pd.DataFrame()
submission['PassengerId'] = test.index.tolist()
submission['Survived'] = my_prediction

# In[ ]:

submission.to_csv("submission.csv", index=False)
コード例 #41
0
#voting ensemlbe
ensemble = VotingClassifier(estimators, voting='soft', weights=[1, 1, 1])
ensemble.fit(X_train, y_train)
pred = ensemble.predict(X_test)
print("predicted values----------:", pred)
pickle.dump(ensemble, open('ensemble-clf.sav', 'wb'))
# pred_op = ensemble.predict(otpt)
# print("Predicted values:" ,pred_op)
print('fscore:{0:.3f}'.format(f1_score(y_test, pred, average='micro')))

#meta classifier ensemble
stack = StackingCVClassifier(classifiers=[mlp, xgb, rf],
                             meta_classifier=lr,
                             use_probas=True)
stack.fit(X_train.values, y_train.values)
pred2 = stack.predict(X_test.values)
print("predicted values: ", pred2)
print('fscore:{0:.3f}'.format(f1_score(y_test, pred2, average='micro')))
from sklearn.metrics import confusion_matrix
confusion_lr = confusion_matrix(y_test, pred)
pickle.dump(stack, open('stack-clf.sav', 'wb'))
print(confusion_lr)

####################################################################################################################
# #REPORT AND PLOT MICRO-AVERAGE ROC AUC FOR EACH MODEL
# from sklearn.preprocessing import label_binarize
# import matplotlib.pyplot as plt
# from itertools import cycle
# from sklearn.multiclass import OneVsRestClassifier
# from scipy import interp
def main_leave_one_week(offline, mall_ids=-1, save_offline_predict=False):
    model_name = "stack_balance_strong_matrix_lonlat_wh"
    train_all = load_train()
    test_all = load_testA()
    shop_info = load_shop_info()
    if mall_ids == -1:
        mall_ids = shop_info.mall_id.unique()
    offline_predicts = {}
    all_rowid = {}
    offline_reals = {}
    all_predicts = {}

    for _index, mall_id in enumerate(mall_ids):
        print "train: ", mall_id, " {}/{}".format(_index + 1, len(mall_ids))
        shops = shop_info[shop_info.mall_id == mall_id].shop_id.unique()
        train = train_all[train_all.mall_id == mall_id]
        test = test_all[test_all.mall_id == mall_id]

        # y label encoder
        y = train.shop_id.values
        label_encoder = LabelEncoder().fit(y)
        y = label_encoder.transform(y)

        num_class = len(shops)
        print "num_class", num_class

        # all wifi matrix
        df, train_cache, test_cache = get_wifi_cache2(mall_id)
        train_matrix_origin_all = train_cache[2]
        test_matrix_origin_all = test_cache[2]
        test_index = test_cache[0]

        # choose_strong_wifi_index
        strong_wifi_index = choose_strong_wifi_index(-90, 6,
                                                     train_matrix_origin_all)
        train_strong_matrix = train_matrix_origin_all[:, strong_wifi_index]
        test_strong_matrix = test_matrix_origin_all[:, strong_wifi_index]

        # train valid split and get index
        _train_index, _valid_index = get_last_one_week_index(train)

        # weekday and hour
        preprocess_basic_time(train)
        preprocess_basic_time(test)
        preprocess_basic_wifi(train)
        preprocess_basic_wifi(test)
        train_time_features = train[["weekday", "hour", "is_weekend"]].values
        test_time_features = test[["weekday", "hour", "is_weekend"]].values
        train_wh_features = train[["weekday", "hour"]].values
        test_wh_features = test[["weekday", "hour"]].values

        # 是否连接wifi
        train_connect_wifi = (
            train.basic_wifi_info.map(lambda x: len(x[1])).values >
            0).astype(int).reshape(-1, 1)
        test_connect_wifi = (
            test.basic_wifi_info.map(lambda x: len(x[1])).values >
            0).astype(int).reshape(-1, 1)

        # 搜到的wifi数量
        train_search_wifi_size = train.basic_wifi_info.map(
            lambda x: x[0]).values.reshape(-1, 1)
        test_search_wifi_size = test.basic_wifi_info.map(
            lambda x: x[0]).values.reshape(-1, 1)

        # lon lat
        train_lonlats = train[["longitude", "latitude"]].values
        test_lonlats = test[["longitude", "latitude"]].values

        # concatenate train/test features
        train_matrix = np.concatenate(
            [
                train_strong_matrix,
                train_lonlats,
                train_wh_features,
                # train_connect_wifi,
                # train_search_wifi_size
            ],
            axis=1)

        test_matrix = np.concatenate(
            [
                test_strong_matrix,
                test_lonlats,
                test_wh_features,
                # test_connect_wifi,
                # test_search_wifi_size
            ],
            axis=1)

        # train valid get
        _train_x = train_matrix[_train_index]
        _train_y = y[_train_index]
        _valid_x = train_matrix[_valid_index]
        _valid_y = y[_valid_index]

        # stack base model
        def get_model1():
            model1 = RandomForestClassifier(n_estimators=500,
                                            n_jobs=-1,
                                            class_weight="balanced")
            return model1

        def get_model2():
            model2 = OneVsRestClassifier(estimator=RandomForestClassifier(
                n_estimators=188, n_jobs=-1, class_weight="balanced"))
            return model2

        # stack meta model
        def get_meta_model():
            meta_model = RandomForestClassifier(n_estimators=777,
                                                n_jobs=-1,
                                                class_weight="balanced")
            return meta_model

        # stack cv
        cv = 3

        # offline
        # expansion train
        _x, _y = expansion(_train_x, _train_y, cv)
        stack = StackingCVClassifier([get_model1(), get_model2()],
                                     get_meta_model(),
                                     use_probas=True,
                                     use_features_in_secondary=True,
                                     cv=cv)
        stack.fit(_x, _y)
        best_predict = stack.predict(_valid_x)

        predict = label_encoder.inverse_transform(best_predict)
        offline_predicts[mall_id] = predict
        _real_y = label_encoder.inverse_transform(_valid_y)
        offline_reals[mall_id] = _real_y
        print mall_id + "'s acc is", acc(predict, _real_y)

        # online
        if not offline:
            # expansion train
            _x, _y = expansion(train_matrix, y, cv)
            stack = StackingCVClassifier(
                [get_model1(), get_model2()],
                get_meta_model(),
                use_probas=True,
                use_features_in_secondary=True,
                cv=cv)

            stack.fit(_x, _y)
            predict = stack.predict(test_matrix)
            predict = label_encoder.inverse_transform(predict)
            all_predicts[mall_id] = predict
            all_rowid[mall_id] = test_all[np.in1d(test_all.index,
                                                  test_index)].row_id.values

    # offline acc result
    result = {}
    for _mall_id in mall_ids:
        _acc = acc(offline_predicts[_mall_id], offline_reals[_mall_id])
        print _mall_id + "'s acc is", _acc
        result[_mall_id] = _acc

        if save_offline_predict:
            pd.DataFrame({
                "predict": offline_predicts[_mall_id],
                "real": offline_reals[_mall_id]
            }).to_csv("../result/offline_predict/{}.csv".format(_mall_id),
                      index=None)

    all_predict = np.concatenate(offline_reals.values())
    all_true = np.concatenate(offline_predicts.values())
    _acc = acc(all_predict, all_true)
    print "all acc is", _acc

    if len(mall_ids) < 50:
        exit(1)

    result["all_acc"] = _acc
    path = "../result/offline/{}".format(model_name)
    save_acc(result, path, None)

    # online save result
    if not offline:
        all_rowid = np.concatenate(all_rowid.values())
        all_predict = np.concatenate(all_predicts.values())
        result = pd.DataFrame(data={
            "row_id": all_rowid,
            "shop_id": all_predict
        })
        result.sort_values(by="row_id", inplace=True)
        path = "../result/online/{}".format(model_name)
        save_result(result, path, None)