コード例 #1
0
def test_predict_meta_features():
    knn = KNeighborsClassifier()
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    gnb = GaussianNB()
    X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris,
                                                        test_size=0.3)
    #  test default (class labels)
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 store_train_meta_features=True)
    stclf.fit(X_train, y_train)
    test_meta_features = stclf.predict(X_test)
    assert test_meta_features.shape == (X_test.shape[0],)
コード例 #2
0
 def stack(self,X,y,test_X):
     """
     模型融合
     :param X: X是一个训练数据集合,array或者list
     :param y: Y是真实值集合,array或者list
     :param test_X: 测试数据集合,array或者list
     :return:
             result_Y:根据测试数据预测出来的结果
     """
     logging.info('------Stacking之后的模型效果')
     sclf = StackingCVClassifier(classifiers=self.clfArr,meta_classifier=self.lr,cv=4)
     # sclf = StackingClassifier(classifiers=self.clfArr,meta_classifier=self.lr,verbose=1)
     X=np.array(X)
     y=np.array(y).flatten()
     sclf.fit(X,y)
     result_Y = sclf.predict(test_X)
     scores = model_selection.cross_val_score(sclf,X,y,cv=5,scoring='accuracy')
     print('The  Accuracy , mean: {:.5f} , std:+/- {:.5f}'.format(scores.mean(), scores.std()))
     return result_Y
コード例 #3
0
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=35, criterion="entropy")
ada = AdaBoostClassifier(n_estimators=75, learning_rate=1.5)
etc = ExtraTreesClassifier(n_jobs=-1, n_estimators=5, criterion="entropy")
# lr = LogisticRegression(n_jobs=-1, C=100)  # meta classifier, 2 trees, c=100 is used in stacking2.pkl
lr = LogisticRegression(n_jobs=-1, C=8)  # meta classifier

sclf = StackingCVClassifier(classifiers=[ada, rfc, etc], meta_classifier=lr, use_probas=True, verbose=3)
sclf.fit(X,y)
print("training finished")

df=pd.read_csv(r'data/corrected',header=None, names=__ATTR_NAMES)
df = processing.merge_sparse_feature(df)
# one hot encoding
df = processing.one_hot(df)
# y labels mapping
df = processing.map2major5(df)
with open(r'data/selected_feat_names.pkl', 'rb') as f:
    selected_feat_names = pickle.load(f)
print("test data loaded")

X = df[selected_feat_names].values
y = df['attack_type'].values
y_rf = sclf.predict(X)

print("stacking results:")
cost_based_scoring.score(y, y_rf, True)




コード例 #4
0
                          solver='lbfgs')
clf6 = svm.SVC(C=2, gamma=0.1)

lr = LogisticRegression()  # clf1, clf2, clf3,clf4,clf5,clf6
sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf4, clf5, clf6],
                            meta_classifier=lr)

# for clf, label in zip(
#     [clf1, clf2, clf3,clf4,clf5,clf6, sclf],
#     ['xgb', 'lgb', 'catboost','RF','LR','svc', 'StackingClassifier']):
#
#     scores = model_selection.cross_val_score(clf, data_tr, label_tr, cv=3, scoring='accuracy')
#     print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
sclf.fit(data_tr, label_tr)
label_te_true = np.array(label_te)
pre = sclf.predict(data_te)
zhizaoye = sclf.predict(zhiyaoye_pre_data)
print(zhizaoye)

number = data4['TICKER_SYMBOL'].values
dataframe = pd.DataFrame({'股票编号': number, '房地产业': zhizaoye})
dataframe.to_csv("房地产业-预测结果.csv", index=False, encoding='GBK')
print("the stacking model auc: %.4g" %
      metrics.roc_auc_score(label_te_true, pre))
print(classification_report(label_te_true, pre))
print("stacking auc值为:", roc_auc_score(label_te_true, pre))
# ROC曲线绘制
fpr1, tpr1, threshold1 = roc_curve(label_te_true, pre)
plt.plot(fpr1, tpr1, color='red')
plt.plot([0, 1], [0, 1], color='blue', linestyle='--')
plt.xlim([0.0, 1.0])
コード例 #5
0
ファイル: Training.py プロジェクト: snowdj/MachineLearning_V
        xgb.XGBClassifier(max_depth=6, n_estimators=100, num_round=5),
        RandomForestClassifier(n_estimators=100, max_depth=6, oob_score=True),
        GradientBoostingClassifier(learning_rate=0.3,
                                   max_depth=6,
                                   n_estimators=100)
    ]
    clf2 = LogisticRegression(C=0.5, max_iter=100)
    #============================================================================#
    from mlxtend.classifier import StackingClassifier, StackingCVClassifier
    sclf = StackingClassifier(classifiers=clfs, meta_classifier=clf2)
    sclf.fit(X_train, Y_train)
    print(sclf.score(X_train, Y_train))
    sclf_pre = sclf.predict(X_test)
    sclf_sub = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": sclf_pre
    })
    sclf_sub.to_csv("../data/sclf_sub.csv", index=False)
    #===============================================================================#
    sclf2 = StackingCVClassifier(classifiers=clfs, meta_classifier=clf2, cv=5)
    x = np.array(X_train)
    y = np.array(Y_train).flatten()
    sclf2.fit(x, y)
    print(sclf2.score(x, y))
    sclf2_pre = sclf2.predict(np.array(X_test))
    sclf2_sub = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": sclf2_pre
    })
    sclf2_sub.to_csv("../data/sclf2_sub.csv", index=False)
            show=False)
elif (META == False):
    HeldOutDataPredictions = pf.Classification_Model(
        data_training=vec_training,
        target_training=out_train,
        data_testing=vec_testing,
        Classifier=EnsembleCustom[0][1],
        target_testing=None,
        ModelName=EnsembleCustom[0][0],
        accur=False,
        grph=False,
        setClass=clas,
        show=False)
else:
    MetaClass.fit(vec_training, out_train)
    HeldOutDataPredictions = MetaClass.predict(vec_testing)

runingTime = timeit.default_timer(
) - tStart  #Stopping clock and getting time spent
print("Fitting and predictions done in %0.4fs." % runingTime)
print("=" * 100)
""" PRINTING THE PREDICTIONS MADE AND SAVING CSV FILE """
Preds = pd.DataFrame({"Category": HeldOutDataPredictions})
Results = pd.concat([dataTest["id"], Preds], axis=1, sort=False)
print(Results)
pf.Write_File_DF(Data_Set=Results,
                 File_Name="Predictions_Group_4",
                 separation=",",
                 head=True,
                 ind=False)
def main_leave_one_week(offline, mall_ids=-1, save_offline_predict=False):
    model_name = "stack_balance_strong_matrix_lonlat_wh"
    train_all = load_train()
    test_all = load_testA()
    shop_info = load_shop_info()
    if mall_ids == -1:
        mall_ids = shop_info.mall_id.unique()
    offline_predicts = {}
    all_rowid = {}
    offline_reals = {}
    all_predicts = {}

    for _index, mall_id in enumerate(mall_ids):
        print "train: ", mall_id, " {}/{}".format(_index + 1, len(mall_ids))
        shops = shop_info[shop_info.mall_id == mall_id].shop_id.unique()
        train = train_all[train_all.mall_id == mall_id]
        test = test_all[test_all.mall_id == mall_id]

        # y label encoder
        y = train.shop_id.values
        label_encoder = LabelEncoder().fit(y)
        y = label_encoder.transform(y)

        num_class = len(shops)
        print "num_class", num_class

        # all wifi matrix
        df, train_cache, test_cache = get_wifi_cache2(mall_id)
        train_matrix_origin_all = train_cache[2]
        test_matrix_origin_all = test_cache[2]
        test_index = test_cache[0]

        # choose_strong_wifi_index
        strong_wifi_index = choose_strong_wifi_index(-90, 6,
                                                     train_matrix_origin_all)
        train_strong_matrix = train_matrix_origin_all[:, strong_wifi_index]
        test_strong_matrix = test_matrix_origin_all[:, strong_wifi_index]

        # train valid split and get index
        _train_index, _valid_index = get_last_one_week_index(train)

        # weekday and hour
        preprocess_basic_time(train)
        preprocess_basic_time(test)
        preprocess_basic_wifi(train)
        preprocess_basic_wifi(test)
        train_time_features = train[["weekday", "hour", "is_weekend"]].values
        test_time_features = test[["weekday", "hour", "is_weekend"]].values
        train_wh_features = train[["weekday", "hour"]].values
        test_wh_features = test[["weekday", "hour"]].values

        # 是否连接wifi
        train_connect_wifi = (
            train.basic_wifi_info.map(lambda x: len(x[1])).values >
            0).astype(int).reshape(-1, 1)
        test_connect_wifi = (
            test.basic_wifi_info.map(lambda x: len(x[1])).values >
            0).astype(int).reshape(-1, 1)

        # 搜到的wifi数量
        train_search_wifi_size = train.basic_wifi_info.map(
            lambda x: x[0]).values.reshape(-1, 1)
        test_search_wifi_size = test.basic_wifi_info.map(
            lambda x: x[0]).values.reshape(-1, 1)

        # lon lat
        train_lonlats = train[["longitude", "latitude"]].values
        test_lonlats = test[["longitude", "latitude"]].values

        # concatenate train/test features
        train_matrix = np.concatenate(
            [
                train_strong_matrix,
                train_lonlats,
                train_wh_features,
                # train_connect_wifi,
                # train_search_wifi_size
            ],
            axis=1)

        test_matrix = np.concatenate(
            [
                test_strong_matrix,
                test_lonlats,
                test_wh_features,
                # test_connect_wifi,
                # test_search_wifi_size
            ],
            axis=1)

        # train valid get
        _train_x = train_matrix[_train_index]
        _train_y = y[_train_index]
        _valid_x = train_matrix[_valid_index]
        _valid_y = y[_valid_index]

        # stack base model
        def get_model1():
            model1 = RandomForestClassifier(n_estimators=500,
                                            n_jobs=-1,
                                            class_weight="balanced")
            return model1

        def get_model2():
            model2 = OneVsRestClassifier(estimator=RandomForestClassifier(
                n_estimators=188, n_jobs=-1, class_weight="balanced"))
            return model2

        # stack meta model
        def get_meta_model():
            meta_model = RandomForestClassifier(n_estimators=777,
                                                n_jobs=-1,
                                                class_weight="balanced")
            return meta_model

        # stack cv
        cv = 3

        # offline
        # expansion train
        _x, _y = expansion(_train_x, _train_y, cv)
        stack = StackingCVClassifier([get_model1(), get_model2()],
                                     get_meta_model(),
                                     use_probas=True,
                                     use_features_in_secondary=True,
                                     cv=cv)
        stack.fit(_x, _y)
        best_predict = stack.predict(_valid_x)

        predict = label_encoder.inverse_transform(best_predict)
        offline_predicts[mall_id] = predict
        _real_y = label_encoder.inverse_transform(_valid_y)
        offline_reals[mall_id] = _real_y
        print mall_id + "'s acc is", acc(predict, _real_y)

        # online
        if not offline:
            # expansion train
            _x, _y = expansion(train_matrix, y, cv)
            stack = StackingCVClassifier(
                [get_model1(), get_model2()],
                get_meta_model(),
                use_probas=True,
                use_features_in_secondary=True,
                cv=cv)

            stack.fit(_x, _y)
            predict = stack.predict(test_matrix)
            predict = label_encoder.inverse_transform(predict)
            all_predicts[mall_id] = predict
            all_rowid[mall_id] = test_all[np.in1d(test_all.index,
                                                  test_index)].row_id.values

    # offline acc result
    result = {}
    for _mall_id in mall_ids:
        _acc = acc(offline_predicts[_mall_id], offline_reals[_mall_id])
        print _mall_id + "'s acc is", _acc
        result[_mall_id] = _acc

        if save_offline_predict:
            pd.DataFrame({
                "predict": offline_predicts[_mall_id],
                "real": offline_reals[_mall_id]
            }).to_csv("../result/offline_predict/{}.csv".format(_mall_id),
                      index=None)

    all_predict = np.concatenate(offline_reals.values())
    all_true = np.concatenate(offline_predicts.values())
    _acc = acc(all_predict, all_true)
    print "all acc is", _acc

    if len(mall_ids) < 50:
        exit(1)

    result["all_acc"] = _acc
    path = "../result/offline/{}".format(model_name)
    save_acc(result, path, None)

    # online save result
    if not offline:
        all_rowid = np.concatenate(all_rowid.values())
        all_predict = np.concatenate(all_predicts.values())
        result = pd.DataFrame(data={
            "row_id": all_rowid,
            "shop_id": all_predict
        })
        result.sort_values(by="row_id", inplace=True)
        path = "../result/online/{}".format(model_name)
        save_result(result, path, None)
コード例 #8
0
ファイル: kdd_classify.py プロジェクト: CBVon/graph_learning
scvc = StackingCVClassifier(classifiers=[dtc, rfc, etc],
                            meta_classifier=lr,
                            use_probas=True,
                            verbose=0)
# 关于数据的数据,一般是结构化数据(如存储在数据库里的数据,规定了字段的长度、类型等)
# meta_classifier : 关于分类器的分类器,通常是主分类器的代理,用于提供附加的数据预处理
# use_probas : If True, trains meta-classifier based on predicted probabilities instead of class labels.
# verbose>2: Changes verbose param of the underlying regressor to self.verbose - 2  输出计算过程,赘言
start_time = time.time()
scvc = scvc.fit(
    train_x.values,
    train_y["label"].values)  #stack对输入要求是numpy.array, 所以pandas.df必须转换,即.values
end_time = time.time()
print("StackingCVClassifier, training finished, using : %.2f s" %
      (end_time - start_time))
predict_y = scvc.predict(test_x)
#print classification_report(test_y["label"].values, predict_y)
print "score : " + str(
    cost_based_scoring.score(test_y["label"].values, predict_y, show=False))
print "---------- ----------"
'''
20180524 最终输出
train data loaded
test data loaded
LogisticRegression : 
LogisticRegression, training finished, using : 105.72 s
score : 0.491240366654
---------- ----------
DecisionTreeClassifier : 
DecisionTreeClassifier, training finished, using : 2.71 s
score : 0.23169543676
コード例 #9
0
                            use_probas=True,
                            meta_classifier=mlp,
                            cv=7,
                            store_train_meta_features=True,
                            stratify=True,
                            verbose=3,
                            n_jobs=-1,
                            random_state=seed)

sclf_cv_score = cross_val_score(sclf,
                                df_train[selected_columns].values,
                                y=y,
                                scoring='accuracy',
                                cv=3)
print(f"Mean accuracy {sclf_cv_score.mean(): .4f}")
print(f"+/- {sclf_cv_score.std(): .2f}")

###############################################################################

predictions = sclf.predict(
    df_test[selected_columns].values
)  # Add values attribute to rid of 'feature_names mismatch'
final_pred = le.inverse_transform(predictions)
final_pred = [int(i) for i in final_pred]

print(final_pred)

# Model voting submission
output = pd.DataFrame({'Id': test_ids, 'Cover_Type': final_pred})
output.to_csv('submission48.csv', index=False, header=True)
コード例 #10
0
                     random_state=2018,
                     n_jobs=8)
svc = SVC(kernel='rbf', random_state=2018, probability=True, gamma='auto')
lr = LogisticRegression(max_iter=1000, solver='lbfgs', penalty='l2', n_jobs=8)
models = [rf, xgb, lgb, svc]
y_pred_self, y_prob_self = StackingModels(models=models,
                                          meta_model=lr,
                                          X_train=X_train,
                                          X_test=X_test,
                                          y_train=y_train)
acc = accuracy_score(y_test, y_pred_self)
auc = roc_auc_score(y_test, y_prob_self)
print('MyModel:  ACC = {:.6f}, AUC = {:.6f}'.format(acc, auc))
stack_clf = StackingCVClassifier(classifiers=models, meta_classifier=lr,
                                 cv=5).fit(X_train, y_train)
y_pred_mxltend, y_prob_mxltend = stack_clf.predict(
    X_test), stack_clf.predict_proba(X_test)[:, -1]
acc = accuracy_score(y_test, y_pred_mxltend)
auc = roc_auc_score(y_test, y_prob_mxltend)
print('Mlxtend:  ACC = {:.6f}, AUC = {:.6f}'.format(acc, auc))

X, y = make_regression(n_samples=5000,
                       n_features=20,
                       n_informative=18,
                       random_state=2018)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=2018)
X_train, X_test = map(scaler.fit_transform, [X_train, X_test])

rf = RandomForestRegressor(n_estimators=50,
                              n_jobs=-1)

ensemble = [('ex_cls', ex_cls), ('rf2', rf2_clf), ('rf', rf_clf)]

stack = StackingCVClassifier(classifiers=[clf for label, clf in ensemble],
                             meta_classifier=rf_clf,
                             cv=5,
                             use_probas=True,
                             use_features_in_secondary=True,
                             verbose=1)
# HOLD-OUT
X_train, X_valid, y_train, y_valid = train_test_split(X.values,
                                                      y.values,
                                                      train_size=0.8,
                                                      test_size=0.2,
                                                      random_state=42)

stack = stack.fit(X_train, y_train)
pr = stack.predict(X_valid)

# MAE
y_nump = np.array(y_valid)
mae = mean_absolute_error(pr, y_valid)
print("Mean Absolute Error:", mae)
print("Good predicted: ", np.sum(pr == y_nump), "of: ", y_valid.shape[0])
print("Accuracy Score: ", accuracy_score(pr, y_valid) * 100)

# PREDICTION OF TEST
pr_final_test = stack.predict(X_test_full)
print(pr_final_test.shape)
コード例 #12
0
ファイル: 5_2_train_stack.py プロジェクト: CameleoGrey/Monlan
    ExtraTreesClassifier(n_estimators=1000, max_depth=2, n_jobs=8),
    ExtraTreesClassifier(n_estimators=1000, max_depth=4, n_jobs=8),
    ExtraTreesClassifier(n_estimators=1000, max_depth=10, n_jobs=8),
    ExtraTreesClassifier(n_estimators=1000, max_depth=30, n_jobs=8),
]
lr = ExtraTreesClassifier(n_estimators=1000, max_depth=30, n_jobs=8)

model = StackingCVClassifier(classifiers=clfList,
                             use_probas=True,
                             use_features_in_secondary=True,
                             meta_classifier=lr,
                             cv=20,
                             random_state=15,
                             verbose=1)

model.fit(x_train, y_train)
#y_pred = sclf.predict(x_test)
#score(y_pred, y_test)

#model = load("../models/catboost_model.pkl")
y_pred = model.predict(x_val)
scores = get_all_scores(y_pred, y_val)
print(scores)
y_pred = model.predict(x_test)
scores = get_all_scores(y_pred, y_test)
print(scores)
probas_test = model.predict_proba(x_test)
save(model, "../models/ex_stack.pkl")

print("done")
コード例 #13
0
clf = StackingClassifier(estimators=[gs_cv_knc, gs_tfidf_knn],
                         final_estimator=LogisticRegression(
                             class_weight='balanced',
                             multi_class='multinomial'))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

#  ('knn', gs_tfidf_knn), ('knc', gs_cv_knc)

sclf = StackingCVClassifier(classifiers=[gs_cv_knc, gs_tfidf_knn],
                            meta_classifier=LogisticRegression(
                                class_weight='balanced',
                                multi_class='multinomial'),
                            random_state=15)
sclf.fit(X_train, y_train)
y_pred = sclf.predict(y_test)

# ENSEMBLE - VoteClassifier:

models_list = [
    gs_cv_cnb, gs_tfidf_cnb, gs_cv_knn, gs_tfidf_knn, gs_cv_log, gs_tfidf_log,
    gs_cv_rfc, gs_tfidf_rfc, gs_cv_knc, gs_tfid_knc, gs_cv_sgd, gs_tfidf_sgd
]
models_labels = [
    'cv_cnb', 'tfidf_cnb', 'cv_knn', 'tfidf_knn', 'cv_log', 'tfidf_log',
    'cv_rfc', 'tfidf_rfc', 'cv_knc', 'tfidf_knc', 'cv_sgd', 'tfidf_sgd'
]

models_comb = list(itertools.combinations(
    models_list, 3))  #create all possible combinations of models
labels_comb = list(itertools.combinations(
コード例 #14
0
ファイル: stacking_test.py プロジェクト: oxygensu/Titan
# X即特征属性值
X = train_np[:, 1:]

# train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3)
test_df = test_data
test_np = test_df.values
test_x = test_np[:, 0:]

lr = LogisticRegression(C=0.8,penalty='l2',tol=1e-6)
DTree = DecisionTreeClassifier(max_depth=20)
rfc=RandomForestClassifier(n_estimators=5000)

xgbc = XGBClassifier(learning_rate=0.001, n_estimators=5000, max_depth=30, objective='binary:logitraw')
gbc=GradientBoostingClassifier(learning_rate=0.001, n_estimators=5000, max_depth=30)

sclf = StackingCVClassifier(classifiers=[lr,DTree,rfc,SVC(probability=True)], meta_classifier=xgbc, use_probas=True)
sclf.fit(X, y)
print(sclf.score(X, y))

test_id = pd.read_csv('data/origin/test.csv')

predictions = sclf.predict(test_x)
result = pd.DataFrame({'PassengerId':test_id['PassengerId'].values, 'Survived':predictions.astype(np.int32)})
result.to_csv("data/predictions/stacking_test4.csv", index=False)

answer = pd.read_csv('data/predictions/submission.csv')
answer_np = answer['Survived'].values
print('acc = %.5f' % accuracy_score(answer_np, predictions))

# from calculate_acc import calculate_acc
# calculate_acc(predictions)
コード例 #15
0
               "CatBoost": classifier4,
               "ET": classifier5,
               "Stack": sclf}

# Train classifiers
for key in classifiers:
    # Get classifier
    classifier = classifiers[key]
    
    # Fit classifier
    classifier.fit(X_train, y_train)
        
    # Save fitted classifier
    classifiers[key] = classifier

pred = sclf.predict( X_test)

# Get results
results = pd.DataFrame()
for key in classifiers:
    # Make prediction on test set
    y_pred = classifiers[key].predict(X_test)
    
    # Save results in pandas dataframe object
    results[f"{key}"] = y_pred

# Add the test set to the results object
results["Target"] = y_test

pred_stack = results[ 'Stack' ] 
score = f1_score( y_test, pred_stack ) 
コード例 #16
0
sgd = SGDClassifier(eta0=1, max_iter=1000, tol=0.0001, alpha=0.01, l1_ratio=1.0, learning_rate='adaptive', loss='log', penalty='elasticnet')


# set up the meta classifier (level 2 model)
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingCVClassifier
np.random.seed(RANDOM_SEED)
lr = LogisticRegression(max_iter=1000, class_weight='balanced', penalty='l1', C=0.1, solver='liblinear')
sclf = StackingCVClassifier(classifiers=[knn, rf, nb, svc, sgd, lgbm], 
                            use_probas=True,
                            use_features_in_secondary=True,
                            meta_classifier=lr,
                            cv=6)

sclf.fit(train, targets)
preds = sclf.predict(test)
print(preds)
exit(0)


# Set up K-Fold cross validation and predictions
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

num_folds = 6
folds = KFold(n_splits=num_folds, shuffle=True)

test_result = np.zeros(len(test))
auc_score = 0

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, targets)):
if(SINGLE):
    if(ALL):
        Preds = Parallel(n_jobs=-1, verbose=1, backend="threading")(delayed(pf.Classification_Model)(data_training=vec_training, target_training=out_train,
                           data_testing=vec_testing, Classifier=Model[1], target_testing=out_test, ModelName=Model[0], accur=True, grph=False, setClass=clas, show=False) for Model in ListAllClassifiers)
    else:
        Preds = pf.Classification_Model(data_training=vec_training, target_training=out_train, data_testing=vec_testing,
                                    Classifier=ListAllClassifiers[10][1], target_testing=out_test, ModelName=ListAllClassifiers[10][0],
                                    accur=True, grph=True, setClass=clas, show=False)
elif(META==False):
    Preds = pf.Classification_Model(data_training=vec_training, target_training=out_train, data_testing=vec_testing,
                                    Classifier=EnsembleCustom[0][1], target_testing=out_test, ModelName=EnsembleCustom[0][0],
                                    accur=True, grph=True, setClass=clas, show=False)

else:
    MetaClass.fit(vec_training, out_train)
    Preds = MetaClass.predict(vec_testing)
    pf.ClassReport_Graph(Classif=MetaClass, Data_train=vec_training, Target_train=out_train, Data_test=vec_testing, Target_test=out_test,
                         Class=clas, ModelName='Stacking CV Classifier', Accur=True, Predict=Preds)

runingTime = timeit.default_timer() - tStart #Stopping clock and getting time spent
print("Fitting and predictions done in %0.4fs."%runingTime)
print("="*100)

pf.Get_ConfusionMatrix(TrueLabels=out_test, PredictedLabels=Preds, Classes=clas, Normal=True, Title='Confusion matrix', ColorMap='rainbow', FigSize=(30,30), save=False)

#r = pd.DataFrame({"SGDC":ResultPreds[0], "Bernoulli":ResultPreds[1], "Multinomial":ResultPreds[2], "RandomForest":ResultPreds[3], "ExtraTrees":ResultPreds[4], "GradientBoosting":ResultPreds[5], "AdaBoosting":ResultPreds[6]})
#
##################################################################################################################################
'''                                DEFAULT PARAMETERS FOR THE DIFFERENT CLASSIFIERS AVAILABLE                                '''
##################################################################################################################################
"""
コード例 #18
0
test['slope_hyd'] = (test['Horizontal_Distance_To_Hydrology']**2 +
                     test['Vertical_Distance_To_Hydrology']**2)**0.5
test.slope_hyd = test.slope_hyd.map(
    lambda x: 0 if np.isinf(x) else x)  # remove infinite value if any

#Mean distance to Amenities
test['Mean_Amenities'] = (test.Horizontal_Distance_To_Fire_Points +
                          test.Horizontal_Distance_To_Hydrology +
                          test.Horizontal_Distance_To_Roadways) / 3
#Mean Distance to Fire and Water
test['Mean_Fire_Hyd'] = (test.Horizontal_Distance_To_Fire_Points +
                         test.Horizontal_Distance_To_Hydrology) / 2

feature = [col for col in train.columns if col not in ['Cover_Type', 'Id']]
X_train = train[feature]
X_test = test[feature]
c1 = ensemble.ExtraTreesClassifier(n_estimators=150, bootstrap=True)
c2 = ensemble.RandomForestClassifier(n_estimators=150, bootstrap=True)
c3 = XGBClassifier()
meta = svm.LinearSVC()
etc = StackingCVClassifier(classifiers=[c1, c2, c3],
                           use_probas=True,
                           meta_classifier=meta)

etc.fit(X_train.values, train['Cover_Type'].values)
sub = pd.DataFrame({
    "Id": test['Id'],
    "Cover_Type": etc.predict(X_test.values)
})
sub.to_csv("stackcv_linearsvc.csv", index=False)
コード例 #19
0
#voting ensemlbe
ensemble = VotingClassifier(estimators, voting='soft', weights=[1, 1, 1])
ensemble.fit(X_train, y_train)
pred = ensemble.predict(X_test)
print("predicted values----------:", pred)
pickle.dump(ensemble, open('ensemble-clf.sav', 'wb'))
# pred_op = ensemble.predict(otpt)
# print("Predicted values:" ,pred_op)
print('fscore:{0:.3f}'.format(f1_score(y_test, pred, average='micro')))

#meta classifier ensemble
stack = StackingCVClassifier(classifiers=[mlp, xgb, rf],
                             meta_classifier=lr,
                             use_probas=True)
stack.fit(X_train.values, y_train.values)
pred2 = stack.predict(X_test.values)
print("predicted values: ", pred2)
print('fscore:{0:.3f}'.format(f1_score(y_test, pred2, average='micro')))
from sklearn.metrics import confusion_matrix
confusion_lr = confusion_matrix(y_test, pred)
pickle.dump(stack, open('stack-clf.sav', 'wb'))
print(confusion_lr)

####################################################################################################################
# #REPORT AND PLOT MICRO-AVERAGE ROC AUC FOR EACH MODEL
# from sklearn.preprocessing import label_binarize
# import matplotlib.pyplot as plt
# from itertools import cycle
# from sklearn.multiclass import OneVsRestClassifier
# from scipy import interp
# # Binarize the output
コード例 #20
0
ファイル: stacking.py プロジェクト: spareribs/kaggleSpareribs
# x_train = x_train.reset_index(drop=True)
# x_vali = x_vali.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
# y_vali = y_vali.reset_index(drop=True)
"""=====================================================================================================================
2 模型融合;
学习参考:https://blog.csdn.net/LAW_130625/article/details/78573736
"""

lr_clf = clfs["lr"]  # meta_classifier
svm_clf = clfs["svm_ploy"]
rf_clf = clfs["rf"]
xgb_clf = clfs["xgb"]
lgb_clf = clfs["lgb"]

sclf = StackingCVClassifier(
    classifiers=[lr_clf, svm_clf, rf_clf, xgb_clf, lgb_clf],
    meta_classifier=lr_clf,
    use_probas=True,
    verbose=3)

sclf.fit(x_train, y_train)

print("测试模型 & 模型参数如下:\n{0}".format(sclf))
print("=" * 20)
pre_train = sclf.predict(x_train)
print("训练集正确率: {0:.4f}".format(sclf.score(x_train, y_train)))
print("训练集f1分数: {0:.4f}".format(f1_score(y_train, pre_train)))
print("训练集auc分数: {0:.4f}".format(roc_auc_score(y_train, pre_train)))
コード例 #21
0
# In[ ]:

stacker = SCVC(classifiers=[clf1, clf2, clf3, clf1],
               meta_classifier=meta_clf,
               use_probas=True,
               use_features_in_secondary=True)

# In[ ]:

for c in train.columns:
    train[c] = train[c].fillna(train[c].median())
    test[c] = test[c].fillna(train[c].median())
stacker.fit(train.values, np.array(Y))

# In[ ]:

my_prediction = stacker.predict(test.values)

# In[ ]:

# PassengerId,Survived
submission = pd.DataFrame()
submission['PassengerId'] = test.index.tolist()
submission['Survived'] = my_prediction

# In[ ]:

submission.to_csv("submission.csv", index=False)

# In[ ]:
コード例 #22
0
                                              np.around(score, 3)))

print('> Fitting stack')

stack = StackingCVClassifier(
    classifiers=[ab_clf, rf_clf, xgb_clf, et_clf, lg_clf],
    meta_classifier=rf_clf,
    cv=5,
    stratify=True,
    shuffle=True,
    use_probas=True,
    use_features_in_secondary=True,
    verbose=1,
    random_state=12345,
    n_jobs=-1)

stack = stack.fit(X_train, y_train)

X_test = np.array(X_test)
print('> Making predictions')
pred = stack.predict(X_test)
print(classification_report(pred, y_test, labels=None))

#predictions = pd.Series(pred, index=X_test.index, dtype=y_train.dtype)

# ======================================================================== #
sel = VarianceThreshold(threshold=0)
df_train_new = sel.fit_transform(df_train)
#sel.get_support(df_train)
sel.get_support(indices=True)
コード例 #23
0
eclf = VotingClassifier(estimators=[('rf',rf),('lr',lr),('gb',gb)],voting='soft',
                        weights=[3,2,3])

# Building and running the StackingClassifier on the test data
from mlxtend.classifier import StackingCVClassifier
sclf=StackingCVClassifier(classifiers=[rf,lr,gb,et,gnb,svc,knn,xgb,ada,mlp,lda,qda],
                          use_features_in_secondary=True,
                          use_probas=True,
                        meta_classifier=eclf)
cmetrics=[]
cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='accuracy').mean())
cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='precision').mean())
cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='recall').mean())
cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='roc_auc').mean())
sclf.fit(X.values,y.values)
pred=sclf.predict(Xt.values)

# plotting ROC-Curve
pred_proba=sclf.predict_proba(Xt.values)[:,1]
fpr, tpr, threshold = roc_curve(yt, pred_proba)
roc_auc=auc(fpr,tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig('ROC_curve_test.png',bbox_inches='tight')
plt.clf()
コード例 #24
0
# Fitting stack
from mlxtend.classifier import StackingCVClassifier
stack = StackingCVClassifier(classifiers=[ab_clf, et_clf, lg_clf, 
                                          bag_clf,
                                          rf_clf],
                             meta_classifier=rf_clf,
                             cv=10,
                             stratify=True,
                             shuffle=True,
                             use_probas=True,
                             use_features_in_secondary=True,
                             verbose=0,
                             random_state=randomstate)
stack = stack.fit(x, y)

print("Completed modeling!")

#make predictions
y_predict = stack.predict(x_predict)
y_predict = pd.Series(y_predict, index=x_predict.index, dtype=y.dtype)

print("Completed predictions!")

# Save predictions to a file for submission
output = pd.DataFrame({'Id': Ids,
                       'Cover_Type': y_predict})
output.to_csv('submission.csv', index=False)

#create a link to download the file    
from IPython.display import FileLink
FileLink(r'submission.csv')