Example #1
0
def stacking(_stacking_model_list, _final_clf, _metric, X_train, X_val,
             y_train, y_val, X_test, _cv):
    # Might wanna consider remove _final_clf from the _stacking_model_list
    sclf = StackingCVClassifier(classifiers=_stacking_model_list,
                                use_probas=True,
                                meta_classifier=_final_clf,
                                random_state=42)
    scores = model_selection.cross_val_score(sclf,
                                             X_train,
                                             y_train,
                                             cv=_cv,
                                             scoring=_metric)
    print('Cross-validated score:', scores)
    print('-' * 20)
    predicted_probas = sclf.predict_proba(X_val)
    y_true = y_val
    y_probas = predicted_probas
    skplt.metrics.plot_roc_curve(y_true, y_probas)
    plt.savefig(r'Result/' + sclf.__class__.__name__ + 'stacking.png')
    plt.show()

    prediction = sclf.predict_proba(X_test)[:, 1]
    return prediction
Example #2
0
# Building and running the StackingClassifier on the test data
from mlxtend.classifier import StackingCVClassifier
sclf=StackingCVClassifier(classifiers=[rf,lr,gb,et,gnb,svc,knn,xgb,ada,mlp,lda,qda],
                          use_features_in_secondary=True,
                          use_probas=True,
                        meta_classifier=eclf)
cmetrics=[]
cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='accuracy').mean())
cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='precision').mean())
cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='recall').mean())
cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='roc_auc').mean())
sclf.fit(X.values,y.values)
pred=sclf.predict(Xt.values)

# plotting ROC-Curve
pred_proba=sclf.predict_proba(Xt.values)[:,1]
fpr, tpr, threshold = roc_curve(yt, pred_proba)
roc_auc=auc(fpr,tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig('ROC_curve_test.png',bbox_inches='tight')
plt.clf()
perf = pd.read_csv('performance_estimates.csv')
metrics=[]
metrics.append(accuracy_score(yt,pred))
Example #3
0
num_folds = 6
folds = KFold(n_splits=num_folds, shuffle=True)

test_result = np.zeros(len(test))
auc_score = 0

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, targets)):
    print("Fold: ", fold_ + 1)
    
    X_train, y_train = train.iloc[trn_idx], targets.iloc[trn_idx]
    X_valid, y_valid = train.iloc[val_idx], targets.iloc[val_idx]
    
    sclf.fit(X_train.values, y_train.values)
    
    y_pred = sclf.predict_proba(X_valid)
    auc = roc_auc_score(y_valid, y_pred[:, 1])
    print(auc)
    auc_score += auc

    preds = sclf.predict_proba(test)
    test_result += preds[:, 1]

# print the average AUC across the folds and compute the final results on the test data
auc_score = auc_score / folds.n_splits
print("AUC score: ", auc_score)
test_result = test_result / folds.n_splits

# create the submission
submission = pd.DataFrame({
  'Id' : test['Id'],
class StackingDemo(object):
    def __init__(self):
        # data prepare
        self.__iris = None
        self.__X = None
        self.__y = None
        self.__train, self.__train_label = [None for _ in range(2)]
        self.__test, self.__test_label = [None for _ in range(2)]

        # function set
        self.__params = None

        self.__lr = None
        self.__gb = None
        self.__rf = None
        self.__sclf = None
        self.__grid = None

    def data_prepare(self):
        self.__iris = load_iris()
        self.__X = self.__iris.data[0:100]
        self.__y = self.__iris.target[0:100]
        self.__train, self.__test, self.__train_label, self.__test_label = train_test_split(
            self.__X, self.__y, test_size=0.2, shuffle=True)

    def function_set(self):
        # param
        self.__params = {
            # 注意名称必须是这样
            "logisticregression__C":
            list(np.linspace(start=0.1, stop=10, num=5)),
            "gradientboostingclassifier__learning_rate":
            list(np.linspace(start=0.1, stop=1, num=10)),
            "randomforestclassifier__n_estimators":
            list(range(5, 16)),
            "meta-logisticregression__C":
            list(np.linspace(start=0.1, stop=10, num=5))
        }

        # model
        self.__lr = LogisticRegression()
        self.__gb = GradientBoostingClassifier()
        self.__rf = RandomForestClassifier()
        self.__sclf = StackingCVClassifier(
            classifiers=[self.__lr, self.__gb, self.__rf],
            meta_classifier=self.__lr,
            use_probas=True,
            cv=5,
            use_features_in_secondary=True,
            verbose=1)

        self.__grid = GridSearchCV(estimator=self.__sclf,
                                   param_grid=self.__params,
                                   cv=5,
                                   refit=True)

    def goodness_of_function(self):
        self.__grid.fit(self.__train, self.__train_label)
        print("Best parameters: %s" % self.__grid.best_params_)
        print("Accuracy: %.2f" % self.__grid.best_score_)

    def pick_the_best_function(self):
        self.__lr = LogisticRegression(C=0.1)
        self.__gb = GradientBoostingClassifier(learning_rate=0.1)
        self.__rf = RandomForestClassifier(n_estimators=5)
        self.__sclf = StackingCVClassifier(
            classifiers=[self.__lr, self.__gb, self.__rf],
            meta_classifier=self.__lr,
            use_probas=True,
            cv=5,
            use_features_in_secondary=True,
            verbose=1)
        self.__sclf.fit(self.__train, self.__train_label)
        print(
            roc_auc_score(self.__test_label,
                          self.__sclf.predict_proba(self.__test)[:, 1]))
class StackingBaseline(object):
    def __init__(self, *, path):
        self.__path = path
        self.__application_train = None
        self.__application_test = None
        self.__sample_submission = None

        # data prepare
        self.__application_train_feature = None
        self.__application_train_label = None
        self.__application_test_feature = None

        self.__categorical_columns = None
        self.__numeric_columns = None

        # numeric handle
        # categorical handle
        self.__encoder = None

        # model fit
        self.__lr = None
        self.__ef = None
        self.__rf = None
        self.__gb = None
        self.__xgb = None
        self.__sclf = None

    def data_prepare(self):
        self.__application_train = pd.read_csv(
            os.path.join(self.__path, "application_train.csv"))
        self.__application_test = pd.read_csv(
            os.path.join(self.__path, "application_test.csv"))
        self.__sample_submission = pd.read_csv(
            os.path.join(self.__path, "sample_submission.csv"))

        self.__application_train = self.__application_train.drop("SK_ID_CURR",
                                                                 axis=1)
        self.__application_test = self.__application_test.drop("SK_ID_CURR",
                                                               axis=1)

        self.__application_train_feature = self.__application_train[[
            i for i in self.__application_train.columns if i != "TARGET"
        ]]
        self.__application_train_label = self.__application_train["TARGET"]
        self.__application_test_feature = self.__application_test

        self.__categorical_columns = self.__application_train_feature.select_dtypes(
            include=["object"]).columns.tolist()
        self.__numeric_columns = [
            i for i in self.__application_train_feature.columns
            if i not in self.__categorical_columns
        ]

    def numeric_handle(self):
        self.__application_train_feature[
            self.__numeric_columns] = self.__application_train_feature[
                self.__numeric_columns].fillna(-999.0)
        self.__application_test_feature[
            self.__numeric_columns] = self.__application_test_feature[
                self.__numeric_columns].fillna(-999.0)

    def categorical_handle(self):
        self.__application_train_feature[self.__categorical_columns] = (
            self.__application_train_feature[
                self.__categorical_columns].fillna("missing"))

        self.__encoder = LeaveOneOutEncoder()
        self.__encoder.fit(
            self.__application_train_feature[self.__categorical_columns],
            self.__application_train_label)
        self.__application_train_feature[
            self.__categorical_columns] = self.__encoder.transform(
                self.__application_train_feature[self.__categorical_columns])
        self.__application_test_feature[
            self.__categorical_columns] = self.__encoder.transform(
                self.__application_test_feature[self.__categorical_columns])

    def model_fit(self):
        self.__ef = ExtraTreesClassifier(n_jobs=-1)
        self.__rf = RandomForestClassifier(n_jobs=-1)
        self.__lr = LogisticRegression()
        self.__gb = GradientBoostingClassifier()
        self.__xgb = XGBClassifier(n_jobs=-1, missing=-999.0)
        self.__sclf = StackingCVClassifier(
            classifiers=[self.__ef, self.__rf, self.__gb, self.__xgb],
            meta_classifier=self.__lr,
            use_probas=True,
            cv=3)
        self.__sclf.fit(self.__application_train_feature.values,
                        self.__application_train_label.values)

    def model_predict(self):
        self.__sample_submission["TARGET"] = np.clip(
            self.__sclf.predict_proba(
                self.__application_test_feature.values)[:, 1], 0, 1)
        self.__sample_submission.to_csv(
            '/Users/David/Desktop/0.Home default risk/submission/stack_baseline',
            index=False)
Example #6
0
# sclf = StackingCVClassifier(classifiers=[xgb, xtrees, rf, gb], meta_classifier=lr, use_probas=True, cv=5, verbose=2)
# sclf.fit(X_train.values, y_train.values)
# sclf_y_pred_proba = sclf.predict_proba(X_test.values)[:,1]
# gini_norm(y_test, sclf_y_pred_proba)
# # 0.2777: not much better than cv 3...

# Try out some more regularization for Logit

sclf = StackingCVClassifier(classifiers=[xgb, xtrees, rf, gb],
                            meta_classifier=LogisticRegression(C=0.1),
                            use_probas=True,
                            cv=3,
                            verbose=2)
sclf.fit(X_train.values, y_train.values)
sclf_y_pred_proba = sclf.predict_proba(X_test.values)[:, 1]
gini_norm(y_test, sclf_y_pred_proba)
# 0.2710, 0.2718

sclf.meta_clf_.coef_

sclf.meta_clf_.intercept_

sclf.meta_clf_.n_iter_

# __max_iter=300
# class_weight='balanced'
# penalty='l1'__

# sclf = StackingCVClassifier(classifiers=[xgb, xtrees, rf, gb], meta_classifier=LogisticRegression(max_iter=300, class_weight='balanced'), use_probas=True, cv=3, verbose=2)
# sclf.fit(X_train.values, y_train.values)
    1,
    0,
    1,
    1,
    0,
    1
])
metaClassifier = CalibratedClassifierCV(EnsembleRegression(
    x0, list(clfs.keys()), le.classes_),
                                        method='isotonic',
                                        cv=META_FOLDS)
sclf = StackingCVClassifier(classifiers=pipes,
                            meta_classifier=metaClassifier,
                            use_clones=False,
                            use_probas=True,
                            cv=FOLDS,
                            verbose=1)
sclf.fit(data.values, labelsEncoded, groups=None, **weightsPerClassifier)
print('StackingCV classifier is fitted in ' + str(datetime.now() - start))

start = datetime.now()
test = pd.read_csv('data/test_users_norm.csv').fillna(NA_CONST)
Xid = test.pop('id')
saveResult(Xid, sclf.predict_proba(test.values), le, 'predict/stacking.csv')
print('Submission predict/stacking.csv is predicted in ' +
      str(datetime.now() - start))

trainPredicted = sclf.predict_proba(data.values)
print('Test set nDCG5 score: ' + str(nDCG5(labelsEncoded, trainPredicted)))
print('Total time: ' + str(datetime.now() - totalStart))
                     n_jobs=8)
svc = SVC(kernel='rbf', random_state=2018, probability=True, gamma='auto')
lr = LogisticRegression(max_iter=1000, solver='lbfgs', penalty='l2', n_jobs=8)
models = [rf, xgb, lgb, svc]
y_pred_self, y_prob_self = StackingModels(models=models,
                                          meta_model=lr,
                                          X_train=X_train,
                                          X_test=X_test,
                                          y_train=y_train)
acc = accuracy_score(y_test, y_pred_self)
auc = roc_auc_score(y_test, y_prob_self)
print('MyModel:  ACC = {:.6f}, AUC = {:.6f}'.format(acc, auc))
stack_clf = StackingCVClassifier(classifiers=models, meta_classifier=lr,
                                 cv=5).fit(X_train, y_train)
y_pred_mxltend, y_prob_mxltend = stack_clf.predict(
    X_test), stack_clf.predict_proba(X_test)[:, -1]
acc = accuracy_score(y_test, y_pred_mxltend)
auc = roc_auc_score(y_test, y_prob_mxltend)
print('Mlxtend:  ACC = {:.6f}, AUC = {:.6f}'.format(acc, auc))

X, y = make_regression(n_samples=5000,
                       n_features=20,
                       n_informative=18,
                       random_state=2018)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=2018)
X_train, X_test = map(scaler.fit_transform, [X_train, X_test])

rf = RandomForestRegressor(n_estimators=50,
Example #9
0
    ExtraTreesClassifier(n_estimators=1000, max_depth=2, n_jobs=8),
    ExtraTreesClassifier(n_estimators=1000, max_depth=4, n_jobs=8),
    ExtraTreesClassifier(n_estimators=1000, max_depth=10, n_jobs=8),
    ExtraTreesClassifier(n_estimators=1000, max_depth=30, n_jobs=8),
]
lr = ExtraTreesClassifier(n_estimators=1000, max_depth=30, n_jobs=8)

model = StackingCVClassifier(classifiers=clfList,
                             use_probas=True,
                             use_features_in_secondary=True,
                             meta_classifier=lr,
                             cv=20,
                             random_state=15,
                             verbose=1)

model.fit(x_train, y_train)
#y_pred = sclf.predict(x_test)
#score(y_pred, y_test)

#model = load("../models/catboost_model.pkl")
y_pred = model.predict(x_val)
scores = get_all_scores(y_pred, y_val)
print(scores)
y_pred = model.predict(x_test)
scores = get_all_scores(y_pred, y_test)
print(scores)
probas_test = model.predict_proba(x_test)
save(model, "../models/ex_stack.pkl")

print("done")
                     validation_fraction=0.1,
                     verbose=False,
                     warm_start=False)

lr = BaggingClassifier(LogisticRegression(random_state=RANDOM_SEED,
                                          penalty='l1',
                                          C=0.1),
                       max_samples=0.8,
                       max_features=0.8)

sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3, clf5, clf7, clf8],
                            use_probas=True,
                            meta_classifier=lr)

sclf.fit(train_cm_x.values, train_cm_y.values)
predict_y = sclf.predict_proba(test_cm.values)[:-1]

df = pd.DataFrame(predict_y)
df.to_csv("predicted_y.csv")
print('5-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, clf5, clf7, clf8, sclf], [
        'KNN', 'Extreme gradient boosting', 'bagging Logistic Regression',
        'Linear SVC', 'Extra Tree', 'Neural Network', 'StackingClassifier'
]):

    scores = cross_val_score(clf,
                             train_cm_x.values,
                             train_cm_y.values,
                             cv=5,
                             scoring='roc_auc')
def stacking_classifier(
    train,
    validation,
    refit='yes',
    use_saved_model='no',
    save_model='yes',
    to_plot='yes',
    meta_leaner_parameters={
        'max_depth': 20,
        "n_estimators": 20,
        "learning_rate": 0.05,
        'silent': False,
        'n_jobs': 3,
        'subsample': 1,
        'objective': 'binary:logistic',
        'colsample_bytree': 1,
        'eval_metric': "auc",
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'random_state': 500
    },
    stacking_cv_parameters={
        'use_probas': False,
        'use_features_in_secondary': True,
        'cv': 5,
        'store_train_meta_features': True,
        'refit': True
    }):

    if use_saved_model == 'no':

        # Import all the grid searched models

        # Logistic
        model_filename = os.getcwd() + "/models/logistic_model.pkl"
        log_model = joblib.load(model_filename)

        # Random Forest
        model_filename = os.getcwd() + "/models/rf_model.pkl"
        rf_model = joblib.load(model_filename)

        # Extreme Random Forest
        model_filename = os.getcwd() + "/models/erf_model.pkl"
        erf_model = joblib.load(model_filename)

        # XGBoost
        model_filename = os.getcwd() + "/models/xgb_model.pkl"
        xgb_model = joblib.load(model_filename)

        # SVM
        model_filename = os.getcwd() + "/models/svm_model.pkl"
        svm_model = joblib.load(model_filename)

        # Naive Bayes
        model_filename = os.getcwd() + "/models/nb_model.pkl"
        nb_model = joblib.load(model_filename)

        # Neural Network
        model_filename = os.getcwd() + "/models/nn_model.pkl"
        nn_model = joblib.load(model_filename)

        meta_learner = xgboost.XGBClassifier(
            max_depth=meta_leaner_parameters['max_depth'],
            n_estimators=meta_leaner_parameters['n_estimators'],
            learning_rate=meta_leaner_parameters['learning_rate'],
            silent=meta_leaner_parameters['silent'],
            n_jobs=meta_leaner_parameters['n_jobs'],
            subsample=meta_leaner_parameters['subsample'],
            objective=meta_leaner_parameters['objective'],
            colsample_bytree=meta_leaner_parameters['colsample_bytree'],
            eval_metric=meta_leaner_parameters['eval_metric'],
            reg_alpha=meta_leaner_parameters['reg_alpha'],
            reg_lambda=meta_leaner_parameters['reg_lambda'],
            random_state=meta_leaner_parameters['random_state'])

        model = StackingCVClassifier(
            classifiers=[rf_model, erf_model, xgb_model],
            meta_classifier=meta_learner,
            use_probas=stacking_cv_parameters['use_probas'],
            use_features_in_secondary=stacking_cv_parameters[
                'use_features_in_secondary'],
            store_train_meta_features=stacking_cv_parameters[
                'store_train_meta_features'],
            cv=stacking_cv_parameters['cv'])

        model = model.fit(
            train.drop(['click', 'bidprice', 'payprice'], axis=1).values,
            train['click'].values)
        prediction = model.predict_proba(
            validation.drop(['click', 'bidprice', 'payprice'], axis=1).values)

    else:

        # Load from saved files
        model_filename = os.getcwd() + "/models/stacked_model.pkl"
        saved_model = joblib.load(model_filename)

        if refit == 'yes':

            # If refit, run
            model = saved_model.fit(
                train.drop(['click', 'bidprice', 'payprice'], axis=1).values,
                train['click'].values)

            # Make prediction
            prediction = model.predict_proba(
                validation.drop(['click', 'bidprice', 'payprice'],
                                axis=1).values)

        else:
            prediction = saved_model.predict_proba(
                validation.drop(['click', 'bidprice', 'payprice'],
                                axis=1).values)
            model = saved_model

    # Whether to save the model
    if save_model == 'yes':

        print('Saving the stacked model to the disc.')
        model_filename = os.getcwd() + "/models/stacked_model.pkl"
        joblib.dump(model, model_filename, compress=9)

    # Print scores
    print("AUC: %0.5f for Stacking Model" %
          (roc_auc_score(validation['click'], prediction[:, 1])))

    if to_plot == 'yes':

        plot_ROC_curve(validation['click'], prediction[:, 1])

    return model, prediction[:, 1]


####################### END ########################
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        #
        "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    fnames_for_checksum = [x + f"cauc" for x in experiments]
    checksum = compute_checksum_v2(fnames_for_checksum)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]

    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    x, y = get_x_y(holdout_predictions)
    print(x.shape, y.shape)

    x_test, _ = get_x_y(test_predictions)
    print(x_test.shape)

    if True:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if False:
        sc = PCA(n_components=16)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)

    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    auc_cv = []

    classifier1 = LGBMClassifier()
    classifier2 = CatBoostClassifier()
    classifier3 = LogisticRegression()
    classifier4 = CalibratedClassifierCV()
    classifier5 = LinearDiscriminantAnalysis()

    sclf = StackingCVClassifier(
        classifiers=[
            classifier1, classifier2, classifier3, classifier4, classifier5
        ],
        shuffle=False,
        use_probas=True,
        cv=4,
        # meta_classifier=SVC(degree=2, probability=True),
        meta_classifier=LogisticRegression(solver="lbfgs"),
    )

    sclf.fit(x, y, groups=image_ids)

    classifiers = {
        "LGBMClassifier": classifier1,
        "CatBoostClassifier": classifier2,
        "LogisticRegression": classifier3,
        "CalibratedClassifierCV": classifier4,
        "LinearDiscriminantAnalysis": classifier5,
        "Stack": sclf,
    }

    # Get results
    for key in classifiers:
        # Make prediction on test set
        y_pred = classifiers[key].predict_proba(x_valid)[:, 1]

        print(key, alaska_weighted_auc(y_valid, y_pred))

    # Making prediction on test set
    y_test = sclf.predict_proba(x_test)[:, 1]

    df["Label"] = y_test
    df.to_csv(os.path.join(output_dir,
                           f"stacking_{np.mean(auc_cv):.4f}_{checksum}.csv"),
              index=False)
Example #13
0
#Method 2
clf1 = XGBClassifier(learning_rate=0.5,
                     n_estimators=300,
                     max_depth=5,
                     gamma=0,
                     subsample=0.8,
                     verbose=1)
#clf1 = XGBClassifier(learning_rate =0.5,n_estimators=300,max_depth=5,gamma=0,subsample=0.8)
clf2 = RandomForestClassifier(n_jobs=-1, n_estimators=35, criterion="entropy")
clf3 = ExtraTreesClassifier(n_jobs=-1, n_estimators=5, criterion="entropy")
lr = LogisticRegression(n_jobs=-1, C=8)
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                          meta_classifier=lr,
                          verbose=100)

# for clf, label in zip([clf1, clf2, clf3, sclf],
#                       ['XGBoost',
#                        'Random Forest',
#                        'Extra Tree',
#                        'StackingClassifier']):
#     scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy')
#     print("Accuracy: %0.2f (+/- %0.2f) [%s]"
#           % (scores.mean(), scores.std(), label))

sclf.fit(X_train, y_train)
print("training finished")
#y_pre = sclf.predict(X_test)
y_pre = sclf.predict_proba(X_test)[:, 1]
print("roc:{0:.3f}".format(roc_auc_score(y_test, y_pre)))