Ejemplo n.º 1
0
def stacking3Model(model1, model2, metamodel, xtr, ytr, xts, yts):
    model = StackingClassifier(classifiers=[model1, model2], meta_classifier=metamodel)
    train, testt = scaling(xtr, xts,MaxAbsScaler())
    model.fit(train, ytr)
    acc = accuracy_score(yts, model.predict(testt))
    predict = model.predict(testt)
    return acc, predict
def classifer_stacking(data_file,alertgroup_name,classifier_list):
    classifiers = {'KNN':KNeighborsClassifier(),
                   # n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric_params=None, n_jobs=1),
                   # 'LR': LogisticRegression(),
                   'RF':  RandomForestClassifier(),
                   # n_estimators=60,max_depth=13,min_samples_split=120,min_samples_leaf=20,random_state=10
                   'DT': tree.DecisionTreeClassifier(),
                   # criterion='gini',splitter=random,max_features=None,max_depth=13,min_samples_leaf=2
                   'GBDT': GradientBoostingClassifier()
                       # loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1,max_depth=3,verbose=0,presort='auto')
                   # 'XGB':xgboost_classifier
                   }
    all_data = pd.read_csv(data_file, sep=',', dtype=str)
    for alertgroup, group in all_data.groupby('alertgroup'):
        if alertgroup == alertgroup_name:
            train_x, test_x, train_y, test_y = get_data(group, split=True)
            arr_x = train_x.values
            arr_y = train_y.values
            max_fs = 0
            best_model = None
            stratified_folder = StratifiedKFold(n_folds=3,random_state=0,shuffle=False)

            for train_index,test_index in stratified_folder.split(train_x):
                train_x = arr_x[train_index]
                train_y = arr_y[train_index]
                test_x = arr_x[test_index]
                test_y = arr_y[test_index]
                classifiers_list = [classifiers[cl] for cl in classifier_list]
                stack_model = StackingClassifier(classifiers = classifiers_list,use_probas=True,
                                                average_probas=True,meta_classifier=classifiers['RF'])

                stack_model.fit(train_x,train_y)
                predict = stack_model.predict(test_x)
                fbetascore = fbeta_score(test_y, predict, 1)
                print(' f2score:' + str(fbetascore))
                if fbetascore > max_fs:
                    max_fs = fbetascore
                    best_model = stack_model

            stack_model = best_model
            predict = stack_model.predict(test_x)
            precision = metrics.precision_score(test_y, predict)
            recall = metrics.recall_score(test_y, predict)
            fbetascore = fbeta_score(test_y, predict, 0.5)
            accuracy = metrics.accuracy_score(test_y, predict)
            print('final performance:')
            print(alertgroup_name)
            print('precision: %.6f' % (100 *precision))
            print('recall: %.6f' % (100 * recall))
            print('f0.5score: %.6f' % (100 * fbetascore))
            print('accuracy: %.6f%%' % (100 * accuracy))

            return best_model
Ejemplo n.º 3
0
def model_processing(X_train, X_test, y_train, y_test):
    log_reg = LogisticRegression(C=0.01, penalty='l2')
    svc = SVC(C=0.7, kernel='linear')
    tree_clf = DecisionTreeClassifier(criterion='entropy',
                                      max_depth=3,
                                      min_samples_leaf=5)
    rf_clf = RandomForestClassifier(n_estimators=70,
                                    criterion='entropy',
                                    max_features='auto',
                                    min_samples_leaf=6)
    xgb = XGBClassifier(gamma=0.3,
                        max_depth=4,
                        min_child_weight=8,
                        reg_alpha=0.05)

    sclf = StackingClassifier(classifiers=[log_reg, svc, tree_clf, rf_clf],
                              meta_classifier=xgb)
    sclf.fit(X_train, y_train)
    y_pred_train = sclf.predict(X_train)
    y_pred = sclf.predict(X_test)

    print('*' * 30, '在训练集上的得分')

    accuracy = accuracy_score(y_train, y_pred_train)
    precision = precision_score(y_train, y_pred_train)
    f1 = f1_score(y_train, y_pred_train)
    recall = recall_score(y_train, y_pred_train)
    auc = roc_auc_score(y_train, y_pred_train)
    model_name = '堆叠模型-训练集'

    print('{} 精确度 (accuracy):{:.2f}'.format(model_name, accuracy))
    print('{} 准确度(precision):{:.2f}'.format(model_name, precision))
    print('{} F1 Score :{:.2f}'.format(model_name, f1))
    print('{} 召回率(recall Score):{:.2f}'.format(model_name, recall))
    print('{} auc Score:{:.2f}'.format(model_name, auc))

    print('*' * 30, '在测试集上的得分')

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    model_name = '堆叠模型'

    print('{} 精确度 (accuracy):{:.2f}'.format(model_name, accuracy))
    print('{} 准确度(precision):{:.2f}'.format(model_name, precision))
    print('{} F1 Score :{:.2f}'.format(model_name, f1))
    print('{} 召回率(recall Score):{:.2f}'.format(model_name, recall))
    print('{} auc Score:{:.2f}'.format(model_name, auc))
Ejemplo n.º 4
0
def model_stack2():
    _, test_df, train_label = data_process.get_person_data()
    train_data, test_data = data_process.get_scale_data()
    X_train, X_val, y_train, y_val = train_test_split(train_data,
                                                      train_label,
                                                      test_size=0.2,
                                                      random_state=66)
    id_list = list(test_df.pop('ID'))
    model1 = gbt.XGBRegressor(n_estimators=1000,
                              subsample=0.8,
                              learning_rate=0.25,
                              objective='reg:linear')
    model2 = gbt.XGBRegressor(n_estimators=1000,
                              subsample=0.8,
                              learning_rate=0.25,
                              objective='reg:gamma')
    model3 = gbt.XGBRegressor(n_estimators=1000,
                              subsample=0.8,
                              learning_rate=0.25,
                              objective='reg:tweedie')
    model4 = svm.SVR()
    stack_model = StackingClassifier(
        classifiers=[model1, model2, model3, model4], meta_classifier=model3)
    stack_model.fit(train_data, train_label)
    yHat = stack_model.predict(test_data)
    result = pd.DataFrame({'id': id_list, 'yhat': yHat})
    result.to_csv('result/result6.csv',
                  index=False,
                  header=None,
                  encoding='utf-8')
Ejemplo n.º 5
0
class ClassifierBlender:
    def __init__(self, x_train, x_test, y_train, y_test=None):
        x_train.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True)
        x_test.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True)
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train['y'].values
        if self.y_train is not None:
            self.y_test = y_test['y'].values

    def clf_blend(self):
        mete_clf = LinearRegression()
        clf1 = model.svm_regressor()
        clf2 = model.randomforest_regressor()
        clf3 = model.xgb_regressor()
        self.blend = StackingClassifier(classifiers=[clf1, clf2, clf3],
                                        meta_classifier=mete_clf)
        self.blend.fit(self.x_train, self.y_train)
        return self.blend

    def score(self):
        scores = cross_val_score(self.blend,
                                 X=self.x_train,
                                 y=self.y_train,
                                 cv=10,
                                 verbose=2)
        return scores

    def prediction(self):
        y_pred = self.blend.predict(self.x_test)
        return y_pred
Ejemplo n.º 6
0
def stacking_prediction2(m1, m2, meta):
    # model_train, model_test = stacking(clf, Xtrain2,ytrain2, Xtest2)
    # model.fit(model_train, ytrain2)
    tr, ts = scaling(Xtrain2,Xtest2,MaxAbsScaler())
    m = StackingClassifier(classifiers=[m1, m2],meta_classifier=meta) 
    m.fit(tr, ytrain2)
    predict_mm = m.predict(ts)
    return predict_mm
Ejemplo n.º 7
0
    def stacking(self):
        train_data, test_data = self.Extract_feature.extract_count()
        from sklearn.svm import SVR
        from sklearn.pipeline import make_pipeline
        from sklearn.preprocessing import RobustScaler, MinMaxScaler
        from sklearn.preprocessing import StandardScaler
        from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
        from xgboost import XGBClassifier
        import lightgbm as lgb
        from lightgbm import LGBMClassifier
        import xgboost as xgb
        from mlxtend.classifier import StackingClassifier

        import scipy as sc
        from sklearn import model_selection

        lasso = make_pipeline(SVC(C=2.1, gamma=0.005))
        rforest = make_pipeline(
            RandomForestClassifier(random_state=0, n_estimators=6))
        Gboost = GradientBoostingClassifier(n_estimators=500,
                                            learning_rate=0.01,
                                            max_depth=12,
                                            max_features="sqrt",
                                            min_samples_leaf=15,
                                            min_samples_split=97,
                                            random_state=200)
        model_xgb = xgb.XGBClassifier(colsample_bytree=0.4603,
                                      gamma=10,
                                      learning_rate=0.01,
                                      max_depth=11,
                                      n_estimators=500,
                                      reg_alpha=0.01,
                                      reg_lambda=5,
                                      subsample=0.5213,
                                      seed=1024,
                                      nthread=-1)

        lr = LogisticRegression()
        classifiers = [rforest, lasso, Gboost, model_xgb, lr]
        stregr = StackingClassifier(classifiers=classifiers,
                                    meta_classifier=lr)
        stregr.fit(train_data, self.train_label)

        prediction = stregr.predict(test_data)
        classification = classification_report(y_true=self.test_label,
                                               y_pred=prediction)
        print("classification:{}".format(classification))
        print("测试集的score:{}".format(stregr.score(test_data, self.test_label)))
        for clf, label in zip(
            [rforest, lasso, Gboost, lr, model_xgb, stregr],
            ['rf', 'svr', 'gboost', 'lr', 'xgb', 'stackingclassifier']):
            scores = model_selection.cross_val_score(clf,
                                                     train_data,
                                                     self.train_label,
                                                     cv=3,
                                                     scoring='accuracy')
            print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
                  (scores.mean(), scores.std(), label))
Ejemplo n.º 8
0
def Stacking_model(x_train, x_test, y_train, y_test):
    t1 = time()
    dt = DecisionTreeClassifier(
        criterion="gini",  #建立决策树模型
        splitter="best",
        max_depth=10)
    #lr = LogisticRegression(C=1,penalty='l1')#构建逻辑回归分类器
    gbdt = GradientBoostingClassifier(loss='deviance',
                                      learning_rate=0.01,
                                      n_estimators=2000,
                                      subsample=0.8,
                                      max_features=1,
                                      max_depth=10,
                                      verbose=2)
    rf = RandomForestClassifier(n_estimators=30, max_depth=15)
    xgbst = XGBClassifier(silent=0,
                          nthread=4,
                          learning_rate=0.1,
                          min_child_weight=1,
                          max_depth=5,
                          gamma=0,
                          subsample=0.8,
                          max_delta_step=0,
                          colsample_bytree=0.8,
                          reg_lambda=1,
                          n_estimators=2000,
                          seed=27)

    sclf = StackingClassifier(classifiers=[dt, gbdt, rf],
                              use_probas=False,
                              average_probas=False,
                              meta_classifier=xgbst)
    sclf.fit(x_train, y_train)
    t2 = time()
    y_train_p = sclf.predict(x_train)
    y_test_p = sclf.predict(x_test)

    print('----Stacking----')
    print("Train set accuracy score: {:.5f}".format(
        accuracy_score(y_train_p, y_train)))
    print("Test set accuracy score: {:.5f}".format(
        accuracy_score(y_test_p, y_test)))
    print('Time: {:.1f} s'.format(t2 - t1))
    return y_test_p
Ejemplo n.º 9
0
def data_ensemble(cancer_type,feat):
	data_dir = "/home/ubuntu/cancer/"
	data_file = data_dir + cancer_type + "_matrix.csv"
	features = data_dir + cancer_type + "_output.txt"
	output_file = data_dir + cancer_type + "_accuracy.txt"
	file = open(features, "r")
	o_file = open(output_file, "w")
	line = file.readline()
	line = file.readline()
	df = pd.read_csv(data_file)
	df = shuffle(df)
	file_ids=df.pop('file_id')
	y = df.pop('label').values
	dataf=df.pop(line[:-1])
	#dataframe consisting of only important features
	for x in range(feat):
		line = file.readline()
		dataf=np.column_stack((dataf,df.pop(line[:-1])))
	X=normalize(dataf)
	X=scale(X)
	pca=PCA()
	pca.fit(X)
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
        #multiple classifiers
	clf1 = RandomForestClassifier(random_state=1,n_estimators=100)
	clf2 = GradientBoostingClassifier(n_estimators=1200,subsample=0.5,random_state=3)
	clf3 = SVC(gamma='auto')
	clf4 = KNeighborsClassifier(n_neighbors=1)
	clf5 = DecisionTreeClassifier(random_state=0)
	lr = LogisticRegression(solver='lbfgs')
	#stacking for data ensemble
	sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], meta_classifier=lr)
	clf1.fit(X_train,y_train)
	clf2.fit(X_train,y_train)
	clf3.fit(X_train,y_train)
	clf4.fit(X_train,y_train)
	clf5.fit(X_train,y_train)
	sclf.fit(X_train,y_train)
	y_test_predict=sclf.predict(X_test)
	precision = precision_score(y_test, y_test_predict)
	accuracy = accuracy_score(y_test, y_test_predict)
	f1 = f1_score(y_test, y_test_predict)
	recall = recall_score(y_test, y_test_predict)
	scores = [precision,accuracy,f1,recall]
	label = ['RF', 'GBDT', 'SVM','KNN','DT','Stacking']
	clf_list = [clf1, clf2, clf3, clf4, clf5, sclf]
	#score calculation
	for clf, label in zip(clf_list, label):
		y_test_predict = clf.predict(X_test)
		tn, fp, fn, tp = confusion_matrix(y_test, y_test_predict).ravel()
		specificity = tn / (tn+fp)
		recall = tp / (tp+fn)
		precision = tp / (tp+fp)
		accuracy = (tp + tn) / (tp+tn+fp+fn)
		f1 = 2*tp / (2*tp+fp+fn)
		o_file.write("\nAccuracy: %.2f [%s] \nPrecision: %.2f [%s] \nRecall: %.2f [%s] \nF1 score: %.2f [%s] \nSpecificity: %.2f [%s]\n" %(accuracy,label,precision, label, recall, label, f1, label, specificity, label))
Ejemplo n.º 10
0
def stackingPerformanceEditor():
    nb_clf = GaussianNB()
    svm_clf = RandomForestClassifier(n_estimators=100, max_depth=400, random_state=5)
    mlp_clff = MLPClassifier(hidden_layer_sizes=(500,500))
    label = ["NB","RF","MLP"]

    acc = StackingClassifier(classifiers=[nb_clf,svm_clf,mlp_clff], meta_classifier=svm_clf)
    acc.fit(Xtrain2, ytrain2)
    pred = accuracy_score(ytest, acc.predict(Xtest2))
    return pred
Ejemplo n.º 11
0
 def stacking(self):
     from sklearn.svm import SVC
     from sklearn.pipeline import make_pipeline
     from sklearn.preprocessing import RobustScaler, MinMaxScaler
     from sklearn.preprocessing import StandardScaler
     from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
     from lightgbm import LGBMClassifier
     import xgboost as xgb
     from mlxtend.classifier import StackingClassifier
     import scipy as sc
     svc = make_pipeline(SVC(kernel='rbf', C=2.8, gamma=2))
     rf = RandomForestClassifier(random_state=590, n_estimators=6)
     GBoost = GradientBoostingClassifier(n_estimators=500,
                                         learning_rate=0.01,
                                         max_depth=12,
                                         max_features='sqrt',
                                         min_samples_leaf=15,
                                         min_samples_split=97,
                                         random_state=200)
     model_xgb = xgb.XGBClassifier(colsample_bytree=0.4603,
                                   gamma=10,
                                   learning_rate=0.01,
                                   max_depth=11,
                                   min_child_weight=1.7817,
                                   n_estimators=500,
                                   reg_alpha=0.01,
                                   reg_lambda=5,
                                   subsample=0.5213,
                                   silent=1,
                                   seed=1024,
                                   nthread=-1)
     model_lgb = LGBMClassifier(objective='regression',
                                num_leaves=5,
                                learning_rate=0.05,
                                n_estimators=550,
                                max_bin=25,
                                bagging_fraction=1,
                                bagging_freq=5,
                                feature_fraction=0.7,
                                feature_fraction_seed=9,
                                bagging_seed=9,
                                min_data_in_leaf=42,
                                min_sum_hessian_in_leaf=40)
     regressors = [rf, svc, GBoost, model_lgb, model_xgb]
     stregr = StackingClassifier(classifiers=regressors,
                                 meta_classifier=model_xgb,
                                 verbose=1)
     stregr.fit(self.X_train, self.y_train)
     print(
         "the model is stregr and the valid's f1 is: ",
         f1_score(self.y_test, stregr.predict(self.X_test),
                  average="macro"))
     # print("the model is stregr and the valid's precision_score is: ", precision_score(self.y_test, stregr.predict(self.X_test),average="macro"))
     # print("the model is stregr and the valid's recall_score is: ", recall_score(self.y_test, stregr.predict(self.X_test),average="macro"))
     return stregr
Ejemplo n.º 12
0
def model_processing(X_train,X_test,y_train,y_test):
    log_reg = LogisticRegression(C=0.01, penalty='l2')
    svc = SVC(C=0.7, kernel='linear')
    tree_clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
    rf_clf = RandomForestClassifier(n_estimators=70,criterion='entropy', max_features='auto',min_samples_leaf=6)
    xgb = XGBClassifier(gamma=0.3, max_depth=4, min_child_weight=8,reg_alpha=0.05)
    
    sclf = StackingClassifier(classifiers=[log_reg,svc,tree_clf,rf_clf],meta_classifier=xgb)
    sclf.fit(X_train,y_train)
    y_pred_train = sclf.predict(X_train)
    y_pred = sclf.predict(X_test)
    
    print('*' * 30,'在训练集上的得分' )
    
    accuracy = accuracy_score(y_train,y_pred_train)
    precision = precision_score(y_train,y_pred_train)
    f1 = f1_score(y_train,y_pred_train)
    recall = recall_score(y_train,y_pred_train)
    auc = roc_auc_score(y_train,y_pred_train)
    model_name = '堆叠模型-训练集'
     
    print('{} 精确度 (accuracy):{:.2f}'.format(model_name,accuracy))
    print('{} 准确度(precision):{:.2f}'.format(model_name,precision))
    print('{} F1 Score :{:.2f}'.format(model_name,f1))
    print('{} 召回率(recall Score):{:.2f}'.format(model_name,recall))
    print('{} auc Score:{:.2f}'.format(model_name,auc))
    
    
    print('*' * 30,'在测试集上的得分' )
    
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    auc = roc_auc_score(y_test,y_pred)
    model_name = '堆叠模型'
     
    print('{} 精确度 (accuracy):{:.2f}'.format(model_name,accuracy))
    print('{} 准确度(precision):{:.2f}'.format(model_name,precision))
    print('{} F1 Score :{:.2f}'.format(model_name,f1))
    print('{} 召回率(recall Score):{:.2f}'.format(model_name,recall))
    print('{} auc Score:{:.2f}'.format(model_name,auc))    
Ejemplo n.º 13
0
def test_predict_meta_features():
    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    gnb = GaussianNB()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    #  test default (class labels)
    stclf = StackingClassifier(classifiers=[knn, gnb],
                               meta_classifier=lr,
                               store_train_meta_features=True)
    stclf.fit(X_train, y_train)
    test_meta_features = stclf.predict(X_test)
    assert test_meta_features.shape == (X_test.shape[0], )
Ejemplo n.º 14
0
def test_predict_meta_features():
    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    gnb = GaussianNB()
    X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

    #  test default (class labels)
    stclf = StackingClassifier(classifiers=[knn, gnb],
                               meta_classifier=lr,
                               store_train_meta_features=True)
    stclf.fit(X_train, y_train)
    test_meta_features = stclf.predict(X_test)
    assert test_meta_features.shape == (X_test.shape[0],)
def stack_model(X_train, Y_train, X_test, expert_model, n_estimator):
    estimators = [('DT', DecisionTreeClassifier()), ('MLP', MLPClassifier())]
    if expert_model == "DT":
        model = StackingClassifier(estimators=estimators,
                                   final_estimator=DecisionTreeClassifier())
    if expert_model == "MLP":
        model = StackingClassifier(estimators=estimators,
                                   final_estimator=MLPClassifier())
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)

    return Y_pred
Ejemplo n.º 16
0
 def stacking():
     clf1 = KNeighborsClassifier(n_neighbors=1)
     clf2 = RandomForestClassifier(random_state=1)
     clf3 = GaussianNB()
     lr = LogisticRegression()
     stack = StackingClassifier(classifiers=[clf1, clf2, clf3],
                                meta_classifier=lr)
     stack.fit(X_train, y_train)
     y_pred_class = stack.predict(X_test)
     print("\n########### Stacking ###############")
     score_arr = evalClassModel(stack, y_test, y_pred_class)
     accuracy_score = score_arr[0]
     method_dict["Stacking"] = accuracy_score * 100
     rmse_dict["Stacking"] = stack
     time_elapsed = log_event()
     score_arr.append(time_elapsed)
     return score_arr
Ejemplo n.º 17
0
def fit_n_models(clf, n_clfs, X_train, y_train, X_test, y_test, save_dir):
    search = True
    y_true = np.where(y_test > 1, 0, 1)
    p_n = np.zeros((len(X_test), n_clfs))
    score = np.zeros(n_clfs)
    i = 0
    if search:
        while True:
            if clf == 'stacking':
                # 第一層のクラシファイア
                clf_children = [
                    GaussianNB() for _ in range(16)
                ]

                # メタクラシファイア
                clf_meta = LogisticRegression()
                clf = StackingClassifier(
                    classifiers=clf_children,
                    meta_classifier=clf_meta,
                    use_probas=True
                )
            elif clf == 'naive_bayes':
                clf = GaussianNB()
            elif clf == 'forest':
                clf = RandomForestClassifier()
            elif clf == 'boosting':
                clf = GradientBoostingClassifier()
            elif clf == 'svm':
                clf = SVC()

            clf.fit(X_train.values, y_train.values[:, 0])
            p = clf.predict(X_test.values)

            y_pred = np.where(p > 1, 0, 1)
            p_n[:, i] = p
            score[i] = recall_score(y_true=y_true, y_pred=y_pred)

            print('recall score :', score[i])

            pickle.dump(clf, open("%s/clf%03d.pkl"
                                  % (save_dir, i), "wb"))
            i += 1

            if i == n_clfs:
                break
    return p_n, score
Ejemplo n.º 18
0
def stacking():
    # Building and fitting 
    clf1 = KNeighborsClassifier(n_neighbors=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    lr = LogisticRegression()
    stack = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)
    stack.fit(X_train, y_train)
    
    # make class predictions for the testing set
    y_pred_class = stack.predict(X_test)
    
    print('########### Stacking ###############')
    
    accuracy_score = evalClassModel(stack, y_test, y_pred_class, True)

    #Data for final graph
    methodDict['Stacking'] = accuracy_score * 100
Ejemplo n.º 19
0
def model_stack():
    kf = KFold(n_splits=10, random_state=5)
    train_df, test_df, train_label = data_process.get_person_data()
    train_df.drop('ID', axis=1, inplace=True)
    train_data = train_df.values
    X_train, X_val, y_train, y_val = train_test_split(train_data,
                                                      train_label,
                                                      test_size=0.2,
                                                      random_state=66)
    id_list = list(test_df.pop('ID'))
    test_data = test_df.values
    model1 = gbt.XGBRegressor(n_estimators=1000,
                              subsample=0.8,
                              learning_rate=0.25,
                              objective='reg:linear')
    model2 = gbt.XGBRegressor(n_estimators=1000,
                              subsample=0.8,
                              learning_rate=0.25,
                              objective='reg:gamma')
    model3 = gbt.XGBRegressor(n_estimators=1000,
                              subsample=0.8,
                              learning_rate=0.25,
                              objective='reg:tweedie')
    model4 = RandomForestRegressor()
    model5 = GradientBoostingRegressor()
    # model6 = AdaBoostRegressor(n_estimators=200)
    stack_model = StackingClassifier(
        classifiers=[model1, model2, model4, model5], meta_classifier=model3)
    train_data = np.array(train_data)
    yHat_list = []
    for i, (train_index, test_index) in enumerate(kf.split(train_data)):
        new_train_data = train_data[train_index]
        new_train_label = [train_label[i] for i in train_index]
        stack_model.fit(new_train_data, new_train_label)
        yHat = stack_model.predict(test_data)
        yHat_list.append(yHat)
    yHat_list = np.array(yHat_list)
    yHat_list = yHat_list.mean(axis=0)
    result = pd.DataFrame({'id': id_list, 'yhat': yHat_list})
    result.to_csv('result/result17.csv',
                  index=False,
                  header=None,
                  encoding='utf-8')
Ejemplo n.º 20
0
def stacking_clf(model, X_train, y_train, X_test, y_test):
    voting_clf_hard = VotingClassifier(estimators=[
        ('Logistic Regression', model[0]), ('Decision Tree 1', model[1]),
        ('Decision Tree 2', model[2])
    ],
                                       voting='hard')

    voting_clf_hard.fit(X_train, y_train)
    y_pred_hard = voting_clf_hard.predict(X_train)
    y_pred_hard_test = voting_clf_hard.predict(X_test)
    X_trainN = np.concatenate((X_train, pd.DataFrame(y_pred_hard)), axis=1)
    X_testN = np.concatenate((X_test, pd.DataFrame(y_pred_hard_test)), axis=1)

    stacking_clf1 = StackingClassifier(
        classifiers=model, meta_classifier=LogisticRegression(random_state=9))
    stacking_clf1.fit(X_trainN, y_train)
    y_pred = stacking_clf1.predict(X_testN)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy
Ejemplo n.º 21
0
class Blend:
    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train['y'].values
        self.y_test = y_test['y'].values

    def blending(self):
        mete_clf = LogisticRegression()
        clf1 = model.svm_classifier()
        clf2 = model.dt_classifier()
        # reg3 = model.xgb_classifier()
        self.blend = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=mete_clf)
        self.blend.fit(self.x_train, self.y_train)
        return self.blend

    def score(self):
        scores = cross_val_score(self.blend, X=self.x_train, y=self.y_train, cv=5,
                                 verbose=2)
        return scores

    def prediction(self):
        y_pred = self.blend.predict(self.x_test)
        return y_pred
Ejemplo n.º 22
0
model_log = LogisticRegression(penalty='l2',
                         C=10,
                         multi_class='multinomial',
                         class_weight='balanced',
                         solver='newton-cg',
                         )

model_sta = StackingClassifier(
        classifiers=[model_RF,model_SVM, model_Adb], 
        meta_classifier=model_log,
        )

model_sta.fit(X_train,y_train)


y_sta_pred = model_sta.predict(X_test)


accuracy_score(y_test,y_sta_pred)
print('The sta accuracy is:',accuracy_score(y_test,y_sta_pred))
print('The sta precision is:',metrics.precision_score(y_test,y_sta_pred,average='macro'))
print('The sta recall is::',metrics.recall_score(y_test,y_sta_pred,average='macro'))
print('The sta f1 score is:',metrics.f1_score(y_test,y_sta_pred,average='macro'))



print('The sta confusion matrix is:\n',confusion_matrix(y_test,y_sta_pred,labels=[1,2,3,4,5,6]))
print('The sta precision, recall, f1 score are:\n',classification_report(y_test,y_sta_pred))


Ejemplo n.º 23
0
baseline6 = GaussianNB()
baseline7 = SVC(kernel='rbf', class_weight='balanced')
lr = XGBClassifier()

stackmodel = StackingClassifier(classifiers=[
    baseline1, baseline2, baseline3, baseline4, baseline5, baseline6, baseline7
],
                                meta_classifier=lr)

#%%
for basemodel, label in zip([
        baseline1, baseline2, baseline3, baseline4, baseline5, baseline6,
        baseline7, stackmodel
], [
        'xgboost', 'lightgbm', 'Random Forest', 'Catboost', 'AdaBoost',
        'GaussianNB', 'SVC', 'stack'
]):

    scores = model_selection.cross_val_score(basemodel,
                                             train,
                                             target,
                                             cv=5,
                                             scoring='accuracy')
#%%
stackmodel.fit(train, target)
predict = stackmodel.predict(test)
#%%
print('data saving')
predict = pd.DataFrame(predict)
predict.to_csv('./data/stacking.csv', encoding='utf-8', index=None)
Ejemplo n.º 24
0
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))


# In[44]:


sclf.fit(X_train, y)


# # Produce output
# Here I will just use the stacking classifier I just built.

# In[17]:


# Predict values
prediction = sclf.predict(X_test)

# Build output dataframe
out_df = pd.DataFrame({
    'PassengerId': test_passenger_id,
    'Survived': prediction.astype(int)
})

# Write to CSV
out_df.to_csv('titanic-result.csv', index=False)


# 
Ejemplo n.º 25
0
def main():
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import f1_score
    from skynet import DATA_PATH
    from skynet.data_handling import read_learning_data
    from skynet.data_handling.preprocessing import PreProcessor
    from skynet.data_handling import get_init_response
    from skynet.data_handling import split_time_series
    from mlxtend.classifier import StackingClassifier

    icao = "RJFK"

    train = read_learning_data(DATA_PATH +
                               "/pickle/learning/skynet/train_%s.pkl" % icao)
    test = read_learning_data(DATA_PATH +
                              "/pickle/learning/skynet/test_%s.pkl" % icao)

    data = pd.concat([train, test]).reset_index(drop=True)

    preprocess = PreProcessor(norm=False, binary=False)
    preprocess.fit(data.iloc[:, :-1], data.iloc[:, -1])

    data = pd.concat([preprocess.X_train, preprocess.y_train], axis=1)
    date = data["date"].values
    spdata = split_time_series(data, date, level="month", period=2)

    for key in spdata:
        ext = spdata[key]

        target = get_init_response()
        feats = [f for f in ext.keys() if not (f in target + ["date"])]

        X = ext[feats]
        ss = StandardScaler()
        X = pd.DataFrame(ss.fit_transform(X), columns=X.keys())
        y = ext[target]
        X, y = balanced(X, y)

        spX, spy = preprocess.split(X, y, n_folds=5)
        for k in spy:
            print(np.unique(spy[k]))

        X = pd.concat([spX[n] for n in spX if n != 0]).reset_index(drop=True)
        y = pd.concat([spy[n] for n in spy if n != 0]).reset_index(drop=True)

        X_test = spX[0].reset_index(drop=True)
        y_test = spy[0].reset_index(drop=True)

        from sklearn.ensemble import RandomForestClassifier
        clf1 = RandomForestClassifier(max_features=2)
        clf2 = SkySVM()
        meta = LogisticRegression()

        # 学習
        # (注)balancedしてない
        sta = SkyStacking((clf1, clf2), meta)
        sta.fit(X, y)
        p = sta.predict(X_test)

        clf1.fit(X.values, y.values[:, 0])
        print(np.array(X.keys())[np.argsort(clf1.feature_importances_)[::-1]])
        p_rf = clf1.predict(X_test.values)

        # mlxtendのstacking
        sc = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta)
        sc.fit(X.values, y.values[:, 0])
        p_sc = sc.predict(X_test.values)

        y_test = np.where(y_test.values[:, 0] > 1, 0, 1)
        p = np.where(p > 1, 0, 1)
        p_rf = np.where(p_rf > 1, 0, 1)
        p_sc = np.where(p_sc > 1, 0, 1)

        f1 = f1_score(y_true=y_test, y_pred=p)
        print("stacking", f1)

        f1_rf = f1_score(y_true=y_test, y_pred=p_rf)
        print("random forest", f1_rf)

        f1_sc = f1_score(y_true=y_test, y_pred=p_sc)
        print("stacked classifier", f1_sc)

        if True:
            break
Ejemplo n.º 26
0
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(hidden_layer_sizes=5)
dt = tree.DecisionTreeClassifier(max_depth=25,
                                 min_samples_leaf=15,
                                 presort=True)
bag = BaggingClassifier(qda1, n_estimators=5)
rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=10)
stack = StackingClassifier(
    classifiers=[gaus, lr, neigh, grad, gb, qda, lda, bag, rf],
    use_probas=True,
    verbose=2,
    meta_classifier=lr,
    use_features_in_secondary=False)
#no lda
stack.fit(Xtrain2, ytrain)
ypred = stack.predict(Xtrain2)
ypred1 = stack.predict(Xtest2)
ypred2 = stack.predict(test2)
print accuracy_score(ytrain, ypred)
print accuracy_score(ytest, ypred1)
print f1_score(ytrain, ypred, average='micro')
print f1_score(ytest, ypred1, average='micro')
print f1_score(ytrain, ypred, average='macro')
print f1_score(ytest, ypred1, average='macro')

ypred = stack.predict(Xtrain2)
ypred1 = stack.predict(Xtest2)
ypred2 = stack.predict(test2)

print accuracy_score(ytrain, ypred)
print accuracy_score(ytest, ypred1)
Ejemplo n.º 27
0
def model_training_stack(x_train, y_train, cross_val, y_name, n_maj=None, n_min=None):
    last_precision = 0
    recall_list = []
    fscore_list = []
    MAcc_list = []
    precision_list = []
    for epoch in range(epochs):
    # cross_val flag is used to specify if the model is used to test on a
    # cross validation set or the blind test set
        if cross_val:
            # splits the training data to perform 5-fold cross validation
            ss = StratifiedShuffleSplit(n_splits=20, test_size=0.2, random_state=epoch*11)


            for train_index, test_index in ss.split(x_train, y_train):

                index = 0
                X_train = x_train[train_index]
                Y_train = y_train[train_index]
                X_test = x_train[test_index]
                Y_test = y_train[test_index]

                #invoke the parameter tuning functiom
                rf_params = parameter_tuning_rf(X_train, Y_train)
                svm_params = parameter_tuning_svm(X_train, Y_train)
                ada_params = parameter_tuning_ada(X_train, Y_train)
                #gdbt_params = parameter_tuning_gbdt(X,Y)

                rf = RandomForestClassifier(n_estimators=rf_params['n_estimators'], max_depth=rf_params['max_depth'],
                                            n_jobs=-1,
                                            random_state=0)
                svr = svm.SVC(C=svm_params['C'], kernel='rbf', gamma=svm_params['gamma'], random_state=0, probability=True)

                ada = AdaBoostClassifier(n_estimators=ada_params['n_estimators'], learning_rate=ada_params['learning_rate'],
                                         algorithm='SAMME.R')

                #gdbt = GradientBoostingClassifier(learning_rate=gdbt_params['learning_rate'],n_estimators=gdbt_params['n_estimators'],
                                                     #max_depth=gdbt_params['max_depth'],subsample=gdbt_params['subsample'],random_state = 0)

                lr = LogisticRegression(C=1,max_iter=500)

                clfs = [svr]

                sclf = StackingClassifier(classifiers=clfs,use_probas=True,average_probas=False,
                                          meta_classifier=lr)
                sclf.fit(X_train,Y_train)
                # intialize the random forest classifier
                y_predict = sclf.predict(X_test)
                precision, recall, f_score, _ = precision_recall_fscore_support(Y_test, y_predict, pos_label=1,average='binary')
                c_mat = confusion_matrix(Y_test, y_predict)
                MAcc = caculate_MAcc(c_mat)
                #if precision > precision_list[index]:
                    #joblib.dump(sclf,'/home/deep/heart_science/model/sclf.model')
                precision_list.append(precision)
                recall_list.append(recall)
                fscore_list.append(f_score)
                MAcc_list.append(MAcc)
                index += 1


        '''
        if np.mean(precision_list) > last_precision:
            print(precision_list)
            last_precision = np.mean(precision_list)
            print('best precision is:',np.mean(precision_list))
            print('best recall is',np.mean(recall_list))
            print('best f-score is',np.mean(fscore_list))
            print('best MAcc is',np.mean(MAcc_list))
        '''


    #return sclf
    return np.mean(precision_list),np.mean(recall_list),np.mean(fscore_list),np.mean(MAcc_list)
Ejemplo n.º 28
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

################## load packages #####################
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline

################## load data #####################
iris = datasets.load_iris()
x, y = iris.data, iris.target

################## define classifier #####################

pipe1 = make_pipeline(ColumnSelector(cols=(0, 1)), LogisticRegression())
pipe2 = make_pipeline(ColumnSelector(cols=(2, 3)), LogisticRegression())

sclf = StackingClassifier(classifiers=[pipe1, pipe2],
                          meta_classifier=LogisticRegression())

################## fit and predict #####################
sclf.fit(x, y)

print(sclf.predict(x))

########### predict class probability ###########
print(sclf.predict_proba(x))
Ejemplo n.º 29
0
# In[5]:

x_train, x_test, y_train, y_test = train_test_split(data,
                                                    target,
                                                    test_size=0.1,
                                                    random_state=5)

# In[6]:

print(len(x_train))
print(len(x_test))

# In[7]:

#reg = LogisticRegression()
tree = DecisionTreeClassifier()
bayes = GaussianNB()
meta = KNeighborsClassifier(n_neighbors=10)

# In[8]:

model = StackingClassifier(classifiers=[tree, bayes],
                           use_probas=True,
                           meta_classifier=meta)
model.fit(x_train, y_train)

# In[9]:

y_predicted = model.predict(x_test)
print(mean_squared_error(y_test, y_predicted))
Ejemplo n.º 30
0
sub = pd.DataFrame({'name': ids, 'poi': rf_pred})
#sub['Survived'] = sub['Survived'].map(lambda x:1 if x>0.5 else 0)
sub.to_csv('test_rf.csv', index=False)

from mlxtend.classifier import StackingClassifier
meta_estimator = GradientBoostingClassifier(tol=100,
                                            subsample=0.75,
                                            n_estimators=250,
                                            max_features='sqrt',
                                            max_depth=6,
                                            learning_rate=0.03)

stacking = StackingClassifier(classifiers=[gdbt, rf, lr],
                              meta_classifier=meta_estimator)
stacking.fit(train_X, train_Y)
stacking_pred = stacking.predict(test_X)
sub = pd.DataFrame({'name': ids, 'poi': stacking_pred})
sub['poi'] = sub['poi'].map(lambda x: 0.99 if x == True else 0.0)
sub.to_csv('test_stacking.csv', index=False)

##
from sklearn.model_selection import GridSearchCV
param_test1 = {'n_estimators': range(20, 300, 10)}
gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(
    tol=100, subsample=0.75, max_features=11, max_depth=6, learning_rate=0.03),
                        param_grid=param_test1,
                        scoring='roc_auc',
                        iid=False,
                        cv=5)

gsearch1.fit(train_X, train_Y)
Ejemplo n.º 31
0
X_test = lda.transform(X_test)

bclf = LogisticRegression()

#random forest,knn,svm
clfs = [
    RandomForestClassifier(n_estimators=10,
                           criterion='entropy',
                           random_state=0),
    KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2),
    SVC(kernel='linear', random_state=0)
]
sl = StackingClassifier(classifiers=clfs, meta_classifier=bclf)
sl.fit(X_train, y_train)

y_pred = sl.predict(X_test)

accuracy0 = metrics.accuracy_score(y_test, y_pred)
print("Random Forest, KNN, SVM:", accuracy0)

cm = confusion_matrix(y_test, y_pred)
print(cm)

#knn, decision tree, svm
clfs = [
    DecisionTreeClassifier(criterion='entropy'),
    KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2),
    SVC(kernel='linear', random_state=0)
]
sl = StackingClassifier(classifiers=clfs, meta_classifier=bclf)
sl.fit(X_train, y_train)
    average_probas=True,
    meta_classifier=clf_lg)
label = ['stacking']
sclf.fit(X_train_standar, y_train)
score_stacking = cross_val_score(sclf,
                                 X_train_standar,
                                 y_train,
                                 scoring='accuracy')
cross_val_score(sclf, X_train_standar, y_train, scoring='f1')
score_mean_sclf = score_stacking.mean()
print('stacking final score\'s mean is % .2f' % score_mean_sclf)

print('accuracy: %.2f (+/- %.2f) [%s]' %
      (score_stacking.mean(), score_stacking.std(), label))

result_stacking = sclf.predict(X_test_stander)
result_stacking_proba = sclf.predict_proba(X_test_stander)
clf_stacking_test_score = sclf.score(X_test_stander, y_test)

precision, recall, thresholds = precision_recall_curve(y_test,
                                                       sclf.predict(X_test))
report = result_stacking_proba[:, 1] >= 0.8
print(classification_report(y_test, report, target_names=['0', '1']))

# ==============================================================================
# 模型持久化
# os.chdir(u'D:\【01】行健金融\【01】数据中心\【05】数据分析项目\【03】2018\May\规则引擎_分期商城_风控+授信')
# joblib.dump(sclf, 'stackingpkl.pkl')
# joblib.dump(scaler, 'scaler.pkl')

# ==============================================================================