def stacking3Model(model1, model2, metamodel, xtr, ytr, xts, yts): model = StackingClassifier(classifiers=[model1, model2], meta_classifier=metamodel) train, testt = scaling(xtr, xts,MaxAbsScaler()) model.fit(train, ytr) acc = accuracy_score(yts, model.predict(testt)) predict = model.predict(testt) return acc, predict
def classifer_stacking(data_file,alertgroup_name,classifier_list): classifiers = {'KNN':KNeighborsClassifier(), # n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric_params=None, n_jobs=1), # 'LR': LogisticRegression(), 'RF': RandomForestClassifier(), # n_estimators=60,max_depth=13,min_samples_split=120,min_samples_leaf=20,random_state=10 'DT': tree.DecisionTreeClassifier(), # criterion='gini',splitter=random,max_features=None,max_depth=13,min_samples_leaf=2 'GBDT': GradientBoostingClassifier() # loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1,max_depth=3,verbose=0,presort='auto') # 'XGB':xgboost_classifier } all_data = pd.read_csv(data_file, sep=',', dtype=str) for alertgroup, group in all_data.groupby('alertgroup'): if alertgroup == alertgroup_name: train_x, test_x, train_y, test_y = get_data(group, split=True) arr_x = train_x.values arr_y = train_y.values max_fs = 0 best_model = None stratified_folder = StratifiedKFold(n_folds=3,random_state=0,shuffle=False) for train_index,test_index in stratified_folder.split(train_x): train_x = arr_x[train_index] train_y = arr_y[train_index] test_x = arr_x[test_index] test_y = arr_y[test_index] classifiers_list = [classifiers[cl] for cl in classifier_list] stack_model = StackingClassifier(classifiers = classifiers_list,use_probas=True, average_probas=True,meta_classifier=classifiers['RF']) stack_model.fit(train_x,train_y) predict = stack_model.predict(test_x) fbetascore = fbeta_score(test_y, predict, 1) print(' f2score:' + str(fbetascore)) if fbetascore > max_fs: max_fs = fbetascore best_model = stack_model stack_model = best_model predict = stack_model.predict(test_x) precision = metrics.precision_score(test_y, predict) recall = metrics.recall_score(test_y, predict) fbetascore = fbeta_score(test_y, predict, 0.5) accuracy = metrics.accuracy_score(test_y, predict) print('final performance:') print(alertgroup_name) print('precision: %.6f' % (100 *precision)) print('recall: %.6f' % (100 * recall)) print('f0.5score: %.6f' % (100 * fbetascore)) print('accuracy: %.6f%%' % (100 * accuracy)) return best_model
def model_processing(X_train, X_test, y_train, y_test): log_reg = LogisticRegression(C=0.01, penalty='l2') svc = SVC(C=0.7, kernel='linear') tree_clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5) rf_clf = RandomForestClassifier(n_estimators=70, criterion='entropy', max_features='auto', min_samples_leaf=6) xgb = XGBClassifier(gamma=0.3, max_depth=4, min_child_weight=8, reg_alpha=0.05) sclf = StackingClassifier(classifiers=[log_reg, svc, tree_clf, rf_clf], meta_classifier=xgb) sclf.fit(X_train, y_train) y_pred_train = sclf.predict(X_train) y_pred = sclf.predict(X_test) print('*' * 30, '在训练集上的得分') accuracy = accuracy_score(y_train, y_pred_train) precision = precision_score(y_train, y_pred_train) f1 = f1_score(y_train, y_pred_train) recall = recall_score(y_train, y_pred_train) auc = roc_auc_score(y_train, y_pred_train) model_name = '堆叠模型-训练集' print('{} 精确度 (accuracy):{:.2f}'.format(model_name, accuracy)) print('{} 准确度(precision):{:.2f}'.format(model_name, precision)) print('{} F1 Score :{:.2f}'.format(model_name, f1)) print('{} 召回率(recall Score):{:.2f}'.format(model_name, recall)) print('{} auc Score:{:.2f}'.format(model_name, auc)) print('*' * 30, '在测试集上的得分') accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) recall = recall_score(y_test, y_pred) auc = roc_auc_score(y_test, y_pred) model_name = '堆叠模型' print('{} 精确度 (accuracy):{:.2f}'.format(model_name, accuracy)) print('{} 准确度(precision):{:.2f}'.format(model_name, precision)) print('{} F1 Score :{:.2f}'.format(model_name, f1)) print('{} 召回率(recall Score):{:.2f}'.format(model_name, recall)) print('{} auc Score:{:.2f}'.format(model_name, auc))
def model_stack2(): _, test_df, train_label = data_process.get_person_data() train_data, test_data = data_process.get_scale_data() X_train, X_val, y_train, y_val = train_test_split(train_data, train_label, test_size=0.2, random_state=66) id_list = list(test_df.pop('ID')) model1 = gbt.XGBRegressor(n_estimators=1000, subsample=0.8, learning_rate=0.25, objective='reg:linear') model2 = gbt.XGBRegressor(n_estimators=1000, subsample=0.8, learning_rate=0.25, objective='reg:gamma') model3 = gbt.XGBRegressor(n_estimators=1000, subsample=0.8, learning_rate=0.25, objective='reg:tweedie') model4 = svm.SVR() stack_model = StackingClassifier( classifiers=[model1, model2, model3, model4], meta_classifier=model3) stack_model.fit(train_data, train_label) yHat = stack_model.predict(test_data) result = pd.DataFrame({'id': id_list, 'yhat': yHat}) result.to_csv('result/result6.csv', index=False, header=None, encoding='utf-8')
class ClassifierBlender: def __init__(self, x_train, x_test, y_train, y_test=None): x_train.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True) x_test.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True) self.x_train = x_train self.x_test = x_test self.y_train = y_train['y'].values if self.y_train is not None: self.y_test = y_test['y'].values def clf_blend(self): mete_clf = LinearRegression() clf1 = model.svm_regressor() clf2 = model.randomforest_regressor() clf3 = model.xgb_regressor() self.blend = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=mete_clf) self.blend.fit(self.x_train, self.y_train) return self.blend def score(self): scores = cross_val_score(self.blend, X=self.x_train, y=self.y_train, cv=10, verbose=2) return scores def prediction(self): y_pred = self.blend.predict(self.x_test) return y_pred
def stacking_prediction2(m1, m2, meta): # model_train, model_test = stacking(clf, Xtrain2,ytrain2, Xtest2) # model.fit(model_train, ytrain2) tr, ts = scaling(Xtrain2,Xtest2,MaxAbsScaler()) m = StackingClassifier(classifiers=[m1, m2],meta_classifier=meta) m.fit(tr, ytrain2) predict_mm = m.predict(ts) return predict_mm
def stacking(self): train_data, test_data = self.Extract_feature.extract_count() from sklearn.svm import SVR from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler, MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from xgboost import XGBClassifier import lightgbm as lgb from lightgbm import LGBMClassifier import xgboost as xgb from mlxtend.classifier import StackingClassifier import scipy as sc from sklearn import model_selection lasso = make_pipeline(SVC(C=2.1, gamma=0.005)) rforest = make_pipeline( RandomForestClassifier(random_state=0, n_estimators=6)) Gboost = GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth=12, max_features="sqrt", min_samples_leaf=15, min_samples_split=97, random_state=200) model_xgb = xgb.XGBClassifier(colsample_bytree=0.4603, gamma=10, learning_rate=0.01, max_depth=11, n_estimators=500, reg_alpha=0.01, reg_lambda=5, subsample=0.5213, seed=1024, nthread=-1) lr = LogisticRegression() classifiers = [rforest, lasso, Gboost, model_xgb, lr] stregr = StackingClassifier(classifiers=classifiers, meta_classifier=lr) stregr.fit(train_data, self.train_label) prediction = stregr.predict(test_data) classification = classification_report(y_true=self.test_label, y_pred=prediction) print("classification:{}".format(classification)) print("测试集的score:{}".format(stregr.score(test_data, self.test_label))) for clf, label in zip( [rforest, lasso, Gboost, lr, model_xgb, stregr], ['rf', 'svr', 'gboost', 'lr', 'xgb', 'stackingclassifier']): scores = model_selection.cross_val_score(clf, train_data, self.train_label, cv=3, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
def Stacking_model(x_train, x_test, y_train, y_test): t1 = time() dt = DecisionTreeClassifier( criterion="gini", #建立决策树模型 splitter="best", max_depth=10) #lr = LogisticRegression(C=1,penalty='l1')#构建逻辑回归分类器 gbdt = GradientBoostingClassifier(loss='deviance', learning_rate=0.01, n_estimators=2000, subsample=0.8, max_features=1, max_depth=10, verbose=2) rf = RandomForestClassifier(n_estimators=30, max_depth=15) xgbst = XGBClassifier(silent=0, nthread=4, learning_rate=0.1, min_child_weight=1, max_depth=5, gamma=0, subsample=0.8, max_delta_step=0, colsample_bytree=0.8, reg_lambda=1, n_estimators=2000, seed=27) sclf = StackingClassifier(classifiers=[dt, gbdt, rf], use_probas=False, average_probas=False, meta_classifier=xgbst) sclf.fit(x_train, y_train) t2 = time() y_train_p = sclf.predict(x_train) y_test_p = sclf.predict(x_test) print('----Stacking----') print("Train set accuracy score: {:.5f}".format( accuracy_score(y_train_p, y_train))) print("Test set accuracy score: {:.5f}".format( accuracy_score(y_test_p, y_test))) print('Time: {:.1f} s'.format(t2 - t1)) return y_test_p
def data_ensemble(cancer_type,feat): data_dir = "/home/ubuntu/cancer/" data_file = data_dir + cancer_type + "_matrix.csv" features = data_dir + cancer_type + "_output.txt" output_file = data_dir + cancer_type + "_accuracy.txt" file = open(features, "r") o_file = open(output_file, "w") line = file.readline() line = file.readline() df = pd.read_csv(data_file) df = shuffle(df) file_ids=df.pop('file_id') y = df.pop('label').values dataf=df.pop(line[:-1]) #dataframe consisting of only important features for x in range(feat): line = file.readline() dataf=np.column_stack((dataf,df.pop(line[:-1]))) X=normalize(dataf) X=scale(X) pca=PCA() pca.fit(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) #multiple classifiers clf1 = RandomForestClassifier(random_state=1,n_estimators=100) clf2 = GradientBoostingClassifier(n_estimators=1200,subsample=0.5,random_state=3) clf3 = SVC(gamma='auto') clf4 = KNeighborsClassifier(n_neighbors=1) clf5 = DecisionTreeClassifier(random_state=0) lr = LogisticRegression(solver='lbfgs') #stacking for data ensemble sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], meta_classifier=lr) clf1.fit(X_train,y_train) clf2.fit(X_train,y_train) clf3.fit(X_train,y_train) clf4.fit(X_train,y_train) clf5.fit(X_train,y_train) sclf.fit(X_train,y_train) y_test_predict=sclf.predict(X_test) precision = precision_score(y_test, y_test_predict) accuracy = accuracy_score(y_test, y_test_predict) f1 = f1_score(y_test, y_test_predict) recall = recall_score(y_test, y_test_predict) scores = [precision,accuracy,f1,recall] label = ['RF', 'GBDT', 'SVM','KNN','DT','Stacking'] clf_list = [clf1, clf2, clf3, clf4, clf5, sclf] #score calculation for clf, label in zip(clf_list, label): y_test_predict = clf.predict(X_test) tn, fp, fn, tp = confusion_matrix(y_test, y_test_predict).ravel() specificity = tn / (tn+fp) recall = tp / (tp+fn) precision = tp / (tp+fp) accuracy = (tp + tn) / (tp+tn+fp+fn) f1 = 2*tp / (2*tp+fp+fn) o_file.write("\nAccuracy: %.2f [%s] \nPrecision: %.2f [%s] \nRecall: %.2f [%s] \nF1 score: %.2f [%s] \nSpecificity: %.2f [%s]\n" %(accuracy,label,precision, label, recall, label, f1, label, specificity, label))
def stackingPerformanceEditor(): nb_clf = GaussianNB() svm_clf = RandomForestClassifier(n_estimators=100, max_depth=400, random_state=5) mlp_clff = MLPClassifier(hidden_layer_sizes=(500,500)) label = ["NB","RF","MLP"] acc = StackingClassifier(classifiers=[nb_clf,svm_clf,mlp_clff], meta_classifier=svm_clf) acc.fit(Xtrain2, ytrain2) pred = accuracy_score(ytest, acc.predict(Xtest2)) return pred
def stacking(self): from sklearn.svm import SVC from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler, MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from lightgbm import LGBMClassifier import xgboost as xgb from mlxtend.classifier import StackingClassifier import scipy as sc svc = make_pipeline(SVC(kernel='rbf', C=2.8, gamma=2)) rf = RandomForestClassifier(random_state=590, n_estimators=6) GBoost = GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth=12, max_features='sqrt', min_samples_leaf=15, min_samples_split=97, random_state=200) model_xgb = xgb.XGBClassifier(colsample_bytree=0.4603, gamma=10, learning_rate=0.01, max_depth=11, min_child_weight=1.7817, n_estimators=500, reg_alpha=0.01, reg_lambda=5, subsample=0.5213, silent=1, seed=1024, nthread=-1) model_lgb = LGBMClassifier(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=550, max_bin=25, bagging_fraction=1, bagging_freq=5, feature_fraction=0.7, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=42, min_sum_hessian_in_leaf=40) regressors = [rf, svc, GBoost, model_lgb, model_xgb] stregr = StackingClassifier(classifiers=regressors, meta_classifier=model_xgb, verbose=1) stregr.fit(self.X_train, self.y_train) print( "the model is stregr and the valid's f1 is: ", f1_score(self.y_test, stregr.predict(self.X_test), average="macro")) # print("the model is stregr and the valid's precision_score is: ", precision_score(self.y_test, stregr.predict(self.X_test),average="macro")) # print("the model is stregr and the valid's recall_score is: ", recall_score(self.y_test, stregr.predict(self.X_test),average="macro")) return stregr
def model_processing(X_train,X_test,y_train,y_test): log_reg = LogisticRegression(C=0.01, penalty='l2') svc = SVC(C=0.7, kernel='linear') tree_clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5) rf_clf = RandomForestClassifier(n_estimators=70,criterion='entropy', max_features='auto',min_samples_leaf=6) xgb = XGBClassifier(gamma=0.3, max_depth=4, min_child_weight=8,reg_alpha=0.05) sclf = StackingClassifier(classifiers=[log_reg,svc,tree_clf,rf_clf],meta_classifier=xgb) sclf.fit(X_train,y_train) y_pred_train = sclf.predict(X_train) y_pred = sclf.predict(X_test) print('*' * 30,'在训练集上的得分' ) accuracy = accuracy_score(y_train,y_pred_train) precision = precision_score(y_train,y_pred_train) f1 = f1_score(y_train,y_pred_train) recall = recall_score(y_train,y_pred_train) auc = roc_auc_score(y_train,y_pred_train) model_name = '堆叠模型-训练集' print('{} 精确度 (accuracy):{:.2f}'.format(model_name,accuracy)) print('{} 准确度(precision):{:.2f}'.format(model_name,precision)) print('{} F1 Score :{:.2f}'.format(model_name,f1)) print('{} 召回率(recall Score):{:.2f}'.format(model_name,recall)) print('{} auc Score:{:.2f}'.format(model_name,auc)) print('*' * 30,'在测试集上的得分' ) accuracy = accuracy_score(y_test,y_pred) precision = precision_score(y_test,y_pred) f1 = f1_score(y_test,y_pred) recall = recall_score(y_test,y_pred) auc = roc_auc_score(y_test,y_pred) model_name = '堆叠模型' print('{} 精确度 (accuracy):{:.2f}'.format(model_name,accuracy)) print('{} 准确度(precision):{:.2f}'.format(model_name,precision)) print('{} F1 Score :{:.2f}'.format(model_name,f1)) print('{} 召回率(recall Score):{:.2f}'.format(model_name,recall)) print('{} auc Score:{:.2f}'.format(model_name,auc))
def test_predict_meta_features(): knn = KNeighborsClassifier() lr = LogisticRegression() gnb = GaussianNB() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # test default (class labels) stclf = StackingClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) stclf.fit(X_train, y_train) test_meta_features = stclf.predict(X_test) assert test_meta_features.shape == (X_test.shape[0], )
def test_predict_meta_features(): knn = KNeighborsClassifier() lr = LogisticRegression() gnb = GaussianNB() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # test default (class labels) stclf = StackingClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) stclf.fit(X_train, y_train) test_meta_features = stclf.predict(X_test) assert test_meta_features.shape == (X_test.shape[0],)
def stack_model(X_train, Y_train, X_test, expert_model, n_estimator): estimators = [('DT', DecisionTreeClassifier()), ('MLP', MLPClassifier())] if expert_model == "DT": model = StackingClassifier(estimators=estimators, final_estimator=DecisionTreeClassifier()) if expert_model == "MLP": model = StackingClassifier(estimators=estimators, final_estimator=MLPClassifier()) model.fit(X_train, Y_train) Y_pred = model.predict(X_test) return Y_pred
def stacking(): clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression() stack = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) stack.fit(X_train, y_train) y_pred_class = stack.predict(X_test) print("\n########### Stacking ###############") score_arr = evalClassModel(stack, y_test, y_pred_class) accuracy_score = score_arr[0] method_dict["Stacking"] = accuracy_score * 100 rmse_dict["Stacking"] = stack time_elapsed = log_event() score_arr.append(time_elapsed) return score_arr
def fit_n_models(clf, n_clfs, X_train, y_train, X_test, y_test, save_dir): search = True y_true = np.where(y_test > 1, 0, 1) p_n = np.zeros((len(X_test), n_clfs)) score = np.zeros(n_clfs) i = 0 if search: while True: if clf == 'stacking': # 第一層のクラシファイア clf_children = [ GaussianNB() for _ in range(16) ] # メタクラシファイア clf_meta = LogisticRegression() clf = StackingClassifier( classifiers=clf_children, meta_classifier=clf_meta, use_probas=True ) elif clf == 'naive_bayes': clf = GaussianNB() elif clf == 'forest': clf = RandomForestClassifier() elif clf == 'boosting': clf = GradientBoostingClassifier() elif clf == 'svm': clf = SVC() clf.fit(X_train.values, y_train.values[:, 0]) p = clf.predict(X_test.values) y_pred = np.where(p > 1, 0, 1) p_n[:, i] = p score[i] = recall_score(y_true=y_true, y_pred=y_pred) print('recall score :', score[i]) pickle.dump(clf, open("%s/clf%03d.pkl" % (save_dir, i), "wb")) i += 1 if i == n_clfs: break return p_n, score
def stacking(): # Building and fitting clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression() stack = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) stack.fit(X_train, y_train) # make class predictions for the testing set y_pred_class = stack.predict(X_test) print('########### Stacking ###############') accuracy_score = evalClassModel(stack, y_test, y_pred_class, True) #Data for final graph methodDict['Stacking'] = accuracy_score * 100
def model_stack(): kf = KFold(n_splits=10, random_state=5) train_df, test_df, train_label = data_process.get_person_data() train_df.drop('ID', axis=1, inplace=True) train_data = train_df.values X_train, X_val, y_train, y_val = train_test_split(train_data, train_label, test_size=0.2, random_state=66) id_list = list(test_df.pop('ID')) test_data = test_df.values model1 = gbt.XGBRegressor(n_estimators=1000, subsample=0.8, learning_rate=0.25, objective='reg:linear') model2 = gbt.XGBRegressor(n_estimators=1000, subsample=0.8, learning_rate=0.25, objective='reg:gamma') model3 = gbt.XGBRegressor(n_estimators=1000, subsample=0.8, learning_rate=0.25, objective='reg:tweedie') model4 = RandomForestRegressor() model5 = GradientBoostingRegressor() # model6 = AdaBoostRegressor(n_estimators=200) stack_model = StackingClassifier( classifiers=[model1, model2, model4, model5], meta_classifier=model3) train_data = np.array(train_data) yHat_list = [] for i, (train_index, test_index) in enumerate(kf.split(train_data)): new_train_data = train_data[train_index] new_train_label = [train_label[i] for i in train_index] stack_model.fit(new_train_data, new_train_label) yHat = stack_model.predict(test_data) yHat_list.append(yHat) yHat_list = np.array(yHat_list) yHat_list = yHat_list.mean(axis=0) result = pd.DataFrame({'id': id_list, 'yhat': yHat_list}) result.to_csv('result/result17.csv', index=False, header=None, encoding='utf-8')
def stacking_clf(model, X_train, y_train, X_test, y_test): voting_clf_hard = VotingClassifier(estimators=[ ('Logistic Regression', model[0]), ('Decision Tree 1', model[1]), ('Decision Tree 2', model[2]) ], voting='hard') voting_clf_hard.fit(X_train, y_train) y_pred_hard = voting_clf_hard.predict(X_train) y_pred_hard_test = voting_clf_hard.predict(X_test) X_trainN = np.concatenate((X_train, pd.DataFrame(y_pred_hard)), axis=1) X_testN = np.concatenate((X_test, pd.DataFrame(y_pred_hard_test)), axis=1) stacking_clf1 = StackingClassifier( classifiers=model, meta_classifier=LogisticRegression(random_state=9)) stacking_clf1.fit(X_trainN, y_train) y_pred = stacking_clf1.predict(X_testN) accuracy = accuracy_score(y_test, y_pred) return accuracy
class Blend: def __init__(self, x_train, x_test, y_train, y_test): self.x_train = x_train self.x_test = x_test self.y_train = y_train['y'].values self.y_test = y_test['y'].values def blending(self): mete_clf = LogisticRegression() clf1 = model.svm_classifier() clf2 = model.dt_classifier() # reg3 = model.xgb_classifier() self.blend = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=mete_clf) self.blend.fit(self.x_train, self.y_train) return self.blend def score(self): scores = cross_val_score(self.blend, X=self.x_train, y=self.y_train, cv=5, verbose=2) return scores def prediction(self): y_pred = self.blend.predict(self.x_test) return y_pred
model_log = LogisticRegression(penalty='l2', C=10, multi_class='multinomial', class_weight='balanced', solver='newton-cg', ) model_sta = StackingClassifier( classifiers=[model_RF,model_SVM, model_Adb], meta_classifier=model_log, ) model_sta.fit(X_train,y_train) y_sta_pred = model_sta.predict(X_test) accuracy_score(y_test,y_sta_pred) print('The sta accuracy is:',accuracy_score(y_test,y_sta_pred)) print('The sta precision is:',metrics.precision_score(y_test,y_sta_pred,average='macro')) print('The sta recall is::',metrics.recall_score(y_test,y_sta_pred,average='macro')) print('The sta f1 score is:',metrics.f1_score(y_test,y_sta_pred,average='macro')) print('The sta confusion matrix is:\n',confusion_matrix(y_test,y_sta_pred,labels=[1,2,3,4,5,6])) print('The sta precision, recall, f1 score are:\n',classification_report(y_test,y_sta_pred))
baseline6 = GaussianNB() baseline7 = SVC(kernel='rbf', class_weight='balanced') lr = XGBClassifier() stackmodel = StackingClassifier(classifiers=[ baseline1, baseline2, baseline3, baseline4, baseline5, baseline6, baseline7 ], meta_classifier=lr) #%% for basemodel, label in zip([ baseline1, baseline2, baseline3, baseline4, baseline5, baseline6, baseline7, stackmodel ], [ 'xgboost', 'lightgbm', 'Random Forest', 'Catboost', 'AdaBoost', 'GaussianNB', 'SVC', 'stack' ]): scores = model_selection.cross_val_score(basemodel, train, target, cv=5, scoring='accuracy') #%% stackmodel.fit(train, target) predict = stackmodel.predict(test) #%% print('data saving') predict = pd.DataFrame(predict) predict.to_csv('./data/stacking.csv', encoding='utf-8', index=None)
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) # In[44]: sclf.fit(X_train, y) # # Produce output # Here I will just use the stacking classifier I just built. # In[17]: # Predict values prediction = sclf.predict(X_test) # Build output dataframe out_df = pd.DataFrame({ 'PassengerId': test_passenger_id, 'Survived': prediction.astype(int) }) # Write to CSV out_df.to_csv('titanic-result.csv', index=False) #
def main(): from sklearn.preprocessing import StandardScaler from sklearn.metrics import f1_score from skynet import DATA_PATH from skynet.data_handling import read_learning_data from skynet.data_handling.preprocessing import PreProcessor from skynet.data_handling import get_init_response from skynet.data_handling import split_time_series from mlxtend.classifier import StackingClassifier icao = "RJFK" train = read_learning_data(DATA_PATH + "/pickle/learning/skynet/train_%s.pkl" % icao) test = read_learning_data(DATA_PATH + "/pickle/learning/skynet/test_%s.pkl" % icao) data = pd.concat([train, test]).reset_index(drop=True) preprocess = PreProcessor(norm=False, binary=False) preprocess.fit(data.iloc[:, :-1], data.iloc[:, -1]) data = pd.concat([preprocess.X_train, preprocess.y_train], axis=1) date = data["date"].values spdata = split_time_series(data, date, level="month", period=2) for key in spdata: ext = spdata[key] target = get_init_response() feats = [f for f in ext.keys() if not (f in target + ["date"])] X = ext[feats] ss = StandardScaler() X = pd.DataFrame(ss.fit_transform(X), columns=X.keys()) y = ext[target] X, y = balanced(X, y) spX, spy = preprocess.split(X, y, n_folds=5) for k in spy: print(np.unique(spy[k])) X = pd.concat([spX[n] for n in spX if n != 0]).reset_index(drop=True) y = pd.concat([spy[n] for n in spy if n != 0]).reset_index(drop=True) X_test = spX[0].reset_index(drop=True) y_test = spy[0].reset_index(drop=True) from sklearn.ensemble import RandomForestClassifier clf1 = RandomForestClassifier(max_features=2) clf2 = SkySVM() meta = LogisticRegression() # 学習 # (注)balancedしてない sta = SkyStacking((clf1, clf2), meta) sta.fit(X, y) p = sta.predict(X_test) clf1.fit(X.values, y.values[:, 0]) print(np.array(X.keys())[np.argsort(clf1.feature_importances_)[::-1]]) p_rf = clf1.predict(X_test.values) # mlxtendのstacking sc = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta) sc.fit(X.values, y.values[:, 0]) p_sc = sc.predict(X_test.values) y_test = np.where(y_test.values[:, 0] > 1, 0, 1) p = np.where(p > 1, 0, 1) p_rf = np.where(p_rf > 1, 0, 1) p_sc = np.where(p_sc > 1, 0, 1) f1 = f1_score(y_true=y_test, y_pred=p) print("stacking", f1) f1_rf = f1_score(y_true=y_test, y_pred=p_rf) print("random forest", f1_rf) f1_sc = f1_score(y_true=y_test, y_pred=p_sc) print("stacked classifier", f1_sc) if True: break
lda = LinearDiscriminantAnalysis() mlp = MLPClassifier(hidden_layer_sizes=5) dt = tree.DecisionTreeClassifier(max_depth=25, min_samples_leaf=15, presort=True) bag = BaggingClassifier(qda1, n_estimators=5) rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=10) stack = StackingClassifier( classifiers=[gaus, lr, neigh, grad, gb, qda, lda, bag, rf], use_probas=True, verbose=2, meta_classifier=lr, use_features_in_secondary=False) #no lda stack.fit(Xtrain2, ytrain) ypred = stack.predict(Xtrain2) ypred1 = stack.predict(Xtest2) ypred2 = stack.predict(test2) print accuracy_score(ytrain, ypred) print accuracy_score(ytest, ypred1) print f1_score(ytrain, ypred, average='micro') print f1_score(ytest, ypred1, average='micro') print f1_score(ytrain, ypred, average='macro') print f1_score(ytest, ypred1, average='macro') ypred = stack.predict(Xtrain2) ypred1 = stack.predict(Xtest2) ypred2 = stack.predict(test2) print accuracy_score(ytrain, ypred) print accuracy_score(ytest, ypred1)
def model_training_stack(x_train, y_train, cross_val, y_name, n_maj=None, n_min=None): last_precision = 0 recall_list = [] fscore_list = [] MAcc_list = [] precision_list = [] for epoch in range(epochs): # cross_val flag is used to specify if the model is used to test on a # cross validation set or the blind test set if cross_val: # splits the training data to perform 5-fold cross validation ss = StratifiedShuffleSplit(n_splits=20, test_size=0.2, random_state=epoch*11) for train_index, test_index in ss.split(x_train, y_train): index = 0 X_train = x_train[train_index] Y_train = y_train[train_index] X_test = x_train[test_index] Y_test = y_train[test_index] #invoke the parameter tuning functiom rf_params = parameter_tuning_rf(X_train, Y_train) svm_params = parameter_tuning_svm(X_train, Y_train) ada_params = parameter_tuning_ada(X_train, Y_train) #gdbt_params = parameter_tuning_gbdt(X,Y) rf = RandomForestClassifier(n_estimators=rf_params['n_estimators'], max_depth=rf_params['max_depth'], n_jobs=-1, random_state=0) svr = svm.SVC(C=svm_params['C'], kernel='rbf', gamma=svm_params['gamma'], random_state=0, probability=True) ada = AdaBoostClassifier(n_estimators=ada_params['n_estimators'], learning_rate=ada_params['learning_rate'], algorithm='SAMME.R') #gdbt = GradientBoostingClassifier(learning_rate=gdbt_params['learning_rate'],n_estimators=gdbt_params['n_estimators'], #max_depth=gdbt_params['max_depth'],subsample=gdbt_params['subsample'],random_state = 0) lr = LogisticRegression(C=1,max_iter=500) clfs = [svr] sclf = StackingClassifier(classifiers=clfs,use_probas=True,average_probas=False, meta_classifier=lr) sclf.fit(X_train,Y_train) # intialize the random forest classifier y_predict = sclf.predict(X_test) precision, recall, f_score, _ = precision_recall_fscore_support(Y_test, y_predict, pos_label=1,average='binary') c_mat = confusion_matrix(Y_test, y_predict) MAcc = caculate_MAcc(c_mat) #if precision > precision_list[index]: #joblib.dump(sclf,'/home/deep/heart_science/model/sclf.model') precision_list.append(precision) recall_list.append(recall) fscore_list.append(f_score) MAcc_list.append(MAcc) index += 1 ''' if np.mean(precision_list) > last_precision: print(precision_list) last_precision = np.mean(precision_list) print('best precision is:',np.mean(precision_list)) print('best recall is',np.mean(recall_list)) print('best f-score is',np.mean(fscore_list)) print('best MAcc is',np.mean(MAcc_list)) ''' #return sclf return np.mean(precision_list),np.mean(recall_list),np.mean(fscore_list),np.mean(MAcc_list)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- ################## load packages ##################### from sklearn import datasets from sklearn.linear_model import LogisticRegression from mlxtend.classifier import StackingClassifier from mlxtend.feature_selection import ColumnSelector from sklearn.pipeline import make_pipeline ################## load data ##################### iris = datasets.load_iris() x, y = iris.data, iris.target ################## define classifier ##################### pipe1 = make_pipeline(ColumnSelector(cols=(0, 1)), LogisticRegression()) pipe2 = make_pipeline(ColumnSelector(cols=(2, 3)), LogisticRegression()) sclf = StackingClassifier(classifiers=[pipe1, pipe2], meta_classifier=LogisticRegression()) ################## fit and predict ##################### sclf.fit(x, y) print(sclf.predict(x)) ########### predict class probability ########### print(sclf.predict_proba(x))
# In[5]: x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.1, random_state=5) # In[6]: print(len(x_train)) print(len(x_test)) # In[7]: #reg = LogisticRegression() tree = DecisionTreeClassifier() bayes = GaussianNB() meta = KNeighborsClassifier(n_neighbors=10) # In[8]: model = StackingClassifier(classifiers=[tree, bayes], use_probas=True, meta_classifier=meta) model.fit(x_train, y_train) # In[9]: y_predicted = model.predict(x_test) print(mean_squared_error(y_test, y_predicted))
sub = pd.DataFrame({'name': ids, 'poi': rf_pred}) #sub['Survived'] = sub['Survived'].map(lambda x:1 if x>0.5 else 0) sub.to_csv('test_rf.csv', index=False) from mlxtend.classifier import StackingClassifier meta_estimator = GradientBoostingClassifier(tol=100, subsample=0.75, n_estimators=250, max_features='sqrt', max_depth=6, learning_rate=0.03) stacking = StackingClassifier(classifiers=[gdbt, rf, lr], meta_classifier=meta_estimator) stacking.fit(train_X, train_Y) stacking_pred = stacking.predict(test_X) sub = pd.DataFrame({'name': ids, 'poi': stacking_pred}) sub['poi'] = sub['poi'].map(lambda x: 0.99 if x == True else 0.0) sub.to_csv('test_stacking.csv', index=False) ## from sklearn.model_selection import GridSearchCV param_test1 = {'n_estimators': range(20, 300, 10)} gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier( tol=100, subsample=0.75, max_features=11, max_depth=6, learning_rate=0.03), param_grid=param_test1, scoring='roc_auc', iid=False, cv=5) gsearch1.fit(train_X, train_Y)
X_test = lda.transform(X_test) bclf = LogisticRegression() #random forest,knn,svm clfs = [ RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0), KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2), SVC(kernel='linear', random_state=0) ] sl = StackingClassifier(classifiers=clfs, meta_classifier=bclf) sl.fit(X_train, y_train) y_pred = sl.predict(X_test) accuracy0 = metrics.accuracy_score(y_test, y_pred) print("Random Forest, KNN, SVM:", accuracy0) cm = confusion_matrix(y_test, y_pred) print(cm) #knn, decision tree, svm clfs = [ DecisionTreeClassifier(criterion='entropy'), KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2), SVC(kernel='linear', random_state=0) ] sl = StackingClassifier(classifiers=clfs, meta_classifier=bclf) sl.fit(X_train, y_train)
average_probas=True, meta_classifier=clf_lg) label = ['stacking'] sclf.fit(X_train_standar, y_train) score_stacking = cross_val_score(sclf, X_train_standar, y_train, scoring='accuracy') cross_val_score(sclf, X_train_standar, y_train, scoring='f1') score_mean_sclf = score_stacking.mean() print('stacking final score\'s mean is % .2f' % score_mean_sclf) print('accuracy: %.2f (+/- %.2f) [%s]' % (score_stacking.mean(), score_stacking.std(), label)) result_stacking = sclf.predict(X_test_stander) result_stacking_proba = sclf.predict_proba(X_test_stander) clf_stacking_test_score = sclf.score(X_test_stander, y_test) precision, recall, thresholds = precision_recall_curve(y_test, sclf.predict(X_test)) report = result_stacking_proba[:, 1] >= 0.8 print(classification_report(y_test, report, target_names=['0', '1'])) # ============================================================================== # 模型持久化 # os.chdir(u'D:\【01】行健金融\【01】数据中心\【05】数据分析项目\【03】2018\May\规则引擎_分期商城_风控+授信') # joblib.dump(sclf, 'stackingpkl.pkl') # joblib.dump(scaler, 'scaler.pkl') # ==============================================================================