def test_predict_meta_features(): knn = KNeighborsClassifier() lr = LogisticRegression(multi_class='ovr', solver='liblinear') gnb = GaussianNB() X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.3) # test default (class labels) stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) stclf.fit(X_train, y_train) test_meta_features = stclf.predict(X_test) assert test_meta_features.shape == (X_test.shape[0],)
def stack(self,X,y,test_X): """ 模型融合 :param X: X是一个训练数据集合,array或者list :param y: Y是真实值集合,array或者list :param test_X: 测试数据集合,array或者list :return: result_Y:根据测试数据预测出来的结果 """ logging.info('------Stacking之后的模型效果') sclf = StackingCVClassifier(classifiers=self.clfArr,meta_classifier=self.lr,cv=4) # sclf = StackingClassifier(classifiers=self.clfArr,meta_classifier=self.lr,verbose=1) X=np.array(X) y=np.array(y).flatten() sclf.fit(X,y) result_Y = sclf.predict(test_X) scores = model_selection.cross_val_score(sclf,X,y,cv=5,scoring='accuracy') print('The Accuracy , mean: {:.5f} , std:+/- {:.5f}'.format(scores.mean(), scores.std())) return result_Y
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=35, criterion="entropy") ada = AdaBoostClassifier(n_estimators=75, learning_rate=1.5) etc = ExtraTreesClassifier(n_jobs=-1, n_estimators=5, criterion="entropy") # lr = LogisticRegression(n_jobs=-1, C=100) # meta classifier, 2 trees, c=100 is used in stacking2.pkl lr = LogisticRegression(n_jobs=-1, C=8) # meta classifier sclf = StackingCVClassifier(classifiers=[ada, rfc, etc], meta_classifier=lr, use_probas=True, verbose=3) sclf.fit(X,y) print("training finished") df=pd.read_csv(r'data/corrected',header=None, names=__ATTR_NAMES) df = processing.merge_sparse_feature(df) # one hot encoding df = processing.one_hot(df) # y labels mapping df = processing.map2major5(df) with open(r'data/selected_feat_names.pkl', 'rb') as f: selected_feat_names = pickle.load(f) print("test data loaded") X = df[selected_feat_names].values y = df['attack_type'].values y_rf = sclf.predict(X) print("stacking results:") cost_based_scoring.score(y, y_rf, True)
solver='lbfgs') clf6 = svm.SVC(C=2, gamma=0.1) lr = LogisticRegression() # clf1, clf2, clf3,clf4,clf5,clf6 sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf4, clf5, clf6], meta_classifier=lr) # for clf, label in zip( # [clf1, clf2, clf3,clf4,clf5,clf6, sclf], # ['xgb', 'lgb', 'catboost','RF','LR','svc', 'StackingClassifier']): # # scores = model_selection.cross_val_score(clf, data_tr, label_tr, cv=3, scoring='accuracy') # print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) sclf.fit(data_tr, label_tr) label_te_true = np.array(label_te) pre = sclf.predict(data_te) zhizaoye = sclf.predict(zhiyaoye_pre_data) print(zhizaoye) number = data4['TICKER_SYMBOL'].values dataframe = pd.DataFrame({'股票编号': number, '房地产业': zhizaoye}) dataframe.to_csv("房地产业-预测结果.csv", index=False, encoding='GBK') print("the stacking model auc: %.4g" % metrics.roc_auc_score(label_te_true, pre)) print(classification_report(label_te_true, pre)) print("stacking auc值为:", roc_auc_score(label_te_true, pre)) # ROC曲线绘制 fpr1, tpr1, threshold1 = roc_curve(label_te_true, pre) plt.plot(fpr1, tpr1, color='red') plt.plot([0, 1], [0, 1], color='blue', linestyle='--') plt.xlim([0.0, 1.0])
xgb.XGBClassifier(max_depth=6, n_estimators=100, num_round=5), RandomForestClassifier(n_estimators=100, max_depth=6, oob_score=True), GradientBoostingClassifier(learning_rate=0.3, max_depth=6, n_estimators=100) ] clf2 = LogisticRegression(C=0.5, max_iter=100) #============================================================================# from mlxtend.classifier import StackingClassifier, StackingCVClassifier sclf = StackingClassifier(classifiers=clfs, meta_classifier=clf2) sclf.fit(X_train, Y_train) print(sclf.score(X_train, Y_train)) sclf_pre = sclf.predict(X_test) sclf_sub = pd.DataFrame({ "PassengerId": test_df["PassengerId"], "Survived": sclf_pre }) sclf_sub.to_csv("../data/sclf_sub.csv", index=False) #===============================================================================# sclf2 = StackingCVClassifier(classifiers=clfs, meta_classifier=clf2, cv=5) x = np.array(X_train) y = np.array(Y_train).flatten() sclf2.fit(x, y) print(sclf2.score(x, y)) sclf2_pre = sclf2.predict(np.array(X_test)) sclf2_sub = pd.DataFrame({ "PassengerId": test_df["PassengerId"], "Survived": sclf2_pre }) sclf2_sub.to_csv("../data/sclf2_sub.csv", index=False)
show=False) elif (META == False): HeldOutDataPredictions = pf.Classification_Model( data_training=vec_training, target_training=out_train, data_testing=vec_testing, Classifier=EnsembleCustom[0][1], target_testing=None, ModelName=EnsembleCustom[0][0], accur=False, grph=False, setClass=clas, show=False) else: MetaClass.fit(vec_training, out_train) HeldOutDataPredictions = MetaClass.predict(vec_testing) runingTime = timeit.default_timer( ) - tStart #Stopping clock and getting time spent print("Fitting and predictions done in %0.4fs." % runingTime) print("=" * 100) """ PRINTING THE PREDICTIONS MADE AND SAVING CSV FILE """ Preds = pd.DataFrame({"Category": HeldOutDataPredictions}) Results = pd.concat([dataTest["id"], Preds], axis=1, sort=False) print(Results) pf.Write_File_DF(Data_Set=Results, File_Name="Predictions_Group_4", separation=",", head=True, ind=False)
def main_leave_one_week(offline, mall_ids=-1, save_offline_predict=False): model_name = "stack_balance_strong_matrix_lonlat_wh" train_all = load_train() test_all = load_testA() shop_info = load_shop_info() if mall_ids == -1: mall_ids = shop_info.mall_id.unique() offline_predicts = {} all_rowid = {} offline_reals = {} all_predicts = {} for _index, mall_id in enumerate(mall_ids): print "train: ", mall_id, " {}/{}".format(_index + 1, len(mall_ids)) shops = shop_info[shop_info.mall_id == mall_id].shop_id.unique() train = train_all[train_all.mall_id == mall_id] test = test_all[test_all.mall_id == mall_id] # y label encoder y = train.shop_id.values label_encoder = LabelEncoder().fit(y) y = label_encoder.transform(y) num_class = len(shops) print "num_class", num_class # all wifi matrix df, train_cache, test_cache = get_wifi_cache2(mall_id) train_matrix_origin_all = train_cache[2] test_matrix_origin_all = test_cache[2] test_index = test_cache[0] # choose_strong_wifi_index strong_wifi_index = choose_strong_wifi_index(-90, 6, train_matrix_origin_all) train_strong_matrix = train_matrix_origin_all[:, strong_wifi_index] test_strong_matrix = test_matrix_origin_all[:, strong_wifi_index] # train valid split and get index _train_index, _valid_index = get_last_one_week_index(train) # weekday and hour preprocess_basic_time(train) preprocess_basic_time(test) preprocess_basic_wifi(train) preprocess_basic_wifi(test) train_time_features = train[["weekday", "hour", "is_weekend"]].values test_time_features = test[["weekday", "hour", "is_weekend"]].values train_wh_features = train[["weekday", "hour"]].values test_wh_features = test[["weekday", "hour"]].values # 是否连接wifi train_connect_wifi = ( train.basic_wifi_info.map(lambda x: len(x[1])).values > 0).astype(int).reshape(-1, 1) test_connect_wifi = ( test.basic_wifi_info.map(lambda x: len(x[1])).values > 0).astype(int).reshape(-1, 1) # 搜到的wifi数量 train_search_wifi_size = train.basic_wifi_info.map( lambda x: x[0]).values.reshape(-1, 1) test_search_wifi_size = test.basic_wifi_info.map( lambda x: x[0]).values.reshape(-1, 1) # lon lat train_lonlats = train[["longitude", "latitude"]].values test_lonlats = test[["longitude", "latitude"]].values # concatenate train/test features train_matrix = np.concatenate( [ train_strong_matrix, train_lonlats, train_wh_features, # train_connect_wifi, # train_search_wifi_size ], axis=1) test_matrix = np.concatenate( [ test_strong_matrix, test_lonlats, test_wh_features, # test_connect_wifi, # test_search_wifi_size ], axis=1) # train valid get _train_x = train_matrix[_train_index] _train_y = y[_train_index] _valid_x = train_matrix[_valid_index] _valid_y = y[_valid_index] # stack base model def get_model1(): model1 = RandomForestClassifier(n_estimators=500, n_jobs=-1, class_weight="balanced") return model1 def get_model2(): model2 = OneVsRestClassifier(estimator=RandomForestClassifier( n_estimators=188, n_jobs=-1, class_weight="balanced")) return model2 # stack meta model def get_meta_model(): meta_model = RandomForestClassifier(n_estimators=777, n_jobs=-1, class_weight="balanced") return meta_model # stack cv cv = 3 # offline # expansion train _x, _y = expansion(_train_x, _train_y, cv) stack = StackingCVClassifier([get_model1(), get_model2()], get_meta_model(), use_probas=True, use_features_in_secondary=True, cv=cv) stack.fit(_x, _y) best_predict = stack.predict(_valid_x) predict = label_encoder.inverse_transform(best_predict) offline_predicts[mall_id] = predict _real_y = label_encoder.inverse_transform(_valid_y) offline_reals[mall_id] = _real_y print mall_id + "'s acc is", acc(predict, _real_y) # online if not offline: # expansion train _x, _y = expansion(train_matrix, y, cv) stack = StackingCVClassifier( [get_model1(), get_model2()], get_meta_model(), use_probas=True, use_features_in_secondary=True, cv=cv) stack.fit(_x, _y) predict = stack.predict(test_matrix) predict = label_encoder.inverse_transform(predict) all_predicts[mall_id] = predict all_rowid[mall_id] = test_all[np.in1d(test_all.index, test_index)].row_id.values # offline acc result result = {} for _mall_id in mall_ids: _acc = acc(offline_predicts[_mall_id], offline_reals[_mall_id]) print _mall_id + "'s acc is", _acc result[_mall_id] = _acc if save_offline_predict: pd.DataFrame({ "predict": offline_predicts[_mall_id], "real": offline_reals[_mall_id] }).to_csv("../result/offline_predict/{}.csv".format(_mall_id), index=None) all_predict = np.concatenate(offline_reals.values()) all_true = np.concatenate(offline_predicts.values()) _acc = acc(all_predict, all_true) print "all acc is", _acc if len(mall_ids) < 50: exit(1) result["all_acc"] = _acc path = "../result/offline/{}".format(model_name) save_acc(result, path, None) # online save result if not offline: all_rowid = np.concatenate(all_rowid.values()) all_predict = np.concatenate(all_predicts.values()) result = pd.DataFrame(data={ "row_id": all_rowid, "shop_id": all_predict }) result.sort_values(by="row_id", inplace=True) path = "../result/online/{}".format(model_name) save_result(result, path, None)
scvc = StackingCVClassifier(classifiers=[dtc, rfc, etc], meta_classifier=lr, use_probas=True, verbose=0) # 关于数据的数据,一般是结构化数据(如存储在数据库里的数据,规定了字段的长度、类型等) # meta_classifier : 关于分类器的分类器,通常是主分类器的代理,用于提供附加的数据预处理 # use_probas : If True, trains meta-classifier based on predicted probabilities instead of class labels. # verbose>2: Changes verbose param of the underlying regressor to self.verbose - 2 输出计算过程,赘言 start_time = time.time() scvc = scvc.fit( train_x.values, train_y["label"].values) #stack对输入要求是numpy.array, 所以pandas.df必须转换,即.values end_time = time.time() print("StackingCVClassifier, training finished, using : %.2f s" % (end_time - start_time)) predict_y = scvc.predict(test_x) #print classification_report(test_y["label"].values, predict_y) print "score : " + str( cost_based_scoring.score(test_y["label"].values, predict_y, show=False)) print "---------- ----------" ''' 20180524 最终输出 train data loaded test data loaded LogisticRegression : LogisticRegression, training finished, using : 105.72 s score : 0.491240366654 ---------- ---------- DecisionTreeClassifier : DecisionTreeClassifier, training finished, using : 2.71 s score : 0.23169543676
use_probas=True, meta_classifier=mlp, cv=7, store_train_meta_features=True, stratify=True, verbose=3, n_jobs=-1, random_state=seed) sclf_cv_score = cross_val_score(sclf, df_train[selected_columns].values, y=y, scoring='accuracy', cv=3) print(f"Mean accuracy {sclf_cv_score.mean(): .4f}") print(f"+/- {sclf_cv_score.std(): .2f}") ############################################################################### predictions = sclf.predict( df_test[selected_columns].values ) # Add values attribute to rid of 'feature_names mismatch' final_pred = le.inverse_transform(predictions) final_pred = [int(i) for i in final_pred] print(final_pred) # Model voting submission output = pd.DataFrame({'Id': test_ids, 'Cover_Type': final_pred}) output.to_csv('submission48.csv', index=False, header=True)
random_state=2018, n_jobs=8) svc = SVC(kernel='rbf', random_state=2018, probability=True, gamma='auto') lr = LogisticRegression(max_iter=1000, solver='lbfgs', penalty='l2', n_jobs=8) models = [rf, xgb, lgb, svc] y_pred_self, y_prob_self = StackingModels(models=models, meta_model=lr, X_train=X_train, X_test=X_test, y_train=y_train) acc = accuracy_score(y_test, y_pred_self) auc = roc_auc_score(y_test, y_prob_self) print('MyModel: ACC = {:.6f}, AUC = {:.6f}'.format(acc, auc)) stack_clf = StackingCVClassifier(classifiers=models, meta_classifier=lr, cv=5).fit(X_train, y_train) y_pred_mxltend, y_prob_mxltend = stack_clf.predict( X_test), stack_clf.predict_proba(X_test)[:, -1] acc = accuracy_score(y_test, y_pred_mxltend) auc = roc_auc_score(y_test, y_prob_mxltend) print('Mlxtend: ACC = {:.6f}, AUC = {:.6f}'.format(acc, auc)) X, y = make_regression(n_samples=5000, n_features=20, n_informative=18, random_state=2018) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2018) X_train, X_test = map(scaler.fit_transform, [X_train, X_test]) rf = RandomForestRegressor(n_estimators=50,
n_jobs=-1) ensemble = [('ex_cls', ex_cls), ('rf2', rf2_clf), ('rf', rf_clf)] stack = StackingCVClassifier(classifiers=[clf for label, clf in ensemble], meta_classifier=rf_clf, cv=5, use_probas=True, use_features_in_secondary=True, verbose=1) # HOLD-OUT X_train, X_valid, y_train, y_valid = train_test_split(X.values, y.values, train_size=0.8, test_size=0.2, random_state=42) stack = stack.fit(X_train, y_train) pr = stack.predict(X_valid) # MAE y_nump = np.array(y_valid) mae = mean_absolute_error(pr, y_valid) print("Mean Absolute Error:", mae) print("Good predicted: ", np.sum(pr == y_nump), "of: ", y_valid.shape[0]) print("Accuracy Score: ", accuracy_score(pr, y_valid) * 100) # PREDICTION OF TEST pr_final_test = stack.predict(X_test_full) print(pr_final_test.shape)
ExtraTreesClassifier(n_estimators=1000, max_depth=2, n_jobs=8), ExtraTreesClassifier(n_estimators=1000, max_depth=4, n_jobs=8), ExtraTreesClassifier(n_estimators=1000, max_depth=10, n_jobs=8), ExtraTreesClassifier(n_estimators=1000, max_depth=30, n_jobs=8), ] lr = ExtraTreesClassifier(n_estimators=1000, max_depth=30, n_jobs=8) model = StackingCVClassifier(classifiers=clfList, use_probas=True, use_features_in_secondary=True, meta_classifier=lr, cv=20, random_state=15, verbose=1) model.fit(x_train, y_train) #y_pred = sclf.predict(x_test) #score(y_pred, y_test) #model = load("../models/catboost_model.pkl") y_pred = model.predict(x_val) scores = get_all_scores(y_pred, y_val) print(scores) y_pred = model.predict(x_test) scores = get_all_scores(y_pred, y_test) print(scores) probas_test = model.predict_proba(x_test) save(model, "../models/ex_stack.pkl") print("done")
clf = StackingClassifier(estimators=[gs_cv_knc, gs_tfidf_knn], final_estimator=LogisticRegression( class_weight='balanced', multi_class='multinomial')) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # ('knn', gs_tfidf_knn), ('knc', gs_cv_knc) sclf = StackingCVClassifier(classifiers=[gs_cv_knc, gs_tfidf_knn], meta_classifier=LogisticRegression( class_weight='balanced', multi_class='multinomial'), random_state=15) sclf.fit(X_train, y_train) y_pred = sclf.predict(y_test) # ENSEMBLE - VoteClassifier: models_list = [ gs_cv_cnb, gs_tfidf_cnb, gs_cv_knn, gs_tfidf_knn, gs_cv_log, gs_tfidf_log, gs_cv_rfc, gs_tfidf_rfc, gs_cv_knc, gs_tfid_knc, gs_cv_sgd, gs_tfidf_sgd ] models_labels = [ 'cv_cnb', 'tfidf_cnb', 'cv_knn', 'tfidf_knn', 'cv_log', 'tfidf_log', 'cv_rfc', 'tfidf_rfc', 'cv_knc', 'tfidf_knc', 'cv_sgd', 'tfidf_sgd' ] models_comb = list(itertools.combinations( models_list, 3)) #create all possible combinations of models labels_comb = list(itertools.combinations(
# X即特征属性值 X = train_np[:, 1:] # train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3) test_df = test_data test_np = test_df.values test_x = test_np[:, 0:] lr = LogisticRegression(C=0.8,penalty='l2',tol=1e-6) DTree = DecisionTreeClassifier(max_depth=20) rfc=RandomForestClassifier(n_estimators=5000) xgbc = XGBClassifier(learning_rate=0.001, n_estimators=5000, max_depth=30, objective='binary:logitraw') gbc=GradientBoostingClassifier(learning_rate=0.001, n_estimators=5000, max_depth=30) sclf = StackingCVClassifier(classifiers=[lr,DTree,rfc,SVC(probability=True)], meta_classifier=xgbc, use_probas=True) sclf.fit(X, y) print(sclf.score(X, y)) test_id = pd.read_csv('data/origin/test.csv') predictions = sclf.predict(test_x) result = pd.DataFrame({'PassengerId':test_id['PassengerId'].values, 'Survived':predictions.astype(np.int32)}) result.to_csv("data/predictions/stacking_test4.csv", index=False) answer = pd.read_csv('data/predictions/submission.csv') answer_np = answer['Survived'].values print('acc = %.5f' % accuracy_score(answer_np, predictions)) # from calculate_acc import calculate_acc # calculate_acc(predictions)
"CatBoost": classifier4, "ET": classifier5, "Stack": sclf} # Train classifiers for key in classifiers: # Get classifier classifier = classifiers[key] # Fit classifier classifier.fit(X_train, y_train) # Save fitted classifier classifiers[key] = classifier pred = sclf.predict( X_test) # Get results results = pd.DataFrame() for key in classifiers: # Make prediction on test set y_pred = classifiers[key].predict(X_test) # Save results in pandas dataframe object results[f"{key}"] = y_pred # Add the test set to the results object results["Target"] = y_test pred_stack = results[ 'Stack' ] score = f1_score( y_test, pred_stack )
sgd = SGDClassifier(eta0=1, max_iter=1000, tol=0.0001, alpha=0.01, l1_ratio=1.0, learning_rate='adaptive', loss='log', penalty='elasticnet') # set up the meta classifier (level 2 model) from sklearn.linear_model import LogisticRegression from mlxtend.classifier import StackingCVClassifier np.random.seed(RANDOM_SEED) lr = LogisticRegression(max_iter=1000, class_weight='balanced', penalty='l1', C=0.1, solver='liblinear') sclf = StackingCVClassifier(classifiers=[knn, rf, nb, svc, sgd, lgbm], use_probas=True, use_features_in_secondary=True, meta_classifier=lr, cv=6) sclf.fit(train, targets) preds = sclf.predict(test) print(preds) exit(0) # Set up K-Fold cross validation and predictions from sklearn.metrics import roc_auc_score from sklearn.model_selection import KFold num_folds = 6 folds = KFold(n_splits=num_folds, shuffle=True) test_result = np.zeros(len(test)) auc_score = 0 for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, targets)):
if(SINGLE): if(ALL): Preds = Parallel(n_jobs=-1, verbose=1, backend="threading")(delayed(pf.Classification_Model)(data_training=vec_training, target_training=out_train, data_testing=vec_testing, Classifier=Model[1], target_testing=out_test, ModelName=Model[0], accur=True, grph=False, setClass=clas, show=False) for Model in ListAllClassifiers) else: Preds = pf.Classification_Model(data_training=vec_training, target_training=out_train, data_testing=vec_testing, Classifier=ListAllClassifiers[10][1], target_testing=out_test, ModelName=ListAllClassifiers[10][0], accur=True, grph=True, setClass=clas, show=False) elif(META==False): Preds = pf.Classification_Model(data_training=vec_training, target_training=out_train, data_testing=vec_testing, Classifier=EnsembleCustom[0][1], target_testing=out_test, ModelName=EnsembleCustom[0][0], accur=True, grph=True, setClass=clas, show=False) else: MetaClass.fit(vec_training, out_train) Preds = MetaClass.predict(vec_testing) pf.ClassReport_Graph(Classif=MetaClass, Data_train=vec_training, Target_train=out_train, Data_test=vec_testing, Target_test=out_test, Class=clas, ModelName='Stacking CV Classifier', Accur=True, Predict=Preds) runingTime = timeit.default_timer() - tStart #Stopping clock and getting time spent print("Fitting and predictions done in %0.4fs."%runingTime) print("="*100) pf.Get_ConfusionMatrix(TrueLabels=out_test, PredictedLabels=Preds, Classes=clas, Normal=True, Title='Confusion matrix', ColorMap='rainbow', FigSize=(30,30), save=False) #r = pd.DataFrame({"SGDC":ResultPreds[0], "Bernoulli":ResultPreds[1], "Multinomial":ResultPreds[2], "RandomForest":ResultPreds[3], "ExtraTrees":ResultPreds[4], "GradientBoosting":ResultPreds[5], "AdaBoosting":ResultPreds[6]}) # ################################################################################################################################## ''' DEFAULT PARAMETERS FOR THE DIFFERENT CLASSIFIERS AVAILABLE ''' ################################################################################################################################## """
test['slope_hyd'] = (test['Horizontal_Distance_To_Hydrology']**2 + test['Vertical_Distance_To_Hydrology']**2)**0.5 test.slope_hyd = test.slope_hyd.map( lambda x: 0 if np.isinf(x) else x) # remove infinite value if any #Mean distance to Amenities test['Mean_Amenities'] = (test.Horizontal_Distance_To_Fire_Points + test.Horizontal_Distance_To_Hydrology + test.Horizontal_Distance_To_Roadways) / 3 #Mean Distance to Fire and Water test['Mean_Fire_Hyd'] = (test.Horizontal_Distance_To_Fire_Points + test.Horizontal_Distance_To_Hydrology) / 2 feature = [col for col in train.columns if col not in ['Cover_Type', 'Id']] X_train = train[feature] X_test = test[feature] c1 = ensemble.ExtraTreesClassifier(n_estimators=150, bootstrap=True) c2 = ensemble.RandomForestClassifier(n_estimators=150, bootstrap=True) c3 = XGBClassifier() meta = svm.LinearSVC() etc = StackingCVClassifier(classifiers=[c1, c2, c3], use_probas=True, meta_classifier=meta) etc.fit(X_train.values, train['Cover_Type'].values) sub = pd.DataFrame({ "Id": test['Id'], "Cover_Type": etc.predict(X_test.values) }) sub.to_csv("stackcv_linearsvc.csv", index=False)
#voting ensemlbe ensemble = VotingClassifier(estimators, voting='soft', weights=[1, 1, 1]) ensemble.fit(X_train, y_train) pred = ensemble.predict(X_test) print("predicted values----------:", pred) pickle.dump(ensemble, open('ensemble-clf.sav', 'wb')) # pred_op = ensemble.predict(otpt) # print("Predicted values:" ,pred_op) print('fscore:{0:.3f}'.format(f1_score(y_test, pred, average='micro'))) #meta classifier ensemble stack = StackingCVClassifier(classifiers=[mlp, xgb, rf], meta_classifier=lr, use_probas=True) stack.fit(X_train.values, y_train.values) pred2 = stack.predict(X_test.values) print("predicted values: ", pred2) print('fscore:{0:.3f}'.format(f1_score(y_test, pred2, average='micro'))) from sklearn.metrics import confusion_matrix confusion_lr = confusion_matrix(y_test, pred) pickle.dump(stack, open('stack-clf.sav', 'wb')) print(confusion_lr) #################################################################################################################### # #REPORT AND PLOT MICRO-AVERAGE ROC AUC FOR EACH MODEL # from sklearn.preprocessing import label_binarize # import matplotlib.pyplot as plt # from itertools import cycle # from sklearn.multiclass import OneVsRestClassifier # from scipy import interp # # Binarize the output
# x_train = x_train.reset_index(drop=True) # x_vali = x_vali.reset_index(drop=True) y_train = y_train.reset_index(drop=True) # y_vali = y_vali.reset_index(drop=True) """===================================================================================================================== 2 模型融合; 学习参考:https://blog.csdn.net/LAW_130625/article/details/78573736 """ lr_clf = clfs["lr"] # meta_classifier svm_clf = clfs["svm_ploy"] rf_clf = clfs["rf"] xgb_clf = clfs["xgb"] lgb_clf = clfs["lgb"] sclf = StackingCVClassifier( classifiers=[lr_clf, svm_clf, rf_clf, xgb_clf, lgb_clf], meta_classifier=lr_clf, use_probas=True, verbose=3) sclf.fit(x_train, y_train) print("测试模型 & 模型参数如下:\n{0}".format(sclf)) print("=" * 20) pre_train = sclf.predict(x_train) print("训练集正确率: {0:.4f}".format(sclf.score(x_train, y_train))) print("训练集f1分数: {0:.4f}".format(f1_score(y_train, pre_train))) print("训练集auc分数: {0:.4f}".format(roc_auc_score(y_train, pre_train)))
# In[ ]: stacker = SCVC(classifiers=[clf1, clf2, clf3, clf1], meta_classifier=meta_clf, use_probas=True, use_features_in_secondary=True) # In[ ]: for c in train.columns: train[c] = train[c].fillna(train[c].median()) test[c] = test[c].fillna(train[c].median()) stacker.fit(train.values, np.array(Y)) # In[ ]: my_prediction = stacker.predict(test.values) # In[ ]: # PassengerId,Survived submission = pd.DataFrame() submission['PassengerId'] = test.index.tolist() submission['Survived'] = my_prediction # In[ ]: submission.to_csv("submission.csv", index=False) # In[ ]:
np.around(score, 3))) print('> Fitting stack') stack = StackingCVClassifier( classifiers=[ab_clf, rf_clf, xgb_clf, et_clf, lg_clf], meta_classifier=rf_clf, cv=5, stratify=True, shuffle=True, use_probas=True, use_features_in_secondary=True, verbose=1, random_state=12345, n_jobs=-1) stack = stack.fit(X_train, y_train) X_test = np.array(X_test) print('> Making predictions') pred = stack.predict(X_test) print(classification_report(pred, y_test, labels=None)) #predictions = pd.Series(pred, index=X_test.index, dtype=y_train.dtype) # ======================================================================== # sel = VarianceThreshold(threshold=0) df_train_new = sel.fit_transform(df_train) #sel.get_support(df_train) sel.get_support(indices=True)
eclf = VotingClassifier(estimators=[('rf',rf),('lr',lr),('gb',gb)],voting='soft', weights=[3,2,3]) # Building and running the StackingClassifier on the test data from mlxtend.classifier import StackingCVClassifier sclf=StackingCVClassifier(classifiers=[rf,lr,gb,et,gnb,svc,knn,xgb,ada,mlp,lda,qda], use_features_in_secondary=True, use_probas=True, meta_classifier=eclf) cmetrics=[] cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='accuracy').mean()) cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='precision').mean()) cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='recall').mean()) cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='roc_auc').mean()) sclf.fit(X.values,y.values) pred=sclf.predict(Xt.values) # plotting ROC-Curve pred_proba=sclf.predict_proba(Xt.values)[:,1] fpr, tpr, threshold = roc_curve(yt, pred_proba) roc_auc=auc(fpr,tpr) plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.savefig('ROC_curve_test.png',bbox_inches='tight') plt.clf()
# Fitting stack from mlxtend.classifier import StackingCVClassifier stack = StackingCVClassifier(classifiers=[ab_clf, et_clf, lg_clf, bag_clf, rf_clf], meta_classifier=rf_clf, cv=10, stratify=True, shuffle=True, use_probas=True, use_features_in_secondary=True, verbose=0, random_state=randomstate) stack = stack.fit(x, y) print("Completed modeling!") #make predictions y_predict = stack.predict(x_predict) y_predict = pd.Series(y_predict, index=x_predict.index, dtype=y.dtype) print("Completed predictions!") # Save predictions to a file for submission output = pd.DataFrame({'Id': Ids, 'Cover_Type': y_predict}) output.to_csv('submission.csv', index=False) #create a link to download the file from IPython.display import FileLink FileLink(r'submission.csv')