def stacking(): # Building and fitting clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression() stack = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) stack.fit(X_train, y_train) # make class predictions for the testing set y_pred_class = stack.predict(X_test) print('########### Stacking ###############') accuracy_score = evalClassModel(stack, y_test, y_pred_class, True) #Data for final graph methodDict['Stacking'] = accuracy_score * 100 stacking()
def runExperiment(bases, experimentName): metric = 'precision' meta_classifier = GaussianNB() input = x for i in [10, 15, 20]: experiment = '\n*** Stacking - {} - {} base classifiers ***'.format(experimentName, i) base_classifiers = [] while len(base_classifiers) < i: for b in bases: pipe = make_pipeline(ColumnSelector(cols=getRandomCols()), b) base_classifiers.append(pipe) # test_scores = [] test_scores = {'score':[], 'diversity':[]} for j in range(10): ensemble = StackingClassifier(classifiers=base_classifiers, meta_classifier=meta_classifier) cv_scores = cross_validate(ensemble, input, y, scoring={'score':metric, 'diversity':diversity}, cv=KFold(n_splits=10)) # test_scores.append(cv_scores['test_score'].mean()) test_scores['score'].append(cv_scores['test_score'].mean()) test_scores['diversity'].append(cv_scores['test_diversity'].mean()) report(experiment, test_scores)
def runExperiment(base_classifiers, experimentName): if len(base_classifiers) != 5: raise ValueError('You must provide exactly 5 base classifiers') metric = 'precision' meta_classifier = GaussianNB() input = x for i in [10, 15, 20]: classifiers = [] for j in range(1, i // len(base_classifiers) + 1): classifiers.extend(base_classifiers) experiment = '\n*** Stacking - {} - {} base classifiers ***'.format(experimentName, len(classifiers)) # test_scores = [] test_scores = {'score':[], 'diversity':[]} for j in range(10): ensemble = StackingClassifier(classifiers=base_classifiers, meta_classifier=meta_classifier) # cv_scores = cross_validate(ensemble, input, y, scoring=metric, cv=KFold(n_splits=10)) cv_scores = cross_validate(ensemble, input, y, scoring={'score':metric, 'diversity':diversity}, cv=KFold(n_splits=10)) # test_scores.append(cv_scores['test_score'].mean()) test_scores['score'].append(cv_scores['test_score'].mean()) test_scores['diversity'].append(cv_scores['test_diversity'].mean()) report(experiment, test_scores)
def __init__(self, models_instructions): ''' models_instructions should be a list of tuples [(Model, list_of_features)] ''' from mlxtend.classifier import StackingClassifier from mlxtend.feature_selection import ColumnSelector from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.model_selection import StratifiedKFold self.cv = StratifiedKFold(n_splits=5, random_state=123) self.models_instructions = models_instructions models = [ Pipeline([('ColumnSelect', ColumnSelector(v[1])), ('Model', v[0].clf)]) for i, v in enumerate(models_instructions) ] self.models = models self.clf_stack = Model(clf=StackingClassifier( classifiers=models, meta_classifier=LogisticRegression()), name='Stacked ensemble')
def stacking(self, x_train, y_train, meta_clf_label=None, save=True, name='stack_model.pkl'): meta_clf = self.model_util.get_meta_model(meta_clf_label) meta_param = self.model_util.get_meta_param_set(meta_clf_label) stack_clf = StackingClassifier(classifiers=self.get_basic_models( self.selected_models), use_probas=False, average_probas=False, meta_classifier=meta_clf) self.best_model = self.model_param_tune(x_train, y_train, stack_clf, meta_param) if save: self.model_util.save_model( self.best_model, os.path.join(self.model_save_path, name)) return self.best_model
def get_model(model_type): if model_type == "knn": model = KNeighborsClassifier() elif model_type == "naive_bayes": model = MultinomialNB() elif model_type == "logistic_regression": model = LogisticRegression() elif model_type == "svm": model = SVC(kernel='linear') elif model_type == "decision_tree": model = DecisionTreeClassifier() elif model_type == "adaboost": model = AdaBoostClassifier() elif model_type == "random_forest": model = RandomForestClassifier(n_estimators=300) elif model_type == "gbdt": model = GradientBoostingClassifier() elif model_type == "xgboost": model = XGBClassifier() elif model_type == "mlp": model = MLPClassifier() elif model_type == 'bagging': clf1 = LogisticRegression(random_state=0) clf2 = XGBClassifier(random_state=0) clf3 = SVC(random_state=0, kernel='linear', probability=True) clf4 = MLPClassifier(random_state=0) model = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4], weights=[1, 2, 2, 1], voting='soft', verbose=2) elif model_type == 'stacking': clf1 = XGBClassifier(random_state=0) clf2 = SVC(random_state=0, kernel='linear', probability=True) clf3 = MLPClassifier(random_state=0) lr = LogisticRegression() model = StackingClassifier(classifiers=[clf1, clf2, clf3], use_probas=True, average_probas=False, meta_classifier=lr) return model
def test_mlxtend(): data = load_iris() X = data.data # (569,30) y = data.target # (569) clfs = [ RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'), ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'), GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100) ] lr = LogisticRegression() sclf = StackingClassifier(classifiers=clfs, meta_classifier=lr) scores = model_selection.cross_val_score(sclf, X, y, cv=3, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), 'stacking')) print('3-fold cross validation:\n') for clf, label in zip( clfs, ['KNN', 'Random Forest', 'Naive Bayes', 'StackingClassifier']): scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
def NN_Stacking(X_train, X_test, y_train, y_test): model_name = 'NN Stacking' input_dim = len(X_train.columns) #create_model_NN_1HiddenLayer(input_dim=38*4, nodes_l1=200, dropout_l1=0.2) clf1 = KerasClassifier(build_fn=create_model_NN_1HiddenLayer, input_dim=38 * 4, epochs=30, batch_size=1000, verbose=2) clf2 = KerasClassifier(build_fn=create_model_NN_2HiddenLayers, input_dim=input_dim, epochs=15, batch_size=1000, verbose=2) clf3 = KerasClassifier(build_fn=create_model_NN_3HiddenLayers, input_dim=input_dim, epochs=15, batch_size=1000, verbose=2) clf4 = KerasClassifier(build_fn=create_model_NN_4HiddenLayers, input_dim=input_dim, epochs=15, batch_size=1000, verbose=2) sclf = StackingClassifier(classifiers=[clf2, clf2, clf3, clf4], use_probas=True, average_probas=False, meta_classifier=clf1) start = datetime.now() print('Fitting %s...' % model_name) model = sclf.fit(X_train, y_train) end = datetime.now() print('Finished fitting {} in {} seconds'.format(model_name, str(end - start))) return sclf, model
def stacking_model2(self, X_train, X_test, y_train, bst_xgb, bst_forest, bst_gradient, bst_lgb): ''' 组合四种算法 :param X_train: 训练集 :param X_test: 测试集 :param y_train: 训练标签 :param bst_xgb: xgb最优参数 :param bst_forest: forest最优参数 :param bst_gradient: gradient最优参数 :param bst_lgb: lgb最优参数 :return: 预测结果 ''' lr = linear_model.LogisticRegression(random_state=7) sclf = StackingClassifier( classifiers=[bst_xgb, bst_forest, bst_gradient, bst_lgb], use_probas=True, average_probas=False, meta_classifier=lr) sclf.fit(X_train, y_train) predictions = sclf.predict_proba(X_test)[:, 1] return predictions
def test_get_params(): clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()})) expect = ['average_probas', 'classifiers', 'gaussiannb', 'kneighborsclassifier', 'meta-logisticregression', 'meta_classifier', 'randomforestclassifier', 'store_train_meta_features', 'use_clones', 'use_features_in_secondary', 'use_probas', 'verbose'] assert got == expect, got
def get_model(model_type): if model_type == "logistic_regression": model = LogisticRegression() # 快,准确率一般。val mean acc:0.91 elif model_type == "random_forest": model = RandomForestClassifier( n_estimators=300) # 速度还行,准确率一般。val mean acc:0.93125 elif model_type == "decision_tree": model = DecisionTreeClassifier() # 速度快,准确率低。val mean acc:0.62 elif model_type == "knn": model = KNeighborsClassifier() # 速度一般,准确率低。val mean acc:0.675 elif model_type == "bayes": model = MultinomialNB() # 速度快,准确率低。val mean acc:0.62 elif model_type == "xgboost": model = XGBClassifier() # 速度慢,准确率高。val mean acc:0.95 elif model_type == "svm": model = SVC(kernel='linear', probability=True) # 速度慢,准确率高,val mean acc:0.945 elif model_type == 'mlp': model = MLPClassifier() # 速度一般,准确率一般。val mean acc:0.89125 elif model_type == 'ensemble': clf1 = LogisticRegression(random_state=0) clf2 = XGBClassifier(random_state=0) clf3 = SVC(random_state=0, kernel='linear', probability=True) clf4 = MLPClassifier(random_state=0) model = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4], weights=[1, 2, 2, 1], voting='soft', verbose=2) elif model_type == 'stack': clf1 = XGBClassifier(random_state=0) clf2 = SVC(random_state=0, kernel='linear', probability=True) clf3 = MLPClassifier(random_state=0) lr = LogisticRegression() model = StackingClassifier(classifiers=[clf1, clf2, clf3], use_probas=True, average_probas=False, meta_classifier=lr) return model
def main(): data_train, data_test = loadData() data_train = brain(data_train) data_test = brain(data_test) data_train = data_train.drop(['AgePclass'], axis=1) data_test = data_test.drop(['AgePclass'], axis=1) models = { 'LogisticReg': LogisticRegression(max_iter=500, tol=0.0001, penalty='l2', solver='lbfgs'), 'svc': SVC(max_iter=200, kernel='rbf', gamma=0.5, C=5), 'KNN': KNeighborsClassifier(n_neighbors=9), 'LinearSvc': LinearSVC(max_iter=250, penalty='l2', C=0.5), 'decisionTree': DecisionTreeClassifier(max_depth=4), 'randomTree': RandomForestClassifier(n_estimators=100, max_depth=3, max_features=4, min_samples_leaf=20, random_state=0), 'gbdt': GradientBoostingClassifier(n_estimators=500, max_depth=3, learning_rate=0.1, random_state=0), 'adaboost': AdaBoostClassifier(n_estimators=300, learning_rate=0.75, random_state=0), 'extract': ExtraTreesClassifier(n_estimators=250, n_jobs=-1, max_depth=5, random_state=0), 'gnb': GaussianNB(), } stackModel = StackingClassifier(classifiers=[models['decisionTree'], models['gbdt'], models['adaboost'], models['extract']], meta_classifier=models['randomTree']) # for key in models: # scores, clf = train(data_train, models[key]) # # print("model: {0} scores: {1}".format(key, scores)) # clf = SVC(max_iter=200, kernel='rbf', gamma=0.5, C=5) # parameters = {"n_estimators": [200, 500, 800, 1000], 'min_samples_leaf': [2, 4, 6], # "max_depth": [3, 5, 8], "random_state": [0]} # clf = GridSearchCV(models['randomTree'], parameters) clf = train(data_train, models['randomTree']) test(data_test, clf)
def test_not_fitted(): np.random.seed(123) meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta) assert_raises( NotFittedError, "This StackingClassifier instance is not fitted yet." " Call 'fit' with appropriate arguments" " before using this method.", sclf.predict, iris.data) assert_raises( NotFittedError, "This StackingClassifier instance is not fitted yet." " Call 'fit' with appropriate arguments" " before using this method.", sclf.predict_proba, iris.data) assert_raises( NotFittedError, "This StackingClassifier instance is not fitted yet." " Call 'fit' with appropriate arguments" " before using this method.", sclf.predict_meta_features, iris.data)
def stacking(): from sklearn import datasets from sklearn import model_selection from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from mlxtend.classifier import StackingClassifier #import xgboost as xgb import numpy as np iris = datasets.load_iris() X, y = iris.data, iris.target clf1 = KNeighborsClassifier(n_neighbors=5) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() clf4 = GradientBoostingClassifier(n_estimators=200, max_depth=6) lr = LogisticRegression() #sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],meta_classifier=lr) sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], use_probas=True, meta_classifier=lr) print('3-fold cross validation:\n') for clf, label in zip([clf1, clf2, clf3, clf4, sclf], [ 'KNN', 'Random Forest', 'Naive Bayes', "GradientBoostingClassifier", 'StackingClassifier' ]): scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
def model_select_param(self,model='LR'): print 'Model Select Parameters' print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') lr = self.model_init(model) clf1 = self.model_init('KNN') clf2 = self.model_init('RFC') clf3 = self.model_init('GNB') sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],meta_classifier=lr) params = {'kneighborsclassifier__n_neighbors': [1, 5],'randomforestclassifier__n_estimators': [10, 50],\ 'meta-logisticregression__C': [0.1, 10.0]} gs = GridSearchCV(sclf, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=5) data = self.train.values.copy() label = self.train_label['label'].values.copy() label = label.reshape(label.shape[0]) gs.fit(data, label) print 'Model: {0} '.format(model) print gs.grid_scores_ print gs.best_score_ print gs.best_params_ print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') auc_train=None auc_test=None auc_train=self.model_cross_validation(model,gs.best_params_) auc_test=self.model_test(model,gs.best_params_) return auc_train,auc_test
def run_glass_experiments_ensemble(data): glass_X = data.drop(['Id', 'Type'], axis=1) glass_y = data.loc[:, 'Type'] bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100) boosting = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100) stacking = StackingClassifier( classifiers=[DecisionTreeClassifier() for _ in range(100)], meta_classifier=DecisionTreeClassifier()) random_forest = RandomForestClassifier(n_estimators=100) xgboost = xgb.XGBClassifier() decision_tree = DecisionTreeClassifier() methods = { ' Bagging': bagging, ' Boosting': boosting, ' Stacking': stacking, ' Random Forest': random_forest, ' XGBoost': xgboost, ' Decision Tree': decision_tree } results = list() for method in methods: results_model = cross_validate(methods[method], glass_X, y=glass_y, cv=10, scoring=['accuracy'], return_train_score=True) results_model['method'] = method results_model['fold'] = np.arange(1, 11) results.append(pd.DataFrame(results_model)) return pd.concat(results)
def stack_models(features, labels): """ stack gb and nb """ kfold = StratifiedShuffleSplit(labels, 100, random_state=42) clf1 = GradientBoostingClassifier(learning_rate=0.1, min_samples_leaf=1, n_estimators=400, min_weight_fraction_leaf=0, min_samples_split=4, max_depth=2, random_state=42) #clf2 = AdaBoostClassifier() clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingClassifier(classifiers=[clf1, clf3], meta_classifier=lr) for clf, label in zip([clf1, clf3, sclf], ['GB', 'Naive Bayes', 'StackingClassifier']): scores = model_selection.cross_val_score(clf, features, labels, cv=kfold, scoring='f1') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
def test_get_params(): np.random.seed(123) clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() lr = LogisticRegression(solver='liblinear', multi_class='ovr') sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()})) expect = ['average_probas', 'classifiers', 'drop_last_proba', 'gaussiannb', 'kneighborsclassifier', 'meta_classifier', 'randomforestclassifier', 'store_train_meta_features', 'use_clones', 'use_features_in_secondary', 'use_probas', 'verbose'] assert got == expect, got
#使用KNN knn = BaggingClassifier(KNeighborsClassifier(), n_estimators=40, max_samples=0.5, max_features=0.5) knn.fit(X_train, y_train) knn_predict = knn.predict(data_test) #使用stack集成 随机森林,梯度提升决策树,KNN clf1 = rfc clf2 = gbc clf3 = knn lr = LogisticRegression() slf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) slf.fit(X_train, y_train) #交叉验证 scores = cross_validation.cross_val_score(gbm, X, y, cv=5) #得分 dtc_score = dtc.score(X_test, y_test) rfc_score = rfc.score(X_test, y_test) #gbc_score = cross_val_score(gbc,X,y,cv=10) gbc_score = gbc.score(X_test, y_test) gbm_score = gbm.score(X_test, y_test) knn_score = knn.score(X_test, y_test) slf_score = slf.score(X_test, y_test) #输出结果
# lm_clf = BaggingClassifier( # LogisticRegression(), max_features=1.0, max_samples=0.2) knn_clf = BaggingClassifier( KNeighborsClassifier(n_jobs=-1), n_estimators=3, max_features=1.0, max_samples=0.3, n_jobs=-1) # nb_clf = BaggingClassifier( # GaussianNB(), n_estimators=3, max_features=1.0, max_samples=0.3, n_jobs=-1) # svc_clf = BaggingClassifier( # SVC(), n_estimators=3, max_features=1.0, max_samples=0.3, n_jobs=-1) rf_clf = RandomForestClassifier(n_estimators=110, max_depth=5, min_samples_split=2, min_samples_leaf=1, random_state=34, n_jobs=-1) st_clf = StackingClassifier(classifiers=[knn_clf, rf_clf], meta_classifier=LogisticRegression(), use_probas=True, average_probas=False) print('3-fold cross validation:\n') for clf, label in zip([knn_clf, rf_clf, st_clf], [ 'KNN', 'Random Forest', 'StackingClassifier']): scores = model_selection.cross_val_score( clf, train_data_pca, train_data.values[:, 0], cv=3, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) # ret = bagging_clf.predict(test_data.values)
predictionsxgb = xgb.predict(X_test) import operator from sklearn.neural_network import MLPClassifier mlp=MLPClassifier(solver='adam',activation='tanh',random_state=0) modelmlp=mlp.fit(X_train,y_train) predictionmlp=mlp.predict(X_test) #4. Stacked Classifier X=features1 y=target clf1 = adb clf2 = dtc clf3 = svm1 meta = LogisticRegression() sclf = StackingClassifier(classifiers=[meta, clf1, clf3], meta_classifier=clf2) for clf, label in zip([clf1, clf2, clf3, sclf], ['clf1','clf2','clf3','SC']): scores = model_selection.cross_val_score(clf, X, y, cv=6, scoring='accuracy') #5. Data Visualization import matplotlib.pyplot as plt from mlxtend.plotting import plot_decision_regions import matplotlib.gridspec as gridspec import itertools gs = gridspec.GridSpec(2, 2) fig = plt.figure(figsize=(10,8)) for clf, lab, grd in zip([clf1, clf2, clf3, sclf], ['AdaBoost', 'Decision Tree Classifier', 'Support Vector Machine', 'Stacking Classifier'],itertools.product([0, 1], repeat=2)): clf.fit(X, y)
#---------5堆叠法(stacking)------ from sklearn.datasets import load_iris iris=load_iris() X,Y=iris.data[:,1:3],iris.target from sklearn.model_selection import cross_val_score from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from mlxtend.classifier import StackingClassifier import numpy as np knn_clf=KNeighborsClassifier(n_neighbors=2) RF_clf=RandomForestClassifier(random_state=1) baye_clf=GaussianNB() lr=LogisticRegression() stack_clf=StackingClassifier(classifiers=[knn_clf,RF_clf,baye_clf],meta_classifier=lr) print('3-fold cross validation:\n') for clf,label in zip([knn_clf,RF_clf,baye_clf,lr],["KNN","Random Forest","Naive Bayes","StackingClassifier"]): scores=cross_val_score(clf,X,Y,cv=3,scoring='accuracy') print('Accuracy:%0.2f(+/-%0.2f)[%s]'%(scores.mean(),scores.std(),label)) #---------5.堆叠法(stacking)------ from sklearn import datasets from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier from sklearn.cross_validation import train_test_split from sklearn.cross_validation import StratifiedKFold import numpy as np from sklearn.metrics import roc_auc_score from sklearn.datasets.samples_generator import make_blobs #聚类数据生成器其参数设置详见:https://blog.csdn.net/kevinelstri/article/details/52622960 '''创建训练的数据集''' data, target = make_blobs(n_samples=50000, centers=2, random_state=0, cluster_std=0.60)
baseline3 = RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=1, random_state=1) baseline4 = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.033, loss_function='Logloss', logging_level='Verbose') baseline5 = AdaBoostClassifier() baseline6 = GaussianNB() baseline7 = SVC(kernel='rbf', class_weight='balanced') lr = XGBClassifier() stackmodel = StackingClassifier(classifiers=[ baseline1, baseline2, baseline3, baseline4, baseline5, baseline6, baseline7 ], meta_classifier=lr) #%% for basemodel, label in zip([ baseline1, baseline2, baseline3, baseline4, baseline5, baseline6, baseline7, stackmodel ], [ 'xgboost', 'lightgbm', 'Random Forest', 'Catboost', 'AdaBoost', 'GaussianNB', 'SVC', 'stack' ]): scores = model_selection.cross_val_score(basemodel, train, target, cv=5,
print("Soft Voting Test w/bin score mean: {:.2f}". format(vote_soft_cv['test_score'].mean()*100)) print("Soft Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_soft_cv['test_score'].std()*100*3)) print('-'*10)z """## Stacking #### ML Extend """ # Commented out IPython magic to ensure Python compatibility. from mlxtend.classifier import StackingClassifier lgbm_cl = LGBMClassifier(random_state=seed) rf_cl = RandomForestClassifier(10, random_state=seed) gdb_cl = GradientBoostingClassifier(random_state=seed) logreg = LogisticRegression() sclf = StackingClassifier(classifiers=[lgbm_cl, rf_cl,gdb_cl], meta_classifier=logreg) scores = model_selection.cross_val_score(sclf, data1_x_bin, data[Target], cv=3, scoring='f1') print("Accuracy: %0.2f (+/- %0.2f)" # % (scores.mean(), scores.std())) """#### VecStack""" #1st level model X_train, X_test, y_train, y_test = train_test_split(data1_x_bin, data[Target], test_size=0.2) models = [lgbm_cl,rf_cl,gdb_cl] S_train, S_test = stacking(models, X_train, y_train, X_test, regression = False, metric = metrics.f1_score, n_folds = 4 ,
#========================Adaboost END========================= #========================Stacking========================= model_log = LogisticRegression(penalty='l2', C=10, multi_class='multinomial', class_weight='balanced', solver='newton-cg', ) model_sta = StackingClassifier( classifiers=[model_RF,model_SVM, model_Adb], meta_classifier=model_log, ) model_sta.fit(X_train,y_train) y_sta_pred = model_sta.predict(X_test) accuracy_score(y_test,y_sta_pred) print('The sta accuracy is:',accuracy_score(y_test,y_sta_pred)) print('The sta precision is:',metrics.precision_score(y_test,y_sta_pred,average='macro')) print('The sta recall is::',metrics.recall_score(y_test,y_sta_pred,average='macro')) print('The sta f1 score is:',metrics.f1_score(y_test,y_sta_pred,average='macro'))
def main(): from sklearn.preprocessing import StandardScaler from sklearn.metrics import f1_score from skynet import DATA_PATH from skynet.data_handling import read_learning_data from skynet.data_handling.preprocessing import PreProcessor from skynet.data_handling import get_init_response from skynet.data_handling import split_time_series from mlxtend.classifier import StackingClassifier icao = "RJFK" train = read_learning_data(DATA_PATH + "/pickle/learning/skynet/train_%s.pkl" % icao) test = read_learning_data(DATA_PATH + "/pickle/learning/skynet/test_%s.pkl" % icao) data = pd.concat([train, test]).reset_index(drop=True) preprocess = PreProcessor(norm=False, binary=False) preprocess.fit(data.iloc[:, :-1], data.iloc[:, -1]) data = pd.concat([preprocess.X_train, preprocess.y_train], axis=1) date = data["date"].values spdata = split_time_series(data, date, level="month", period=2) for key in spdata: ext = spdata[key] target = get_init_response() feats = [f for f in ext.keys() if not (f in target + ["date"])] X = ext[feats] ss = StandardScaler() X = pd.DataFrame(ss.fit_transform(X), columns=X.keys()) y = ext[target] X, y = balanced(X, y) spX, spy = preprocess.split(X, y, n_folds=5) for k in spy: print(np.unique(spy[k])) X = pd.concat([spX[n] for n in spX if n != 0]).reset_index(drop=True) y = pd.concat([spy[n] for n in spy if n != 0]).reset_index(drop=True) X_test = spX[0].reset_index(drop=True) y_test = spy[0].reset_index(drop=True) from sklearn.ensemble import RandomForestClassifier clf1 = RandomForestClassifier(max_features=2) clf2 = SkySVM() meta = LogisticRegression() # 学習 # (注)balancedしてない sta = SkyStacking((clf1, clf2), meta) sta.fit(X, y) p = sta.predict(X_test) clf1.fit(X.values, y.values[:, 0]) print(np.array(X.keys())[np.argsort(clf1.feature_importances_)[::-1]]) p_rf = clf1.predict(X_test.values) # mlxtendのstacking sc = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta) sc.fit(X.values, y.values[:, 0]) p_sc = sc.predict(X_test.values) y_test = np.where(y_test.values[:, 0] > 1, 0, 1) p = np.where(p > 1, 0, 1) p_rf = np.where(p_rf > 1, 0, 1) p_sc = np.where(p_sc > 1, 0, 1) f1 = f1_score(y_true=y_test, y_pred=p) print("stacking", f1) f1_rf = f1_score(y_true=y_test, y_pred=p_rf) print("random forest", f1_rf) f1_sc = f1_score(y_true=y_test, y_pred=p_sc) print("stacked classifier", f1_sc) if True: break
iris = datasets.load_iris() X, y = iris.data[:, 1:3], iris.target from sklearn import model_selection from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from mlxtend.classifier import StackingClassifier import numpy as np clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) print('3-fold cross validation:\n') for clf, label in zip( [clf1, clf2, clf3, sclf], ['KNN', 'Random Forest', 'Naive Bayes', 'StackingClassifier']): scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
def get_pipeline(classifier_name='BDT'): """ Function to get classifier pipeline. """ steps = [] if classifier_name == 'RF': classifier = RandomForestClassifier( n_estimators=100, max_depth=6, n_jobs=20, # n_estimators=100, max_depth=7, min_samples_leaf=150, n_jobs=20, random_state=2) elif classifier_name == 'xgboost': classifier = XGBClassifier(n_estimators=125, nthread=10, silent=True, seed=2) elif classifier_name == 'Ada': classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5), n_estimators=100, learning_rate=0.1, random_state=2) # classifier = AdaBoostClassifier(n_estimators=50, learning_rate=0.1, random_state=2) # elif classifier_name in ['GBDT', 'BDT']: # classifier = GradientBoostingClassifier( # loss='exponential', max_depth=3, n_estimators=100, random_state=2) # # classifier = GradientBoostingClassifier(loss='deviance', max_depth=3, # # n_estimators=500, random_state=2) elif classifier_name == 'BDT_comp_IC79.2010': classifier = GradientBoostingClassifier(loss='deviance', max_depth=4, n_estimators=100, random_state=2) steps.append(('classifier', classifier)) elif classifier_name == 'BDT_comp_IC79.2010_2-groups': classifier = GradientBoostingClassifier(loss='deviance', max_depth=4, n_estimators=100, random_state=2) steps.append(('classifier', classifier)) elif classifier_name == 'BDT_comp_IC86.2012_2-groups': classifier = GradientBoostingClassifier(loss='deviance', max_depth=4, n_estimators=100, random_state=2) steps.append(('classifier', classifier)) elif classifier_name == 'BDT_comp_IC86.2012_3-groups': classifier = GradientBoostingClassifier(loss='deviance', max_depth=3, n_estimators=100, random_state=2) steps.append(('classifier', classifier)) elif classifier_name == 'BDT_comp_IC86.2012_4-groups': classifier = GradientBoostingClassifier(loss='deviance', max_depth=2, n_estimators=100, random_state=2) steps.append(('classifier', classifier)) elif 'CustomClassifier' in classifier_name: hyperparams_str = classifier_name.split('_')[1:] assert len( hyperparams_str ) == 3, 'Too many CustomClassifier hyperparams. Got {} but should have 3.'.format( len(hyperparams_str)) p = float(hyperparams_str[0]) neighbor_weight = float(hyperparams_str[1]) num_groups = int(hyperparams_str[2]) classifier = CustomClassifier(p=p, neighbor_weight=neighbor_weight, num_groups=num_groups, random_state=2) steps.append(('classifier', classifier)) elif classifier_name == 'RF_comp_IC86.2012_4-groups': classifier = RandomForestClassifier(max_depth=10, n_estimators=500, random_state=2, n_jobs=10) steps.append(('classifier', classifier)) elif classifier_name == 'SVC_comp_IC86.2012_2-groups': classifier = SVC(C=0.5, random_state=2) steps.append(('scaler', StandardScaler())) steps.append(('classifier', classifier)) elif classifier_name == 'SVC_comp_IC86.2012_4-groups': classifier = SVC(C=0.5, random_state=2) steps.append(('scaler', StandardScaler())) steps.append(('classifier', classifier)) elif classifier_name == 'LinearSVC_comp_IC86.2012_2-groups': classifier = LinearSVC(random_state=2) steps.append(('scaler', StandardScaler())) steps.append(('classifier', classifier)) elif classifier_name == 'LinearSVC_comp_IC86.2012_4-groups': classifier = LinearSVC(random_state=2) steps.append(('scaler', StandardScaler())) steps.append(('classifier', classifier)) elif classifier_name == 'NuSVC_comp_IC86.2012_4-groups': classifier = NuSVC(random_state=2) steps.append(('scaler', StandardScaler())) steps.append(('classifier', classifier)) elif classifier_name == 'xgboost_comp_IC86.2012_2-groups': classifier = XGBClassifier( learning_rate=0.05, max_depth=7, n_estimators=150, # subsample=0.75, random_state=2) steps.append(('classifier', classifier)) elif classifier_name == 'xgboost_comp_IC86.2012_4-groups': classifier = XGBClassifier( max_depth=2, n_estimators=100, # subsample=0.75, random_state=2) steps.append(('classifier', classifier)) elif classifier_name == 'LogisticRegression_comp_IC86.2012_4-groups': classifier = LogisticRegression(random_state=2) steps.append(('scaler', StandardScaler())) steps.append(('classifier', classifier)) elif classifier_name == 'linecut_comp_IC86.2012_4-groups': classifier = LineCutClassifier() steps.append(('classifier', classifier)) elif classifier_name == 'stacking_comp_IC86.2012_4-groups': classifiers = [ SVC(random_state=2), LinearSVC(random_state=2), GradientBoostingClassifier(loss='deviance', max_depth=2, n_estimators=100, random_state=2), ] classifier = StackingClassifier(classifiers, meta_classifier=LogisticRegression()) steps.append(('scaler', StandardScaler())) steps.append(('classifier', classifier)) elif classifier_name == 'voting_comp_IC86.2012_4-groups': # classifiers = [SVC(random_state=2), # LinearSVC(random_state=2), # GradientBoostingClassifier(loss='deviance', # max_depth=2, # n_estimators=100, # random_state=2), # ] estimators = [ ('SVC', SVC(random_state=2)), # ('LinearSVC', LinearSVC(random_state=2)), ('LogisticRegression', LogisticRegression(random_state=2)), # ('BDT', GradientBoostingClassifier(loss='deviance', # max_depth=2, # n_estimators=100, # random_state=2)), ('xgboost', XGBClassifier(max_depth=3, booster='gblinear', n_estimators=100, random_state=2)) ] classifier = VotingClassifier(estimators, voting='hard') steps.append(('scaler', StandardScaler())) steps.append(('classifier', classifier)) elif classifier_name == 'RF_energy_IC79.2010': classifier = RandomForestRegressor(n_estimators=100, max_depth=8, n_jobs=10, random_state=2) steps.append(('classifier', classifier)) elif classifier_name == 'RF_energy_IC86.2012': classifier = RandomForestRegressor(n_estimators=100, max_depth=7, n_jobs=10, random_state=2) steps.append(('classifier', classifier)) elif classifier_name == 'xgboost_energy_IC86.2012': classifier = XGBRegressor( n_estimators=75, booster='gblinear', # subsample=0.75, random_state=2) steps.append(('classifier', classifier)) elif classifier_name == 'linearregression_energy_IC86.2012': reg = make_pipeline( PolynomialFeatures(2), # StandardScaler(), LinearRegression(), ) return reg elif classifier_name == 'SGD_comp_IC86.2012_2-groups': # clf = make_pipeline(StandardScaler(), # SGDClassifier(random_state=2, n_jobs=1), # ) clf = make_pipeline( StandardScaler(), SGDClassifier(loss='hinge', alpha=1e-3, max_iter=50, tol=1e-3, shuffle=True, random_state=2), ) return clf else: raise ValueError( '{} is not a valid classifier name'.format(classifier_name)) # pipeline = Pipeline([ # # ('scaler', StandardScaler()), # # ('pca', PCA(n_components=4, random_state=2)), # # ('lda', LinearDiscriminantAnalysis(n_discriminants=6)), # ('classifier', classifier)]) pipeline = Pipeline(steps) return pipeline
zero_one_losses_list = [] print '---------------------------------------------------------------------------------------------------------' print 'Bagging Classifier:' clf = BaggingClassifier(base_estimator=LogisticRegression(class_weight={0:1,1:9},warm_start=True),n_estimators=3) clf.fit(train_features,train_labels) pred = clf.predict(test_features) print "F1 Score:" print (precision_recall_fscore_support(test_labels,pred,average='micro'))[2] fpr, tpr, thresholds = roc_curve(test_labels, pred) print "Area Under Receiver Operating Characteristic Curve (ROC):" print auc(fpr, tpr) print '\nStacking Classifier:' clf = StackingClassifier(classifiers=[LogisticRegression(class_weight={0:1,1:9},warm_start=True),LogisticRegression(class_weight={0:1,1:9},warm_start=True),LogisticRegression(class_weight={0:1,1:9},warm_start=True)],meta_classifier=LogisticRegression()) clf.fit(train_features,train_labels) pred = clf.predict(test_features) print "F1 Score:" print (precision_recall_fscore_support(test_labels,pred,average='micro'))[2] fpr, tpr, thresholds = roc_curve(test_labels, pred, pos_label=1) print "Area Under Receiver Operating Characteristic Curve (ROC):" print auc(fpr, tpr) print '---------------------------------------------------------------------------------------------------------' print "Classifiers Used : AdaBoost, RandomForest,Gradient Booster, Logistic Regression" print "\nTrain Start Time : {}".format(time.time()) clf_AdaBoost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(class_weight={0:1,1:4}),n_estimators=5) clf_RandomForest = RandomForestClassifier(class_weight={0:1,1:11}) clf_LogisticReg = LogisticRegression(class_weight={0:1,1:9},warm_start=True)
scores = cross_val_score(clf, X_train, Y_train, cv=5, scoring='accuracy', pre_dispatch=4) print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(), label)) # Let's try another strategy: Stacking # In[ ]: lgr = LogisticRegression() sclf = StackingClassifier( classifiers=[RFC_best, logitR, LDA, GBC_best, votingC], meta_classifier=lgr) print('10-fold cross validation:\n') for clf, label in zip([RFC_best, logitR, LDA, GBC_best, votingC, sclf], ['rfc', 'logitR', 'lda', 'gbc', 'voting', 'stacking']): scores = cross_val_score(clf, X_train, Y_train, cv=5, scoring='accuracy') print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(), label)) # If we run an ANOVA we couldn't say one model predicts better than other, so let's take the voting Classifier as the final model... # ### 6.3 Prediction # #### 6.3.1 Predict and Submit results