def __train_stacks__(self, train_input, test_input, train_output, test_output): scaler = MinMaxScaler() scaler.fit(train_input) scaled_train_input = scaler.transform(train_input) scaled_test_input = scaler.transform(test_input) clf1 = MLPClassifier(hidden_layer_sizes=[50, 50, 50, 50], activation='logistic') clf2 = DecisionTreeClassifier() clf3 = SVC() clf4 = QuadraticDiscriminantAnalysis() clf5 = KNeighborsClassifier() clf6 = RandomForestClassifier() clf7 = LinearSVC() clf8 = LinearDiscriminantAnalysis() sclf1 = StackingClassifier(classifiers=[clf2, clf2, clf2, clf2], meta_classifier=clf6) sclf2 = StackingClassifier(classifiers=[clf7, clf7, clf7, clf7], meta_classifier=clf3) sclf3 = StackingClassifier(classifiers=[clf8, clf8, clf8, clf8], meta_classifier=clf4) sclf4 = StackingClassifier(classifiers=[clf5, clf5, clf5, clf5], meta_classifier=clf5) main_clf = StackingClassifier(classifiers=[sclf1, sclf2, sclf3, sclf4], meta_classifier=clf1) ada_main = AdaBoostClassifier(main_clf) training_scr = np.mean( cross_val_score(ada_main, scaled_train_input, train_output, cv=4)) ada_main.fit(scaled_train_input, train_output) pred_output = ada_main.predict(scaled_test_input) return ada_main, pred_output
def fit(self,x,y): ''' 拟合: ''' model_list = [] basic_cls = ['logistic','knn','svm','dt','rf','adaBoost','gbm','xgb','bp'] for model_name in self.listModelName: if model_name in basic_cls: cls = cls_model(model_name,isGridSearch = self.isGridSearch) if model_name in self.dict_para.keys(): #如果用户自定义了参数范围,则对模型参数进行设置 cls.set_parameters(self.dict_para[model_name]) else: pass #模型拟合 cls.fit(x,y) model_list.append(cls.cls_model) self.train_model[model_name] = cls if self.meta_reg == 'logistic': meta_cls = linear_model.LogisticRegression() elif self.meta_reg == 'knn': meta_cls = KNeighborsClassifier() self.stack = StackingClassifier(classifiers = model_list,meta_classifier = meta_cls) self.stack.fit(x.values,y.values.reshape(len(y)))
def __init__(self, alpha=0.1, n_jobs=-1, max_features='sqrt', n_estimators=1000, RandomForest=True, KMeansFeatures=True, NaiveBayes=True): """ INPUT: - alpha = Additive laplace smoothing parameter for NaiveBayes - n_jobs = Number of jobs to run RFC on - max_features = Number of featres to consider on RFC - n_estimators = Number of trees in RFC - RandomForest = Bool, run RFC - KMeansFeatures = Bool, include K means features in RFC - NaiveBayes = Bool, run MNB ATTRIBUTES: - RFC = Random Forest Classifier - MNB = Multinomial Naive Bayes Classifier """ self.RFC = RandomForestClassifier(n_jobs=n_jobs, max_features=max_features, n_estimators=n_estimators) self.MNB = MultinomialNB(alpha=alpha) self.LogR = LogisticRegression() self.STK = StackingClassifier(classifiers=[self.RFC, self.MNB], meta_classifier=self.LogR, use_probas=True) self.RandomForest = RandomForest self.KMeansFeatures = KMeansFeatures self.NaiveBayes = NaiveBayes
def create_stacked(dataset, x_train, y_train): for i, y in enumerate(dataset.y_true): dataset.y_true[i] = dataset.class_labels.index(y) for i, y in enumerate(y_train): y_train[i] = dataset.class_labels.index(y) dataset.class_labels = range(0, len(dataset.class_labels)) clf1 = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=42) clf2 = KNeighborsClassifier(n_neighbors=10) clf3 = GaussianNB() clf4 = MLPClassifier(activation='relu', max_iter=100000, hidden_layer_sizes=(50, 50, 50, 50, 50)) clf5 = MLPClassifier(activation='relu', max_iter=1000000, hidden_layer_sizes=(500, 500)) clf6 = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42) clf_meta = LogisticRegression() clf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5, clf6], meta_classifier=clf_meta, use_probas=True) clf.fit(x_train, y_train) return clf
def test_StackingClassifier_avg_vs_concat(): np.random.seed(123) lr1 = LogisticRegression(solver='liblinear', multi_class='ovr') sclf1 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, average_probas=True, meta_classifier=lr1) sclf1.fit(X, y) r1 = sclf1.predict_meta_features(X[:2]) assert r1.shape == (2, 3) assert_almost_equal(np.sum(r1[0]), 1.0, decimal=6) assert_almost_equal(np.sum(r1[1]), 1.0, decimal=6) sclf2 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, average_probas=False, meta_classifier=lr1) sclf2.fit(X, y) r2 = sclf2.predict_meta_features(X[:2]) assert r2.shape == (2, 6) assert_almost_equal(np.sum(r2[0]), 2.0, decimal=6) assert_almost_equal(np.sum(r2[1]), 2.0, decimal=6) np.array_equal(r2[0][:3], r2[0][3:])
class ClassifierBlender: def __init__(self, x_train, x_test, y_train, y_test=None): x_train.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True) x_test.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True) self.x_train = x_train self.x_test = x_test self.y_train = y_train['y'].values if self.y_train is not None: self.y_test = y_test['y'].values def clf_blend(self): mete_clf = LinearRegression() clf1 = model.svm_regressor() clf2 = model.randomforest_regressor() clf3 = model.xgb_regressor() self.blend = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=mete_clf) self.blend.fit(self.x_train, self.y_train) return self.blend def score(self): scores = cross_val_score(self.blend, X=self.x_train, y=self.y_train, cv=10, verbose=2) return scores def prediction(self): y_pred = self.blend.predict(self.x_test) return y_pred
def stacking_clf(train_x, train_y): clf1 = RandomForestClassifier(n_estimators=300, max_features="sqrt", min_samples_split=20, min_samples_leaf=15, max_depth=6, bootstrap=True, n_jobs=8) clf2 = svm.SVC(C=10) clf3 = xgb.XGBClassifier(n_estimators=300, learning_rate=0.1, n_jobs=8, object="multi:softmax", colsample_bylevel=0.8, reg_lambda=1, max_depth=6, min_child_weight=1) clf4 = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, min_samples_split=20, min_samples_leaf=15, max_depth=6, max_features="sqrt") clf5 = LogisticRegression(penalty='l2', C=100, multi_class='ovr') sclf = StackingClassifier( classifiers=[clf1, clf3, clf4], meta_classifier=clf5, ) sclf.fit(train_x, train_y) return sclf
def model_cross_validation(self, model, best_params): print 'Model Cross Validation' print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') lr = self.model_init(model) clf1 = self.model_init('KNN') clf2 = self.model_init('RFC') clf3 = self.model_init('GNB') sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],meta_classifier=lr) sclf.set_params(**best_params) train_data = self.train.values.copy() train_label = self.train_label['label'].values.copy() train_label = train_label.reshape(train_label.shape[0]) scores = cross_val_score(sclf, train_data, train_label, cv=5, scoring='roc_auc', n_jobs=3) print sclf print scores print np.mean(scores) print 'Model: {0} ; Train: {1}'.format(model,np.mean(scores)) print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') return np.mean(scores)
def run(cls) -> StackingClassifier: """ Run a Stacking Classifier using all registered models """ sc = cls() X, y = sc.load_train() # Define the StackingClassifier using all models registered. classifiers = [Model() for Model in sc._models if Model.__name__ != 'DumbModel'] clf = StackingClassifier(classifiers=classifiers, meta_classifier=LogisticRegression(), verbose=1, average_probas=False, use_probas=True ) # Run cross-val to get an idea of what to expect for final output #scores = cross_val_score(clf, X.copy(), y.copy(), scoring='neg_log_loss', cv=2) #print('\n---------\nCross validation (3) --> StackingClassifier - Avg Log Loss: {:.8f} - STD: {:.4f}\n---------' # .format(scores.mean(), scores.std()) # ) # Finally, refit clf to entire dataset print('Fitting Stacking Classifier to entire training dataset...') clf.fit(X.copy(), y.copy()) return clf
def stacking(classifier, optimalRF, optimalKNN, task=0): if task == 0: for x in range(1, 51): classifier.predict( StackingClassifier(classifiers=[ RandomForestClassifier(optimalRF, n_jobs=8), KNeighborsClassifier(optimalKNN, n_jobs=8) ], meta_classifier=KNeighborsClassifier( x, n_jobs=8))) elif task == 2: classifier.predict( StackingClassifier(classifiers=[ RandomForestClassifier(optimalRF, n_jobs=8), KNeighborsClassifier(optimalKNN, n_jobs=8) ], meta_classifier=tree.DecisionTreeClassifier())) elif task == 5: classifier.predict( StackingClassifier(classifiers=[ RandomForestClassifier(optimalRF, n_jobs=8), KNeighborsClassifier(optimalKNN, n_jobs=8) ], meta_classifier=svm.SVC(kernel='linear'))) elif task == 6: classifier.predict( StackingClassifier(classifiers=[ RandomForestClassifier(optimalRF, n_jobs=8), KNeighborsClassifier(optimalKNN, n_jobs=8) ], meta_classifier=svm.SVC(kernel='rbf')))
def stacking2(): from sklearn.datasets import load_iris from mlxtend.classifier import StackingClassifier from mlxtend.feature_selection import ColumnSelector from sklearn.pipeline import make_pipeline from sklearn.linear_model import LogisticRegression from sklearn import model_selection iris = load_iris() X = iris.data y = iris.target pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)), LogisticRegression()) pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)), LogisticRegression()) sclf = StackingClassifier(classifiers=[pipe1, pipe2], meta_classifier=LogisticRegression(), use_features_in_secondary=True, store_train_meta_features=True) sclf.fit(X, y) scores = model_selection.cross_val_score(sclf, X, y, cv=5, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
def model_stack2(): _, test_df, train_label = data_process.get_person_data() train_data, test_data = data_process.get_scale_data() X_train, X_val, y_train, y_val = train_test_split(train_data, train_label, test_size=0.2, random_state=66) id_list = list(test_df.pop('ID')) model1 = gbt.XGBRegressor(n_estimators=1000, subsample=0.8, learning_rate=0.25, objective='reg:linear') model2 = gbt.XGBRegressor(n_estimators=1000, subsample=0.8, learning_rate=0.25, objective='reg:gamma') model3 = gbt.XGBRegressor(n_estimators=1000, subsample=0.8, learning_rate=0.25, objective='reg:tweedie') model4 = svm.SVR() stack_model = StackingClassifier( classifiers=[model1, model2, model3, model4], meta_classifier=model3) stack_model.fit(train_data, train_label) yHat = stack_model.predict(test_data) result = pd.DataFrame({'id': id_list, 'yhat': yHat}) result.to_csv('result/result6.csv', index=False, header=None, encoding='utf-8')
def stacking(para, X, y): stack_lvl_0 = StackingClassifier(classifiers=para["lvl_0"], meta_classifier=para["top"]) stack_lvl_1 = StackingClassifier(classifiers=para["lvl_1"], meta_classifier=stack_lvl_0) scores = cross_val_score(stack_lvl_1, X, y, cv=3) return scores.mean()
def blending(self): mete_clf = LogisticRegression() clf1 = model.svm_classifier() clf2 = model.dt_classifier() # reg3 = model.xgb_classifier() self.blend = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=mete_clf) self.blend.fit(self.x_train, self.y_train) return self.blend
def Stacking(self): meta_clf = LogisticRegression() self.stacking = StackingClassifier(classifiers=[self.svm, self.tree, self.bayes, self.knn, self.xgb], meta_classifier=meta_clf) self.stacking.fit(self.X, self.y)
def stacking_prediction2(m1, m2, meta): # model_train, model_test = stacking(clf, Xtrain2,ytrain2, Xtest2) # model.fit(model_train, ytrain2) tr, ts = scaling(Xtrain2,Xtest2,MaxAbsScaler()) m = StackingClassifier(classifiers=[m1, m2],meta_classifier=meta) m.fit(tr, ytrain2) predict_mm = m.predict(ts) return predict_mm
def test_weight_unsupported_no_weight(): # This is okay since we do not pass sample weight meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() clf3 = KNeighborsClassifier() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=meta) sclf.fit(X, y)
def test_multivariate_class(): np.random.seed(123) meta = KNeighborsClassifier() clf1 = RandomForestClassifier(n_estimators=10) clf2 = KNeighborsClassifier() sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta) y_pred = sclf.fit(X, y2).predict(X) ca = .973 assert round((y_pred == y2).mean(), 3) == ca
def clf_blend(self): mete_clf = LinearRegression() clf1 = model.svm_regressor() clf2 = model.randomforest_regressor() clf3 = model.xgb_regressor() self.blend = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=mete_clf) self.blend.fit(self.x_train, self.y_train) return self.blend
def data_ensemble(cancer_type,feat): data_dir = "/home/ubuntu/cancer/" data_file = data_dir + cancer_type + "_matrix.csv" features = data_dir + cancer_type + "_output.txt" output_file = data_dir + cancer_type + "_accuracy.txt" file = open(features, "r") o_file = open(output_file, "w") line = file.readline() line = file.readline() df = pd.read_csv(data_file) df = shuffle(df) file_ids=df.pop('file_id') y = df.pop('label').values dataf=df.pop(line[:-1]) #dataframe consisting of only important features for x in range(feat): line = file.readline() dataf=np.column_stack((dataf,df.pop(line[:-1]))) X=normalize(dataf) X=scale(X) pca=PCA() pca.fit(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) #multiple classifiers clf1 = RandomForestClassifier(random_state=1,n_estimators=100) clf2 = GradientBoostingClassifier(n_estimators=1200,subsample=0.5,random_state=3) clf3 = SVC(gamma='auto') clf4 = KNeighborsClassifier(n_neighbors=1) clf5 = DecisionTreeClassifier(random_state=0) lr = LogisticRegression(solver='lbfgs') #stacking for data ensemble sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], meta_classifier=lr) clf1.fit(X_train,y_train) clf2.fit(X_train,y_train) clf3.fit(X_train,y_train) clf4.fit(X_train,y_train) clf5.fit(X_train,y_train) sclf.fit(X_train,y_train) y_test_predict=sclf.predict(X_test) precision = precision_score(y_test, y_test_predict) accuracy = accuracy_score(y_test, y_test_predict) f1 = f1_score(y_test, y_test_predict) recall = recall_score(y_test, y_test_predict) scores = [precision,accuracy,f1,recall] label = ['RF', 'GBDT', 'SVM','KNN','DT','Stacking'] clf_list = [clf1, clf2, clf3, clf4, clf5, sclf] #score calculation for clf, label in zip(clf_list, label): y_test_predict = clf.predict(X_test) tn, fp, fn, tp = confusion_matrix(y_test, y_test_predict).ravel() specificity = tn / (tn+fp) recall = tp / (tp+fn) precision = tp / (tp+fp) accuracy = (tp + tn) / (tp+tn+fp+fn) f1 = 2*tp / (2*tp+fp+fn) o_file.write("\nAccuracy: %.2f [%s] \nPrecision: %.2f [%s] \nRecall: %.2f [%s] \nF1 score: %.2f [%s] \nSpecificity: %.2f [%s]\n" %(accuracy,label,precision, label, recall, label, f1, label, specificity, label))
def test_verbose(): np.random.seed(123) meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, verbose=3) sclf.fit(iris.data, iris.target)
def stackingPerformanceEditor(): nb_clf = GaussianNB() svm_clf = RandomForestClassifier(n_estimators=100, max_depth=400, random_state=5) mlp_clff = MLPClassifier(hidden_layer_sizes=(500,500)) label = ["NB","RF","MLP"] acc = StackingClassifier(classifiers=[nb_clf,svm_clf,mlp_clff], meta_classifier=svm_clf) acc.fit(Xtrain2, ytrain2) pred = accuracy_score(ytest, acc.predict(Xtest2)) return pred
def test_weight_unsupported_no_weight(): # This is okay since we do not pass sample weight np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() clf3 = KNeighborsClassifier() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=meta) sclf.fit(X, y)
def test_multivariate_class(): np.random.seed(123) meta = KNeighborsClassifier() clf1 = RandomForestClassifier() clf2 = KNeighborsClassifier() sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta) y_pred = sclf.fit(X, y2).predict(X) ca = .973 assert round((y_pred == y2).mean(), 3) == ca
def test_sample_weight(): # Make sure that: # prediction with weight # != prediction with no weight # == prediction with weight ones random.seed(87) w = np.array([random.random() for _ in range(len(y))]) np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta) prob1 = sclf.fit(X, y, sample_weight=w).predict_proba(X) np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta) prob2 = sclf.fit(X, y, sample_weight=None).predict_proba(X) maxdiff = np.max(np.abs(prob1 - prob2)) assert maxdiff > 1e-3, "max diff is %.4f" % maxdiff np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta) prob3 = sclf.fit(X, y, sample_weight=np.ones(len(y))).predict_proba(X) maxdiff = np.max(np.abs(prob2 - prob3)) assert maxdiff < 1e-3, "max diff is %.4f" % maxdiff
def stacking(self): from sklearn.svm import SVC from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler, MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from lightgbm import LGBMClassifier import xgboost as xgb from mlxtend.classifier import StackingClassifier import scipy as sc svc = make_pipeline(SVC(kernel='rbf', C=2.8, gamma=2)) rf = RandomForestClassifier(random_state=590, n_estimators=6) GBoost = GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth=12, max_features='sqrt', min_samples_leaf=15, min_samples_split=97, random_state=200) model_xgb = xgb.XGBClassifier(colsample_bytree=0.4603, gamma=10, learning_rate=0.01, max_depth=11, min_child_weight=1.7817, n_estimators=500, reg_alpha=0.01, reg_lambda=5, subsample=0.5213, silent=1, seed=1024, nthread=-1) model_lgb = LGBMClassifier(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=550, max_bin=25, bagging_fraction=1, bagging_freq=5, feature_fraction=0.7, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=42, min_sum_hessian_in_leaf=40) regressors = [rf, svc, GBoost, model_lgb, model_xgb] stregr = StackingClassifier(classifiers=regressors, meta_classifier=model_xgb, verbose=1) stregr.fit(self.X_train, self.y_train) print( "the model is stregr and the valid's f1 is: ", f1_score(self.y_test, stregr.predict(self.X_test), average="macro")) # print("the model is stregr and the valid's precision_score is: ", precision_score(self.y_test, stregr.predict(self.X_test),average="macro")) # print("the model is stregr and the valid's recall_score is: ", recall_score(self.y_test, stregr.predict(self.X_test),average="macro")) return stregr
def train_model(self, X, y): print('>> KFOLD Iteration <<') # Define Models m1 = CatBoostClassifier(custom_loss=['Accuracy'], random_seed=42, logging_level='Silent') m2 = AdaBoostClassifier(n_estimators=500) m3 = XGBClassifier() meta = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr') model = StackingClassifier(classifiers=[m1, m2, m3], meta_classifier=meta) model = model.fit(X, y) return model
def test_train_meta_features_(): knn = KNeighborsClassifier() lr = LogisticRegression() gnb = GaussianNB() stclf = StackingClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) stclf.fit(X_train, y_train) train_meta_features = stclf.train_meta_features_ assert train_meta_features.shape == (X_train.shape[0], 2)
def train3(): iris = datasets.load_iris() x = iris.data y = iris.target pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)), LogisticRegression()) pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)), LogisticRegression()) sclf = StackingClassifier(classifiers=[pipe1, pipe2], meta_classifier=LogisticRegression()) sclf.fit(x, y)
def test_verbose(): np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, verbose=3) X, y = iris_data() sclf.fit(X, y)
def test_weight_unsupported(): # Error since KNN does not support sample_weight meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() clf3 = KNeighborsClassifier() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=meta) random.seed(87) w = np.array([random.random() for _ in range(len(y))]) sclf.fit(X, y, sample_seight=w)
def test_predict_meta_features(): knn = KNeighborsClassifier() lr = LogisticRegression() gnb = GaussianNB() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # test default (class labels) stclf = StackingClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) stclf.fit(X_train, y_train) test_meta_features = stclf.predict(X_test) assert test_meta_features.shape == (X_test.shape[0],)
def test_train_meta_features_(): np.random.seed(123) knn = KNeighborsClassifier() lr = LogisticRegression(solver='liblinear', multi_class='ovr') gnb = GaussianNB() stclf = StackingClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) stclf.fit(X_train, y_train) train_meta_features = stclf.train_meta_features_ assert train_meta_features.shape == (X_train.shape[0], 2)
def test_StackingClassifier_avg_vs_concat(): np.random.seed(123) lr1 = LogisticRegression() sclf1 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, average_probas=True, meta_classifier=lr1) sclf1.fit(X, y) r1 = sclf1._predict_meta_features(X[:2]) assert r1.shape == (2, 3) assert_almost_equal(np.sum(r1[0]), 1.0, places=6) assert_almost_equal(np.sum(r1[1]), 1.0, places=6) sclf2 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, average_probas=False, meta_classifier=lr1) sclf2.fit(X, y) r2 = sclf2._predict_meta_features(X[:2]) assert r2.shape == (2, 6) assert_almost_equal(np.sum(r2[0]), 2.0, places=6) assert_almost_equal(np.sum(r2[1]), 2.0, places=6) np.array_equal(r2[0][:3], r2[0][3:])
def test_use_features_in_secondary_predict_proba(): np.random.seed(123) meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], use_features_in_secondary=True, meta_classifier=meta) sclf.fit(X, y) idx = [0, 1, 2] y_pred = sclf.predict_proba(X[idx])[:, 0] expect = np.array([0.911, 0.829, 0.885]) np.testing.assert_almost_equal(y_pred, expect, 3)
def test_weight_unsupported(): # Error since KNN does not support sample_weight np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() clf3 = KNeighborsClassifier() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=meta) random.seed(87) w = np.array([random.random() for _ in range(len(y))]) with pytest.raises(TypeError): sclf.fit(X, y, sample_seight=w)
def test_use_features_in_secondary_sparse_input_predict_proba(): np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) sclf = StackingClassifier(classifiers=[clf1], use_features_in_secondary=True, meta_classifier=meta) sclf.fit(sparse.csr_matrix(X), y) idx = [0, 1, 2] y_pred = sclf.predict_proba( sparse.csr_matrix(X[idx]) )[:, 0] expect = np.array([0.910, 0.829, 0.882]) np.testing.assert_almost_equal(y_pred, expect, 3)
def test_use_features_in_secondary_predict_proba(): np.random.seed(123) X, y = iris_data() meta = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1) clf1 = RandomForestClassifier(n_estimators=10, random_state=1) clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], use_features_in_secondary=True, meta_classifier=meta) sclf.fit(X, y) idx = [0, 1, 2] y_pred = sclf.predict_proba(X[idx])[:, 0] expect = np.array([0.916, 0.828, 0.889]) np.testing.assert_almost_equal(y_pred, expect, 3)
def model_processing(X_train,X_test,y_train,y_test): log_reg = LogisticRegression(C=0.01, penalty='l2') svc = SVC(C=0.7, kernel='linear') tree_clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5) rf_clf = RandomForestClassifier(n_estimators=70,criterion='entropy', max_features='auto',min_samples_leaf=6) xgb = XGBClassifier(gamma=0.3, max_depth=4, min_child_weight=8,reg_alpha=0.05) sclf = StackingClassifier(classifiers=[log_reg,svc,tree_clf,rf_clf],meta_classifier=xgb) sclf.fit(X_train,y_train) y_pred_train = sclf.predict(X_train) y_pred = sclf.predict(X_test) print('*' * 30,'在训练集上的得分' ) accuracy = accuracy_score(y_train,y_pred_train) precision = precision_score(y_train,y_pred_train) f1 = f1_score(y_train,y_pred_train) recall = recall_score(y_train,y_pred_train) auc = roc_auc_score(y_train,y_pred_train) model_name = '堆叠模型-训练集' print('{} 精确度 (accuracy):{:.2f}'.format(model_name,accuracy)) print('{} 准确度(precision):{:.2f}'.format(model_name,precision)) print('{} F1 Score :{:.2f}'.format(model_name,f1)) print('{} 召回率(recall Score):{:.2f}'.format(model_name,recall)) print('{} auc Score:{:.2f}'.format(model_name,auc)) print('*' * 30,'在测试集上的得分' ) accuracy = accuracy_score(y_test,y_pred) precision = precision_score(y_test,y_pred) f1 = f1_score(y_test,y_pred) recall = recall_score(y_test,y_pred) auc = roc_auc_score(y_test,y_pred) model_name = '堆叠模型' print('{} 精确度 (accuracy):{:.2f}'.format(model_name,accuracy)) print('{} 准确度(precision):{:.2f}'.format(model_name,precision)) print('{} F1 Score :{:.2f}'.format(model_name,f1)) print('{} 召回率(recall Score):{:.2f}'.format(model_name,recall)) print('{} auc Score:{:.2f}'.format(model_name,auc))
def test_get_params(): clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()})) expect = ['average_probas', 'classifiers', 'gaussiannb', 'kneighborsclassifier', 'meta-logisticregression', 'meta_classifier', 'randomforestclassifier', 'store_train_meta_features', 'use_features_in_secondary', 'use_probas', 'verbose'] assert got == expect, got
def test_get_params(): np.random.seed(123) clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() lr = LogisticRegression(solver='liblinear', multi_class='ovr') sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()})) expect = ['average_probas', 'classifiers', 'drop_last_proba', 'gaussiannb', 'kneighborsclassifier', 'meta_classifier', 'randomforestclassifier', 'store_train_meta_features', 'use_clones', 'use_features_in_secondary', 'use_probas', 'verbose'] assert got == expect, got
def model_test(self,model,best_params): print 'Model Test' print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') lr = self.model_init(model) clf1 = self.model_init('KNN') clf2 = self.model_init('RFC') clf3 = self.model_init('GNB') sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],meta_classifier=lr) sclf.set_params(**best_params) train_data = self.train.values.copy() train_label = self.train_label['label'].values.copy() sclf.fit(train_data, train_label) if model.upper()=='LR': coef=sclf.coef_.reshape(clf.coef_.shape[1]) ind=coef.argsort() att=self.train.columns[ind[-30:]].tolist() print att elif model.upper()=='RFC': imp=sclf.feature_importances_ print imp ind=imp.argsort() att=self.train.columns[ind[-30:]].tolist() print att elif model.upper()=='XGB': imp=sclf.feature_importances_ print imp ind=imp.argsort() att=self.train.columns[ind[-30:]].tolist() print att test_data = self.test.values.copy() test_label = self.test_label['label'].values.copy() test_label = test_label.reshape(test_label.shape[0]) res_proba=sclf.predict_proba(test_data) res_auc=roc_auc_score(test_label,res_proba[:,1]) print 'Model: {0} ; Test: {1}'.format(model,res_auc) print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') return res_auc
def test_StackingClassifier_drop_last_proba(): np.random.seed(123) lr1 = LogisticRegression(solver='liblinear', multi_class='ovr') sclf1 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, drop_last_proba=False, meta_classifier=lr1) sclf1.fit(X, y) r1 = sclf1.predict_meta_features(X[:2]) assert r1.shape == (2, 6) sclf2 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, drop_last_proba=True, meta_classifier=lr1) sclf2.fit(X, y) r2 = sclf2.predict_meta_features(X[:2]) assert r2.shape == (2, 4), r2.shape sclf3 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, drop_last_proba=True, meta_classifier=lr1) sclf3.fit(X[0:100], y[0:100]) # only 2 classes r3 = sclf3.predict_meta_features(X[:2]) assert r3.shape == (2, 2), r3.shape
def predictor_ev(): print "Building Neural Net classifiers for devices with events" n_input = X_train_ev.shape[1] n_train = X_train_ev.shape[0] from keras.models import Sequential from keras.layers import Dense, Activation from keras.layers.core import Dropout from keras.layers.advanced_activations import PReLU from keras.regularizers import l2 from keras.optimizers import Adadelta from keras.optimizers import SGD from keras.wrappers.scikit_learn import KerasClassifier from keras.callbacks import ModelCheckpoint def create_model(n_hidden_layers=1, nodes=[50], reg=1.0, dropouts=[.5], acts=['relu']): n_in = n_input model = Sequential() for i in xrange(n_hidden_layers): n_out = nodes[i] dropout = dropouts[i] act = acts[i] model.add(Dense(output_dim=n_out, input_dim=n_in, W_regularizer=l2(reg))) model.add(Activation(act)) model.add(Dropout(dropout)) n_in = n_out model.add(Dense(output_dim=12, W_regularizer=l2(reg))) model.add(Activation("softmax")) # Compile model adadelta = Adadelta(lr=1.0, rho=0.95, epsilon=1e-08) sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='categorical_crossentropy', optimizer=adadelta, metrics=['accuracy']) return model class KerasClassifier2(KerasClassifier): def __init__(self, build_fn, fn_args, random_state=0, nb_epoch=10, batch_size=500, verbose=2): self.random_state = random_state self.nb_epoch = nb_epoch self.batch_size = batch_size self.verbose = verbose super(KerasClassifier2, self).__init__(build_fn, **fn_args) self.classes_= np.arange(12) self.n_classes_ = 12 self.model = build_fn(**fn_args) def fit(self, X, y, sample_weight=None): return super(KerasClassifier2, self).fit(X, indicator(y), verbose = self.verbose, sample_weight=sample_weight, validation_data=(X_cv_ev, indicator(y_cv_ev)), nb_epoch=self.nb_epoch, batch_size=self.batch_size) def predict_proba(self, X): return super(KerasClassifier2, self).predict_proba(X, batch_size=500, verbose=0) def predict(self, X): return super(KerasClassifier2, self).predict_proba(X, batch_size=500, verbose=0) nn1_args = {'n_hidden_layers': 2, 'nodes': [600, 400], 'reg': 1.8, 'dropouts': [.3, .4], 'acts': ['relu', 'relu']} nn2_args = {'n_hidden_layers': 3, 'nodes': [300, 100, 50], 'reg': 2.0, 'dropouts': [.2, .4, .5], 'acts': ['relu', 'relu', 'relu']} nn3_args = {'n_hidden_layers': 4, 'nodes': [1001, 511, 245, 99], 'reg': 2.0, 'dropouts': [.2, .3, .2, .3], 'acts': ['relu', 'relu', 'relu', 'relu']} nn4_args = {'n_hidden_layers': 1, 'nodes': [500], 'reg': 1.2, 'dropouts': [.25], 'acts': ['relu']} nn5_args = {'n_hidden_layers': 5, 'nodes': [1343, 1012, 757, 539, 117], 'reg': 2.5, 'dropouts': [.2, .3, .4, .4, .4], 'acts': ['relu', 'relu', 'relu', 'relu', 'relu']} clfNN1 = KerasClassifier2(create_model, nn1_args, random_state=5, nb_epoch=5) clfNN2 = KerasClassifier2(create_model, nn2_args, random_state=23, nb_epoch=11) clfNN3 = KerasClassifier2(create_model, nn3_args, random_state=710, nb_epoch=6) clfNN4 = KerasClassifier2(create_model, nn4_args, random_state=5072, nb_epoch=6) clfNN5 = KerasClassifier2(create_model, nn5_args, random_state=2016, nb_epoch=12) print "Building XGBoost classifiers for devices with events" xgb_params = { "objective": "multi:softprob", "num_class": 12, "booster": "gblinear", "max_depth": 6, "eval_metric": "mlogloss", "eta": 0.07, "silent": 1, "alpha": 3.5, } class XGBClassifier2(xgb.XGBClassifier): def __init__(self, max_depth=xgb_params['max_depth'], objective='multi:softprob', missing=None, learning_rate=xgb_params['eta'], n_estimators=40, subsample=1, reg_alpha=xgb_params['alpha'], seed=2016, booster='gblinear'): super(XGBClassifier2, self).__init__(max_depth=max_depth, seed=seed, objective=objective, missing=missing, learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, reg_alpha=reg_alpha) self.booster = xgb_params['booster'] def fit(self, X, y): super(XGBClassifier2, self).fit(X.tocsc(), y, eval_metric='mlogloss', eval_set=[(X_cv_ev.tocsc(), y_cv_ev)]) gbm1 = XGBClassifier2(seed=0, booster='gblinear', n_estimators=28) gbm2 = XGBClassifier2(seed=6, booster='gblinear', n_estimators=28) gbm3 = XGBClassifier2(seed=151, booster='gbtree', n_estimators=28) gbm4 = XGBClassifier2(seed=1047, booster='gbtree', n_estimators=28) gbm5 = XGBClassifier2(seed=22, booster='dart', n_estimators=28) print "Building Logistic Regression classifier for devices with events" clfLR = LogisticRegression(C=.02, random_state=2016, multi_class='multinomial', solver='newton-cg') #Combine results of classifiers print "Stacking classifiers for devices with events" clf_ls = [gbm1,gbm2,gbm3,gbm4,gbm5,clfNN1,clfNN2,clfNN3,clfNN4,clfNN5,clfLR] meta = LogisticRegression() stack = StackingClassifier(clf_ls, meta, use_probas=True, verbose=1) stack.fit(X_train_ev, y_train_ev) print log_loss(y_cv_ev, stack.predict_proba(X_cv_ev)) y_pred_ev = stack.predict_proba(X_test_ev) #y_pre = (pred_prob_nn+y_pre)/2.0 return y_pred_ev
#KNN #clfKNN = KNeighborsClassifier(n_neighbors=5) #clfKNN.fit(X_train_noev, y_train_noev) #print log_loss(y_cv_noev, clfKNN.predict_proba(X_cv_noev)) # ##NB #clfNB = MultinomialNB(alpha=1.0) #clfNB.fit(X_train_noev, y_train_noev) #print log_loss(y_cv_noev, clfNB.predict_proba(X_cv_noev)) #Combine results of classifiers print "Stacking classifiers for devices with no events" clf_ls = [gbm1,gbm2,gbm3,gbm4,gbm5,clfNN1,clfNN2,clfNN3,clfNN4,clfNN5,clfLR] meta = LogisticRegression() stack = StackingClassifier(clf_ls, meta, use_probas=True, verbose=1) stack.fit(X_train_noev, y_train_noev) print log_loss(y_cv_noev, stack.predict_proba(X_cv_noev)) y_pred_noev = stack.predict_proba(X_test_noev) #y_pre = (pred_prob_nn+y_pre)/2.0 # return y_pred_noev y_pred_ev = predictor_ev() #y_pred_noev = predictor_noev() # Write results result = pd.DataFrame(np.hstack(y_pred_ev, y_pred_noev), columns=le.classes_) result["device_id"] = test_dev result = result.set_index("device_id") result.to_csv('stacking_1.gz', index=True,