def main(): np.random.seed(0) train_X, train_y, test_X, test_y = load_data() ''' # find the best params for rfc param_grid = { 'n_estimators': np.arange(10, 21), 'max_features': ['sqrt'] } g = GridSearchCV(RandomForestClassifier(), param_grid, scoring='f1').fit(train_X, train_y) print('best random forest params:', g.best_params_) rfc = g.best_estimator_ ''' rfc = RandomForestClassifier(**{ 'max_features': 'sqrt', 'n_estimators': 13 }) ''' # find the best params for ada param_grid = { 'base_estimator': [DecisionTreeClassifier(max_depth=1)], 'n_estimators': np.arange(30, 41) } g = GridSearchCV(AdaBoostClassifier(), param_grid, scoring='f1').fit(train_X, train_y) print('best adaboost params:', g.best_params_) ada = g.best_estimator_ ''' ada = AdaBoostClassifier( **{ 'base_estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 32 }) ''' # find the best params for lr param_grid = { 'C': 10. ** np.arange(-4, 5), 'penalty': ['l1', 'l2'], 'solver': ['liblinear'], 'max_iter': [1000] } g = GridSearchCV(LogisticRegression(), param_grid, scoring='f1').fit(train_X, train_y) print('best lr params:', g.best_params_) lr = g.best_estimator_ ''' lr = LogisticRegression(**{ 'C': 10.0, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear' }) ''' # find the best params for svc param_grid = { 'C': 10. ** np.arange(-4, 5), 'kernel': ['rbf'] } g = GridSearchCV(SVC(), param_grid, scoring='f1').fit(train_X, train_y) print('best svc params:', g.best_params_) svc = g.best_estimator_ ''' svc = SVC(**{'C': 1000.0, 'kernel': 'rbf'}) print('Q3.1') stac = StackingClassifier(estimators=[('random forest', rfc), ('adaboost', ada), ('logistic regression', lr), ('svc', svc)], final_estimator=LogisticRegression()) kf = KFold(n_splits=10) f1s = [] for learn_ix, val_ix in kf.split(train_X, train_y): learn_X, learn_y, val_X, val_y = train_X[learn_ix, :], train_y[ learn_ix], train_X[val_ix, :], train_y[val_ix] stac.fit(learn_X, learn_y) val_preds = stac.predict(val_X) f1s.append(metrics.f1_score(val_y, val_preds)) print('average validation f1 score:', np.mean(f1s)) '''
def test_stacking_classifier_error(y, params, type_err, msg_err): with pytest.raises(type_err, match=msg_err): clf = StackingClassifier(**params, cv=3) clf.fit(scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0]))
from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from sklearn.ensemble import StackingClassifier from sklearn.model_selection import train_test_split X, y = load_iris(return_X_y=True) estimators = [ ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svr', make_pipeline(StandardScaler(), LinearSVC(random_state=42))) ] clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression() ) X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, random_state=42 ) clf.fit(X_train, y_train).score(X_test, y_test) ############################################################################ # Checking scikit-learn compatibility of an estimator # --------------------------------------------------- # Developers can check the compatibility of their scikit-learn compatible # estimators using :func:`~utils.estimator_checks.check_estimator`. For # instance, the ``check_estimator(LinearSVC)`` passes. # # We now provide a ``pytest`` specific decorator which allows ``pytest`` # to run all checks independently and report the checks that are failing.
# {'n_estimators': 100, # 'base_estimator__min_samples_split': 0.2, # 'base_estimator__max_depth': 1, # 'base_estimator__class_weight': {0: 2, 1: 1}} ## GradientBoostingClassifier ## {'subsample': 0.8, 'max_features': 0.8, 'init__min_samples_split': 0.4} search.score(X_test, y_test) search.best_params_ clf.fit(X_train , y_train) clf.score(X_test , y_test) list(X_train.columns[np.argsort(-clf.feature_importances_ , )]) roc_auc_score(y_test , clf.predict_proba(X_test)[:,1]) from sklearn.ensemble import StackingClassifier clf = StackingClassifier( n_jobs=-1 ,estimators=[ ('rf' , RandomForestClassifier(n_estimators=200)) , ('abc' , AdaBoostClassifier(n_estimators=200)) ]) clf.fit(X_train , y_train) roc_auc_score(y_test , clf.predict_proba(X_test)[:,1])
], ) def test_stacking_regressor_error(y, params, type_err, msg_err): with pytest.raises(type_err, match=msg_err): reg = StackingRegressor(**params, cv=3) reg.fit(scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0])) @pytest.mark.parametrize( "estimator, X, y", [ ( StackingClassifier(estimators=[ ("lr", LogisticRegression(random_state=0)), ("svm", LinearSVC(random_state=0)), ]), X_iris[:100], y_iris[:100], ), # keep only classes 0 and 1 ( StackingRegressor(estimators=[ ("lr", LinearRegression()), ("svm", LinearSVR(random_state=0)), ]), X_diabetes, y_diabetes, ), ], ids=["StackingClassifier", "StackingRegressor"], )
def Stacking(self): estimators3 = [ ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('knn', KNeighborsClassifier(n_neighbors=5)), ('svm', SVC())] estimators2 = [ ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svm', SVC())] estimators1 = [ ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('knn', KNeighborsClassifier(n_neighbors=5))] estimators4 = [ ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svm', SVC())] try: if (self.svmStackingcheckBox.isChecked() and self.rfcStackingcheckBox.isChecked() and self.knnStackingcheckBox.isChecked()): estimators = estimators3 clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test) self.accuracyEnsembleLBL.setText(str(stackingAccuracy)) elif (self.svmStackingcheckBox.isChecked() and self.rfcStackingcheckBox.isChecked()): estimators = estimators2 clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test) self.accuracyEnsembleLBL.setText(str(stackingAccuracy)) elif(self.rfcStackingcheckBox.isChecked() and self.knnStackingcheckBox.isChecked()): estimators = estimators1 clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test) self.accuracyEnsembleLBL.setText(str(stackingAccuracy)) elif(self.svmStackingcheckBox.isChecked() and self.knnStackingcheckBox.isChecked()): estimators = estimators4 clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test) self.accuracyEnsembleLBL.setText(str(stackingAccuracy)) except Exception as a: print(a)
def defineBestModelPipeline(df, target, categorical_columns, numeric_columns): # Splitting original data into Train and Test x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.1, random_state=42) y_train = y_train.to_numpy( ) # Transforming training targets into numpy arrays y_test = y_test.to_numpy() # Transforming test targets into numpy arrays # # If desired, we can balance training classes using one of the functions below # # Obtaining balanced data for modeling using Random Under Sampling x_train, y_train = balancingClassesRus(x_train, y_train) # # Obtaining balanced data for modeling using SMOTEENN #x_train, y_train = balancingClassesSmoteenn(x_train, y_train) # # Obtaining balanced data for modeling using SMOTE #x_train, y_train = balancingClassesSmote(x_train, y_train) # 1st -> Numeric Transformers # Here, we are creating different several different data transformation pipelines # to be applied in our numeric features numeric_transformer_1 = Pipeline( steps=[('imp', IterativeImputer(max_iter=30, random_state=42) ), ('scaler', MinMaxScaler())]) numeric_transformer_2 = Pipeline( steps=[('imp', IterativeImputer(max_iter=20, random_state=42) ), ('scaler', StandardScaler())]) numeric_transformer_3 = Pipeline( steps=[('imp', SimpleImputer(strategy='mean')), ('scaler', MinMaxScaler())]) numeric_transformer_4 = Pipeline( steps=[('imp', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) # 2nd -> Categorical Transformer # Despite my option of not doing it, you can also choose to create different # data transformation pipelines for your categorical features. categorical_transformer = Pipeline( steps=[('frequent', SimpleImputer(strategy='most_frequent') ), ('onehot', OneHotEncoder(use_cat_names=True))]) # 3rd -> Combining both numerical and categorical pipelines # Here, we are creating different ColumnTransformers, each one with a different numerical transformation data_transformations_1 = ColumnTransformer(transformers=[( 'num', numeric_transformer_1, numeric_columns), ('cat', categorical_transformer, categorical_columns)]) data_transformations_2 = ColumnTransformer(transformers=[( 'num', numeric_transformer_2, numeric_columns), ('cat', categorical_transformer, categorical_columns)]) data_transformations_3 = ColumnTransformer(transformers=[( 'num', numeric_transformer_3, numeric_columns), ('cat', categorical_transformer, categorical_columns)]) data_transformations_4 = ColumnTransformer(transformers=[( 'num', numeric_transformer_4, numeric_columns), ('cat', categorical_transformer, categorical_columns)]) # And finally, we are going to apply these different data transformations to RandomSearchCV, # trying to find the best imputing strategy, the best feature engineering strategy # and the best model with it's respective parameters. # Below, we just need to initialize a Pipeline object with any transformations we want, on each of the steps. pipe = Pipeline(steps=[ ( 'data_transformations', data_transformations_1 ), # Initializing data transformation step by choosing any of the above ( 'feature_eng', PCA() ), # Initializing feature engineering step by choosing any desired method ('clf', SVC()) ]) # Initializing modeling step of the pipeline with any model object #memory='cache_folder') -> Used to optimize memory when needed # Now, we define the grid of parameters that RandomSearchCV will use. It will randomly chose # options for each step inside the dictionaries ('data transformations', 'feature_eng', 'clf' # and 'clf parameters'). In the end of it's iterations, RandomSearchCV will return the best options. params_grid = [{ 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [KNeighborsClassifier()], 'clf__n_neighbors': stats.randint(1, 30), 'clf__metric': ['minkowski', 'euclidean'] }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [LogisticRegression()], 'clf__penalty': ['l1', 'l2'], 'clf__C': stats.uniform(0.01, 10) }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [SVC()], 'clf__C': stats.uniform(0.01, 1), 'clf__gamma': stats.uniform(0.01, 1) }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [DecisionTreeClassifier()], 'clf__criterion': ['gini', 'entropy'], 'clf__max_features': [None, "auto", "log2"], 'clf__max_depth': [None, stats.randint(1, 5)] }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [RandomForestClassifier()], 'clf__n_estimators': stats.randint(10, 175), 'clf__max_features': [None, "auto", "log2"], 'clf__max_depth': [None, stats.randint(1, 5)], 'clf__random_state': stats.randint(1, 49) }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [ExtraTreesClassifier()], 'clf__n_estimators': stats.randint(10, 150), 'clf__max_features': [None, "auto", "log2"], 'clf__max_depth': [None, stats.randint(1, 6)] }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [GradientBoostingClassifier()], 'clf__n_estimators': stats.randint(10, 100), 'clf__learning_rate': stats.uniform(0.01, 0.7), 'clf__max_depth': [None, stats.randint(1, 6)] }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [LGBMClassifier()], 'clf__n_estimators': stats.randint(1, 100), 'clf__learning_rate': stats.uniform(0.01, 0.7), 'clf__max_depth': [None, stats.randint(1, 6)] }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [XGBClassifier()], 'clf__n_estimators': stats.randint(5, 125), 'clf__eta': stats.uniform(0.01, 1), 'clf__max_depth': [None, stats.randint(1, 6)], 'clf__gamma': stats.uniform(0.01, 1) }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [ StackingClassifier(estimators=[ ('svc', SVC(C=1, gamma=1)), ('rf', RandomForestClassifier(max_depth=7, max_features=None, n_estimators=60, n_jobs=-1, random_state=42)), ('xgb', XGBClassifier(eta=0.6, gamma=0.7, max_depth=None, n_estimators=30)) ], final_estimator=LogisticRegression(C=1)) ] }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [ VotingClassifier(estimators=[ ('gbt', GradientBoostingClassifier(learning_rate=0.8, max_depth=None, n_estimators=30)), ('lgbm', LGBMClassifier(n_estimators=30, learning_rate=0.6, max_depth=None)), ('xgb', XGBClassifier(eta=0.8, gamma=0.8, max_depth=None, n_estimators=40)) ], voting='soft') ] }] # Now, we fit a RandomSearchCV to search over the grid of parameters defined above metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'] best_model_pipeline = RandomizedSearchCV(pipe, params_grid, n_iter=500, scoring=metrics, refit='accuracy', n_jobs=-1, cv=5, random_state=42) best_model_pipeline.fit(x_train, y_train) # At last, we check the final results print( "\n\n#---------------- Best Data Pipeline found in RandomSearchCV ----------------#\n\n", best_model_pipeline.best_estimator_[0]) print( "\n\n#---------------- Best Feature Engineering technique found in RandomSearchCV ----------------#\n\n", best_model_pipeline.best_estimator_[1]) print( "\n\n#---------------- Best Classifier found in RandomSearchCV ----------------#\n\n", best_model_pipeline.best_estimator_[2]) print( "\n\n#---------------- Best Estimator's average Accuracy Score on CV (validation set) ----------------#\n\n", best_model_pipeline.best_score_) return x_train, x_test, y_train, y_test, best_model_pipeline
def stacking(model_listS): clf = StackingClassifier(estimators=model_listS, final_estimator=LogisticRegression()) doBestModel(model(clf))
('ada', AdaBoostClassifier(n_estimators=100)), ('network', MLPClassifier(solver='lbfgs', random_state=1, activation='tanh', alpha=1e-6, hidden_layer_sizes=(10, 30, 5))), # ('knn', KNeighborsClassifier(n_neighbors=10)), ('log', LogisticRegression(C=500, penalty="l2", max_iter=300, tol=0.1)), ('bagging', BaggingClassifier(DecisionTreeClassifier(min_samples_split=0.03), max_samples=0.8, max_features=0.8)) ] NN_model = StackingClassifier(estimators=estimators, final_estimator=gdbt_clf, cv=5) # NN_model = MLPClassifier(solver='lbfgs', random_state=1, activation='tanh', alpha=1e-6, hidden_layer_sizes=(10,30,5), max_iter=500) NN_model = NN_model.fit(train, target) pred1 = NN_model.predict(X_valid) C2 = confusion_matrix(y_valid, pred1, labels=[0, 1, 2]) print(C2) # plot sns.set() f, ax = plt.subplots() sns.heatmap(C2, annot=True, ax=ax) ax.set_title('confusion matrix for stacking') ax.set_xlabel('predict value') ax.set_ylabel('true value')
# Prepare dataframes features, labels = return_features_labels("data/titanic.csv") features_train, features_test, labels_train, labels_test = train_test_split( features, labels, train_size=0.8, random_state=42, shuffle=True) # Init estimators # Create Learners per layer layer_one_estimators = [('rf_1', RandomForestClassifier(n_estimators=10, random_state=42)), ('knn_1', KNeighborsClassifier(n_neighbors=5))] layer_two_estimators = [ ('dt_2', DecisionTreeClassifier()), ('rf_2', RandomForestClassifier(n_estimators=50, random_state=42)), ] layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=LogisticRegression()) # Create Final model by model = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two) model.fit(features_train, labels_train) labels_pred = model.predict(features_test) skplt.metrics.plot_confusion_matrix(labels_test, labels_pred, normalize=True) plt.show()
clf3param = model_KNNs[U_featurenames[i]].get_params() clf3 = KNeighborsClassifier(n_neighbors=clf3param['n_neighbors'], algorithm=clf3param['algorithm'], leaf_size=clf3param['leaf_size'], p=2, metric='minkowski') clf4param = model_SVMs[U_featurenames[i]].get_params() clf4 = svm.SVC(C=clf4param['C'], kernel=clf4param['kernel'], gamma=clf4param['gamma'], decision_function_shape='ovo', random_state=0) estimator = [('NB', clf1), ('RF', clf2), ('KNN', clf3), ('SVM', clf4)] clf5 = StackingClassifier(estimators=estimator, final_estimator=LogisticRegressionCV( cv=5, random_state=0), stack_method='auto', n_jobs=-1) score = cross_val_score(clf5, U_data[:, i].reshape(-1, 1), WHO, scoring='accuracy', cv=ShuffleSplit(n_splits=5, test_size=0.1, random_state=0), n_jobs=-1) Stackscores.append((round(np.mean(score), 4), U_featurenames[i])) model_stack = clf5.fit(U_data[:, i].reshape(-1, 1), WHO) model_stacks[U_featurenames[i]] = model_stack print("Stack %d done. " % (i)) Stackscores_sorted = sorted(Stackscores, reverse=True)
best_cols = Importances.nlargest(20).index X = df[best_cols] X = scale.fit_transform(X) # ----- VotingClassifier ----- # vote = VotingClassifier([('ExtraRFC', ExtraRFC), ('RFC', RFC), ('GB', GB), ('Adaboost', Adaboost), ('XGBoost', XGBoost), ('LGBM', LGBM)]) scores = cross_val_score(vote, X, label, cv=10) print(f''' VotingClassifier: mean: {pp.round(pp.mean(scores), 3)} | STD: {pp.round(pp.std(scores), 2)} ''') # ----- StackingClassifier ----- # stack = StackingClassifier([('ExtraRFC', ExtraRFC), ('RFC', RFC), ('GB', GB), ('Adaboost', Adaboost), ('XGBoost', XGBoost), ('LGBM', LGBM)]) scores = cross_val_score(stack, X, label, cv=10) print(f''' StackingClassifier: mean: {pp.round(pp.mean(scores), 3)} | STD: {pp.round(pp.std(scores), 2)} ''') ''' VotingClassifier: mean: 0.862 | STD: 0.03 StackingClassifier: mean: 0.862 | STD: 0.03 ''' # ----- BaggingClassifier: StackingClassifier ----- #
filename = '../../datasets/bankloan_classification_train.csv' names = ['age', 'loanamount', 'status'] df = read_csv(filename, names=names) array = df.values inputx = array[:, 0:2] outputy = array[:, 2] from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from sklearn.ensemble import StackingClassifier estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svr', make_pipeline(StandardScaler(), LinearSVC(random_state=42)))] thismodel = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) print(thismodel.fit(inputx, outputy)) filename = '../../datasets/bankloan_classification_test.csv' names = ['age', 'loanamount'] newdataframe = read_csv(filename, names=names) array = newdataframe.values testinputz = array[0:4, 0:2] print(newdataframe) res = thismodel.predict(testinputz) reslist = [] for val in res: if val == 0: reslist.append("WillNotPay") else: reslist.append("WillPay") print(reslist)
def getFitness(individual, X, y): """ Feature subset fitness function """ if individual.count(0) != len(individual): # get index with value 0 cols = [index for index in range( len(individual)) if individual[index] == 0] # get features subset X_parsed = X.drop(X.columns[cols], axis=1) X_subset = pd.get_dummies(X_parsed) # X_subset = X # # for col in cols: # X_subset[col].values[:] = 0 # apply classification algorithm clf = AdaBoostClassifier() clf = BaggingClassifier() clf = BernoulliNB() clf = CalibratedClassifierCV() clf = CategoricalNB() clf = ClassifierChain() clf = ComplementNB() clf = DecisionTreeClassifier() clf = DummyClassifier() clf = ExtraTreeClassifier() clf = ExtraTreesClassifier() clf = GaussianNB() clf = GaussianProcessClassifier() clf = GradientBoostingClassifier() # clf = HistGradientBoostingClassifier() clf = KNeighborsClassifier() clf = LabelPropagation() clf = LabelSpreading() clf = LinearDiscriminantAnalysis() clf = LinearSVC() clf = LogisticRegression() clf = LogisticRegressionCV() clf = MLPClassifier() clf = MultiOutputClassifier() clf = MultinomialNB() clf = NearestCentroid() clf = NuSVC() clf = OneVsOneClassifier() clf = OneVsRestClassifier() clf = OutputCodeClassifier() clf = PassiveAggressiveClassifier() clf = Perceptron() clf = QuadraticDiscriminantAnalysis() clf = RadiusNeighborsClassifier() clf = RandomForestClassifier() clf = RidgeClassifier() clf = RidgeClassifierCV() clf = SGDClassifier() clf = SVC() clf = StackingClassifier() clf = VotingClassifier() # clf.fit(X, y) # clf.fit(X_subset, y_train) clf.fit(X_subset, y) # y_pred_ANN = clf.predict(X_test) # y_pred = clf.predict(X_subset) # score = cross_val_score(clf, X, y, cv=5) # # print(max(score), min(score)) return (avg(cross_val_score(clf, X_subset, y, cv=5)),) # return (avg(score),) # return accuracy_score(y, y_pred_ANN) else: return (0,)
class Classifier(object): def __init__(self, in_model_code, db, y_col="party", label_col="county_fips", where_clauses=None, data_view="master_data", year_col="year", year_test=2020): self.db = db self.mc = in_model_code self.drop_cols = db.query(ModelDropCol).filter_by( model_code_id=self.mc.id).all() where = self.db.query(ModelWhereClause).filter_by( model_code=self.mc).all() if where: self.where = " where " + (" and ".join([wc.sql for wc in where])) else: self.where = "" self.engine_string = database_string self.query = f"select * from {data_view}{self.where}" self.df = pandas.read_sql_query( self.query, database_string).drop(columns=[dc.column for dc in self.drop_cols]) self.y = self.df[y_col].to_numpy() self.x = self.df.drop(columns=y_col).to_numpy() self.model_obj = self.db.query(Model).filter_by( model_code=self.mc).first() if not self.model_obj: rf = RandomForestClassifier(n_estimators=10, random_state=42) svr = make_pipeline( StandardScaler(), LinearSVC(random_state=42, dual=False, max_iter=1000)) knn = KNeighborsClassifier(n_neighbors=3) nb = GaussianNB() classifiers = [("rf", rf), ("svr", svr), ("knn", knn), ("nb", nb)] self.model = StackingClassifier( estimators=classifiers, final_estimator=LogisticRegression()) self.accuracy = None self.model_obj = Model(model_code=self.mc, accuracy=self.accuracy) self.db.add(self.model_obj) self.train() self.save() else: self.model = pickle.loads(self.model_obj.model_object) self.accuracy = self.model_obj.accuracy def train(self): x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.33) self.model.fit(x_train, y_train) self.accuracy = self.model.score(x_test, y_test) def save(self): self.model_obj.model_object = pickle.dumps(self.model) self.model_obj.accuracy = self.accuracy self.db.commit() def predict(self, fips, in_file_path=None): """ Currently hard coded to predict for 2020, or the latest election in which all data as available, but not trained on. """ if "2020" in self.mc.id: raise IOError( "Must be a non-2020 model code to predict 2020 results.") year = 2020 logging.info(f"Selecting {self.mc.id} model ({self.mc.description})") if fips in ["ALL", "*"]: and_clause = "" logging.info("Predicting all counties...") all_counties = True else: and_clause = f" and county_fips = {fips}" all_counties = False max_year = self.db.execute( f"select max(year) from ({self.query})").scalar() search_year = max_year - 4 data = pandas.read_sql_query( f"select * from ({self.query}) where year = '{search_year}'{and_clause}", self.engine_string).drop( columns=[dc.column for dc in self.drop_cols]) fields = list(data.columns) county_fips_idx = None for i, f in enumerate(fields): if f == "county_fips": county_fips_idx = i - 1 break y = data["party"].to_numpy() x = data.drop(columns=["party"]).to_numpy() predictions = self.model.predict(x) out_predictions = [] fips_to_county = {} logging.info("Predictions:") i = 0 for val in x: pred = predictions[i] county_id = str(int(val[county_fips_idx])).zfill(6) if county_id in fips_to_county: county = fips_to_county[county_id] else: county = self.db.query(County).filter_by(id=county_id).first() fips_to_county[county_id] = county logging.info(f"{county.name} ({county.id}): {pred}") out_predictions.append({ "party_prediction": pred, "county_fips": county_id, "county_name": county.name, "state_fips": county.state.id, "state_code": county.state.code }) i += 1 if in_file_path: logging.info(f"Writing output to {in_file_path}") out_cols = [ "party_prediction", "county_fips", "county_name", "state_fips", "state_code" ] with open(in_file_path, "w") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=out_cols) writer.writeheader() writer.writerows(out_predictions) return out_predictions
y, test_size=0.3, random_state=0) #Feature Scaling sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.fit_transform(x_test) #Building Model estimators = [('sgd', SGDClassifier(loss='modified_huber', random_state=0)), ('knn', make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=11)))] reg = StackingClassifier(estimators=estimators, final_estimator=KNeighborsClassifier(n_neighbors=11)) reg.fit(x_train, y_train) y_pred = reg.predict(x_test) cm = confusion_matrix(y_test, y_pred) print(cm) acc = accuracy_score(y_test, y_pred) print("accuracy score %0.2f%%" % (acc * 100)) #ROC and AUC curve from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve clf_probs = reg.predict_proba(x_test) clf_probs = clf_probs[:, 1]
# 构建基分类器组 estimators = [ ('rf', RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=args.randomseed)), ('svm', SVC(kernel='sigmoid', random_state=args.randomseed)), ('xgb', xgb.XGBClassifier(n_estimators=100, n_jobs=-1, random_state=args.randomseed)), ('lgb', lgb.LGBMClassifier(boosting_type='goss', n_estimators=100, n_jobs=-1, random_state=args.randomseed)), ('nb', GaussianNB()), # 高斯朴素贝叶斯 ('knn', KNeighborsClassifier(n_jobs=-1)) # k近邻 ] # 构建元分类器 clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(n_jobs=-1, random_state=args.randomseed), cv=5) print('\nClassifier parameters:', clf) # 特征排序,根据函数名,获得相应算法函数对象 fs = getattr(sys.modules[__name__], args.feature) # 特征排序 model_fs = SelectKBest(fs, k="all").fit(X, y) # 降序排列特征权重 fs_idxs = np.argsort(-model_fs.scores_) print('\nStarting cross validating after feature selection...\n') # 特征排序后的增量特征预测
print(len(pipelines)) print(pipe1) print(pipe2) print(pipe3) print(pipe4) print(pipe5) print(pipe6) print(pipe7) print(pipe8) print(pipe9) print(pipe10) """Stacking com kNN no Final""" kNN_sclf = StackingClassifier(estimators=pipelines, final_estimator=KNeighborsClassifier(), cv=stratifiedKfold) kNN_sclf_scores = cross_val_score(kNN_sclf, X, y, cv=stratifiedKfold) kNN_sclf_scores = np.array(kNN_sclf_scores) print('accuracy: %0.4f (+/- %0.4f)' % (kNN_sclf_scores.mean(), kNN_sclf_scores.std())) """Stacking com Gaussian Naive Bayes no Final""" gaussian_nb_sclf = StackingClassifier(estimators=pipelines, final_estimator=GaussianNB(), cv=stratifiedKfold) gaussian_nb_sclf_scores = cross_val_score(gaussian_nb_sclf, X, y, cv=stratifiedKfold)
def run(dataset, config): log.info( f"\n**** Stacking Ensemble [sklearn v{sklearn.__version__}] ****\n") is_classification = config.type == 'classification' X_train, X_test = dataset.train.X_enc, dataset.test.X_enc y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config estimators_params = { e: config.framework_params.get(f'_{e}_params', {}) for e in ['rf', 'gbm', 'linear', 'svc', 'final'] } log.info( "Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores." .format(config.max_runtime_seconds, n_jobs)) log.warning( "We completely ignore the requirement to stay within the time limit.") log.warning( "We completely ignore the advice to optimize towards metric: {}.". format(config.metric)) if is_classification: estimator = StackingClassifier( estimators=[ ('rf', RandomForestClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['rf'])), ('gbm', GradientBoostingClassifier(random_state=config.seed, **estimators_params['gbm'])), ('linear', SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['linear'])), # ('svc', LinearSVC(random_state=config.seed, **estimators_params['svc'])) ], # final_estimator=SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']), final_estimator=LogisticRegression(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']), stack_method='predict_proba', n_jobs=n_jobs, **training_params) else: estimator = StackingRegressor( estimators=[ ('rf', RandomForestRegressor(n_jobs=n_jobs, random_state=config.seed, **estimators_params['rf'])), ('gbm', GradientBoostingRegressor(random_state=config.seed, **estimators_params['gbm'])), ('linear', SGDRegressor(random_state=config.seed, **estimators_params['linear'])), ('svc', LinearSVR(random_state=config.seed, **estimators_params['svc'])) ], # final_estimator=SGDRegressor(random_state=config.seed, **estimators_params['final']), final_estimator=LinearRegression(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']), n_jobs=n_jobs, **training_params) with utils.Timer() as training: estimator.fit(X_train, y_train) predictions = estimator.predict(X_test) probabilities = estimator.predict_proba( X_test) if is_classification else None return result(output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, target_is_encoded=is_classification, models_count=len(estimator.estimators_) + 1, training_duration=training.duration)
#StackingClassifier([('brf', BalancedRandomForestClassifier(random_state=RS, n_estimators=1000))]), #StackingClassifier([('brf', BalancedRandomForestClassifier(random_state=RS, n_estimators=1000))], GradientBoostingClassifier(random_state=RS)), #StackingClassifier([('brf', BalancedRandomForestClassifier(random_state=RS, n_estimators=1000))], MLPClassifier(random_state=RS, hidden_layer_sizes=[100]*5)), #StackingClassifier([('brf', BalancedRandomForestClassifier(random_state=RS, n_estimators=1000))], KNeighborsClassifier()), #RandomForestClassifier(random_state=RS, n_estimators=1000), #BalancedRandomForestClassifier(random_state=RS, n_estimators=1000), #OneVsRestClassifier(GradientBoostingClassifier(random_state=RS)), #OneVsOneClassifier(GradientBoostingClassifier(random_state=RS, n_estimators=1000)), #OneVsOneClassifier(RandomForestClassifier(random_state=RS, n_estimators=1000)), OneVsOneClassifier( BalancedRandomForestClassifier(random_state=RS, n_estimators=1000)), StackingClassifier( [('rs', OneVsOneClassifier( GradientBoostingClassifier(random_state=RS, n_estimators=1000)))], OneVsOneClassifier( BalancedRandomForestClassifier(random_state=RS, n_estimators=1000))), #StackingClassifier([('rs', OneVsOneClassifier(RandomForestClassifier(random_state=RS, n_estimators=1000)))], OneVsOneClassifier(GradientBoostingClassifier(random_state=RS, n_estimators=1000))) #OneVsRestClassifier(MLPClassifier(hidden_layer_sizes= [100]*5, random_state=RS)), #OneVsOneClassifier(MLPClassifier(hidden_layer_sizes= [100]*5, random_state=RS)), #OneVsRestClassifier(SVC(decision_function_shape='ovr', random_state=RS)), #OneVsOneClassifier(SVC(decision_function_shape='ovo', random_state=RS)), #RandomForestClassifier(random_state=RS, n_estimators=1000, min_samples_leaf=3), #RandomForestClassifier(random_state=RS, n_estimators=1000, criterion='entropy'), #LinearDiscriminantAnalysis(), ##QuadraticDiscriminantAnalysis(), #BalancedBaggingClassifier(random_state=RS), ]
build_audit(GBDTLRClassifier(XGBClassifier(n_estimators = 17, random_state = 13), LogisticRegression()), "XGBLRAudit") build_audit(GBDTLRClassifier(XGBRFClassifier(n_estimators = 7, max_depth = 6, random_state = 13), SGDClassifier(loss = "log", penalty = "elasticnet", random_state = 13)), "XGBRFLRAudit") build_audit(EstimatorProxy(GradientBoostingClassifier(loss = "exponential", init = None, random_state = 13)), "GradientBoostingAudit") build_audit(HistGradientBoostingClassifier(max_iter = 71, random_state = 13), "HistGradientBoostingAudit") build_audit(LGBMClassifier(objective = "binary", n_estimators = 37), "LGBMAudit", predict_params = {"num_iteration" : 17}, predict_proba_params = {"num_iteration" : 17}, num_iteration = 17) build_audit(LinearDiscriminantAnalysis(solver = "lsqr"), "LinearDiscriminantAnalysisAudit") build_audit(LinearSVC(penalty = "l1", dual = False, random_state = 13), "LinearSVCAudit", with_proba = False) build_audit(LogisticRegression(multi_class = "multinomial", solver = "newton-cg", max_iter = 500), "MultinomialLogisticRegressionAudit") build_audit(LogisticRegressionCV(cv = 3, multi_class = "ovr"), "OvRLogisticRegressionAudit") build_audit(BaggingClassifier(LogisticRegression(), n_estimators = 3, max_features = 0.5, random_state = 13), "LogisticRegressionEnsembleAudit") build_audit(GaussianNB(), "NaiveBayesAudit") build_audit(OneVsRestClassifier(LogisticRegression()), "OneVsRestAudit") build_audit(EstimatorProxy(RandomForestClassifier(n_estimators = 10, min_samples_leaf = 3, random_state = 13)), "RandomForestAudit", flat = True) build_audit(RidgeClassifierCV(), "RidgeAudit", with_proba = False) build_audit(BaggingClassifier(RidgeClassifier(random_state = 13), n_estimators = 3, max_features = 0.5, random_state = 13), "RidgeEnsembleAudit") build_audit(StackingClassifier([("lda", LinearDiscriminantAnalysis(solver = "lsqr")), ("lr", LogisticRegression())], final_estimator = GradientBoostingClassifier(n_estimators = 11, random_state = 13)), "StackingEnsembleAudit") build_audit(SVC(gamma = "auto"), "SVCAudit", with_proba = False) build_audit(VotingClassifier([("dt", DecisionTreeClassifier(random_state = 13)), ("nb", GaussianNB()), ("lr", LogisticRegression())], voting = "soft", weights = [3, 1, 2]), "VotingEnsembleAudit") build_audit(XGBClassifier(objective = "binary:logistic", importance_type = "weight", random_state = 13), "XGBAudit", predict_params = {"ntree_limit" : 71}, predict_proba_params = {"ntree_limit" : 71}, byte_order = "LITTLE_ENDIAN", charset = "US-ASCII", ntree_limit = 71) build_audit(XGBRFClassifier(objective = "binary:logistic", n_estimators = 31, max_depth = 5, random_state = 13), "XGBRFAudit") audit_X, audit_y = load_audit("Audit") def build_audit_cat(classifier, name, with_proba = True, fit_params = {}): marital_mapping = { "Married-spouse-absent" : "Married" } mapper = DataFrameMapper( [([column], ContinuousDomain(display_name = column)) for column in ["Age", "Income"]] + [(["Hours"], [ContinuousDomain(display_name = "Hours"), CutTransformer(bins = [0, 20, 40, 60, 80, 100], labels = False, right = False, include_lowest = True)])] + [(["Employment", "Education"], [MultiDomain([CategoricalDomain(display_name = "Employment"), CategoricalDomain(display_name = "Education")]), OrdinalEncoder(dtype = numpy.int_)])] +
from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn import linear_model from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import StackingClassifier from sklearn.ensemble import ExtraTreesClassifier TRAIN_DATA_PATH = os.getenv("TRAIN_DATA_PATH") TEST_DATA_PATH = os.getenv("TEST_DATA_PATH") train_data = pd.read_csv(TRAIN_DATA_PATH) X_train, y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1] sc = StandardScaler() X_tr = sc.fit_transform(X_train) level_0 = list() level_0.append(('RF', ExtraTreesClassifier(n_estimators=1000))) level_0.append(('LR', LogisticRegression(max_iter=7000))) level_1 = LinearDiscriminantAnalysis() model = StackingClassifier(estimators=level_0, final_estimator=level_1, cv=4) model.fit(X_tr, y_train) test_data = pd.read_csv(TEST_DATA_PATH) X_te = sc.transform(test_data) submission = model.predict(X_te) submission = pd.DataFrame(submission) submission.to_csv('submission.csv', header=['class'], index=False)
RandomForestClassifier(random_state=123456, n_jobs=-1, max_depth=50, n_estimators=400, verbose=2)), ('xgboost', xgb.XGBClassifier(predictor='cpu_predictor', n_gpus=0, n_jobs=-1, n_estimators=700, eta=0.1, max_depth=10, verbose=2))] stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5, verbose=2) #stacking, y_test_stacking = validacion_cruzada(stacking, X, y, skf) # Entreno de nuevo con el total de los datos # El resultado que muestro es en training, será mejor que en test t = time.time() clf = stacking clf = clf.fit(X, y) tiempo = time.time() - t #plotImp(clf, selec, X.shape[1]) y_pred_tra = clf.predict(X) print("F1 score (tst): {:.4f}, tiempo: {:6.2f} segundos".format( f1_score(y, y_pred_tra, average='micro'), tiempo))
from sklearn.ensemble import StackingClassifier, StackingRegressor from sklearn.ensemble import VotingClassifier, VotingRegressor X, y = load_iris(return_X_y=True) X_r, y_r = load_diabetes(return_X_y=True) @pytest.mark.parametrize( "X, y, estimator", [ ( *make_classification(n_samples=10), StackingClassifier(estimators=[ ("lr", LogisticRegression()), ("svm", LinearSVC()), ("rf", RandomForestClassifier()), ]), ), ( *make_classification(n_samples=10), VotingClassifier(estimators=[ ("lr", LogisticRegression()), ("svm", LinearSVC()), ("rf", RandomForestClassifier()), ]), ), ( *make_regression(n_samples=10), StackingRegressor(estimators=[ ("lr", LinearRegression()),
def test_stacking_classifier_drop_estimator(): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_iris), y_iris, stratify=y_iris, random_state=42) estimators = [("lr", "drop"), ("svc", LinearSVC(random_state=0))] rf = RandomForestClassifier(n_estimators=10, random_state=42) clf = StackingClassifier(estimators=[("svc", LinearSVC(random_state=0))], final_estimator=rf, cv=5) clf_drop = StackingClassifier(estimators=estimators, final_estimator=rf, cv=5) clf.fit(X_train, y_train) clf_drop.fit(X_train, y_train) assert_allclose(clf.predict(X_test), clf_drop.predict(X_test)) assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test)) assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
"model": StackingClassifier( estimators=[ ( "lgbm", best_models["lgbm"]["model"] .set_params( **{"under__sampling_strategy": {5: int(0.11 * 76647 * (2 / 3))}} ) .set_params(**best_models["lgbm"]["parameters"]), ), ( "random_forest", best_models["random_forest"]["model"] .set_params( **{"under__sampling_strategy": {5: int(0.11 * 76647 * (2 / 3))}} ) .set_params(**best_models["random_forest"]["parameters"]), ), ( "xgboost", best_models["xgboost"]["model"] .set_params( **{"under__sampling_strategy": {5: int(0.11 * 76647 * (2 / 3))}} ) .set_params(**best_models["xgboost"]["parameters"]), ), ( "extratree", best_models["extratree"]["model"] .set_params( **{"under__sampling_strategy": {5: int(0.11 * 76647 * (2 / 3))}} ) .set_params(**best_models["extratree"]["parameters"]), ), ( "histgradientboosting", best_models["histgradientboosting"]["model"] .set_params( **{"sampling_strategy": {5: int(0.11 * 76647 * (2 / 3))}} ) .set_params(**best_models["histgradientboosting"]["parameters"]), ), ( "balanced_rf", best_models["balanced_rf"]["model"] .set_params( **{"sampling_strategy": {5: int(0.11 * 76647 * (2 / 3))}} ) .set_params(**best_models["balanced_rf"]["parameters"]), ), ], final_estimator=imb.pipeline.Pipeline( steps=[ ( "under", RandomUnderSampler( sampling_strategy={5: int(0.11 * (4 / 9) * 76647)} ), ), # 4/9 because of double cross-validation, 3 fold for BayesSearchCV and 3 fold for final_estimator. ("model", LGBMClassifier(n_jobs=-1, boosting_type="gbdt")), ] ), verbose=1, n_jobs=-1, cv=3, ),
def test_stacking_classifier_iris(cv, final_estimator, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, y_test = train_test_split(scale(X_iris), y_iris, stratify=y_iris, random_state=42) estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())] clf = StackingClassifier( estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough, ) clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) assert clf.score(X_test, y_test) > 0.8 X_trans = clf.transform(X_test) expected_column_count = 10 if passthrough else 6 assert X_trans.shape[1] == expected_column_count if passthrough: assert_allclose(X_test, X_trans[:, -4:]) clf.set_params(lr="drop") clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) if final_estimator is None: # LogisticRegression has decision_function method clf.decision_function(X_test) X_trans = clf.transform(X_test) expected_column_count_drop = 7 if passthrough else 3 assert X_trans.shape[1] == expected_column_count_drop if passthrough: assert_allclose(X_test, X_trans[:, -4:])
def build_stacking( models, base_model="LogisticRegression", base_model_params=None, cv=5, passthrough=False, ): """ Function to build a simple stacking composed of models loaded in above dicts. Parameters ------------------- models: list Models to use as base estimators. base_model: str Model to use as final estimator. base_model_params: dict Dict containing the parameters for the final estimator. cv: int The number of splits for a StratifiedKFold (k). passthrough: bool Whether or not to fit the final estimator with the data as well as with the base estimators' predictions. Returns ------------------- A StackingClassifier. """ print("1") print(base_model_params) print(type(base_model_params)) base_model_params = dict(base_model_params) base_models = [ (m, best_models[m]["model"].set_params(**best_models[m]["parameters"])) for m in models ] print("2") if base_model == "LogisticRegression": final_estimator = imb.pipeline.Pipeline( steps=[ ( "under", RandomUnderSampler( sampling_strategy={5: int(0.11 * (4 / 5) * 76647)} ), ), ("model", LogisticRegression().set_params(**base_model_params)), ] ) elif base_model == "LGBM": final_estimator = imb.pipeline.Pipeline( steps=[ ( "under", RandomUnderSampler( sampling_strategy={5: int(0.11 * (4 / 5) * 76647)} ), ), ("model", LGBMClassifier(**base_model_params)), ] ) print(final_estimator._estimator_type) print("getting stacking") stacking = StackingClassifier( estimators=base_models, final_estimator=final_estimator, n_jobs=-1, passthrough=passthrough, verbose=1, ) return stacking
'estimators': [('lr', LinearRegression()), ('cor', LinearSVR())], 'final_estimator': NoWeightRegressor() }, TypeError, 'does not support sample weight')]) def test_stacking_regressor_error(y, params, type_err, msg_err): with pytest.raises(type_err, match=msg_err): reg = StackingRegressor(**params, cv=3) reg.fit(scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0])) @pytest.mark.parametrize( "estimator, X, y", [ (StackingClassifier( estimators=[('lr', LogisticRegression( random_state=0)), ('svm', LinearSVC(random_state=0))]), X_iris[:100], y_iris[:100]), # keep only classes 0 and 1 (StackingRegressor(estimators=[( 'lr', LinearRegression()), ('svm', LinearSVR(random_state=0))]), X_diabetes, y_diabetes) ], ids=['StackingClassifier', 'StackingRegressor']) def test_stacking_randomness(estimator, X, y): # checking that fixing the random state of the CV will lead to the same # results estimator_full = clone(estimator) estimator_full.set_params( cv=KFold(shuffle=True, random_state=np.random.RandomState(0))) estimator_drop = clone(estimator)
def test_stacking_64(self): self._common_classifier([ lambda: StackingClassifier([('a', LogisticRegression()), ('b', LogisticRegression())]) ], "StackingClassifier")