def run_ensemble(X_train, X_val, y_train, y_val, df_test): ### ENSEMBLE LEARNING with (naive) classification models from sklearn.ensemble import StackingClassifier, RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.metrics import accuracy_score import xgboost as xgb final_layer = StackingClassifier( estimators=[('knn', KNeighborsClassifier(n_neighbors=6))], final_estimator=xgb.XGBClassifier(objective="binary:logistic", random_state=42)) model = StackingClassifier(estimators=[ ('rf', RandomForestClassifier(random_state=42)), ('svc', SVC(C=1, gamma=1e-6, kernel='rbf')), ], final_estimator=final_layer) history = model.fit(X_train, y_train) print(accuracy_score(y_val, model.predict(X_val))) rank_results = test_results(df_test, alg="ensemble", model=model) return rank_results
def rank_stacking_classifer(X, Y): # rf = RandomForestClassifier() # gbdt = GradientBoostingClassifier() # adaboost = AdaBoostRegressor() # clf = StackingClassifier(classiers=).fit(X, Y) estimators = [('rf', RandomForestClassifier(n_jobs=20)), ('gbdt', GradientBoostingClassifier()), ('AdaBoostRegressor', AdaBoostClassifier())] clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) clf.fit(X, Y) return clf
def fit_model(self): """ 09. Fit our stacked classifier gradient boosting model. """ ### Define Classifiers mdl = StackingClassifier(estimators=self.estimators, final_estimator=LogisticRegressionCV(10)) mdl.fit(self.X_train, self.y_train, X_val=self.X_val, Y_val=self.y_val, sample_weight=self.w_train, val_sample_weight=self.w_val, early_stopping_rounds=100) self.mdl = cloudpickle.dumps(mdl) self.next(self.compute_roc)
def test_stacking_classifier_sparse_passthrough(fmt): # Check passthrough behavior on a sparse X matrix X_train, X_test, y_train, _ = train_test_split( sparse.coo_matrix(scale(X_iris)).asformat(fmt), y_iris, random_state=42 ) estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] rf = RandomForestClassifier(n_estimators=10, random_state=42) clf = StackingClassifier( estimators=estimators, final_estimator=rf, cv=5, passthrough=True ) clf.fit(X_train, y_train) X_trans = clf.transform(X_test) assert_allclose_dense_sparse(X_test, X_trans[:, -4:]) assert sparse.issparse(X_trans) assert X_test.format == X_trans.format
def stacking_predictor(row): """ Training stacking model with our data Define what our base layer will be composed of and then build a stacking classifier base on these models. set our final estimator as "logistic regression" """ our_trained_data = pd.read_csv("data/data.csv") our_trained_data = clean_data(our_trained_data) x = our_trained_data[[ 'radius_mean', 'texture_mean', 'area_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'smoothness_mean' ]] y = our_trained_data[['diagnosis']] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) x_train = x_train.values.tolist() y_train = y_train.values.tolist() flattened_y_train = [] for sub_list in y_train: for val in sub_list: flattened_y_train.append(val) X, y = x_train, flattened_y_train estimators = [('random_forest', RandomForestClassifier(n_estimators=5, random_state=42)), ('logistic_regr', LogisticRegression(solver="lbfgs", max_iter=1460)), ('knn', KNeighborsClassifier(n_neighbors=5)), ('svm_rbf', SVC(kernel='rbf', gamma=4, C=10000))] Stacking_classifier = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(), cv=5) # Fit the stacking model with our own data and with selected 7 features. Stacking_classifier.fit(X, y) # Now predicting one patient single_predicted_result = Stacking_classifier.predict([row]) return ('%s %d' % ("patient", single_predicted_result))
def ensembler(self, method='voting'): """ Utilise des méthodes d'ensemble pour tous les classificateurs en fonction de la méthode Renvoie l'objet correspondant à la méthode """ if method == 'voting': vot_clf = VotingClassifier( estimators=[(name, self.clfs[name]) for name in self.clfs.keys()]) vot_clf.fit(self.X_train, self.y_train) return vot_clf if method == 'stacking': stack_clf = StackingClassifier( estimators=[(name, self.clfs[name]) for name in self.clfs.keys()]) stack_clf.fit(self.X_train, self.y_train) return stack_clf
def predict(self): X_train,y_train = self.train_data.iloc[:,:-1], self.train_data.iloc[:,-1] scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) level_0 = list() level_0.append(('RF', RandomForestClassifier(n_estimators=700))) level_0.append(('LR',LogisticRegression(max_iter=6000))) level_1 = SVC(C=1.2) model = StackingClassifier(estimators=level_0, final_estimator=level_1, cv=4) model.fit(X_train, y_train) test=scaler.transform(self.test_data) submission = model.predict(test) submission = pd.DataFrame(submission) submission.to_csv('submission.csv',header=['quality'],index=False)
def main(): args = parse_arguments() # params DATA_DIR = args.data_path num_folds = args.fold seed = 1234 # setup data with open(DATA_DIR + '/features.txt') as f: features_txt = f.readlines() features_name = [x.strip() for x in features_txt] features_name = [ "".join(c if c.isalnum() else "_" for c in str(x)) for x in features_name ] X_train = pd.read_csv(DATA_DIR + '/X_train.csv', names=features_name) X_test = pd.read_csv(DATA_DIR + '/X_test.csv', names=features_name) y_train = pd.read_csv(DATA_DIR + '/y_train.csv', names=['activity_label']) subject_train = pd.read_csv(DATA_DIR + '/subject_train.csv', names=['subject_id']) # 0始まりにする y_train['activity_label'] = y_train['activity_label'] - 1 # set up models estimators = [('rf', RandomForestClassifier(n_estimators=300, random_state=seed)), ('svr', SVC(probability=True, random_state=seed)), ('knn', KNeighborsClassifier())] final_estimator = LogisticRegression(random_state=seed) kf = GroupKFold(n_splits=num_folds) cv_idx = kf.split(X=subject_train, groups=subject_train) clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=cv_idx) # train clf.fit(X_train, y_train) # make submission test_preds = clf.predict(X_test) submit = test_preds + 1 np.savetxt('baseline.txt', submit)
class stacked_model(BaseEstimator, ClassifierMixin, TransformerMixin): def __init__(self, base_models = None, meta_model = None, n_folds = None): self.base_models = base_models self.meta_model = meta_model self.n_folds = n_folds def fit(self,X,y): level0 = [] for name, model in self.base_models: level0.append((name, model)) level1 = self.meta_model self.get_stacking_ = StackingClassifier(estimators = self.base_models, final_estimator = level1, cv = self.n_folds) self.get_stacking_.fit(X,y) return self def predict(self, X): y_pred = self.get_stacking_.predict(X) return y_pred
def test_stacking_classification(): from sklearn.model_selection import train_test_split from sklearn.datasets import load_iris from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from sklearn.ensemble import StackingClassifier X, y = load_iris(return_X_y=True) estimators = [('gbm', xgb.sklearn.XGBClassifier()), ('svr', make_pipeline(StandardScaler(), LinearSVC(random_state=42)))] clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) clf.fit(X_train, y_train).score(X_test, y_test)
def stack_ensemble(): ''' Create StackingClassifier model Parameters: N/A Returns: N/A Outputs: confusion_matrix, classification_report, scoring ''' WOE_encoder = WOEEncoder() X_train_enc = WOE_encoder.fit_transform(X_train, y_train) X_test_enc = WOE_encoder.transform(X_test) scaler = MinMaxScaler() X_train_enc_scaled = pd.DataFrame( scaler.fit_transform(X_train_enc, y_train)) X_test_enc_scaled = pd.DataFrame(scaler.transform(X_test_enc)) clfs = list() clfs.append(('linSVC', LinearSVC())) clfs.append(('bayes', GaussianNB())) clfs.append(('knn', KNeighborsClassifier())) clfs.append(('rfc', RandomForestClassifier())) # define meta learner model meta_clf = LogisticRegression() # define the stacking ensemble stk_model = StackingClassifier(estimators=clfs, final_estimator=meta_clf, cv=3) # fit the model on training data stk_model.fit(X_train_enc_scaled, y_train) stk_pred = stk_model.predict(X_test_enc_scaled) print('Stack Accuracy :', accuracy_score(y_test, stk_pred)) print('stack F1 :', f1_score(y_test, stk_pred)) print(confusion_matrix(y_test, stk_pred)) print(classification_report(y_test, stk_pred))
def ensemble_(feat, tar, split): scaler = MinMaxScaler() x_tr,x_te,y_tr,y_te = train_test_split(feat,tar,test_size = split,shuffle = True) scaler.fit(x_tr) x_tr = scaler.transform(x_tr) x_te = scaler.transform(x_te) knn = KNeighborsClassifier() params_knn = {'n_neighbors': np.arange(1, 25)} knn_gs = GridSearchCV(knn, params_knn, cv=5) knn_gs.fit(x_tr, y_tr) knn_best = knn_gs.best_estimator_ print(knn_gs.best_params_) rf = RandomForestClassifier() params_rf = {'n_estimators': [50, 100, 200,300,400]} rf_gs = GridSearchCV(rf, params_rf, cv=5) rf_gs.fit(x_tr, y_tr) rf_best = rf_gs.best_estimator_ print(rf_gs.best_params_) log_reg = LogisticRegression() log_reg.fit(x_tr, y_tr) print('knn: {}'.format(knn_best.score(x_te, y_te))) print('rf: {}'.format(rf_best.score(x_te, y_te))) print('log_reg: {}'.format(log_reg.score(x_te, y_te))) estimators=[('knn', knn_best), ('rf', rf_best), ('log_reg', log_reg)] ensemble = VotingClassifier(estimators, voting='hard') ensemble.fit(x_tr, y_tr) print("ensemble voting score: ",str(ensemble.score(x_te, y_te))) ensemble_bagging = BaggingClassifier(base_estimator=RandomForestClassifier(), n_estimators=10) ensemble_bagging.fit(x_tr, y_tr) print("ensemble bagging score: ",str(ensemble_bagging.score(x_te, y_te))) ensemble_stacking = StackingClassifier(estimators,LogisticRegression()) ensemble_stacking.fit(x_tr, y_tr) print("ensemble stacking score: ", str(ensemble_stacking.score(x_te, y_te)))
def Model_1(train, test): ''' Trains the model and Saves the predictions in a CSV file train : Training set test : Test set ''' # Preprocessing X_train = OneHotEncoder(sparse=False).fit_transform( [[x for x in s] for s in train['Sequence']]) X_test = OneHotEncoder(sparse=False).fit_transform( [[x for x in s] for s in test['Sequence']]) Y_train = train['label'] X_train, Y_train = RandomUnderSampler(random_state=100).fit_resample( X_train, Y_train) X_train, Y_train = shuffle(X_train, Y_train, random_state=100) # Training estimators = [('rf', RandomForestClassifier(n_estimators=300, max_depth=45, min_samples_leaf=7, random_state=100)), ('mlp', MLPClassifier(max_iter=200, random_state=100)), ('knn', KNeighborsClassifier(n_neighbors=4))] clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(random_state=100), n_jobs=-1, verbose=1) clf.fit(X_train, Y_train) # Predicting Y_pred = clf.predict(X_test) Y_prob = [x[1] for x in clf.predict_proba(X_test)] result = pd.DataFrame() result["ID"] = test["ID"] result["Label"] = Y_prob result.to_csv("Submission_1.csv", index=False) result["Label"] = Y_pred result.to_csv("Predictions_1.csv", index=False)
def test_stacking_classifier_drop_estimator(): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_iris), y_iris, stratify=y_iris, random_state=42) estimators = [('lr', 'drop'), ('svc', LinearSVC(random_state=0))] rf = RandomForestClassifier(n_estimators=10, random_state=42) clf = StackingClassifier(estimators=[('svc', LinearSVC(random_state=0))], final_estimator=rf, cv=5) clf_drop = StackingClassifier(estimators=estimators, final_estimator=rf, cv=5) clf.fit(X_train, y_train) clf_drop.fit(X_train, y_train) assert_allclose(clf.predict(X_test), clf_drop.predict(X_test)) assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test)) assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
def main(): np.random.seed(0) train_X, train_y, test_X, test_y = load_data() # Stacking models: # Create your stacked model using StackingClassifier base_models = [('rfc', RandomForestClassifier()), ('svm', SVC()), ('gnb', GaussianNB()), ('knc', KNeighborsClassifier()), ('dtc', DecisionTreeClassifier())] # The default final_estimator is LogisticRegression sc = StackingClassifier(estimators=base_models) # fit the model on the training data sc.fit(train_X, train_y) # predict y_pred = sc.predict(test_X) # Get and print f1-score on test data print(f"f1 score = {f1_score(y_pred, test_y , average = 'weighted')}")
def test_stacking(): irep = IREP(random_state=42) rip = RIPPER(random_state=42) df = DF.copy() numeric_cols = df.select_dtypes("number").columns categorical_cols = [ col for col in df.columns if (col not in numeric_cols and not col == CLASS_FEAT) ] dum_df = pd.get_dummies(df[categorical_cols]) for col in numeric_cols: dum_df[col] = df[col] dum_df[CLASS_FEAT] = df[CLASS_FEAT] sktrain, sktest = df_shuffled_split(dum_df, random_state=42) sktrain_x, sktrain_y = sktrain.drop(CLASS_FEAT, axis=1), train[CLASS_FEAT] sktest_x, sktest_y = sktest.drop(CLASS_FEAT, axis=1), test[CLASS_FEAT] lone_tree = DecisionTreeClassifier(random_state=42) lone_tree.fit(sktrain_x, sktrain_y) lone_tree_score = lone_tree.score(sktest_x, sktest_y) # print('lone_tree_score',lone_tree_score) irep_tree = SVC(random_state=42) irep_stack_estimators = [("irep", irep), ("tree", irep_tree)] irep_stack = StackingClassifier(estimators=irep_stack_estimators, final_estimator=LogisticRegression()) irep_stack.fit(sktrain_x, sktrain_y) irep_stack_score = irep_stack.score(sktest_x, sktest_y) # print('irep_stack_score', irep_stack_score) assert irep_stack_score != lone_tree_score rip_tree = DecisionTreeClassifier(random_state=42) rip_stack_estimators = [("rip", rip), ("tree", rip_tree)] rip_stack = StackingClassifier(estimators=rip_stack_estimators, final_estimator=LogisticRegression()) rip_stack.fit(sktrain_x, sktrain_y) rip_stack_score = rip_stack.score(sktest_x, sktest_y) # print('rip_stack_score',rip_stack_score) assert rip_stack_score != lone_tree_score
def train(X, y): sss = StratifiedShuffleSplit(n_splits=2, test_size=0.8, random_state=42) # model = HistGradientBoostingClassifier(**hist_params) # model = GradientBoostingClassifier(**grad_params) # model = XGBClassifier(**xgb_params) # """ estimators = [ ("RandomForest", RandomForestClassifier(**params)), # ("HistGradientBoosting" ,HistGradientBoostingClassifier(**hist_params)), ("Quadrant", QuadraticDiscriminantAnalysis()), ("XGB", XGBClassifier(**xgb_params)) ] model = StackingClassifier(estimators=estimators, n_jobs=-1) # """ print("Train & Cross validation".center(40, '-')) print(np.mean(cross_validate(sss, X, y, model), axis=0) * 100) model.fit(X, y) # print(model.n_iter_) return model
def test_stacking_classifier(): ''' Tests issue https://github.com/koaning/scikit-lego/issues/501 No asserts are added as we only test for being exception free. When cloning the model in Thresholder an unfitted model is generated where no predict_proba exists ''' estimators = [("dummy", DummyClassifier(strategy="constant", constant=0))] X = np.random.normal(0, 1, (100, 3)) y = np.random.normal(0, 1, (100, )) < 0 clf = StackingClassifier(estimators=estimators, final_estimator=DummyClassifier( strategy="constant", constant=0)) clf.fit(X, y) a = Thresholder(clf, threshold=0.2) a.fit(X, y) a.predict(X)
def test_stacking_classifier_iris(cv, final_estimator, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, y_test = train_test_split(scale(X_iris), y_iris, stratify=y_iris, random_state=42) estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())] clf = StackingClassifier( estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough, ) clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) assert clf.score(X_test, y_test) > 0.8 X_trans = clf.transform(X_test) expected_column_count = 10 if passthrough else 6 assert X_trans.shape[1] == expected_column_count if passthrough: assert_allclose(X_test, X_trans[:, -4:]) clf.set_params(lr="drop") clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) if final_estimator is None: # LogisticRegression has decision_function method clf.decision_function(X_test) X_trans = clf.transform(X_test) expected_column_count_drop = 7 if passthrough else 3 assert X_trans.shape[1] == expected_column_count_drop if passthrough: assert_allclose(X_test, X_trans[:, -4:])
def ensemble_predictions(members, X_te, params): assert params["type"] in ("weighted", "stacked") # make predictions if params["type"] == "weighted": y_preds = np.array([model.predict_proba(X_te) for model in members]) # mean across ensemble members y_ensemble_pred = np.average(y_preds, weights=params["weights"], axis=0) else: estimators = [(f'expert_{i}', members[i]) for i in range(len(members))] # only final estimator should be fitted here clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression()) X_tr = params["X_tr"] print(X_tr.columns.tolist()) y_tr = params["y_tr"] clf.fit(X_tr, y_tr.values.ravel()) y_ensemble_pred = clf.predict_proba(X_te) return y_ensemble_pred
def test_stacking_classifier_drop_column_binary_classification(): # check that a column is dropped in binary classification X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, _ = train_test_split( scale(X), y, stratify=y, random_state=42 ) # both classifiers implement 'predict_proba' and will both drop one column estimators = [('lr', LogisticRegression()), ('rf', RandomForestClassifier(random_state=42))] clf = StackingClassifier(estimators=estimators, cv=3) clf.fit(X_train, y_train) X_trans = clf.transform(X_test) assert X_trans.shape[1] == 2 # LinearSVC does not implement 'predict_proba' and will not drop one column estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] clf.set_params(estimators=estimators) clf.fit(X_train, y_train) X_trans = clf.transform(X_test) assert X_trans.shape[1] == 2
def Stacking(self): estimators3 = [ ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('knn', KNeighborsClassifier(n_neighbors=5)), ('svm', SVC())] estimators2 = [ ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svm', SVC())] estimators1 = [ ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('knn', KNeighborsClassifier(n_neighbors=5))] estimators4 = [ ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svm', SVC())] try: if (self.svmStackingcheckBox.isChecked() and self.rfcStackingcheckBox.isChecked() and self.knnStackingcheckBox.isChecked()): estimators = estimators3 clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test) self.accuracyEnsembleLBL.setText(str(stackingAccuracy)) elif (self.svmStackingcheckBox.isChecked() and self.rfcStackingcheckBox.isChecked()): estimators = estimators2 clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test) self.accuracyEnsembleLBL.setText(str(stackingAccuracy)) elif(self.rfcStackingcheckBox.isChecked() and self.knnStackingcheckBox.isChecked()): estimators = estimators1 clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test) self.accuracyEnsembleLBL.setText(str(stackingAccuracy)) elif(self.svmStackingcheckBox.isChecked() and self.knnStackingcheckBox.isChecked()): estimators = estimators4 clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test) self.accuracyEnsembleLBL.setText(str(stackingAccuracy)) except Exception as a: print(a)
def stackingClassifier(Feature_train, y_train, Feature_test): layer_one_estimators = [('rf_1', DecisionTreeClassifier(max_depth=6, max_features=15)), ('knn_1', KNeighborsClassifier(n_neighbors=35))] layer_two_estimators = [('dt_2', DecisionTreeClassifier(max_depth=6, max_features=15)), ('rf_2', svm.SVC())] layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=LogisticRegression()) clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two) clf = clf.fit(Feature_train, y_train) y_pred = clf.predict(Feature_test) return y_pred
from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from sklearn.ensemble import StackingClassifier from sklearn.model_selection import train_test_split X, y = load_iris(return_X_y=True) estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svr', make_pipeline(StandardScaler(), LinearSVC(random_state=42)))] clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) clf.fit(X_train, y_train).score(X_test, y_test) # %% # Permutation-based feature importance # ------------------------------------ # # The :func:`inspection.permutation_importance` can be used to get an # estimate of the importance of each feature, for any fitted estimator: import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_classification from sklearn.ensemble import RandomForestClassifier from sklearn.inspection import permutation_importance X, y = make_classification(random_state=0, n_features=5, n_informative=3)
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) vectorizer = TfidfVectorizer() print([[" ".join(i) for i in p] for p in pos_filtered_data][0]) data = vectorizer.fit_transform( [" ".join([" ".join(i) for i in p]) for p in pos_filtered_data]) X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42) clf.fit(X_train, y_train) print(classification_report(y_test, clf.predict(X_test))) # #### doc2vec with KNN # print(pos_filtered_data[0]) # glued_data = [] # for item in pos_filtered_data: # new_item = [] # for sent in item: # new_item.append(" ".join(sent)) # glued_data.append(". ".join(new_item)) # print(glued_data[0])
# ############################################################ HistGradientBoostingClassifier clf_hgbc = HistGradientBoostingClassifier() clf_hgbc.fit(x_train, y_train) hgbc_pred = clf_hgbc.predict(x_test) hgb_matrices = evaluate_preds(clf_hgbc, x_test, y_test, hgbc_pred) # ############################################################ # ############################################################ LogisticRegression clf_lr = LogisticRegression() clf_lr.fit(x_train, y_train) clf_pred = clf_lr.predict(x_test) lr_matrices = evaluate_preds(clf_lr, x_test, y_test, clf_pred) # ############################################################ # ############################################################ StackingClassifier clf_sc = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) clf_sc.fit(x_train, y_train) clf_pred = clf_sc.predict(x_test) sc_matrices = evaluate_preds(clf_sc, x_test, y_test, clf_pred) # ############################################################ # ############################################################ VotingClassifier clf_vc = VotingClassifier(estimators=[("knn", clf_knn), ('adab', clf_adab), ('rfc', clf_rfc), ('gnc', clf_gbc), ("bc", clf_bc), ("etc", clf_etc), ("hgbc", clf_hgbc), ('xgb', clf_xgb), ("lr", clf_lr)], voting='soft') clf_vc.fit(x_train, y_train) clf_pred = clf_vc.predict(x_test) vc_matrices = evaluate_preds(clf_vc, x_test, y_test, clf_pred)
estimators = [ ('gbm', grid_search_gbm.best_estimator_), #('xgb', grid_search_xgb.best_estimator_), ('lgbm', lgbm_grid.best_estimator_) ] #('rf', grid_search_rf.best_estimator_)] # In[257]: clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression( random_state=20202020), # logreg is better than gbm stack_method='predict_proba') clf.fit(X_tr, y_tr) # In[258]: results = clf.predict_proba(X_val)[:, 1] act = y_val.array roc_auc_score(act, results) # #### 71.246 best on validation # with lgbm and gbm as base learners # ## CatBoost (left out of Stack model - takes forever to train) # In[111]:
), ( "model", LGBMClassifier(n_jobs=-1, boosting_type="gbdt").set_params( **{ k.replace("final_estimator__model__", ""): v for k, v in params.items() }), ), ]), verbose=1, n_jobs=-1, cv=3, ) best_model = model.fit(X_train, y_train) preds = best_model.predict(X_test) print("loggeando movidas") mlflow.log_metrics( metrics={ "f1": f1_score(y_test, preds, average="macro"), "precision": precision_score(y_test, preds, average="macro"), "recall": recall_score(y_test, preds, average="macro"), "accuracy": accuracy_score(y_test, preds), "f05": fbeta_score(y_test, preds, beta=0.5, average="macro"), "f2": fbeta_score(y_test, preds, beta=2, average="macro"), }) best_params = params for param in best_params.keys(): mlflow.log_param(param, best_params[param])
def test_stacking_classifier_error(y, params, type_err, msg_err): with pytest.raises(type_err, match=msg_err): clf = StackingClassifier(**params, cv=3) clf.fit(scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0]))
much better than if we simply transform those outputs to [0,1] according to a threshold ''' X = pd.DataFrame({'Yamnet': y_predicted_yamnet, 'SVM': pd.Series(y_pred_svm)}) estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svr', make_pipeline(StandardScaler(), LinearSVC(random_state=42)))] clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) clf.fit(X_train, y_train).score(X_test, y_test) # y_test == y_real.iloc[X_test.index] y_pred_combined = clf.predict_proba( X_test)[:, 1] # The probability of getting the output as 1 (cough) Confusion_Matrix(y_test, y_pred_combined, pred_prob=True) y_pred_combined = clf.predict_proba(X)[:, 1] y_real, y_predicted_combined = Confusion_Matrix(y, y_pred_combined, pred_prob=True) X_new = pd.DataFrame({'Yamnet': [0], 'SVM': [0.95]}) clf.predict_proba(X_new)[:, 1] # Import Joblib Module from Scikit Learn import joblib