def test_stacking_classifier_iris(cv, final_estimator): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, y_test = train_test_split(scale(X_iris), y_iris, stratify=y_iris, random_state=42) estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=cv) clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) assert clf.score(X_test, y_test) > 0.8 X_trans = clf.transform(X_test) assert X_trans.shape[1] == 6 clf.set_params(lr='drop') clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) if final_estimator is None: # LogisticRegression has decision_function method clf.decision_function(X_test) X_trans = clf.transform(X_test) assert X_trans.shape[1] == 3
def test_stacking_classifier_iris(cv, final_estimator, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, y_test = train_test_split( scale(X_iris), y_iris, stratify=y_iris, random_state=42 ) estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] clf = StackingClassifier( estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough ) clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) assert clf.score(X_test, y_test) > 0.8 X_trans = clf.transform(X_test) expected_column_count = 10 if passthrough else 6 assert X_trans.shape[1] == expected_column_count if passthrough: assert_allclose(X_test, X_trans[:, -4:]) clf.set_params(lr='drop') clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) if final_estimator is None: # LogisticRegression has decision_function method clf.decision_function(X_test) X_trans = clf.transform(X_test) expected_column_count_drop = 7 if passthrough else 3 assert X_trans.shape[1] == expected_column_count_drop if passthrough: assert_allclose(X_test, X_trans[:, -4:])
def run_ensemble(X_train, X_val, y_train, y_val, df_test): ### ENSEMBLE LEARNING with (naive) classification models from sklearn.ensemble import StackingClassifier, RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.metrics import accuracy_score import xgboost as xgb final_layer = StackingClassifier( estimators=[('knn', KNeighborsClassifier(n_neighbors=6))], final_estimator=xgb.XGBClassifier(objective="binary:logistic", random_state=42)) model = StackingClassifier(estimators=[ ('rf', RandomForestClassifier(random_state=42)), ('svc', SVC(C=1, gamma=1e-6, kernel='rbf')), ], final_estimator=final_layer) history = model.fit(X_train, y_train) print(accuracy_score(y_val, model.predict(X_val))) rank_results = test_results(df_test, alg="ensemble", model=model) return rank_results
def Model_Ensemble(best_score_param_estimator, xtrain, xtest, ytrain, ytest): from sklearn.ensemble import StackingClassifier # the base estimators best_score_1, best_param_1, estimator_1 = best_score_param_estimator[0] best_score_2, best_param_2, estimator_2 = best_score_param_estimator[1] best_score_3, best_param_3, estimator_3 = best_score_param_estimator[2] best_score_4, best_param_4, estimator_4 = best_score_param_estimator[3] best_score_5, best_param_5, estimator_5 = best_score_param_estimator[4] estimators = [('estimator_5', estimator_5.get_params()['model']), ('estimator_4', estimator_4.get_params()['model']), ('estimator_3', estimator_3.get_params()['model']), ('estimator_2', estimator_2.get_params()['model'])] # the stacking classifer sc = StackingClassifier(estimators=estimators, final_estimator=estimator_1.get_params()['model']) # train the stacking classifier on the training data sc.fit(xtrain, ytrain) y_test_pred = sc.predict(xtest) print( "--------------Model Ensemble----------------------------------------------------------------" ) print("Accuracy:", '{:1.4f}'.format(accuracy_score(ytest, y_test_pred))) print("") print("Precision:", round(precision_score(ytest, y_test_pred), 4)) print("") print("Recall:", round(recall_score(ytest, y_test_pred), 4)) print("") print("f1-score:", round(f1_score(ytest, y_test_pred), 4)) print("") print(classification_report(ytest, y_test_pred)) print("") print(confusion_matrix(ytest, y_test_pred))
def main(): np.random.seed(0) train_X, train_y, test_X, test_y = load_data() # Stacking models: # Create your stacked model using StackingClassifier level0 = list() level0.append(('rf', RandomForestClassifier(n_estimators=150, max_depth=5))) level0.append(('svm', SVC(C=1, kernel='rbf'))) dtc = DecisionTreeClassifier(max_depth=3) level0.append(('ADA', AdaBoostClassifier(n_estimators=100, base_estimator=dtc, learning_rate=0.1))) level0.append(('lr', LogisticRegression(solver='liblinear'))) level0.append(('bayes', GaussianNB())) # define meta learner model level1 = LogisticRegression() # define the stacking ensemble model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # fit the model on the training data model.fit(train_X, train_y) # Get and print f1-score on test data y_pred = model.predict(test_X) F1_score = metrics.f1_score(y_pred, test_y, average='weighted') print("ANS 3.1 - F1_score of model with stacking different models is: " + str(F1_score))
def model_stacking_clf(X_train, y_train, X_test, y_test): """ @param: X_train - a numpy matrix containing features for training data (e.g. TF-IDF matrix) @param: y_train - a numpy array containing labels for each training sample @param: X_test - a numpy matrix containing features for test data (e.g. TF-IDF matrix) @param: y_test - a numpy array containing labels for each test sample """ estimators = [('rf', PassiveAggressiveClassifier(n_jobs=-1, C=0.001, loss='squared_hinge', max_iter=1000, tol=1e-06)), ('svr', make_pipeline( RandomForestClassifier(n_estimators=1000, random_state=42, criterion='gini', bootstrap=True, max_features='auto')))] clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(penalty='l1', solver='saga', max_iter=500)) clf.fit(X_train, y_train) y_predicted = clf.predict(X_test) rf_accuracy = accuracy_score(y_test, y_predicted) rf_f1 = f1_score(y_test, y_predicted, average="weighted") return rf_accuracy, rf_f1
def stacking_classifier(best_logistic_regression, best_knn_classifier, best_gaussian_nb, best_decision_tree_classifier, best_random_forest_classifier, x_train, x_test, y_train, y_test): from sklearn.ensemble import StackingClassifier estimators = [ # ('random_forest_cv', best_random_forest_classifier), ('knn_classifier_cv', best_knn_classifier), ('dct_cv', best_decision_tree_classifier), ('gaussian_nb_cv', best_gaussian_nb) ] final_stacking_classifier = StackingClassifier( estimators=estimators, shuffle=False, use_probas=True, final_estimator=best_logistic_regression) final_stacking_classifier.fit(x_train, y_train) print("Stacking Classifier Training Score {}".format( final_stacking_classifier.score(x_train, y_train))) print("Stacking Classifier Testing Score {}\n".format( final_stacking_classifier.score(x_test, y_test))) y_predict = final_stacking_classifier.predict(x_test) classification_model = 'Stacking Classifier' confusion_matrix_graph(y_test, y_predict, classification_model) roc_curve_graph(y_test, y_predict, classification_model)
def stacking(self): from sklearn.ensemble import StackingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline estimators = [('rf', RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)), ('svr', make_pipeline(StandardScaler(), LinearSVC(random_state=42)))] clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) clf.fit(self.X_train, self.y_train) y_pred = clf.predict(self.X_test) cf = confusion_matrix(self.y_test, y_pred) print(cf) acc = accuracy_score(self.y_test, y_pred) report = classification_report(self.y_test, y_pred) print(acc) print(report)
def stacking_classifier(train_x, train_y, test_x): import lightgbm as lgb from rgf.sklearn import RGFClassifier from sklearn.ensemble import StackingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.metrics import mean_squared_error lgb_params = { # 'boosting': 'gbdt', 'application': 'classifier', # 'learning_rate': 0.05, # 'min_data_in_leaf': 20, # 'feature_fraction': 0.7, # 'num_leaves': 41, 'metric': 'auc' # 'drop_rate': 0.15 } et_params = { 'n_estimators': 20, 'max_features': 0.5, 'max_depth': 18, 'min_samples_leaf': 4, 'n_jobs': -1 } rf_params = { 'n_estimators': 20, 'max_features': 0.2, 'max_depth': 25, 'min_samples_leaf': 4, 'n_jobs': -1 } rgf_params = {'algorithm': 'RGF_Sib', 'loss': 'Log'} kn_params = {'leaf_size': 10} estimators = [ ('lgb', lgb.LGBMClassifier(**lgb_params)), # ('rgf', RGFClassifier(**rgf_params)), ('et', ExtraTreesClassifier(**et_params)), ('rf', RandomForestClassifier(**rf_params)), ('lr', LogisticRegression()) # ('knn', KNeighborsClassifier(**kn_params)) ] model_stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), verbose=1) model_stack.fit(train_x, train_y) pred = model_stack.predict(test_x) return pred
def stacking(X_train, y_train, X_test, y_test, model1, model2): print('Обучение алгоритма Stacking...\n') estimators = [('kmeans', model1), ('svc', model2)] clf = StackingClassifier(estimators=estimators) clf.fit(X_train, y_train) predictions = clf.predict(X_test) print('Отчет классификации для метода Stacking: \n', classification_report(y_test, predictions)) return clf
def run(): import numpy as np import pandas as pd import seaborn import matplotlib.pyplot as pyplot import seaborn as sns from sklearn.model_selection import train_test_split from xgboost import XGBClassifier from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score from sklearn.svm import SVC svc=SVC(probability=True, kernel='linear') from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.ensemble import StackingClassifier df = pd.read_table("./data/australian.csv", sep='\s+', header=None) y = df[14] X = df.drop(columns = 14) y.value_counts() # Split features and target into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, test_size = 0.4) # Instantiate the Classifiers estimators = [('xgb', XGBClassifier()), ('gbdt', GradientBoostingClassifier(random_state=1))] clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) clf.fit(X_train, y_train) # Make predictions for the test set y_pred_test = clf.predict(X_test) # View accuracy score print(classification_report(y_test, y_pred_test)) clf_probs = clf.predict_proba(X_test) # keep probabilities for the positive outcome only clf_probs = clf_probs[:, 1] # calculate scores clf_auc = roc_auc_score(y_test, clf_probs) # summarize scores print('ensemble: ROC AUC=%.3f' % (clf_auc)) print("accuracy_score is %.3f" % (accuracy_score(y_test, y_pred_test, normalize=True))) # calculate roc curves clf_fpr, clf_tpr, _ = roc_curve(y_test, clf_probs) # plot the roc curve for the model pyplot.plot(clf_fpr, clf_tpr, marker='.', label='Ensemble') # axis labels pyplot.xlabel('False Positive Rate') pyplot.ylabel('True Positive Rate') # show the legend pyplot.legend() # show the plot pyplot.show()
def model_stack(X_train, y_train, X_test, y_test): estimators = [('xgb', XGBClassifier()), ('lgb', lgb.LGBMClassifier())] model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) model.fit(X_train, y_train) y_pred = model.predict(X_test) stack_accuracy = f1_score(y_test, y_pred, average='weighted') stack_f1 = accuracy_score(y_test, y_pred) return stack_accuracy, stack_f1
def test_stacking_classifier_drop_estimator(): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_iris), y_iris, stratify=y_iris, random_state=42) estimators = [('lr', 'drop'), ('svc', LinearSVC(random_state=0))] rf = RandomForestClassifier(n_estimators=10, random_state=42) clf = StackingClassifier(estimators=[('svc', LinearSVC(random_state=0))], final_estimator=rf, cv=5) clf_drop = StackingClassifier(estimators=estimators, final_estimator=rf, cv=5) clf.fit(X_train, y_train) clf_drop.fit(X_train, y_train) assert_allclose(clf.predict(X_test), clf_drop.predict(X_test)) assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test)) assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
def stacking_predictor(row): """ Training stacking model with our data Define what our base layer will be composed of and then build a stacking classifier base on these models. set our final estimator as "logistic regression" """ our_trained_data = pd.read_csv("data/data.csv") our_trained_data = clean_data(our_trained_data) x = our_trained_data[[ 'radius_mean', 'texture_mean', 'area_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'smoothness_mean' ]] y = our_trained_data[['diagnosis']] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) x_train = x_train.values.tolist() y_train = y_train.values.tolist() flattened_y_train = [] for sub_list in y_train: for val in sub_list: flattened_y_train.append(val) X, y = x_train, flattened_y_train estimators = [('random_forest', RandomForestClassifier(n_estimators=5, random_state=42)), ('logistic_regr', LogisticRegression(solver="lbfgs", max_iter=1460)), ('knn', KNeighborsClassifier(n_neighbors=5)), ('svm_rbf', SVC(kernel='rbf', gamma=4, C=10000))] Stacking_classifier = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(), cv=5) # Fit the stacking model with our own data and with selected 7 features. Stacking_classifier.fit(X, y) # Now predicting one patient single_predicted_result = Stacking_classifier.predict([row]) return ('%s %d' % ("patient", single_predicted_result))
def predict(self): X_train,y_train = self.train_data.iloc[:,:-1], self.train_data.iloc[:,-1] scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) level_0 = list() level_0.append(('RF', RandomForestClassifier(n_estimators=700))) level_0.append(('LR',LogisticRegression(max_iter=6000))) level_1 = SVC(C=1.2) model = StackingClassifier(estimators=level_0, final_estimator=level_1, cv=4) model.fit(X_train, y_train) test=scaler.transform(self.test_data) submission = model.predict(test) submission = pd.DataFrame(submission) submission.to_csv('submission.csv',header=['quality'],index=False)
def main(): args = parse_arguments() # params DATA_DIR = args.data_path num_folds = args.fold seed = 1234 # setup data with open(DATA_DIR + '/features.txt') as f: features_txt = f.readlines() features_name = [x.strip() for x in features_txt] features_name = [ "".join(c if c.isalnum() else "_" for c in str(x)) for x in features_name ] X_train = pd.read_csv(DATA_DIR + '/X_train.csv', names=features_name) X_test = pd.read_csv(DATA_DIR + '/X_test.csv', names=features_name) y_train = pd.read_csv(DATA_DIR + '/y_train.csv', names=['activity_label']) subject_train = pd.read_csv(DATA_DIR + '/subject_train.csv', names=['subject_id']) # 0始まりにする y_train['activity_label'] = y_train['activity_label'] - 1 # set up models estimators = [('rf', RandomForestClassifier(n_estimators=300, random_state=seed)), ('svr', SVC(probability=True, random_state=seed)), ('knn', KNeighborsClassifier())] final_estimator = LogisticRegression(random_state=seed) kf = GroupKFold(n_splits=num_folds) cv_idx = kf.split(X=subject_train, groups=subject_train) clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=cv_idx) # train clf.fit(X_train, y_train) # make submission test_preds = clf.predict(X_test) submit = test_preds + 1 np.savetxt('baseline.txt', submit)
def Stacking(x,y,time_split_sample=time_split_sample,split= 0.2): X_train, X_test, y_train, y_test = Train_Test_Split(x, y, time=time_split_sample) estimators=[ # ('Logist', LogisticRegression(multi_class='multinomial',max_iter=1000)), ('DecisionTree',tree.DecisionTreeClassifier(class_weight='balanced',max_depth=3)), ('SVC', SVC()), ('NB', GaussianNB()) ] lv2 = [ ('DecisionTree',tree.DecisionTreeClassifier(class_weight='balanced',max_depth=5), [{'criterion': ['gini'], 'splitter': ['best', 'random'], 'max_depth': [2,6,8,12], 'min_samples_split': [3,5]},] ), ('NB',GaussianNB(), [{ 'var_smoothing':[1e-9,1e-11]}] ), ('Logist', LogisticRegression(multi_class='multinomial'), [{'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01]}] ) ] stacking_rst = [] aum_rst = [] for i in lv2: est = i[1] para = i[2] gs_clf = model_selection.GridSearchCV(est, para, scoring = scorer['f0.5_macro'], cv = model_selection.StratifiedKFold(n_splits = 10, shuffle = True, random_state = 2020)) clf = StackingClassifier(estimators=estimators, final_estimator=gs_clf).fit(X_train, y_train) y_pred = pd.Series(clf.predict(X_test), index=X_test.index) scores = ScoreFunc(y_test, y_pred) scores.name = i[0] print(scores) aum = PredictedReturn(y_pred, method=plot_method,title=i[0]) plt.show() aum_rst.append(aum) stacking_rst.append(scores) score_rst = pd.concat(stacking_rst, axis=1) aum_rst = pd.concat(aum_rst, axis=1) aum_rst.columns = [i[0] for i in lv2] # aum_rst['Benchmark'] = (PredictedReturn(y_test, method=plot_method)) aum_rst.plot(title='Stacking Final_estimator GridSearch') return score_rst, aum_rst
def stackingClassifier(Feature_train, y_train, Feature_test): layer_one_estimators = [('rf_1', DecisionTreeClassifier(max_depth=6, max_features=15)), ('knn_1', KNeighborsClassifier(n_neighbors=35))] layer_two_estimators = [('dt_2', DecisionTreeClassifier(max_depth=6, max_features=15)), ('rf_2', svm.SVC())] layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=LogisticRegression()) clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two) clf = clf.fit(Feature_train, y_train) y_pred = clf.predict(Feature_test) return y_pred
class stacked_model(BaseEstimator, ClassifierMixin, TransformerMixin): def __init__(self, base_models = None, meta_model = None, n_folds = None): self.base_models = base_models self.meta_model = meta_model self.n_folds = n_folds def fit(self,X,y): level0 = [] for name, model in self.base_models: level0.append((name, model)) level1 = self.meta_model self.get_stacking_ = StackingClassifier(estimators = self.base_models, final_estimator = level1, cv = self.n_folds) self.get_stacking_.fit(X,y) return self def predict(self, X): y_pred = self.get_stacking_.predict(X) return y_pred
def Model_1(train, test): ''' Trains the model and Saves the predictions in a CSV file train : Training set test : Test set ''' # Preprocessing X_train = OneHotEncoder(sparse=False).fit_transform( [[x for x in s] for s in train['Sequence']]) X_test = OneHotEncoder(sparse=False).fit_transform( [[x for x in s] for s in test['Sequence']]) Y_train = train['label'] X_train, Y_train = RandomUnderSampler(random_state=100).fit_resample( X_train, Y_train) X_train, Y_train = shuffle(X_train, Y_train, random_state=100) # Training estimators = [('rf', RandomForestClassifier(n_estimators=300, max_depth=45, min_samples_leaf=7, random_state=100)), ('mlp', MLPClassifier(max_iter=200, random_state=100)), ('knn', KNeighborsClassifier(n_neighbors=4))] clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(random_state=100), n_jobs=-1, verbose=1) clf.fit(X_train, Y_train) # Predicting Y_pred = clf.predict(X_test) Y_prob = [x[1] for x in clf.predict_proba(X_test)] result = pd.DataFrame() result["ID"] = test["ID"] result["Label"] = Y_prob result.to_csv("Submission_1.csv", index=False) result["Label"] = Y_pred result.to_csv("Predictions_1.csv", index=False)
def stack_ensemble(): ''' Create StackingClassifier model Parameters: N/A Returns: N/A Outputs: confusion_matrix, classification_report, scoring ''' WOE_encoder = WOEEncoder() X_train_enc = WOE_encoder.fit_transform(X_train, y_train) X_test_enc = WOE_encoder.transform(X_test) scaler = MinMaxScaler() X_train_enc_scaled = pd.DataFrame( scaler.fit_transform(X_train_enc, y_train)) X_test_enc_scaled = pd.DataFrame(scaler.transform(X_test_enc)) clfs = list() clfs.append(('linSVC', LinearSVC())) clfs.append(('bayes', GaussianNB())) clfs.append(('knn', KNeighborsClassifier())) clfs.append(('rfc', RandomForestClassifier())) # define meta learner model meta_clf = LogisticRegression() # define the stacking ensemble stk_model = StackingClassifier(estimators=clfs, final_estimator=meta_clf, cv=3) # fit the model on training data stk_model.fit(X_train_enc_scaled, y_train) stk_pred = stk_model.predict(X_test_enc_scaled) print('Stack Accuracy :', accuracy_score(y_test, stk_pred)) print('stack F1 :', f1_score(y_test, stk_pred)) print(confusion_matrix(y_test, stk_pred)) print(classification_report(y_test, stk_pred))
def main(): np.random.seed(0) train_X, train_y, test_X, test_y = load_data() # Stacking models: # Create your stacked model using StackingClassifier base_models = [('rfc', RandomForestClassifier()), ('svm', SVC()), ('gnb', GaussianNB()), ('knc', KNeighborsClassifier()), ('dtc', DecisionTreeClassifier())] # The default final_estimator is LogisticRegression sc = StackingClassifier(estimators=base_models) # fit the model on the training data sc.fit(train_X, train_y) # predict y_pred = sc.predict(test_X) # Get and print f1-score on test data print(f"f1 score = {f1_score(y_pred, test_y , average = 'weighted')}")
vectorizer = TfidfVectorizer() print([[" ".join(i) for i in p] for p in pos_filtered_data][0]) data = vectorizer.fit_transform( [" ".join([" ".join(i) for i in p]) for p in pos_filtered_data]) X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42) clf.fit(X_train, y_train) print(classification_report(y_test, clf.predict(X_test))) # #### doc2vec with KNN # print(pos_filtered_data[0]) # glued_data = [] # for item in pos_filtered_data: # new_item = [] # for sent in item: # new_item.append(" ".join(sent)) # glued_data.append(". ".join(new_item)) # print(glued_data[0]) # documents = [TaggedDocument(doc[1], [i]) for i, doc in enumerate(glued_data)]
clf_hgbc = HistGradientBoostingClassifier() clf_hgbc.fit(x_train, y_train) hgbc_pred = clf_hgbc.predict(x_test) hgb_matrices = evaluate_preds(clf_hgbc, x_test, y_test, hgbc_pred) # ############################################################ # ############################################################ LogisticRegression clf_lr = LogisticRegression() clf_lr.fit(x_train, y_train) clf_pred = clf_lr.predict(x_test) lr_matrices = evaluate_preds(clf_lr, x_test, y_test, clf_pred) # ############################################################ # ############################################################ StackingClassifier clf_sc = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) clf_sc.fit(x_train, y_train) clf_pred = clf_sc.predict(x_test) sc_matrices = evaluate_preds(clf_sc, x_test, y_test, clf_pred) # ############################################################ # ############################################################ VotingClassifier clf_vc = VotingClassifier(estimators=[("knn", clf_knn), ('adab', clf_adab), ('rfc', clf_rfc), ('gnc', clf_gbc), ("bc", clf_bc), ("etc", clf_etc), ("hgbc", clf_hgbc), ('xgb', clf_xgb), ("lr", clf_lr)], voting='soft') clf_vc.fit(x_train, y_train) clf_pred = clf_vc.predict(x_test) vc_matrices = evaluate_preds(clf_vc, x_test, y_test, clf_pred) # ############################################################
}) result.to_csv('result_xgb.csv', index=False) from sklearn.ensemble import VotingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC alg2 = SVC(probability=True, random_state=29, C=11, gamma=0.05) rf_clf = RandomForestClassifier() mv_clf = VotingClassifier(estimators=[('lr', clf), ('xgb', model), ('svc', alg2), ('rf', rf_clf)], voting='hard') mv_clf.fit(X, y) predictions = mv_clf.predict(test_feature) result = pd.DataFrame({ 'PassengerId': test['PassengerId'], 'Survived': predictions.astype(np.int32) }) result.to_csv("result_voting.csv", index=False) from sklearn.ensemble import StackingClassifier stacking_clf = StackingClassifier(estimators=[('xgb', model), ('svc', alg2), ('rf', rf_clf)], final_estimator=clf) stacking_clf.fit(X, y) predictions = stacking_clf.predict(test_feature) result = pd.DataFrame({ 'PassengerId': test['PassengerId'], 'Survived': predictions.astype(np.int32) }) result.to_csv('result_stacking.csv', index=False)
class ModelFactory(object): def __init__(self): self.model = None self.dataset = pd.read_csv( "./heart_failure_clinical_records_dataset.csv") self.X_ori = self.dataset.drop( columns=['DEATH_EVENT'])[selectedFeatures] self.y = self.dataset['DEATH_EVENT'] col_names = list(self.X_ori.columns) self.stdScaler = preprocessing.StandardScaler() self.stdScaler.fit(self.X_ori) self.X = self.stdScaler.transform(self.X_ori) self.X = pd.DataFrame(self.X, columns=col_names) self.X_val = None self.y_val = None self.X_train = None self.y_train = None self.X_test = None self.t_test = None def getModel(self): self.__genDecisionTree() self.__genBoostTree() self.__genLR() self.__genSVM() self.__genKNN() self.__genRF() #estimators=[('KNN', self.KNN), ('SVC', self.SVM)] self.model = StackingClassifier(estimators=[('LR', self.LR), ('KNN', self.KNN)], final_estimator=self.SVM) self.model.fit(self.X_train, self.y_train) #self.model = make_pipeline(self.stdScaler, self.vote) path = "./temp/model.joblib" modelDump = open(path, "wb") dump(self.model, modelDump) modelDump.close() def printValidationSet(self): print(self.X_val) print(self.y_val) def genDataSet(self, train=0.8, test=0.1, val=0.1): self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=1 - train, random_state=2) #, stratify=self.y) self.X_val, self.X_test, self.y_val, self.y_test = train_test_split( self.X_test, self.y_test, test_size=test / (test + val), random_state=2) #stratify=self.y_test) def __genBoostTree(self): rng = np.random.RandomState(42) self.BT = AdaBoostClassifier(DecisionTreeClassifier(max_depth=None), n_estimators=10, random_state=rng) #self.BT.fit(self.X_train, self.y_train) def __genDecisionTree(self): self.DT = DecisionTreeClassifier() #self.DT.fit(self.X_train, self.y_train) def __genKNN(self): self.KNN = KNeighborsClassifier(n_neighbors=18) #self.KNN.fit(self.X_train, self.y_train) def __genRF(self): self.RF = RandomForestClassifier(n_estimators=10) #self.RF.fit(self.X_train, self.y_train) def __genGNB(self): self.GNB = GaussianNB() #self.GNB.fit(self.X_train, self.y_train) def __genLR(self): self.LR = LogisticRegression(solver='liblinear', max_iter=1000, penalty='l1', C=0.01) # self.LR.fit(self.X_train, self.y_train) def __genSVM(self): self.SVM = SVC(kernel='linear', C=1e2, gamma=1e-04, probability=True) #self.SVM.fit(self.X_train, self.y_train) def getModelTestRes(self): y_pred = self.model.predict(self.X_test) acc = "Accuracy:", metrics.accuracy_score(self.y_test, y_pred) return ('Test score: {}'.format(acc)) def getModetValRes(self): y_pred = self.model.predict(self.X_val) acc = "Accuracy:", metrics.accuracy_score(self.y_val, y_pred) return ('Validation score: {}'.format(acc)) def predict(self, feature): input = feature[selectedFeatures] return self.model.predict(input)
# %% Build pipeline scaler = StandardScaler().fit(X_train) encoder = LabelEncoder().fit(y_train) X_train, y_train = scaler.transform(X_train), encoder.transform(y_train) X_dev, y_dev = scaler.transform(X_dev), encoder.transform(y_dev) # %% estimators = [ ('svm', LinearSVC(C=0.0001)), ('log', LogisticRegression(penalty='l2', C=0.001, max_iter=1000)) ] clf = StackingClassifier( estimators=estimators, final_estimator=GradientBoostingClassifier() ) clf.fit(X_train, y_train) pred_train, pred_dev = clf.predict(X_train), clf.predict(X_dev) train_acc = clf.score(X_train, y_train) dev_acc = clf.score(X_dev, y_dev) train_uar = recall_score(y_train, pred_train, average='macro') dev_uar = recall_score(y_dev, pred_dev, average='macro') print(f"train_acc = {train_acc:.2f}, dev_acc = {dev_acc:.2f}") print(f"train_uar = {train_uar:.2f}, dev_uar = {dev_uar:.2f}") """ train_acc = 0.83, dev_acc = 0.47 train_uar = 0.83, dev_uar = 0.47 """
from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn import linear_model from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import StackingClassifier from sklearn.ensemble import ExtraTreesClassifier TRAIN_DATA_PATH = os.getenv("TRAIN_DATA_PATH") TEST_DATA_PATH = os.getenv("TEST_DATA_PATH") train_data = pd.read_csv(TRAIN_DATA_PATH) X_train, y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1] sc = StandardScaler() X_tr = sc.fit_transform(X_train) level_0 = list() level_0.append(('RF', ExtraTreesClassifier(n_estimators=1000))) level_0.append(('LR', LogisticRegression(max_iter=7000))) level_1 = LinearDiscriminantAnalysis() model = StackingClassifier(estimators=level_0, final_estimator=level_1, cv=4) model.fit(X_tr, y_train) test_data = pd.read_csv(TEST_DATA_PATH) X_te = sc.transform(test_data) submission = model.predict(X_te) submission = pd.DataFrame(submission) submission.to_csv('submission.csv', header=['class'], index=False)
def run(dataset, config): log.info( f"\n**** Stacking Ensemble [sklearn v{sklearn.__version__}] ****\n") is_classification = config.type == 'classification' X_train, X_test = dataset.train.X_enc, dataset.test.X_enc y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config estimators_params = { e: config.framework_params.get(f'_{e}_params', {}) for e in ['rf', 'gbm', 'linear', 'svc', 'final'] } log.info( "Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores." .format(config.max_runtime_seconds, n_jobs)) log.warning( "We completely ignore the requirement to stay within the time limit.") log.warning( "We completely ignore the advice to optimize towards metric: {}.". format(config.metric)) if is_classification: estimator = StackingClassifier( estimators=[ ('rf', RandomForestClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['rf'])), ('gbm', GradientBoostingClassifier(random_state=config.seed, **estimators_params['gbm'])), ('linear', SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['linear'])), # ('svc', LinearSVC(random_state=config.seed, **estimators_params['svc'])) ], # final_estimator=SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']), final_estimator=LogisticRegression(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']), stack_method='predict_proba', n_jobs=n_jobs, **training_params) else: estimator = StackingRegressor( estimators=[ ('rf', RandomForestRegressor(n_jobs=n_jobs, random_state=config.seed, **estimators_params['rf'])), ('gbm', GradientBoostingRegressor(random_state=config.seed, **estimators_params['gbm'])), ('linear', SGDRegressor(random_state=config.seed, **estimators_params['linear'])), ('svc', LinearSVR(random_state=config.seed, **estimators_params['svc'])) ], # final_estimator=SGDRegressor(random_state=config.seed, **estimators_params['final']), final_estimator=LinearRegression(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']), n_jobs=n_jobs, **training_params) with utils.Timer() as training: estimator.fit(X_train, y_train) predictions = estimator.predict(X_test) probabilities = estimator.predict_proba( X_test) if is_classification else None return result(output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, target_is_encoded=is_classification, models_count=len(estimator.estimators_) + 1, training_duration=training.duration)
class Classifier(object): def __init__(self, in_model_code, db, y_col="party", label_col="county_fips", where_clauses=None, data_view="master_data", year_col="year", year_test=2020): self.db = db self.mc = in_model_code self.drop_cols = db.query(ModelDropCol).filter_by( model_code_id=self.mc.id).all() where = self.db.query(ModelWhereClause).filter_by( model_code=self.mc).all() if where: self.where = " where " + (" and ".join([wc.sql for wc in where])) else: self.where = "" self.engine_string = database_string self.query = f"select * from {data_view}{self.where}" self.df = pandas.read_sql_query( self.query, database_string).drop(columns=[dc.column for dc in self.drop_cols]) self.y = self.df[y_col].to_numpy() self.x = self.df.drop(columns=y_col).to_numpy() self.model_obj = self.db.query(Model).filter_by( model_code=self.mc).first() if not self.model_obj: rf = RandomForestClassifier(n_estimators=10, random_state=42) svr = make_pipeline( StandardScaler(), LinearSVC(random_state=42, dual=False, max_iter=1000)) knn = KNeighborsClassifier(n_neighbors=3) nb = GaussianNB() classifiers = [("rf", rf), ("svr", svr), ("knn", knn), ("nb", nb)] self.model = StackingClassifier( estimators=classifiers, final_estimator=LogisticRegression()) self.accuracy = None self.model_obj = Model(model_code=self.mc, accuracy=self.accuracy) self.db.add(self.model_obj) self.train() self.save() else: self.model = pickle.loads(self.model_obj.model_object) self.accuracy = self.model_obj.accuracy def train(self): x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.33) self.model.fit(x_train, y_train) self.accuracy = self.model.score(x_test, y_test) def save(self): self.model_obj.model_object = pickle.dumps(self.model) self.model_obj.accuracy = self.accuracy self.db.commit() def predict(self, fips, in_file_path=None): """ Currently hard coded to predict for 2020, or the latest election in which all data as available, but not trained on. """ if "2020" in self.mc.id: raise IOError( "Must be a non-2020 model code to predict 2020 results.") year = 2020 logging.info(f"Selecting {self.mc.id} model ({self.mc.description})") if fips in ["ALL", "*"]: and_clause = "" logging.info("Predicting all counties...") all_counties = True else: and_clause = f" and county_fips = {fips}" all_counties = False max_year = self.db.execute( f"select max(year) from ({self.query})").scalar() search_year = max_year - 4 data = pandas.read_sql_query( f"select * from ({self.query}) where year = '{search_year}'{and_clause}", self.engine_string).drop( columns=[dc.column for dc in self.drop_cols]) fields = list(data.columns) county_fips_idx = None for i, f in enumerate(fields): if f == "county_fips": county_fips_idx = i - 1 break y = data["party"].to_numpy() x = data.drop(columns=["party"]).to_numpy() predictions = self.model.predict(x) out_predictions = [] fips_to_county = {} logging.info("Predictions:") i = 0 for val in x: pred = predictions[i] county_id = str(int(val[county_fips_idx])).zfill(6) if county_id in fips_to_county: county = fips_to_county[county_id] else: county = self.db.query(County).filter_by(id=county_id).first() fips_to_county[county_id] = county logging.info(f"{county.name} ({county.id}): {pred}") out_predictions.append({ "party_prediction": pred, "county_fips": county_id, "county_name": county.name, "state_fips": county.state.id, "state_code": county.state.code }) i += 1 if in_file_path: logging.info(f"Writing output to {in_file_path}") out_cols = [ "party_prediction", "county_fips", "county_name", "state_fips", "state_code" ] with open(in_file_path, "w") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=out_cols) writer.writeheader() writer.writerows(out_predictions) return out_predictions