def boost_elasticnet(X_train, X_test, y_train, y_test): # applying bagging to logistic regression with elasticnet # Args: # X_train, X_test, y_train, y_test # Returns: # DataFrame: Preprocessed DataFrame. where Alpha and L1 ratio are hyperparameters of elastic net, estimator is hyperparameter for bagging, confusion matrix is the confusion matrix for each combination of those hyperparameters df = pd.DataFrame( columns=['Estimators', 'Learning Rate', 'Confusion Matrix']) rows = [] alphas = [0.0001, 0.001, 0.01] #,0.1,1] estimators = [50, 100, 150] rates = [0.5, 0.75, 1] # for al in alphas: # estimator = SGDClassifier(loss = 'log',alpha= al,penalty = 'l1',random_state=0) for n_est in estimators: for rate in rates: ada = LogitBoost(n_estimators=n_est, learning_rate=rate, random_state=0) #algorithm='SAMME', ada.fit(X_train, y_train) predicted_labels = ada.predict(X_test) tn, fp, fn, tp = confusion_matrix(y_test, predicted_labels, labels=[0, 1]).ravel() convert_matrix = [tn, fp, fn, tp] rows.append([n_est, rate, convert_matrix]) for i in range(len(rows)): df = df.append( { 'Estimators': rows[i][0], 'Learning Rate': rows[i][1], 'Confusion Matrix': rows[i][2] }, ignore_index=True) return df
def _toy_dataset_test(load_func, test_size=(1. / 3), random_state=0, min_score_train=0.9, min_score_test=0.9): """Create a classification unit test from a scikit-learn toy dataset.""" # Fetch the dataset data = load_func() X = data.data y = data.target_names[data.target] # Distinct classes classes = data.target_names n_classes = len(classes) # Binary/multiclass classification indicator is_binary = (n_classes == 2) # Shuffle data and split it into training/testing samples X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y, random_state=random_state) for bootstrap in (True, False): # Fit a LogitBoost model logitboost = LogitBoost(bootstrap=bootstrap, random_state=random_state) logitboost.fit(X_train, y_train) # Compute accuracy scores and assert minimum accuracy score_train = logitboost.score(X_train, y_train) score_test = logitboost.score(X_test, y_test) assert score_train >= min_score_train, \ ("Failed with bootstrap=%s: training score %.3f less than %.3f" % (bootstrap, score_train, min_score_train)) assert score_test >= min_score_test, \ ("Failed with bootstrap=%s: testing score %.3f less than %.3f" % (bootstrap, score_test, min_score_test)) # Get probabilities and the decision function predict_proba = logitboost.predict_proba(X_test) decision_function = logitboost.decision_function(X_test) # predict_proba() should always return (n_samples, n_classes) assert predict_proba.shape == (X_test.shape[0], n_classes) # decision_function() shape depends on the classification task if is_binary: assert decision_function.shape == (X_test.shape[0], ) else: assert decision_function.shape == (X_test.shape[0], n_classes) # Check that the last item of a staged method is the same as a regular # method staged_predict = np.asarray(list(logitboost.staged_predict(X_test))) staged_predict_proba = \ np.asarray(list(logitboost.staged_predict_proba(X_test))) staged_decision_function = \ np.asarray(list(logitboost.staged_decision_function(X_test))) staged_score = \ np.asarray(list(logitboost.staged_score(X_test, y_test))) np.testing.assert_equal(staged_predict[-1], logitboost.predict(X_test)) np.testing.assert_almost_equal(staged_predict_proba[-1], logitboost.predict_proba(X_test)) np.testing.assert_almost_equal(staged_decision_function[-1], logitboost.decision_function(X_test)) np.testing.assert_almost_equal(staged_score[-1], logitboost.score(X_test, y_test)) # contributions() should return one non-negative number for each # estimator in the ensemble contrib = logitboost.contributions(X_train) assert contrib.shape == (logitboost.n_estimators, ) assert np.all(contrib >= 0)
plt.title("t-SNE plot of the training data") plt.xlabel("1st embedding axis") plt.ylabel("2nd embedding axis") plt.legend(loc="best", frameon=True, shadow=True) plt.tight_layout() plt.show() plt.close() lboost = LogitBoost(base_estimator=LogisticRegression(), n_estimators=200, random_state=0) lboost.fit(X_train, y_train) y_pred_train = lboost.predict(X_train) y_pred_test = lboost.predict(X_test) accuracy_train = accuracy_score(y_train, y_pred_train) accuracy_test = accuracy_score(y_test, y_pred_test) print("Training accuracy: %.4f" % accuracy_train) print("Test accuracy: %.4f" % accuracy_test) report_train = classification_report(y_train, y_pred_train) report_test = classification_report(y_test, y_pred_test) print("Training\n%s" % report_train) print("Testing\n%s" % report_test) iterations = np.arange(1, lboost.n_estimators + 1) staged_accuracy_train = list(lboost.staged_score(X_train, y_train))
def main(plot=True, M=8, n_fold=10): """ :param plot: whether to plot the train and test result :param M: maximum T to search would be 100*M :param n_fold: number of folds in cross validation :return: None """ train_file, test_file = "datasets/abalone_train_scaled.txt", "datasets/abalone_test_scaled.txt" train = sparse.csr_matrix(np.loadtxt(train_file, delimiter=",")) test = sparse.csr_matrix(np.loadtxt(test_file, delimiter=",")) m = train.shape[0] #3133 x_dim = train.shape[1] - 1 #10 x_train, y_train = train[:, :x_dim].toarray(), train[:, x_dim].toarray().squeeze() x_test, y_test = test[:, :x_dim].toarray(), test[:, x_dim].toarray().squeeze() #print(x_train.shape, y_train.shape, x_test.shape) #print(x_train) aboost_train_cverror = list(np.ones(M)) lboost_train_cverror = list(np.ones(M)) for multiple in range(1, M+1): T = multiple * 100 print("\nT = %s\t" % T) # Set AdaBoost parameters # decision stump is the default base estimator aboost = AdaBoostClassifier(n_estimators=T, random_state=0) # Set LogitBoost parameters lboost = LogitBoost(n_estimators=T, random_state=0) # get 10-fold cross validation error aboost_cv_results = cross_validate(aboost, x_train, y_train, cv=n_fold) lboost_cv_results = cross_validate(lboost, x_train, y_train, cv=n_fold) # compute error by 1 - accuracy aboost_train_cverror[multiple-1] = 1 - aboost_cv_results['test_score'] lboost_train_cverror[multiple-1] = 1 - lboost_cv_results['test_score'] aboost_train_cverror = np.stack(aboost_train_cverror) lboost_train_cverror = np.stack(lboost_train_cverror) print(aboost_train_cverror) print(lboost_train_cverror) # find the T that gives least error (the best cross-validation accuracy) a_train_cverror_mean, a_train_cverror_std = aboost_train_cverror.mean(axis=1), aboost_train_cverror.std(axis=1) argmin = a_train_cverror_mean.flatten().argmin() best_T_aboost = int(argmin+1) * 100 print("----------------------\n",\ "AdaBoost iteration number T = %s\n"%(best_T_aboost), \ "----------------------\n") # find the T that gives least error (the best cross-validation accuracy) l_train_cverror_mean, l_train_cverror_std = lboost_train_cverror.mean(axis=1), lboost_train_cverror.std(axis=1) argmin = l_train_cverror_mean.flatten().argmin() best_T_lboost = int(argmin + 1) * 100 print("----------------------\n", \ "LogitBoost iteration number T = %s\n" % (best_T_lboost), \ "----------------------\n") print('Now train with the best T=T* and eval on the test set\n') # Train on the whole train set aboost = AdaBoostClassifier(n_estimators=best_T_aboost, random_state=0) aboost.fit(x_train, y_train) lboost = LogitBoost(n_estimators=best_T_lboost, random_state=0) lboost.fit(x_train, y_train) # Test on the test set y_pred_train = aboost.predict(x_train) y_pred_test = aboost.predict(x_test) a_error_train = 1-accuracy_score(y_train, y_pred_train) a_error_test = 1-accuracy_score(y_test, y_pred_test) print("AdaBoost train error: %s test error: %s" % (a_error_train, a_error_test)) y_pred_train = lboost.predict(x_train) y_pred_test = lboost.predict(x_test) l_error_train = 1-accuracy_score(y_train, y_pred_train) l_error_test = 1-accuracy_score(y_test, y_pred_test) print("LogitBoost train error: %s test error: %s"%(l_error_train, l_error_test)) if plot: plt.figure() x_values = range(100, M*100+1, 100) plt.plot(x_values, a_train_cverror_mean, label="AdaBoost") plt.fill_between(x_values, a_train_cverror_mean + a_train_cverror_std, a_train_cverror_mean - a_train_cverror_std, alpha=0.5, edgecolor='blue', facecolor='blue') plt.plot(x_values, l_train_cverror_mean, label="LogitBoost") plt.fill_between(x_values, l_train_cverror_mean + l_train_cverror_std, l_train_cverror_mean - l_train_cverror_std, alpha=0.5, edgecolor='#FF9848', facecolor='#FF9848') plt.xlabel('T (number of iterations/classifiers)') plt.ylabel('10fold cross validation train error') plt.legend() plt.ylim(0, 0.5) plt.savefig('B.i_cverror.png')