def test_fit_sample_weight(): """Check that a warning is raised if sample_weights is passed to fit().""" logitboost = LogitBoost() with pytest.warns(RuntimeWarning): logitboost.fit(X_simple, y_simple_binary, sample_weight=np.ones(len(X_simple)))
def test_bad_base_estimator(): """Tests for errors raised when the base estimator is bad.""" # LogitBoost base estimators should be regressors, not classifiers base_estimator = DecisionTreeClassifier() # Validation is done at fitting, not at initialization logitboost = LogitBoost(base_estimator) with pytest.raises(ValueError): logitboost.fit(X_simple, y_simple_binary)
def test_feature_importances_(): """Check that the feature_importances_ attribute behaves as expected.""" # DecisionTreeRegressor supports feature_importances_ logitboost = LogitBoost(DecisionTreeRegressor()) # Binary classification should work logitboost.fit(X_simple, y_simple_binary) assert logitboost.feature_importances_.shape == (np.shape(X_simple)[1], ) # Multiclass classification should currently fail logitboost.fit(X_simple, y_simple_multiclass) with pytest.raises(NotImplementedError): _ = logitboost.feature_importances_ # Ridge doesn't support feature_importances_ logitboost = LogitBoost(Ridge()) # Even binary classification shouldn't work logitboost.fit(X_simple, y_simple_binary) with pytest.raises(AttributeError): _ = logitboost.feature_importances_ # Check that the feature_importance_ attribute identifies bad features X, y = load_breast_cancer(return_X_y=True) # Add a useless constant feature columns to X: it should be the least # important X = np.column_stack((X, np.zeros(len(X)))) logitboost = LogitBoost(random_state=0) logitboost.fit(X, y) feature_importances = logitboost.feature_importances_ dummy_importance = feature_importances[-1] assert dummy_importance == min(feature_importances)
def boost_elasticnet(X_train, X_test, y_train, y_test): # applying bagging to logistic regression with elasticnet # Args: # X_train, X_test, y_train, y_test # Returns: # DataFrame: Preprocessed DataFrame. where Alpha and L1 ratio are hyperparameters of elastic net, estimator is hyperparameter for bagging, confusion matrix is the confusion matrix for each combination of those hyperparameters df = pd.DataFrame( columns=['Estimators', 'Learning Rate', 'Confusion Matrix']) rows = [] alphas = [0.0001, 0.001, 0.01] #,0.1,1] estimators = [50, 100, 150] rates = [0.5, 0.75, 1] # for al in alphas: # estimator = SGDClassifier(loss = 'log',alpha= al,penalty = 'l1',random_state=0) for n_est in estimators: for rate in rates: ada = LogitBoost(n_estimators=n_est, learning_rate=rate, random_state=0) #algorithm='SAMME', ada.fit(X_train, y_train) predicted_labels = ada.predict(X_test) tn, fp, fn, tp = confusion_matrix(y_test, predicted_labels, labels=[0, 1]).ravel() convert_matrix = [tn, fp, fn, tp] rows.append([n_est, rate, convert_matrix]) for i in range(len(rows)): df = df.append( { 'Estimators': rows[i][0], 'Learning Rate': rows[i][1], 'Confusion Matrix': rows[i][2] }, ignore_index=True) return df
alpha=0.7, ) plt.title("t-SNE plot of the training data") plt.xlabel("1st embedding axis") plt.ylabel("2nd embedding axis") plt.legend(loc="best", frameon=True, shadow=True) plt.tight_layout() plt.show() plt.close() lboost = LogitBoost(base_estimator=LogisticRegression(), n_estimators=200, random_state=0) lboost.fit(X_train, y_train) y_pred_train = lboost.predict(X_train) y_pred_test = lboost.predict(X_test) accuracy_train = accuracy_score(y_train, y_pred_train) accuracy_test = accuracy_score(y_test, y_pred_test) print("Training accuracy: %.4f" % accuracy_train) print("Test accuracy: %.4f" % accuracy_test) report_train = classification_report(y_train, y_pred_train) report_test = classification_report(y_test, y_pred_test) print("Training\n%s" % report_train) print("Testing\n%s" % report_test)
def _toy_dataset_test(load_func, test_size=(1. / 3), random_state=0, min_score_train=0.9, min_score_test=0.9): """Create a classification unit test from a scikit-learn toy dataset.""" # Fetch the dataset data = load_func() X = data.data y = data.target_names[data.target] # Distinct classes classes = data.target_names n_classes = len(classes) # Binary/multiclass classification indicator is_binary = (n_classes == 2) # Shuffle data and split it into training/testing samples X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y, random_state=random_state) for bootstrap in (True, False): # Fit a LogitBoost model logitboost = LogitBoost(bootstrap=bootstrap, random_state=random_state) logitboost.fit(X_train, y_train) # Compute accuracy scores and assert minimum accuracy score_train = logitboost.score(X_train, y_train) score_test = logitboost.score(X_test, y_test) assert score_train >= min_score_train, \ ("Failed with bootstrap=%s: training score %.3f less than %.3f" % (bootstrap, score_train, min_score_train)) assert score_test >= min_score_test, \ ("Failed with bootstrap=%s: testing score %.3f less than %.3f" % (bootstrap, score_test, min_score_test)) # Get probabilities and the decision function predict_proba = logitboost.predict_proba(X_test) decision_function = logitboost.decision_function(X_test) # predict_proba() should always return (n_samples, n_classes) assert predict_proba.shape == (X_test.shape[0], n_classes) # decision_function() shape depends on the classification task if is_binary: assert decision_function.shape == (X_test.shape[0], ) else: assert decision_function.shape == (X_test.shape[0], n_classes) # Check that the last item of a staged method is the same as a regular # method staged_predict = np.asarray(list(logitboost.staged_predict(X_test))) staged_predict_proba = \ np.asarray(list(logitboost.staged_predict_proba(X_test))) staged_decision_function = \ np.asarray(list(logitboost.staged_decision_function(X_test))) staged_score = \ np.asarray(list(logitboost.staged_score(X_test, y_test))) np.testing.assert_equal(staged_predict[-1], logitboost.predict(X_test)) np.testing.assert_almost_equal(staged_predict_proba[-1], logitboost.predict_proba(X_test)) np.testing.assert_almost_equal(staged_decision_function[-1], logitboost.decision_function(X_test)) np.testing.assert_almost_equal(staged_score[-1], logitboost.score(X_test, y_test)) # contributions() should return one non-negative number for each # estimator in the ensemble contrib = logitboost.contributions(X_train) assert contrib.shape == (logitboost.n_estimators, ) assert np.all(contrib >= 0)
subsample=1, # 所有样本建立决策树 colsample_btree=1, # 所有特征建立决策树 scale_pos_weight=1, # 解决样本个数不平衡的问题 random_state=27, # 随机数 slient=0, ) xg.fit(feature_train_balance, label_train_balance) xg_pred = xg.predict_proba(feature_test_balance)[:, 1] xg_evaluation = valid.evaluate( label_test_balance, xg_pred, save_path="../data/xg_evaluation.json" ) plot_evaluation(label_test_balance, xg_pred, "../figure", method="XG") #%% lb = LogitBoost(n_estimators=200, random_state=0) # base_estimator=LogisticRegression() lb.fit(feature_train_balance, label_train_balance) lb_pred = lb.predict_proba(feature_test_balance)[:, 1] lb_evaluation = evaluate( label_test_balance, lb_pred, save_path="../data/lb_evaluation.json" ) plot_evaluation(label_test_balance, lb_pred, "../figure", method="LB") #%% from feature import valid #%% # Auto-tunan_column_listne model for pandemic # 1. XGboost # 2. XGboost - additive learning # 3. LogisticBoosting - additive learning # 4. dummy Logistic # 5. Transfer Logistic
def main(plot=True, M=8, n_fold=10): """ :param plot: whether to plot the train and test result :param M: maximum T to search would be 100*M :param n_fold: number of folds in cross validation :return: None """ train_file, test_file = "datasets/abalone_train_scaled.txt", "datasets/abalone_test_scaled.txt" train = sparse.csr_matrix(np.loadtxt(train_file, delimiter=",")) test = sparse.csr_matrix(np.loadtxt(test_file, delimiter=",")) m = train.shape[0] #3133 x_dim = train.shape[1] - 1 #10 x_train, y_train = train[:, :x_dim].toarray(), train[:, x_dim].toarray().squeeze() x_test, y_test = test[:, :x_dim].toarray(), test[:, x_dim].toarray().squeeze() #print(x_train.shape, y_train.shape, x_test.shape) #print(x_train) aboost_train_cverror = list(np.ones(M)) lboost_train_cverror = list(np.ones(M)) for multiple in range(1, M+1): T = multiple * 100 print("\nT = %s\t" % T) # Set AdaBoost parameters # decision stump is the default base estimator aboost = AdaBoostClassifier(n_estimators=T, random_state=0) # Set LogitBoost parameters lboost = LogitBoost(n_estimators=T, random_state=0) # get 10-fold cross validation error aboost_cv_results = cross_validate(aboost, x_train, y_train, cv=n_fold) lboost_cv_results = cross_validate(lboost, x_train, y_train, cv=n_fold) # compute error by 1 - accuracy aboost_train_cverror[multiple-1] = 1 - aboost_cv_results['test_score'] lboost_train_cverror[multiple-1] = 1 - lboost_cv_results['test_score'] aboost_train_cverror = np.stack(aboost_train_cverror) lboost_train_cverror = np.stack(lboost_train_cverror) print(aboost_train_cverror) print(lboost_train_cverror) # find the T that gives least error (the best cross-validation accuracy) a_train_cverror_mean, a_train_cverror_std = aboost_train_cverror.mean(axis=1), aboost_train_cverror.std(axis=1) argmin = a_train_cverror_mean.flatten().argmin() best_T_aboost = int(argmin+1) * 100 print("----------------------\n",\ "AdaBoost iteration number T = %s\n"%(best_T_aboost), \ "----------------------\n") # find the T that gives least error (the best cross-validation accuracy) l_train_cverror_mean, l_train_cverror_std = lboost_train_cverror.mean(axis=1), lboost_train_cverror.std(axis=1) argmin = l_train_cverror_mean.flatten().argmin() best_T_lboost = int(argmin + 1) * 100 print("----------------------\n", \ "LogitBoost iteration number T = %s\n" % (best_T_lboost), \ "----------------------\n") print('Now train with the best T=T* and eval on the test set\n') # Train on the whole train set aboost = AdaBoostClassifier(n_estimators=best_T_aboost, random_state=0) aboost.fit(x_train, y_train) lboost = LogitBoost(n_estimators=best_T_lboost, random_state=0) lboost.fit(x_train, y_train) # Test on the test set y_pred_train = aboost.predict(x_train) y_pred_test = aboost.predict(x_test) a_error_train = 1-accuracy_score(y_train, y_pred_train) a_error_test = 1-accuracy_score(y_test, y_pred_test) print("AdaBoost train error: %s test error: %s" % (a_error_train, a_error_test)) y_pred_train = lboost.predict(x_train) y_pred_test = lboost.predict(x_test) l_error_train = 1-accuracy_score(y_train, y_pred_train) l_error_test = 1-accuracy_score(y_test, y_pred_test) print("LogitBoost train error: %s test error: %s"%(l_error_train, l_error_test)) if plot: plt.figure() x_values = range(100, M*100+1, 100) plt.plot(x_values, a_train_cverror_mean, label="AdaBoost") plt.fill_between(x_values, a_train_cverror_mean + a_train_cverror_std, a_train_cverror_mean - a_train_cverror_std, alpha=0.5, edgecolor='blue', facecolor='blue') plt.plot(x_values, l_train_cverror_mean, label="LogitBoost") plt.fill_between(x_values, l_train_cverror_mean + l_train_cverror_std, l_train_cverror_mean - l_train_cverror_std, alpha=0.5, edgecolor='#FF9848', facecolor='#FF9848') plt.xlabel('T (number of iterations/classifiers)') plt.ylabel('10fold cross validation train error') plt.legend() plt.ylim(0, 0.5) plt.savefig('B.i_cverror.png')
def model_comp(X_train, X_test, y_train, y_test, title=""): xgboost_model = XGBClassifier(learning_rate=0.01, max_depth=3, n_estimators=700, random_state=8) gradient_boost_model = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, max_features='log2', min_samples_leaf=4, n_estimators=280, subsample=0.25, random_state=8) random_forest_model = RandomForestClassifier(n_estimators=300, max_depth=3, verbose=1, random_state=8) svm_model = SVC(kernel='poly', probability=True, verbose=1, random_state=8) knn_model = KNeighborsClassifier(n_neighbors=3) elm_model = MLPClassifier(hidden_layer_sizes=(80, ), activation='logistic', learning_rate_init=0.01, verbose=1) adaboost_model = AdaBoostClassifier(n_estimators=300, learning_rate=0.01, random_state=8) logitboost_model = LogitBoost(n_estimators=300, learning_rate=0.01, random_state=8) xgboost_model.fit(X_train, y_train) gradient_boost_model.fit(X_train, y_train) random_forest_model.fit(X_train, y_train) svm_model.fit(X_train, y_train) knn_model.fit(X_train, y_train) elm_model.fit(X_train, y_train) adaboost_model.fit(X_train, y_train) logitboost_model.fit(X_train, y_train) p_random_forest = random_forest_model.predict_proba(X_test) p_gradient_boost = gradient_boost_model.predict_proba(X_test) p_xgboost = xgboost_model.predict_proba(X_test) p_svm = svm_model.predict_proba(X_test) p_knn = knn_model.predict_proba(X_test) p_elm = elm_model.predict_proba(X_test) p_adaboost = adaboost_model.predict_proba(X_test) p_logitboost = logitboost_model.predict_proba(X_test) random_forest_ll = log_loss(y_test, p_random_forest) gradient_boost_ll = log_loss(y_test, p_gradient_boost) xgboost_ll = log_loss(y_test, p_xgboost) svm_ll = log_loss(y_test, p_svm) knn_ll = log_loss(y_test, p_knn) elm_ll = log_loss(y_test, p_elm) adaboost_ll = log_loss(y_test, p_adaboost) logitboost_ll = log_loss(y_test, p_logitboost) strng0 = "\n" + title strtest = "\nLength of test data: " + str(len(y_test)) strng2 = "\n------------------" strng4 = "\nGradient Boost Log Loss " + str(gradient_boost_ll) strng5 = "\nRandom Forest Log Loss " + str(random_forest_ll) strng6 = "\nXGBoost Log Loss " + str(xgboost_ll) strng7 = "\n------------------" strng9 = "\nSVM Log Loss " + str(svm_ll) strng10 = "\nKNN Log Loss " + str(knn_ll) strng11 = "\nELM Log Loss " + str(elm_ll) strng12 = "\nAdaBoost Log Loss " + str(adaboost_ll) strng13 = "\nLogitBoost Log Loss " + str(logitboost_ll) prntstr = strng0 + strtest + strng2 + strng4 + strng5 + strng6 + strng7 + strng9 + strng10 + strng11 + strng12 + strng13 print(prntstr) write_to_file(prntstr) return xgboost_model, random_forest_model, adaboost_model