def train_and_test(X_train, y_train, X_test, y_test): ''' Script for performing adaboost analysis :param X_train: training features :param y_train: training tags :param X_test: testing features :param y_test: testing tags :return: ''' ''' Refs: https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV ''' C, max_iter = [math.pow(2, i) for i in range(-4, 8)], [1e6] # Parameter tuning search scheme param_grid = [{ "n_estimators": [10] }, { "n_estimators": [50] }, { "n_estimators": [100] }, { "n_estimators": [300] }, { "n_estimators": [500] }, { "n_estimators": [1000] }] ''' https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC implements "one-vs-the-rest" multi-class strategy (preferred to "one-vs-one" because of significantly less runtime for similar results ''' # Search for optimized parameters and fitting the model clf = GridSearchCV( ada(n_estimators=500), param_grid, scoring='accuracy', iid=False, # return average score across folds cv=3) clf.fit(X_train, y_train) print('Best params set found on training set:\n', clf.best_params_) # Best parameters print('\nGrid (mean accuracy) scores on training set:\n') # Print score means = clf.cv_results_['mean_test_score'] for mean, params in zip(means, clf.cv_results_['params']): print("%0.3f for %r" % (mean, params)) print('\nDetailed classification report:\n') y_pred = clf.predict(X_test) # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report print(classification_report(y_test, y_pred)) # https://scikit-learn.org/stable/modules/classes.html#classification-metrics print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
def adaboost_test(): n_samples = 5000 n_features = 10 n_informative = 8 random_state = 19 n_clusters_per_class = 1 max_depth = 3 from sklearn.datasets import make_classification X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_redundant=0, n_clusters_per_class=n_clusters_per_class, n_classes=2, random_state=random_state, class_sep=1.0) columns = [] for i in range(1, n_features + 1): columns.append('f%d' % i) columns.append('y') df = pd.DataFrame(np.concatenate((X, y.reshape(-1, 1)), axis=1), columns=columns) df['y'] = df['y'].astype('int') X = df.drop('y', axis=1) y = df['y'] # negatives = df[df['y'] == 0].drop('y', axis=1).values # positives = df[df['y'] == 1].drop('y', axis=1).values # # plt.scatter(negatives[:, 0], negatives[:, 1], c='blue') # plt.scatter(positives[:, 0], positives[:, 1], c='red') # plt.show() from sklearn.model_selection import train_test_split X_train, X_vali, y_train, y_vali = train_test_split( X, y, test_size=0.3, random_state=random_state) from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier(max_depth=max_depth) dtc.fit(X_train, y_train) print('DecisionTreeClassifier score: ', dtc.score(X_vali, y_vali)) from ml.ensemble import AdaBoostClassifier abc = AdaBoostClassifier(DecisionTreeClassifier(max_depth=max_depth), n_estimators=50) abc.fit(X_train, y_train) print('AdaBoostClassifier score: ', abc.score(X_vali, y_vali)) from sklearn.ensemble import AdaBoostClassifier as ada ad = ada(DecisionTreeClassifier(max_depth=max_depth), n_estimators=50) ad.fit(X_train, y_train) print('sklearn adaboost score: ', ad.score(X_vali, y_vali))
def train_and_test(X_train, y_train, X_test, y_test): ''' https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html https://stats.stackexchange.com/questions/31066/what-is-the-influence-of-c-in-svms-with-linear-kernel https://stats.stackexchange.com/questions/43943/which-search-range-for-determining-svm-optimal-c-and-gamma-parameters https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV ''' C, max_iter = [math.pow(2, i) for i in range(-4, 8)], [1e6] #param_grid = [{}] #param_grid = [{'loss': ['squared_hinge'], 'dual': [False], #'C': C, 'max_iter': max_iter}, #{'loss': ['hinge'], 'C': C, 'max_iter': max_iter}] param_grid = [{ "n_estimators": [10] }, { "n_estimators": [50] }, { "n_estimators": [100] }, { "n_estimators": [300] }, { "n_estimators": [500] }, { "n_estimators": [1000] }] #param_grid = [{"n_estimators": [10,50,100,300,500,1000]}] ''' https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC implements "one-vs-the-rest" multi-class strategy (preferred to "one-vs-one" because of significantly less runtime for similar results ''' clf = GridSearchCV( ada(n_estimators=500), param_grid, scoring='accuracy', iid=False, # return average score across folds cv=3) clf.fit(X_train, y_train) print('Best params set found on training set:\n', clf.best_params_) print('\nGrid (mean accuracy) scores on training set:\n') means = clf.cv_results_['mean_test_score'] for mean, params in zip(means, clf.cv_results_['params']): print("%0.3f for %r" % (mean, params)) print('\nDetailed classification report:\n') y_pred = clf.predict(X_test) # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report print(classification_report(y_test, y_pred)) # https://scikit-learn.org/stable/modules/classes.html#classification-metrics print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
allData_X = allData[allData.axes[1].tolist()] allData_X = allData_X.drop('Class', axis=1) result_file.write("\nUsing features:\n" + str(allData_X.axes[1].tolist())) result_file.write("\n\nTrain Split: " + str(1 - test_split)) result_file.write("\nTest Split: " + str(test_split)) allData_y = allData['Class'] train_X, test_X, train_y, test_y = train_test_split(allData_X, allData_y, test_size=test_split, random_state=0) weight_list = (train_y * weight + 1) lr = LogisticRegression(penalty='l1') ab_clf = ada(DecisionTreeClassifier(max_depth=1), n_estimators=50) rf_clf = rfc(n_estimators=50) print("Starting Logistic Regression") lr.fit(train_X, train_y, sample_weight=weight_list) predictions = lr.predict(test_X) lr_recall_score = recall_score(test_y, predictions) lr_report = classification_report(test_y, predictions) lr_f1 = f1_score(test_y, predictions) result_file.write("\n\nLogistic Regression:") print(lr_report) result_file.write("\n" + lr_report) print("Finished Logistic Regression")
final_model = grid_search.best_estimator_ #Evaluate the best model on the test data final_model.fit(train_features, train_labels) preds = final_model.predict_proba(test_features)[:, 1] baseline_auc88 = roc_auc_score(test_labels, preds) print( 'The final tuned KNN_model scores {:.5f} ROC AUC on the test set.'.format( baseline_auc88)) # In[ ]: #------<Mode9: adaboost>---------- from sklearn.ensemble import AdaBoostClassifier as ada #Establish a baseline model base_ada = ada() # Default hyperparamters hyperparameters = base_ada.get_params() print(hyperparameters) ada_scores = cross_val_score(base_ada, train_features, train_labels, scoring='roc_auc', cv=10) print('The mean AUC for AdaBoost is:', ada_scores.mean()) base_ada.fit(train_features, train_labels) # Actual class predictions ada_predictions = base_ada.predict(test_features)
y = np.load('data/y_boston.npy') X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) regressors = [ lr(), bay(), rr(alpha=.5, random_state=0), l(alpha=0.1, random_state=0), ll(), knn(), ard(), rfr(random_state=0, n_estimators=100), SVR(gamma='scale', kernel='rbf'), rcv(fit_intercept=False), en(random_state=0), dtr(random_state=0), ada(random_state=0), gbr(random_state=0) ] print('unscaled:', br) for reg in regressors: reg.fit(X_train, y_train) rmse, name = get_error(reg, X_test, y_test) name = reg.__class__.__name__ print(name + '(rmse):', end=' ') print(rmse) print() print('scaled:', br) scaler = StandardScaler() X_train_std = scaler.fit_transform(X_train) X_test_std = scaler.fit_transform(X_test) for reg in regressors: