Example #1
0
def run(dbdata, dbdata2, knn, param_range, title):
    global counter

    X_train = dbdata.iloc[:, np.arange(32)]
    y_train = dbdata.iloc[:, 32]
    X_test = dbdata2.iloc[:, np.arange(32)]
    y_test = dbdata2.iloc[:, 32]

    #####Graphs the best results obtained from the gridsearch
    knn.fit(X_train, y_train)
    X_pred = knn.predict(X_train)
    y_pred = knn.predict(X_test)

    ###############################
    ########Validation Curve#######
    ###############################
    # train_scores, test_scores = validation_curve(
    #     knn, X_train, y_train, param_name="n_neighbors", param_range=param_range,
    #     cv=5, scoring="neg_brier_score", n_jobs=-1)
    # train_scores_mean = np.mean(train_scores, axis=1)
    # train_scores_std = np.std(train_scores, axis=1)
    # test_scores_mean = np.mean(test_scores, axis=1)
    # test_scores_std = np.std(test_scores, axis=1)

    # plt.title("Brier Score for KNN (Monks%d)" % counter)
    # plt.xlabel('N-neighbours')
    # plt.ylabel('Loss')
    # lw = 2
    # #plt.ylim(0,1)
    # plt.plot(param_range, train_scores_mean, label="Training score",
    #             color="darkorange", lw=lw)
    # plt.plot(param_range, test_scores_mean, label="Testing score",
    #             color="navy", lw=lw)

    # plt.legend(loc="best")

    ########################################
    #################Learning curve#########
    ########################################

    plot_learning_curve(knn,
                        title,
                        X_train,
                        y_train,
                        ylim=(0, 1.01),
                        cv=5,
                        n_jobs=4)

    ############################################

    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print('Training Score is ', accuracy_score(y_train, X_pred))
    print('Testing Score is ', accuracy_score(y_test, y_pred))
    print('Training Error is ', mean_squared_error(y_train, X_pred))
    print('Testing Error is ', mean_squared_error(y_test, y_pred))
    plt.savefig('img/KNN/Student%d_learning' % counter)
    counter += 1
    plt.clf()
Example #2
0
def run(dbdata, dbdata2, DTclassifier, param_range, title):
    global counter

    X_train = dbdata.iloc[:, [1, 2, 3, 4, 5, 6]]  #np.arange(32
    y_train = dbdata.iloc[:, 0]  #32

    X_test = dbdata2.iloc[:, [1, 2, 3, 4, 5, 6]]  #np.arange(32
    y_test = dbdata2.iloc[:, 0]  #32

    ######

    DTclassifier.fit(X_train, y_train)
    X_pred = DTclassifier.predict(X_train)
    y_pred = DTclassifier.predict(X_test)

    ########################
    ########VALIDATION CURVE#######
    ########################
    # train_scores, test_scores = validation_curve(
    #     DTclassifier, X_train, y_train, param_name="random_state", param_range=param_range,
    #     cv=3, scoring="neg_brier_score", n_jobs=-1)
    # train_scores_mean = np.mean(train_scores, axis=1)
    # train_scores_std = np.std(train_scores, axis=1)
    # test_scores_mean = np.mean(test_scores, axis=1)
    # test_scores_std = np.std(test_scores, axis=1)

    # plt.title("Brier Score of DTC(MONKS%d)" % counter)
    # plt.xlabel('Random State')
    # plt.ylabel("Loss")
    # lw = 2
    # #plt.ylim(0,1)
    # plt.plot(param_range, train_scores_mean, label="Training score",
    #             color="darkorange", lw=lw)
    # plt.plot(param_range, test_scores_mean, label="Testing score",
    #             color="navy", lw=lw)

    # plt.legend(loc="best")
    # ##################################
    ########################################
    ########################################
    #################LEARNING CURVE#########
    ########################################

    plot_learning_curve(DTclassifier,
                        title,
                        X_train,
                        y_train,
                        ylim=(0, 1.01),
                        cv=5,
                        n_jobs=4)

    print('Training Score is ', accuracy_score(y_train, X_pred))
    print('Testing Score is ', accuracy_score(y_test, y_pred))
    print('Training Error is ', mean_squared_error(y_train, X_pred))
    print('Testing Error is ', mean_squared_error(y_test, y_pred))
    plt.savefig('img/DTC/Students%d' % counter)
    counter += 1
    plt.clf()
Example #3
0
def run(dbdata, dbdata2, svclassifier, param_range, title):
    global counter

    X_train = dbdata.iloc[:, np.arange(32)]
    y_train = dbdata.iloc[:, 32]
    X_test = dbdata2.iloc[:, np.arange(32)]
    y_test = dbdata2.iloc[:, 32]

    #####Graphs the best results obtained from the gridsearch
    svclassifier.fit(X_train, y_train)
    X_pred = svclassifier.predict(X_train)
    y_pred = svclassifier.predict(X_test)

    ########################
    ########VALIDATION CURVE#######
    ########################

    # train_scores, test_scores = validation_curve(
    #     svclassifier, X_train, y_train, param_name="C", param_range=param_range,
    #     cv=3, scoring="accuracy", n_jobs=-1)
    # train_scores_mean = np.mean(train_scores, axis=1)
    # train_scores_std = np.std(train_scores, axis=1)
    # test_scores_mean = np.mean(test_scores, axis=1)
    # test_scores_std = np.std(test_scores, axis=1)

    # plt.title("Accuracy Score of SVM(MONKS%d)" % counter)
    # plt.xlabel('C')
    # plt.ylabel("Accuracy")
    # lw = 2
    # plt.ylim(0,1)
    # plt.plot(param_range, train_scores_mean, label="Training score",
    #             color="darkorange", lw=lw)
    # plt.plot(param_range, test_scores_mean, label="Testing score",
    #             color="navy", lw=lw)

    # plt.legend(loc="best")
    ########################################
    ########################################
    #################LEARNING CURVE#########
    ########################################

    plot_learning_curve(svclassifier,
                        title,
                        X_train,
                        y_train,
                        ylim=(0, 1.01),
                        cv=5,
                        n_jobs=4)

    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print('Training Score is ', accuracy_score(y_train, X_pred))
    print('Testing Score is ', accuracy_score(y_test, y_pred))
    print('Training Error is ', mean_squared_error(y_train, X_pred))
    print('Testing Error is ', mean_squared_error(y_test, y_pred))
    plt.savefig('img/SVM/Students%d' % counter)
    counter += 1
    plt.clf()
Example #4
0
def mlp_regression(X, y, cv):
    parameters = {'alpha': 10.0**-np.arange(1, 7)}
    score_func = make_scorer(pearson_cor, greater_is_better=True)
    mlp = MLPRegressor(max_iter=800,
                       hidden_layer_sizes=(200, 200),
                       activation='tanh')
    best_mlp, best_params_mlp = cross_val(mlp,
                                          params=parameters,
                                          X_train=X,
                                          y_train=y,
                                          score=score_func,
                                          cv=cv,
                                          n_jobs=-1)
    title = r"Learning curves (MLP regression)"
    plt, test_scores = plot_learning_curve(best_mlp,
                                           title,
                                           X,
                                           y,
                                           ylim=(0.0, 1.0),
                                           cv=cv,
                                           n_jobs=4,
                                           scoring=score_func)
    plt.show()
    print("best mlp:", best_mlp)
    print("best para:", best_params_mlp)
    return best_mlp, test_scores
Example #5
0
def support_machine_classification(X, y, cv):
    parameters = [
        {
            'kernel': ['linear'],
            'C': [0.1, 1, 10, 100, 1000]
        },
        {
            'kernel': ['rbf'],
            'C': [0.1, 0.2, 0.25, 0.35, 0.5, 1, 10, 100, 400, 1000, 2500],
            'gamma': [0.01, 0.5, 1, 5, 10, 100]
        },
        # {'kernel': ['poly'], 'C': [1, 10, 100, 1000], 'degree': [3, 4], 'gamma': [0.01, 1, 5, 10, 100]}
    ]
    # define a scoring function
    score_func = make_scorer(pearson_cor, greater_is_better=True)
    svc = SVC()
    # Get the best model through CV
    best_svc, best_params_clf = cross_val(svc,
                                          params=parameters,
                                          X_train=X,
                                          y_train=y,
                                          score=score_func,
                                          cv=cv,
                                          n_jobs=-1)

    title_clf = r"Learning Curves (SVC, rbf kernel)"
    plot_learning_curve(best_svc,
                        title_clf,
                        X,
                        y,
                        ylim=(0.0, 1.0),
                        cv=cv,
                        n_jobs=4,
                        scoring=score_func)
    plt.show()
    print("best svc:", best_svc)
    print("best para:", best_params_clf)
    return best_svc
Example #6
0
def random_forest(X, y, cv):
    score_func = make_scorer(pearson_cor, greater_is_better=True)
    rfg = ExtraTreesRegressor(max_features=8)
    title = r"Learning curves (Random Forest)"
    plt, test_scores = plot_learning_curve(rfg,
                                           title,
                                           X,
                                           y,
                                           ylim=(0.0, 1.0),
                                           cv=cv,
                                           n_jobs=4,
                                           scoring=score_func)
    plt.show()
    return rfg, test_scores
    # Use parameters from either the hyperparameter optimization, or manually selected parameters...
    params = best_params

    print "Generating SGDClassifier model with parameters: ", params
    sgd = SGDClassifier(**params)

    print 'Plot learning curve...'
    cv = ShuffleSplit(X.shape[0],
                      n_iter=25,
                      test_size=0.2,
                      random_state=np.random.randint(0, 123456789))
    title = "SGDClassifier: ", params
    learningcurve.plot_learning_curve(sgd,
                                      title,
                                      X,
                                      y,
                                      ylim=(0.5, 1.0),
                                      cv=cv,
                                      n_jobs=-1)

    test_data = X_test.values
    Xt = test_data[:, 1::]
    yt = test_data[:, 0]

    print "Training model with", train_data.shape[0], "examples"
    print "Testing model with", test_data.shape[0], "examples"
    print "Submitting predicted labels for", submit_df.shape[0], "records"

    test_scores = []
    # Using the optimal parameters, predict the survival of the labeled test set
    for i in range(5):
    #==============================================================================================================

    # Use parameters from either the hyperparameter optimization, or manually selected parameters...
    params = params_score
    #params = best_params

    #############################################################################################################
    # Model generation/validation
    #
    print "Generating RandomForestClassifier model with parameters: ", params
    forest = RandomForestClassifier(n_jobs=-1, oob_score=True, **params)

    print "\nCalculating Learning Curve..."
    title = "RandomForestClassifier with hyperparams: ", params
    midpoint, diff = \
         learningcurve.plot_learning_curve(forest, title, X, y, (0.6, 1.01), cv=8, n_jobs=-1, plot=True)
    #print "Midpoint:", midpoint
    #print "Diff:", diff

    print "\nGenerating ROC curve 5 times to get mean AUC with class weights..."
    aucs = []
    for i in range(5):
        aucs.append(roc_auc.generate_roc_curve(forest, X, y, survived_weight))
    auc_mean = ("%.3f" % (np.mean(aucs))).lstrip('0')
    auc_std = ("%.3f" % (np.std(aucs))).lstrip('0')
    auc_lower = ("%.3f" % (np.mean(aucs) - np.std(aucs))).lstrip('0')
    print "ROC - Area under curve:", auc_mean, "and stddev:", auc_std

    print "\nFitting model 5 times to get mean OOB score using full training data with class weights..."
    test_scores = []
    # Using the optimal parameters, predict the survival of the labeled test set 10 times
Example #9
0
    params = params_score
    #params = best_params



    #############################################################################################################
    # Model generation/validation
    #
    print "Generating RandomForestClassifier model with parameters: ", params
    forest = RandomForestClassifier(n_jobs=-1, oob_score=True, **params)


    print "\nCalculating Learning Curve..."
    title = "RandomForestClassifier with hyperparams: ", params
    midpoint, diff = \
         learningcurve.plot_learning_curve(forest, title, X, y, (0.6, 1.01), cv=8, n_jobs=-1, plot=True, verbose=10)
    #print "Midpoint:", midpoint
    #print "Diff:", diff


    print "\nGenerating ROC curve 5 times to get mean AUC with class weights..."
    aucs = []
    for i in range(5):
        aucs.append(roc_auc.generate_roc_curve(forest, X, y, survived_weight))
    auc_mean = ("%.3f"%(np.mean(aucs))).lstrip('0')
    auc_std = ("%.3f"%(np.std(aucs))).lstrip('0')
    auc_lower = ("%.3f"%(np.mean(aucs)-np.std(aucs))).lstrip('0')
    print "ROC - Area under curve:", auc_mean, "and stddev:", auc_std


    print "\nFitting model 5 times to get mean OOB score using full training data with class weights..."
Example #10
0
# random_search.fit(train_data[0::,1::], train_data[0::,0])
# best_params = report(random_search.grid_scores_)
#==================================================================================================================

# Plot the learning curve for the model
cv = sklearn.cross_validation.ShuffleSplit(X.shape[0],
                                           n_iter=10,
                                           train_size=0.7,
                                           test_size=0.3,
                                           random_state=np.random.randint(
                                               0, 123456789))
title = "Learning Curves (BernoulliNB)"
bnb = naive_bayes.BernoulliNB()
learningcurve.plot_learning_curve(bnb,
                                  title,
                                  X,
                                  y, (0.6, 0.9),
                                  cv=cv,
                                  n_jobs=1)

# Using the optimal parameters, predict the survival of the test set
print 'Predicting...'
bnb = naive_bayes.BernoulliNB()
bnb.fit(train_data[0::, 1::], train_data[0::, 0])
output = bnb.predict(test_data).astype(int)

# write results
predictions_file = open(
    "data/results/naivebayes_bernoulli" + str(int(time.time())) + ".csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId", "Survived"])
open_file_object.writerows(zip(ids, output))
    sys.exit()

    #==============================================================================================================
    # print 'Hyperparameter optimization via GridSearchCV...'
    # grid_search = GridSearchCV(svc, rbf_params, cv=20, n_jobs=-1, verbose=2)
    # grid_search.fit(X, y)
    # best_params = report(grid_search.grid_scores_)
    #==============================================================================================================
    
    
    # Plot the learning curve for the model with the best parameters
    print 'Plotting learning curve...'
    cv = ShuffleSplit(X.shape[0], n_iter=20, test_size=0.33, random_state=np.random.randint(0,123456789))
    title = "SVC(RBF): ", best_params
    svc = SVC(**best_params)
    learningcurve.plot_learning_curve(svc, title, X, y, ylim=(0.5, 1.0), cv=cv, n_jobs=-1)
    
    sys.exit()
    
    # Using the optimal parameters, predict the survival of the test set
    print 'Predicting test set...'
    #==================================================================================================================
    # for train_ix, val_ix in cv:
    #     sgd.fit(X[train_ix], y[train_ix])
    #     val_pred = sgd.predict(X[val_ix])
    #     print "cross val accuracy score: ", metrics.accuracy_score(y[val_ix], val_pred)
    #==================================================================================================================
    svc.fit(X, y)
    output = svc.predict(test_data).astype(int)
    
     
     grid_search = GridSearchCV(forest, params, n_jobs=-1)
     grid_search.fit(X, y)
     best_params = scorereport.report(grid_search.grid_scores_)
 
     # Use parameters from either the hyperparameter optimization, or manually selected parameters...
     params = best_params
     
     
     print "Generating RandomForestClassifier model with parameters: ", params
     forest = RandomForestClassifier(n_jobs=-1, **params)
     
     print "Plot Learning Curve..."
     cv = cross_validation.ShuffleSplit(train_data.shape[0], n_iter=5, test_size=0.25, \
                                        random_state=np.random.randint(0,123456789))
     title = "RandomForestClassifier with hyperparams: ", params
     learningcurve.plot_learning_curve(forest, title, X, y, (0.6, 1.01), cv=cv, n_jobs=-1)
     
     
     test_data = X_test[X_test.Gender==gender].drop('Gender', axis=1).values
     Xt = test_data[:, 1::]
     yt = test_data[:, 0]
     
     print "Training", gender, "model with", train_data.shape[0], "examples"
     print "Testing", gender, "model with", test_data.shape[0], "examples"
     print "Submitting predicted labels for", submit_df.shape[0], "records"
     
     test_scores = []
     # Using the optimal parameters, predict the survival of the labeled test set
     for i in range(5):
         print "Predicting test set for submission..."
         forest.fit(X, y)
Example #13
0
from lassocv_util import lasso_cv_x, load_data
from learningcurve import plot_learning_curve
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, LassoCV, Lasso, RidgeCV
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

if __name__ == '__main__':
    elf_lr = LogisticRegression(max_iter=5000, solver="liblinear", C=1)

    x, y, df = load_data('merged_file_f.csv')
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=1215)

    lasso_x_train, lasso_x_test, lasso_f_columns = lasso_cv_x(
        x_train, x_test, y_train, y_test)
    plot_learning_curve(elf_lr,
                        lasso_x_train,
                        y_train,
                        model_name='Logistic Regression')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    model.fit(X_train, y_train)
    t = model.predict(X_test)

    print(msg + " Test score: " + str(r2_score(y_test, t)))

# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
ax = sns.boxplot(x=['KNN', 'SVR', 'GDB', 'RF', 'ETC', 'BC', 'STREGR'],
                 y=results)
ax.set_xticklabels(names)
plt.show()
# fig.savefig('plot/algorithm_compare.png', format='png', dpi=1200)

# Draw learning curve of each models
import learningcurve
for name, model in models:
    title = "Learning Curves (" + name + ")"
    estimator = model
    p1 = learningcurve.plot_learning_curve(estimator,
                                           title,
                                           X_train,
                                           y_train,
                                           ylim=(0.1, 1.01),
                                           cv=kfold,
                                           n_jobs=4)
Example #15
0
        grid_search.fit(X, y)
        best_params = scorereport.report(grid_search.grid_scores_)

        # Use parameters from either the hyperparameter optimization, or manually selected parameters...
        params = best_params

        print "Generating RandomForestClassifier model with parameters: ", params
        forest = RandomForestClassifier(n_jobs=-1, **params)

        print "Plot Learning Curve..."
        cv = cross_validation.ShuffleSplit(train_data.shape[0], n_iter=5, test_size=0.25, \
                                           random_state=np.random.randint(0,123456789))
        title = "RandomForestClassifier with hyperparams: ", params
        learningcurve.plot_learning_curve(forest,
                                          title,
                                          X,
                                          y, (0.6, 1.01),
                                          cv=cv,
                                          n_jobs=-1)

        test_data = X_test[X_test.Gender == gender].drop('Gender',
                                                         axis=1).values
        Xt = test_data[:, 1::]
        yt = test_data[:, 0]

        print "Training", gender, "model with", train_data.shape[0], "examples"
        print "Testing", gender, "model with", test_data.shape[0], "examples"
        print "Submitting predicted labels for", submit_df.shape[0], "records"

        test_scores = []
        # Using the optimal parameters, predict the survival of the labeled test set
        for i in range(5):
Example #16
0
######

import learningcurve
from sklearn.model_selection import ShuffleSplit

title = "Learning Curves (RandomForestRegressor)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

estimator = rf = RandomForestRegressor(n_estimators=best_n,
                                       max_features=best_f)
p1 = learningcurve.plot_learning_curve(estimator,
                                       title,
                                       X_train,
                                       y_train,
                                       ylim=(0.1, 1.01),
                                       cv=10,
                                       n_jobs=4)
p1.grid()
p1.savefig('LearningCurve_rf.png', format='png', dpi=1200)

title = "Learning Curves (SVM, RBF, C = 100)"
# SVC is more expensive so we do a lower number of CV iterations:

estimator = svm.SVR(C=best_c, epsilon=best_e, kernel='rbf')
p2 = learningcurve.plot_learning_curve(estimator,
                                       title,
                                       X_train,
                                       y_train, (0.5, 1.01),
                                       cv=10,
#           "binarize": np.random.rand()}
# # run randomized search to find the optimal parameters
# n_iter_search = 50
# bnb = naive_bayes.BernoulliNB()
# random_search = RandomizedSearchCV(bnb, param_distributions=params, n_iter=n_iter_search)
# random_search.fit(train_data[0::,1::], train_data[0::,0])
# best_params = report(random_search.grid_scores_)
#==================================================================================================================


# Plot the learning curve for the model
cv = sklearn.cross_validation.ShuffleSplit(X.shape[0], n_iter=100, train_size=0.7, test_size=0.3, 
                                           random_state=np.random.randint(0,123456789))
title = "Learning Curves (GaussianNB)"
gnb = naive_bayes.GaussianNB()
learningcurve.plot_learning_curve(gnb, title, X, y, (0.6, 0.9), cv=cv, n_jobs=1)


# Using the optimal parameters, predict the survival of the test set
print 'Predicting...'
bnb = naive_bayes.BernoulliNB()
bnb.fit(train_data[0::,1::], train_data[0::,0])
output = bnb.predict(test_data).astype(int)
  
# write results
predictions_file = open("data/results/naivebayes_gaussian" + str(int(time.time())) + ".csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'
Example #18
0
    # Get the best model through CV
    best_svr, best_params_rg = cross_val(svr,
                                         params=parameters,
                                         X_train=X_rg_scaled,
                                         y_train=y_rg,
                                         score=score_func,
                                         cv=cv,
                                         n_jobs=-1)
    print("best svr:", best_svr)
    print("best para:", best_params_rg)

    title_rg = r"Learning Curves (SVR, rbf kernel)"
    plot_learning_curve(best_svr,
                        title_rg,
                        X_rg_scaled,
                        y_rg,
                        ylim=(0.0, 0.8),
                        cv=cv,
                        n_jobs=4,
                        scoring=score_func)
    plt.show()

    # X_train, X_test, y_train, y_test = train_test_split(X_rg_scaled, y_rg, test_size=0.33, random_state=42)
    X_train = X_rg_scaled[100:]
    y_train = y_rg[100:]
    X_test = X_rg_scaled[:100]
    y_test = y_rg[:100]
    best_svr.fit(X_train, y_train)

    y_predict = best_svr.predict(X_test)
    print(pearson_cor(y_test, y_predict))