def learning_curve(self, X1, Y1, dataset_name, kernel):

        title = "Learning Curve for {} Dataset(supportVectorMachine)".format(
            dataset_name)
        cv = StratifiedKFold(n_splits=10, random_state=42)
        X_train, X_test, y_train, y_test = train_test_split(X1,
                                                            Y1,
                                                            test_size=0.3)

        #  Linear Kernel
        if kernel == "linear":
            C_param1 = self.GridSearchCV1(X_train, y_train, dataset_name)
            estimator1 = Pipeline([('Scale', StandardScaler()),
                                   ('clf', LinearSVC(C=C_param1))])
            plot_learning_curve(estimator1, title, X1, Y1, ylim=None, cv=cv)

            return Pipeline([('Scale', StandardScaler()),
                             ('clf', LinearSVC(C=C_param1))])

        # RBF Kernel
        elif kernel == "rbf":
            C_param2 = self.GridSearchCV2(X_train, y_train, dataset_name)

            estimator1 = Pipeline([('Scale', StandardScaler()),
                                   ('clf', SVC(kernel='rbf', C=C_param2))])
            plot_learning_curve(estimator1, title, X1, Y1, ylim=None, cv=cv)

            return Pipeline([('Scale', StandardScaler()),
                             ('clf', SVC(kernel='rbf', C=C_param2))])
    def learning_curve(self,X1,Y1,param,dataset_name):

        title = "Learning Curve for {} Dataset (Neural Network)".format(dataset_name)
        X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3)
        estimator = self.gridSearchCV(X_train, y_train, param,dataset_name)
        plot_learning_curve(estimator, title, X1, Y1, ylim=None, cv=5)
        plt.show()
        return estimator
Ejemplo n.º 3
0
def scoreModel(classifiers, X, y, testX, testy, scoring,
               outputDir, params, scoreType='baseline',
               dsname=''):
    fitClassifiers = {}
    scores = []
    names = []
    for classifier in classifiers:
        clf, _ = A1.getClfParams(classifier)
        if params is not None:
            # Remove classifier prefix from params
            p = {k.replace('classifier__', ''): v for k, v in params[classifier].items()}
            clf.set_params(**p)

        print('{}: Generating {} learning curve'
              .format(classifier, scoreType))
        print('{}: hyperparameters: '.format(classifier), clf.get_params())
        util.plot_learning_curve(classifier, clf, X,
                                 y, scoring,
                                 savedir=outputDir,
                                 scoreType=scoreType)

        # SVM and ANN need a training epoch graph
        if classifier == 'kernelSVM' or classifier == 'ann':
            util.plotValidationCurve(clf, X, y,
                                     scoring=scoring,
                                     paramName='max_iter',
                                     paramRange=range(100, 2000, 100),
                                     savedir=outputDir,
                                     clfName='{}-{}'.format(classifier, scoreType),
                                     cv=3)

        # To score the model, fit with given parameters and predict
        print('{}: Retraining with best parameters on entire training set'
              .format(classifier))
        pipeline = Pipeline(steps=[('scaler', StandardScaler()),
                                   ('classifier', clf)])
        start_time = timeit.default_timer()
        pipeline.fit(X, y)
        total_time = timeit.default_timer() - start_time
        print('Training ANN took {} seconds'.format(total_time))
        ypred = pipeline.predict(testX)
        fitClassifiers[classifier] = pipeline
        scores.append(f1_score(testy, ypred))
        names.append(classifier)

        # Generate confusion matrix
        print('{}: Scoring predictions against test set'
              .format(classifier))
        util.confusionMatrix(classifier, testy, ypred,
                             savedir=outputDir,
                             scoreType=scoreType)

        plt.close('all')

    util.plotBarScores(scores, names, '', outputDir, phaseName=scoreType)
    plt.close('all')
    return fitClassifiers
Ejemplo n.º 4
0
def draw_learning_curve_2():
    title = "Learning Curve for Optical Digit Recognition Dataset (Decision Tree)"
    cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X2, Y2, test_size=0.3)
    max_depth, min_samples_leaf = getParametersFromGridSearchCV(64, X_train, y_train)
    estimator = DecisionTreeClassifier(max_depth=max_depth, random_state=100, min_samples_leaf=min_samples_leaf)
    plot_learning_curve(estimator, title, X2, Y2, ylim=None, cv=cv)

    plt.show()
    def learning_curve(self,X1,Y1,dataset_name,d):

        title = "Learning Curve for {} Dataset (Boosting)".format(dataset_name)
        cv = StratifiedKFold(n_splits=10, random_state=42)
        X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3)
        n_estimators = self.GridSearchCV(X_train, y_train,dataset_name,d)
        estimator = AdaBoostClassifier( base_estimator = DecisionTreeClassifier(max_depth=d, random_state=42),n_estimators=n_estimators)
        plot_learning_curve(estimator, title, X1, Y1, ylim=None, cv=cv)
        return  AdaBoostClassifier( base_estimator = DecisionTreeClassifier(max_depth=d, random_state=42),n_estimators=n_estimators)
        plt.show()
Ejemplo n.º 6
0
Archivo: ada.py Proyecto: ghanley/OMSCS
def run_ada(X, y, X_train, X_test, y_train, y_test, title):
  # param_range = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
  # param_range = [1,5,10,15,20,25,30,35,40,45,50]
  # param_name = 'n_estimators'

  param_range = [.0001, .001, .01, .1, 1, 10]
  param_name = 'learning_rate'

  # data = load_digits()

  # X = data.data
  # y = data.target

  dt = tree.DecisionTreeClassifier(max_depth =10)
  # class sklearn.ensemble.AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm=’SAMME.R’, random_state=None)[source]¶
  ada = AdaBoostClassifier(dt)


  util.plot_learning_curve(ada, title + " ADA LC", X, y, cv=5, n_jobs=-1)


  train_scores, test_scores = validation_curve(ada, X, y, param_name, param_range, cv=5, scoring="accuracy", n_jobs =-1 )


  # Calculate mean and standard deviation for training set scores
  train_mean = np.mean(train_scores, axis=1)
  train_std = np.std(train_scores, axis=1)

  # Calculate mean and standard deviation for test set scores
  test_mean = np.mean(test_scores, axis=1)
  test_std = np.std(test_scores, axis=1)

  plt.figure()
  # Plot mean accuracy scores for training and test sets
  # plt.plot(param_range, train_mean, 'o-', label="Training score", color="g")
  # plt.plot(param_range, test_mean, 'o-', label="Cross-validation score", color="r")
  lw = 2
  plt.semilogx(param_range, train_mean, label="Training score",
                  color="darkorange", lw=lw)
  # plt.plot(param_range, train_mean, 'o-', label="Training score", color="g")
  plt.semilogx(param_range, test_mean, label="Cross-validation score",
                  color="navy", lw=lw)
  # plt.plot(param_range, test_mean, 'o-', label="Cross-validation score", color="r")



  # Create plot
  plt.title("Ada Boost "+title+ " Validation")
  plt.xlabel(param_name)
  plt.ylabel("Accuracy Score")
  plt.tight_layout()
  plt.legend(loc="best")
  plt.savefig(title+'ADAvalidation.png')
  plt.figure()
Ejemplo n.º 7
0
    def learning_curve(self, X1, Y1, dataset_name):

        title = "Learning Curve for {} Dataset(KNN)".format(dataset_name)
        cv = StratifiedKFold(n_splits=10, random_state=42)
        X_train, X_test, y_train, y_test = train_test_split(X1,
                                                            Y1,
                                                            test_size=0.3)
        n_neighbors = self.GridSearchCV(X_train, y_train, dataset_name)

        estimator = KNeighborsClassifier(n_neighbors=n_neighbors)
        plot_learning_curve(estimator, title, X1, Y1, ylim=None, cv=cv)

        plt.show()
        return KNeighborsClassifier(n_neighbors=n_neighbors)
Ejemplo n.º 8
0
def Boosting(X_train,
             X_test,
             y_train,
             y_test,
             data_name,
             lc_y_min=0.4,
             lc_y_max=1.01):
    # Train Model and Predict
    # dt1 = DecisionTreeClassifier(criterion="gini", max_depth=1)
    # dt2 = DecisionTreeClassifier(criterion="gini", max_depth=2)
    dt3 = DecisionTreeClassifier(criterion="gini", max_depth=3)
    # dt4 = DecisionTreeClassifier(criterion="gini", max_depth=4)
    # dt5 = DecisionTreeClassifier(criterion="gini", max_depth=5)
    param_grid = {
        "base_estimator": [dt3],
        "learning_rate": np.linspace(0.5, 10.0, 20),
        "n_estimators": range(1, 200, 20)
    }

    clf = AdaBoostClassifier()

    # run grid search
    grid_search = GridSearchCV(clf,
                               param_grid=param_grid,
                               cv=5,
                               verbose=1,
                               n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    best_params = grid_search.best_params_
    print(best_params)
    save_cv(grid_search.cv_results_, 'Boosting', data_name)
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    X = np.concatenate((X_train, X_test))
    y = np.concatenate((y_train, y_test))

    title = 'Learning Curves (Boosting Classifier) - {}'.format(data_name)

    estimator = AdaBoostClassifier(**best_params)
    print('plotting learning curve for {}'.format(estimator))
    plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=(lc_y_min, lc_y_max),
                        cv=cv,
                        n_jobs=4)
    plt.savefig('Figs/Boosting-learningcurve-{}'.format(data_name))
Ejemplo n.º 9
0
    def learning_curve(self, X1, Y1, dataset_name):

        title = "Learning Curve for {} Dataset (Decision Tree)".format(
            dataset_name)
        cv = StratifiedKFold(n_splits=10, random_state=42)
        X_train, X_test, y_train, y_test = train_test_split(X1,
                                                            Y1,
                                                            test_size=0.3)
        max_depth = self.GridSearchCV(X_train, y_train, dataset_name)
        estimator = DecisionTreeClassifier(max_depth=max_depth,
                                           random_state=42)
        plot_learning_curve(estimator, title, X1, Y1, ylim=None, cv=cv)

        plt.show()
        return DecisionTreeClassifier(max_depth=max_depth, random_state=42)
Ejemplo n.º 10
0
def kNN(X_train, X_test, y_train, y_test, data_name):
    # Train Model and Predict
    Ks = 25
    mean_acc = np.zeros((Ks-1))
    std_acc = np.zeros((Ks-1))
    performance = {}
    performance['mean_fit_time'] = np.zeros((Ks-1))
    performance['mean_score_time'] = np.zeros((Ks-1))
    performance['mean_test_score'] = np.zeros((Ks-1))
    for n in range(1, Ks):
        # Train Model and Predict 
        train_start = time.time() 
        neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train, y_train)
        train_end = time.time()
        yhat = neigh.predict(X_test)
        test_end = time.time()
        mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)
        std_acc[n-1] = np.std(yhat == y_test)/np.sqrt(yhat.shape[0])
        performance['mean_fit_time'][n-1] = train_end - train_start
        performance['mean_score_time'][n-1] = test_end - train_end
        performance['mean_test_score'] = metrics.accuracy_score(y_test, yhat)

    plt.title('Parameter Plot - Values for K - {}'.format(data_name))
    plt.plot(range(1,Ks),mean_acc,'g')
    plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
    plt.legend(('Accuracy ', '+/- 3xstd'))
    plt.ylabel('Accuracy')
    plt.xlabel('Number of Neighbors (K)')
    plt.tight_layout()
    plt.savefig('Figs/KNN-param-plot-{}'.format(data_name))
    plt.clf()
    save_cv(performance, 'KNN', data_name)


    print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1)
    print( "The best with K<10 was", mean_acc[0:9].max(), "with k=", mean_acc[0:9].argmax()+1)
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    X = np.concatenate((X_train, X_test))
    y = np.concatenate((y_train, y_test))

    title = 'Learning Curves (kNN Classifier) - {}'.format(data_name)

    estimator = KNeighborsClassifier(n_neighbors=mean_acc.argmax()+1)
    print('plotting learning curve for {}'.format(estimator))
    plot_learning_curve(estimator, title, X, y, ylim=(0.4, 1.01), cv=cv, n_jobs=4)
    plt.savefig('Figs/KNN-learningcurve-{}'.format(data_name))
    plt.clf()
Ejemplo n.º 11
0
def run_knn(X, y, X_train, X_test, y_train, y_test, title, k):
  knn_learning = KNeighborsClassifier(n_neighbors=k)

  util.plot_learning_curve(knn_learning, title + " KNN LC", X, y, cv=10, n_jobs=-1)


  #search for an optimal value of K for KNN
  #credit https://www.youtube.com/watch?v=6dbrR-WymjI
  param_range = range(1,31)
  k_scores = []

  # for k in k_range:
  #   knn = KNeighborsClassifier(n_neighbors=k)
  #   scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
  #   # print(k, scores.mean())
  #   k_scores.append(scores.mean())
  # print(k_scores)
  knn = KNeighborsClassifier()
  param_name = "n_neighbors"
  train_scores, test_scores = validation_curve(knn, X, y, param_name, param_range, cv=5, scoring="accuracy", n_jobs =-1 )


  # Calculate mean and standard deviation for training set scores
  train_mean = np.mean(train_scores, axis=1)
  train_std = np.std(train_scores, axis=1)

  # Calculate mean and standard deviation for test set scores
  test_mean = np.mean(test_scores, axis=1)
  test_std = np.std(test_scores, axis=1)

  plt.figure()
  # Plot mean accuracy scores for training and test sets
  plt.plot(param_range, train_mean, 'o-', label="Training score", color="g")
  plt.plot(param_range, test_mean, 'o-', label="Cross-validation score", color="r")

  # Plot accurancy bands for training and test sets
  # plt.fill_between(param_range, train_mean - train_std, train_mean + train_std,alpha=.1, color="r")
  # plt.fill_between(param_range, test_mean - test_std, test_mean + test_std,alpha=.1, color="g")

  # Create plot
  plt.title("KNN Validation Curve "+title)
  plt.xlabel("Number of K Neighbors")
  plt.ylabel("Accuracy Score")
  plt.tight_layout()
  plt.legend(loc="best")
  plt.savefig(title+'KNNvalidation.png')
  plt.figure()
Ejemplo n.º 12
0
def SVM(X_train, X_test, y_train, y_test, data_name):
    # Train Model and Predict
    # param_grid = {"kernel" : ["sigmoid", "poly", "rbf"],
    #             "C" : [0.1, 0.5, 1.0, 1.5]
    #             }

    param_distributions = {
        "kernel": ["sigmoid", "poly", "rbf"],
        "gamma": np.linspace(0.001, 1.0, 1000)
    }

    # clf = svm.SVC(gamma='scale')
    clf = svm.SVC()

    # run grid search on dataset
    # grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
    grid_search = RandomizedSearchCV(clf,
                                     param_distributions=param_distributions,
                                     cv=2,
                                     n_iter=20,
                                     verbose=1,
                                     n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    best_params = grid_search.best_params_
    print(best_params)
    save_cv(grid_search.cv_results_, 'SVM', data_name)
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    X = np.concatenate((X_train, X_test))
    y = np.concatenate((y_train, y_test))

    title = 'Learning Curves (SVM Classifier) - {}'.format(data_name)

    # estimator = svm.SVC(gamma='scale', **best_params)
    estimator = svm.SVC(**best_params)
    print('plotting learning curve for {}'.format(estimator))
    plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=(0.4, 1.01),
                        cv=2,
                        n_jobs=-1)
    plt.savefig('Figs/SVM-learningcurve-{}'.format(data_name))
Ejemplo n.º 13
0
def ANN(X_train,
        X_test,
        y_train,
        y_test,
        data_name,
        lc_y_min=0.4,
        lc_y_max=1.01):
    # Train Model and Predict
    unique_vals = len(np.unique(y_test))

    clf = MLPClassifier(solver='sgd')

    param_grid = {
        "hidden_layer_sizes": [(10, )],
        "alpha": np.linspace(0.0001, 0.5, 50),
        "momentum": np.linspace(0.1, 1.0, 10)
    }
    grid_search = GridSearchCV(clf,
                               param_grid=param_grid,
                               cv=5,
                               verbose=1,
                               n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    best_params = grid_search.best_params_
    best_params = {**best_params}
    save_cv(grid_search.cv_results_, 'ANN', data_name)

    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    X = np.concatenate((X_train, X_test))
    y = np.concatenate((y_train, y_test))

    title = 'Learning Curves (ANN Classifier) - {}'.format(data_name)

    estimator = MLPClassifier(solver='sgd', **best_params)
    print('plotting learning curve for {}'.format(estimator))
    plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=(lc_y_min, lc_y_max),
                        cv=cv,
                        n_jobs=4)
    plt.savefig('Figs/ANN-learningcurve-{}'.format(data_name))
Ejemplo n.º 14
0
# %%
pred_Y = base_model.predict(test_X)
util.print_accuracy_measures(test_Y, pred_Y, label="svm_big_base_clement")

# %%
util.visualize_confusion_matrix(base_model, test_X, test_Y,
                                "svm_big_base_clement_confusion_matrix")

# %%
base_cv_results = cross_validate(base_model, train_X, train_Y, cv=KFold(5))
util.plot_cv_score(base_cv_results, title="svm_big_base_clement_cv_score_bar")

# %%
util.plot_learning_curve(base_model,
                         "svm_big_base_clement_learning_curve",
                         train_X,
                         train_Y,
                         cv=KFold(5),
                         n_jobs=4)

# %%
util.plot_word_cloud(base_model, "svm_big_base_clement_word_cloud")

# %% [markdown]
# <h2> Adding TFIDF </h2>

# %%
tfidf_model = svm.TFIDFSVMModel(ngram=(1, 2))
tfidf_model.fit(train_X, train_Y)

# %%
pred_Y = tfidf_model.predict(test_X)
Ejemplo n.º 15
0
def train_svm(filename,
              X_train,
              X_test,
              y_train,
              y_test,
              solver='rbf',
              full_param=False,
              debug=False,
              numFolds=10,
              njobs=-1,
              scalar=1,
              make_graphs=False,
              pSVM={}):
    np.random.seed(1)
    algo = 'SVM'

    start = time.time()
    if len(pSVM) == 0:
        if full_param:
            param_grid = [{
                'kernel': [solver],
                # 0.0001 - Finished for Linear
                # 'max_iter': [-1, 10000, 100000],
                # 'shrinking'   : [True, False], # Seems to just make things faster/slower on larger iterations, I think cutting down 2x is better
                # 'probability' : [True, False],
                'random_state': [1]
            }]
            if solver == 'rbf':
                param_grid[0]['C'] = [
                    0.001
                ]  #, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000, 100000]
                param_grid[0]['gamma'] = [
                    0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000,
                    100000
                ]
            elif solver == 'sigmoid':
                param_grid[0]['gamma'] = [
                    0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000
                ]
                param_grid[0]['coef0'] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
                param_grid[0]['C'] = [
                    0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000,
                    100000
                ]

            elif solver == 'poly':
                param_grid[0]['gamma'] = [
                    0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000,
                    100000
                ]
                param_grid[0]['degree'] = [1, 2, 3, 4, 5, 6, 7, 8]
                param_grid[0]['coef0'] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
                param_grid[0]['C'] = [
                    0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000,
                    100000
                ]
            elif solver == 'linear':
                param_grid[0]['C'] = [1.0]

        else:
            param_grid = [{
                'kernel': [solver],
                'C': [0.01, 0.1, 1., 10., 100],
                'cache_size': [2000],
                'random_state': [1]
            }]
            if solver == 'poly' or solver == 'linear':
                param_grid = [{
                    'kernel': [solver],
                    'C': [0.001, 0.01, 0.1, 1., 10.],
                    'cache_size': [2000],
                    'random_state': [1]
                }]
        svm_classifier = svm.SVC(probability=True)
        grid_search = GridSearchCV(svm_classifier,
                                   param_grid,
                                   cv=numFolds,
                                   scoring='roc_auc_ovr_weighted',
                                   return_train_score=True,
                                   n_jobs=njobs,
                                   verbose=debug)
        grid_search.fit(X_train, y_train)

        cvres = grid_search.cv_results_
        best_params = grid_search.best_params_

        util.save_gridsearch_to_csv(cvres, algo, filename[:-4], scalar, solver)

        svm_classifier = svm.SVC()
        svm_classifier.set_params(**best_params)
    else:
        svm_classifier = svm.SVC()
        svm_classifier.set_params(**pSVM)

    start = time.time()
    svm_classifier.fit(X_train, y_train)
    print('SVM Fit Time: ', time.time() - start)
    start = time.time()

    y_prob = svm_classifier.predict_proba(X_train)
    train_score = roc_auc_score(y_train,
                                y_prob,
                                multi_class="ovr",
                                average="weighted")
    print('SVM Train Score Time: ', time.time() - start)

    start = time.time()

    y_prob = svm_classifier.predict_proba(X_test)
    test_score = roc_auc_score(y_test,
                               y_prob,
                               multi_class="ovr",
                               average="weighted")
    print('SVM Test Score Time: ', time.time() - start)
    test_class = svm.SVC()
    test_class.set_params(**pSVM)

    if make_graphs:
        util.plot_learning_curve(svm_classifier,
                                 algo,
                                 filename[:-4],
                                 X_train,
                                 y_train,
                                 ylim=(0.0, 1.05),
                                 cv=10,
                                 n_jobs=njobs,
                                 debug=debug)
        util.compute_vc(algo,
                        'kernel', ['rbf', 'sigmoid', 'poly', 'linear'],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        svm_classifier,
                        filename[:-4],
                        test_class,
                        pSVM,
                        log=False,
                        njobs=njobs,
                        debug=debug,
                        smalllegend=True)
        util.svm_rbf_C_Gamma_viz(X_train, y_train, pSVM, njobs, filename[:-4],
                                 train_score)

        # computer Model Complexity/Validation curves
        util.compute_vc(algo,
                        'kernel', ['rbf', 'sigmoid', 'poly', 'linear'],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        svm_classifier,
                        filename[:-4],
                        test_class,
                        pSVM,
                        log=False,
                        njobs=njobs)

        util.compute_vc(algo,
                        'C',
                        [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        svm_classifier,
                        filename[:-4],
                        test_class,
                        pSVM,
                        log=True,
                        njobs=njobs,
                        debug=debug)
        if solver == 'rbf':
            util.compute_vc(algo,
                            'gamma',
                            [0.0001, 0.001, 0.01, 0.1, 1.0, 5.0, 10.0],
                            X_train,
                            y_train,
                            X_test,
                            y_test,
                            svm_classifier,
                            filename[:-4],
                            test_class,
                            pSVM,
                            log=True,
                            njobs=njobs,
                            debug=debug)
        elif solver == 'sigmoid':
            util.compute_vc(
                algo,
                'gamma', [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
                X_train,
                y_train,
                X_test,
                y_test,
                svm_classifier,
                filename[:-4],
                test_class,
                pSVM,
                log=True,
                njobs=njobs,
                debug=debug)
            util.compute_vc(algo,
                            'coef0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                            X_train,
                            y_train,
                            X_test,
                            y_test,
                            svm_classifier,
                            filename[:-4],
                            test_class,
                            pSVM,
                            log=False,
                            njobs=njobs,
                            debug=debug)
        elif solver == 'poly':
            util.compute_vc(
                algo,
                'gamma', [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
                X_train,
                y_train,
                X_test,
                y_test,
                svm_classifier,
                filename[:-4],
                test_class,
                pSVM,
                log=True,
                njobs=njobs,
                debug=debug)
            util.compute_vc(algo,
                            'coef0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                            X_train,
                            y_train,
                            X_test,
                            y_test,
                            svm_classifier,
                            filename[:-4],
                            test_class,
                            pSVM,
                            log=False,
                            njobs=njobs,
                            debug=debug)
            util.compute_vc(algo,
                            'degree', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                            X_train,
                            y_train,
                            X_test,
                            y_test,
                            svm_classifier,
                            filename[:-4],
                            test_class,
                            pSVM,
                            log=False,
                            njobs=njobs,
                            debug=debug)

    return time.time() - start, round(train_score, 4), round(test_score, 4)
Ejemplo n.º 16
0
                             label="naive_bayes_uni_base_clement")

# %%
util.visualize_confusion_matrix(
    base_model, test_X, test_Y,
    "naive_bayes_uni_base_clement_confusion_matrix")

# %%
base_cv_results = cross_validate(base_model, train_X, train_Y, cv=KFold(5))
util.plot_cv_score(base_cv_results,
                   title="naive_bayes_uni_base_clement_cv_score_bar")

# %%
util.plot_learning_curve(base_model,
                         "naive_bayes_uni_base_clement_learning_curve",
                         train_X,
                         train_Y,
                         cv=KFold(5),
                         n_jobs=4)

# %%
util.plot_word_cloud(base_model, "naive_bayes_uni_base_clement_word_cloud")

# %% [markdown]
# <h2> Adding TFIDF </h2>

# %%
tfidf_model = nb.TFIDFNaiveBayesModel()
tfidf_model.fit(train_X, train_Y)

# %%
pred_Y = tfidf_model.predict(test_X)
Ejemplo n.º 17
0
    train_accuracies = []
    test_losses = []
    test_accuracies = []

    # if log_interval < 0, stop log when training
    train_loader, test_loader = load_data(batch_size=batch_size)
    if batch_size > (len(test_loader.dataset) *test_data_part):
        print('Error!batch_size: {} > single_test_batch_size: {}'
              .format(batch_size, len(test_loader.dataset) *test_data_part))
        print('Exit!!!')
        os._exit(0)
    elif batch_size > (len(train_loader.dataset) *train_data_part):
        print('Error!batch_size: {} > single_train_batch_size: {}'
              .format(batch_size, len(train_loader.dataset) *train_data_part))
        print('Exit!!!')
        os._exit(0)

    if model_name == 'CNN_add_2_28_28':
        model = CNN_add_2_28_28()
    elif model_name == 'CNN_add_56_28':
        model = CNN_add_56_28()

    # weight seems useless
    criterion = nn.CrossEntropyLoss(weight=weight)
    optimizer = optim.Adam(model.parameters(), lr= learning_rate)
    init()

    plot_learning_curve(train_losses,train_accuracies,test_losses,test_accuracies)

    over_time = time.perf_counter()
    print('Time Cost: {:.1f}'.format(over_time - start_time))
Ejemplo n.º 18
0
                                   # here (1, 84, 84) == (2, 84, 84) == (3, 84, 84) == (4, 84, 84)

        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)  # step() method is overloaded in class RepeatActionAndMaxFrame(gym.Wrapper)
                                                                 # here observation_, reward, done, info is reached after repeating the action 4 times
                                                                 # and (2, 84, 84) == (3, 84, 84) == (4, 84, 84) != (1, 84, 84) for the first loop
                                                                 # i.e queue follows a FIFO method, observation_ is stored in (1, 84, 84)
            score += reward
            if not load_checkpoint:
                agent.store_transition(observation, action, reward, observation_, int(done))  # acts like experience replay
                agent.learn()
            else:
                env.render()
            observation = observation_
            n_steps += 1
        scores.append(score)
        steps_array.append(n_steps)
        avg_score = np.mean(scores[-100:])
        print('episode ',i, ' score: ', score,
                'average score %.1f best score %.1f epsilon %.2f' % 
                (avg_score, best_score, agent.epsilon), 
                'steps ', n_steps)
        if avg_score > best_score:
            if not load_checkpoint:
                agent.save_models()
            best_score = avg_score
        eps_history.append(agent.epsilon)

    plot_learning_curve(steps_array, scores, eps_history, figure_file)
Ejemplo n.º 19
0
    env = gym.make("CartPole-v1")
    n_games = 10000
    scores = []
    eps_history = []

    agent = Agent(lr=0.0001,
                  input_dims=env.observation_space.shape,
                  n_actions=env.action_space.n)
    for i in range(n_games):
        observation = env.reset()
        done = False
        score = 0
        while not done:
            action = agent.choose_action(observation)
            next_observation, reward, done, info = env.step(action)
            score += reward
            agent.learn(observation, action, reward, next_observation)
            observation = next_observation
        scores.append(score)
        eps_history.append(agent.epsilon)

        if i % 100 == 0:
            avg_score = np.mean(scores[-100:])
            print(
                'episode ', i, 'score %.1f avg score %.1f epsilon %.2f' %
                (score, avg_score, agent.epsilon))

    filename = 'cartpole_native_dqn.png'
    x = [i + 1 for i in range(n_games)]
    plot_learning_curve(x, scores, eps_history, filename)
Ejemplo n.º 20
0
digits_data = scale(data)
n_samples, digits_n_features = data.shape
n_digits = len(np.unique(d_labels))
run_cluster(digits_data, d_labels, n_samples, digits_n_features, n_digits,
            "Digits", True)

b_pca = PCA(n_components=7)
b_pca2_results = b_pca.fit_transform(breast_data)
run_cluster(b_pca2_results, b_labels, n_samples, breast_n_features, n_digits,
            "Breast Cancer PCA 7")
clf = MLPClassifier(solver='adam', max_iter=1000, hidden_layer_sizes=(100, 5))
timings['bc']['PCA'] = 0
start = clock()
util.plot_learning_curve(clf,
                         "B Cancer PCA ANN LC",
                         b_pca2_results,
                         b_labels,
                         cv=5,
                         n_jobs=-1)
timings['bc']['PCA'] += clock() - start

b_ica = FastICA(n_components=7)
temp = b_ica.fit_transform(breast_data)
run_cluster(temp, b_labels, n_samples, breast_n_features, n_digits,
            "Breast Cancer ICA 7")
clf = MLPClassifier(solver='adam', max_iter=1000, hidden_layer_sizes=(100, 5))
timings['bc']['ICA'] = 0
start = clock()
util.plot_learning_curve(clf,
                         "B Cancer ICA ANN LC",
                         temp,
                         b_labels,
Ejemplo n.º 21
0
    "D:\\project\\peixun\\ai_course_project_px\\1_intro\\4_anli_project_titanic\\Kaggle_Titanic_Chinese\\Kaggle_Titanic-master\\train.csv"
)

# (2) 特征工程 - 处理缺失值
data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)

# (3) 特特工程 - 类目型的特征离散/因子化
df = one_hot_encoding(data_train)
# select specific coloumn
train_df = df.filter(
    regex=
    'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*'
)
#print(train_df.describe())
train_np = train_df.as_matrix()
# y即Survival结果
y = train_np[:, 0]
# X即特征属性值
X = train_np[:, 1:]

# (5) 模型构建与训练
clf = RandomForestClassifier(criterion='gini',
                             max_depth=5,
                             n_estimators=5,
                             verbose=2)
#clf.fit(X, y)
#print(clf.predict(y))
# (6) 绘制learning curve
plot_learning_curve(clf, u"学习曲线", X, y)
Ejemplo n.º 22
0
    ("scaler", StandardScaler()),
    ("lin_reg", LinearRegression()),
])

# now the augmented dataset with the polynomial expansion (**2) can be fitted
lin_reg = polynomial_regression.fit(X, y)
y_hat = lin_reg.predict(X)

# Interpolate values
n_data_points = 500
x_new = np.linspace(X.min(), X.max(), n_data_points)
f = interp1d(X.ravel(), y_hat, kind="quadratic", axis=0)
y_smooth = f(x_new)

# Plot values versus predicted values
plt.plot(x_new, y_smooth, linestyle="-", color="#AA00AA")
plt.scatter(X, y, c="#00AAAA")
plt.scatter(X, y_hat, c="#FFFF55")
plt.axis([-3, 3, 0, 10])
plt.show()

plot_learning_curve(
    polynomial_regression,
    X,
    y,
    train_sizes=np.linspace(0.1, 1, 50),
    cv=5,
    n_jobs=multiprocessing.cpu_count() - 2,
    scoring="neg_mean_squared_error",
)
                _, train_loss = session.run([train_op, total_loss], {input_tensor: batch_xs})

                # Append loss to the list
                loss_list_train.append(train_loss)

            # Save the model After Completion
            path_prefix  =saver.save(session,os.path.join(save_directory,"homework_2"))

            return loss_list_train
    ###########################################################

if not (model_name == 'autoencoder'):
    acc_val_list,ce_val_list,acc_test,ce_test, acc_train_list, ce_train_list, conf_matrix_test, classification_report_test,actual_labels_test,pred_labels_test,best_epoch = main_fun(batch_size, epochs, kernel_size, use_early_stopping, patience_no)

    # Create and plot the learning curve using training and validation sets
    plot_learning_curve(acc_train_list,acc_val_list,ce_train_list,ce_val_list)

    # Save the correct test class labels as pickle file for later analysis
    pickle_out = open("actual_labels_test.pickle","wb")
    pickle.dump(actual_labels_test, pickle_out)
    pickle_out.close()

    # Save predicted test class labels as pickle file for later analysis
    pickle_out = open("pred_labels_test.pickle","wb")
    pickle.dump(pred_labels_test, pickle_out)
    pickle_out.close()

elif model_name=='autoencoder':
    loss_list_train = main_fun(batch_size, epochs, kernel_size, use_early_stopping, patience_no)
    # Calculate average loss
    avg_train_loss = sum(loss_list_train) / len(loss_list_train) 
Ejemplo n.º 24
0
def DT(X_train,
       X_test,
       y_train,
       y_test,
       data_name,
       lc_y_min=0.4,
       lc_y_max=1.01):
    # Train Model and Predict
    param_grid = {
        "criterion": ["gini", "entropy"],
        "max_depth": range(1, 50),
    }

    clf = DecisionTreeClassifier()

    # run grid search
    grid_search = GridSearchCV(clf,
                               param_grid=param_grid,
                               cv=5,
                               verbose=1,
                               n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    best_params = grid_search.best_params_
    print(best_params)
    save_cv(grid_search.cv_results_, 'DT', data_name)

    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    X = np.concatenate((X_train, X_test))
    y = np.concatenate((y_train, y_test))

    title = 'Learning Curves (DT Classifier) - {}'.format(data_name)

    estimator = DecisionTreeClassifier(**best_params)
    print('plotting learning curve for {}'.format(estimator))
    plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=(lc_y_min, lc_y_max),
                        cv=cv,
                        n_jobs=4)
    plt.savefig('Figs/DT-learningcurve-{}'.format(data_name))
    plt.clf()

    # Plot param tuning
    n = 26
    test_mean_acc1 = np.zeros((n - 1))
    test_std_acc1 = np.zeros((n - 1))
    train_mean_acc1 = np.zeros((n - 1))
    train_std_acc1 = np.zeros((n - 1))
    for n in range(1, n):
        # Train Model and Predict
        print('Max depth: ', n)
        tree = DecisionTreeClassifier(criterion="gini", max_depth=n)
        tree.fit(X_train, y_train)
        y_hat = tree.predict(X_test)
        y_hat_train = tree.predict(X_train)
        test_mean_acc1[n - 1] = metrics.accuracy_score(y_test, y_hat)
        test_std_acc1[n -
                      1] = np.std(y_hat == y_test) / np.sqrt(y_hat.shape[0])
        train_mean_acc1[n - 1] = metrics.accuracy_score(y_train, y_hat_train)
        train_std_acc1[n - 1] = np.std(y_hat_train == y_train) / np.sqrt(
            y_hat_train.shape[0])

    plt.plot(range(1, n + 1), test_mean_acc1, 'r')
    plt.fill_between(range(1, n + 1),
                     test_mean_acc1 - 1 * test_std_acc1,
                     test_mean_acc1 + 1 * test_std_acc1,
                     alpha=0.10)
    plt.plot(range(1, n + 1), train_mean_acc1, 'm')
    plt.fill_between(range(1, n + 1),
                     train_mean_acc1 - 1 * train_std_acc1,
                     train_mean_acc1 + 1 * train_std_acc1,
                     alpha=0.10)
    plt.legend(('Test Accuracy - {}'.format(data_name),
                'Training Accuracy - {}'.format(data_name)))
    plt.ylabel('Accuracy')
    plt.xlabel('Decision Tree Depth')
    plt.tight_layout()
    plt.savefig('Figs/DT-depth-{}'.format(data_name))
    plt.clf()
Ejemplo n.º 25
0
# %% [markdown]
# <h2> Adding TFIDF </h2>

# %%
tfidf_model = lr.TFIDFLogRegModel()
tfidf_model.fit(train_X, train_Y)

# %%
pred_Y = tfidf_model.predict(test_X)
util.print_accuracy_measures(test_Y, pred_Y, label="log_reg_uni_tfidf_clement")

# %%
util.visualize_confusion_matrix(tfidf_model, test_X, test_Y,
                                "log_reg_uni_tfidf_clement_confusion_matrix")

# %%
tfidf_cv_results = cross_validate(tfidf_model, train_X, train_Y, cv=KFold(5))
util.plot_cv_score(tfidf_cv_results,
                   title="log_reg_uni_tfidf_clement_cv_score_bar")

# %%
util.plot_learning_curve(tfidf_model,
                         "log_reg_uni_tfidf_clement_learning_curve",
                         train_X,
                         train_Y,
                         cv=KFold(5),
                         n_jobs=4)

# %%
util.plot_word_cloud(tfidf_model, "log_reg_uni_tfidf_clement_word_cloud")
Ejemplo n.º 26
0
# %%
pred_Y = base_model.predict(test_X)
util.print_accuracy_measures(test_Y, pred_Y, label="log_reg_big_base_comp")

# %%
util.visualize_confusion_matrix(base_model, test_X, test_Y,
                                "log_reg_big_base_comp_confusion_matrix")

# %%
base_cv_results = cross_validate(base_model, train_X, train_Y, cv=KFold(5))
util.plot_cv_score(base_cv_results, title="log_reg_big_base_comp_cv_score_bar")

# %%
util.plot_learning_curve(base_model,
                         "log_reg_big_base_comp_learning_curve",
                         train_X,
                         train_Y,
                         cv=KFold(5),
                         n_jobs=4)

# %%
util.plot_word_cloud(base_model, "log_reg_big_base_comp_word_cloud")

# %% [markdown]
# <h2> Adding TFIDF </h2>

# %%
tfidf_model = lr.TFIDFLogRegModel(ngram=(1, 2))
tfidf_model.fit(train_X, train_Y)

# %%
pred_Y = tfidf_model.predict(test_X)
Ejemplo n.º 27
0
def run_cluster(data,
                labels,
                n_samples,
                n_features,
                n_digits,
                title,
                run_extra=False):

    train_score = defaultdict(list)
    train_score['k-means'] = []
    train_score['gmm'] = []

    for i in iterations:
        n_digits = i

        kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=1)
        kmeans.fit_transform(data)
        cluster_feature = kmeans.labels_
        train_score['k-means'].append(
            metrics.v_measure_score(labels, kmeans.labels_))

        gaus = GaussianMixture(n_components=n_digits)
        gaus.fit(data)
        cluster_feature2 = gaus.predict(data)
        train_score['gmm'].append(gaus.bic(data))
        # print('pre', data.shape)
        # print('pre1', cluster_feature.shape)
        if run_extra:
            clf = MLPClassifier(solver='adam',
                                max_iter=1000,
                                hidden_layer_sizes=(100, 5))
            timings[title]['default'] = 0
            start = clock()
            util.plot_learning_curve(clf,
                                     title + " Basic ANN LC",
                                     data,
                                     labels,
                                     cv=5,
                                     n_jobs=-1)
            timings[title]['default'] += clock() - start

        data1 = np.column_stack((data, cluster_feature))
        # print('post1', data.shape)
        # print('EM', cluster_feature2.shape)
        data2 = np.column_stack((data, cluster_feature2))

        if run_extra:
            clf = MLPClassifier(solver='adam',
                                max_iter=1000,
                                hidden_layer_sizes=(100, 5))
            timings[title]['KmeansXtra'] = 0
            start = clock()
            util.plot_learning_curve(clf,
                                     title + " KM Cluster Feature ANN LC",
                                     data1,
                                     labels,
                                     cv=5,
                                     n_jobs=-1)
            timings[title]['KmeansXtra'] += clock() - start

            clf = MLPClassifier(solver='adam',
                                max_iter=1000,
                                hidden_layer_sizes=(100, 5))
            timings[title]['EMxtra'] = 0
            start = clock()
            util.plot_learning_curve(clf,
                                     title + " EM Cluster Feature ANN LC",
                                     data2,
                                     labels,
                                     cv=5,
                                     n_jobs=-1)
            timings[title]['EMxtra'] += clock() - start

    plt.figure()
    ticks = range(len(iterations))

    plt.plot(ticks,
             train_score['k-means'],
             'o-',
             label="Train Score",
             color="g")
    plt.xticks(ticks, iterations)
    plt.title(title + "K Means V-Measure Score")
    plt.xlabel("N Clusters")
    plt.ylabel("V Measure")
    plt.tight_layout()
    plt.legend(loc="best")
    plt.savefig(title + 'KmeansVmeasure.png')

    plt.figure()
    ticks = range(len(iterations))
    plt.plot(ticks, train_score['gmm'], 'o-', label="Train Score", color="g")
    plt.xticks(ticks, iterations)
    plt.title(title + "EM BIC Score")
    plt.xlabel("N Clusters")
    plt.ylabel("BIC")
    plt.tight_layout()
    plt.legend(loc="best")
    plt.savefig(title + 'EMbic.png')
Ejemplo n.º 28
0
# %%
pred_Y = base_model.predict(test_X)
util.print_accuracy_measures(test_Y, pred_Y, label="svm_uni_base_comp")

# %%
util.visualize_confusion_matrix(base_model, test_X, test_Y,
                                "svm_uni_base_comp_confusion_matrix")

# %%
base_cv_results = cross_validate(base_model, train_X, train_Y, cv=KFold(5))
util.plot_cv_score(base_cv_results, title="svm_uni_base_comp_cv_score_bar")

# %%
util.plot_learning_curve(base_model,
                         "svm_uni_base_comp_learning_curve",
                         train_X,
                         train_Y,
                         cv=KFold(5),
                         n_jobs=4)

# %%
util.plot_word_cloud(base_model, "svm_uni_base_comp_word_cloud")

# %% [markdown]
# <h2> Adding TFIDF </h2>

# %%
tfidf_model = svm.TFIDFSVMModel()
tfidf_model.fit(train_X, train_Y)

# %%
pred_Y = tfidf_model.predict(test_X)
Ejemplo n.º 29
0
# X即特征属性值
X = train_np[:, 1:]

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
print(X.shape)
X = poly.fit_transform(X)
print(X.shape)

# # (5) 模型构建与训练
clf = RandomForestClassifier(criterion='gini',
                             max_depth=1,
                             n_estimators=1,
                             verbose=0)
# (6) 绘制learning curve
plot_learning_curve(clf, u"学习曲线", X, y, cv=10)

# (5) 模型构建与训练 - 训练集精度提升明显
clf = RandomForestClassifier(criterion='gini',
                             max_depth=30,
                             n_estimators=1,
                             verbose=0)
# (6) 绘制learning curve

plot_learning_curve(clf, u"学习曲线-", X, y, cv=10)

# (5) 模型构建与训练 - 可以适当减缓过拟合
clf = RandomForestClassifier(criterion='gini',
                             max_depth=6,
                             n_estimators=30,
                             verbose=0)
Ejemplo n.º 30
0
    scores = []
    winpct = []
    eps_history = []
    n_episodes = 10000

    agent = Agent(env.observation_space.shape, env.action_space.n)

    n_plays = 5000

    for i in tqdm(range(n_plays)):
        observation = env.reset()
        done = None
        score = 0
        while not done:
            action = agent.choose_action(observation)
            next_observation, reward, done, info = env.step(action)
            score += reward
            agent.learn(observation, action, reward, next_observation)
            observation = next_observation
        scores.append(score)
        eps_history.append(agent.eps)

        if i % 100 == 0:
            avg_score = np.mean(scores[-100:])
            print(
                'episode', i, 'score %.1f avg score %.1f epsilon %.2f' %
                (score, avg_score, agent.eps))

    filename = 'cartpole_naive_dqn.png'
    plot_learning_curve(range(n_plays), scores, eps_history, filename)