Beispiel #1
0
def gridSearch():
    pipe = Pipeline([('scaler', StandardScaler()), ('svm', SVC())])
    params = [{
        'svm__kernel': ['linear']
    }, {
        'svm__kernel': ['poly'],
        'svm__degree': [2, 3, 4, 5, 6, 7, 8, 9, 10]
    }, {
        'svm__kernel': ['rbf'],
        'svm__gamma': [
            2**(-10), 2**(-9), 2**(-8), 2**(-7), 2**(-6), 2**(-5), 2**(-4),
            2**(-3), 2**(-2), 2**(-1), 2**0
        ]
    }]
    clf = GridSearchCV(pipe, param_grid=params, cv=5)
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    clf.fit(train_features_cancer, train_labels_cancer)
    best_train_acc_cancer = clf.score(train_features_cancer,
                                      train_labels_cancer)
    best_test_acc_cancer = clf.score(test_features_cancer, test_labels_cancer)
    print('best train accuracy for cancer SVM is:', best_train_acc_cancer)
    print('best test accuracy for cancer SVM is:', best_test_acc_cancer)
    print('best parameters for cancer SVM is:', clf.best_params_)

    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    clf.fit(train_features_spam, train_labels_spam)
    best_train_acc_spam = clf.score(train_features_spam, train_labels_spam)
    best_test_acc_spam = clf.score(test_features_spam, test_labels_spam)
    print('best train accuracy for spam SVM is:', best_train_acc_spam)
    print('best test accuracy for spam SVM is:', best_test_acc_spam)
    print('best parameters for spam SVM is:', clf.best_params_)
def gridSearch():
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('knn', KNeighborsClassifier())])
    k_list = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
    clf = GridSearchCV(pipe, [{'knn__n_neighbors': k_list}], cv=5)
    clf.fit(train_features_cancer, train_labels_cancer)
    best_train_acc_cancer = clf.score(train_features_cancer,
                                      train_labels_cancer)
    best_test_acc_cancer = clf.score(test_features_cancer, test_labels_cancer)
    print('best train accuracy for cancer knn classifier is:',
          best_train_acc_cancer)
    print('best test accuracy for cancer knn classifier is:',
          best_test_acc_cancer)
    print('best hyperparameters for cancer knn classifier is:',
          clf.best_params_)
    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    clf.fit(train_features_spam, train_labels_spam)
    best_train_acc_spam = clf.score(train_features_spam, train_labels_spam)
    best_test_acc_spam = clf.score(test_features_spam, test_labels_spam)
    print('------------------------------------------------')
    print('best train accuracy for spam knn classifier is:',
          best_train_acc_spam)
    print('best test accuracy for spam knn classifier is:', best_test_acc_spam)
    print('best hyperparameters for spam knn classifier is:', clf.best_params_)
def gridsearch():
    my_tree = tree.DecisionTreeClassifier()
    param_grid = [{
        'max_depth': [2, 4, 6, 8, 10]
    }, {
        'min_samples_leaf': [2, 4, 8, 16, 32, 64, 128, 256]
    }, {
        'ccp_alpha': [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128]
    }]
    clf = GridSearchCV(my_tree, param_grid, scoring='accuracy', cv=5)
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    best_cancer_clf = clf.fit(train_features_cancer, train_labels_cancer)
    print('Best parameter for breast cancer classifier:',
          best_cancer_clf.best_params_)
    accuracy_cancer_train = best_cancer_clf.score(train_features_cancer,
                                                  train_labels_cancer)
    accuracy_cancer_test = best_cancer_clf.score(test_features_cancer,
                                                 test_labels_cancer)
    print('Training accuracy for best breast cancer classifier :',
          accuracy_cancer_train)
    print('Test accuracy for best breast cancer classifier :',
          accuracy_cancer_test)
    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    best_spam_clf = clf.fit(train_features_spam, train_labels_spam)
    accuracy_spam_train = best_spam_clf.score(train_features_spam,
                                              train_labels_spam)
    accuracy_spam_test = best_spam_clf.score(test_features_spam,
                                             test_labels_spam)
    print('--------------------------------------------------------------')
    print('Best parameter for spam classifier:', best_spam_clf.best_params_)
    print('Training accuracy for best spam classifier :', accuracy_spam_train)
    print('Test accuracy for best spam classifier :', accuracy_spam_test)
def performance_curve_one_hidden_few_units():
    scaler = StandardScaler()
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    acc_train_list_cancer = []
    acc_test_list_cancer = []
    for num in num_cancer:
        scaler.fit(train_features_cancer[:num, :])
        train_features_norm_cancer = scaler.transform(
            train_features_cancer[:num, :])
        test_features_norm_cancer = scaler.transform(test_features_cancer)
        _, acc_cancer = one_hidden_model(train_features_norm_cancer,
                                         train_labels_cancer[:num],
                                         test_features_norm_cancer,
                                         test_labels_cancer,
                                         hidden_units=4)
        acc_train_cancer, acc_test_cancer = acc_cancer
        acc_train_list_cancer.append(acc_train_cancer)
        acc_test_list_cancer.append(acc_test_cancer)

    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    acc_train_list_spam = []
    acc_test_list_spam = []
    for num in num_spam:
        scaler.fit(train_features_spam[:num, :])
        train_features_norm_spam = scaler.transform(
            train_features_spam[:num, :])
        test_features_norm_spam = scaler.transform(test_features_spam)
        _, acc_spam = one_hidden_model(train_features_norm_spam,
                                       train_labels_spam[:num],
                                       test_features_norm_spam,
                                       test_labels_spam,
                                       hidden_units=4)
        acc_train_spam, acc_test_spam = acc_spam
        acc_train_list_spam.append(acc_train_spam)
        acc_test_list_spam.append(acc_test_spam)
    plt.figure(figsize=(10, 4))
    plt.subplot(121)
    plt.plot(num_cancer, acc_train_list_cancer, label='train acc')
    plt.plot(num_cancer, acc_test_list_cancer, label='test acc')
    plt.xlabel('training size')
    plt.ylabel('accuracy')
    plt.legend(loc='upper right')
    plt.title(
        'accuracy vs trainig size for cancer classifier \nwith one hidden layer 4 hidden units'
    )
    plt.subplot(122)
    plt.plot(num_spam, acc_train_list_spam, label='train acc')
    plt.plot(num_spam, acc_test_list_spam, label='test acc')
    plt.xlabel('training size')
    plt.ylabel('accuracy')
    plt.legend(loc='upper right')
    plt.title(
        'accuracy vs trainig size for spam classifier \nwith one hidden layer 4 hidden units'
    )
    plt.show()
def batch_vs_stocastic():
    scaler = StandardScaler()
    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    scaler.fit(train_features_spam)
    train_features_spam_norm = scaler.transform(train_features_spam)
    test_features_spam_norm = scaler.transform(test_features_spam)
    best_spam_classifier_batch(train_features_spam_norm, train_labels_spam,
                               test_features_spam_norm, test_labels_spam)
    best_spam_classifier_stocastic(train_features_spam_norm, train_labels_spam,
                                   test_features_spam_norm, test_labels_spam)
def post_pruning_boosting_tree_performance():
    pruning_tree = DecisionTreeClassifier(ccp_alpha=0.015)
    num_trees_list = [i + 1 for i in range(20)]
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    acc_train_cancer_list = []
    acc_test_cancer_list = []
    boost_classifier = AdaBoostClassifier(pruning_tree, n_estimators=1)
    for num_trees in num_trees_list:
        boost_classifier.set_params(n_estimators=num_trees)
        boost_classifier.fit(train_features_cancer, train_labels_cancer)
        acc_train_cancer = boost_classifier.score(train_features_cancer,
                                                  train_labels_cancer)
        acc_train_cancer_list.append(acc_train_cancer)
        acc_test_cancer = boost_classifier.score(test_features_cancer,
                                                 test_labels_cancer)
        acc_test_cancer_list.append(acc_test_cancer)

    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    acc_train_spam_list = []
    acc_test_spam_list = []
    for num_trees in num_trees_list:
        boost_classifier.set_params(base_estimator__ccp_alpha=0.005,
                                    n_estimators=num_trees)
        boost_classifier.fit(train_features_spam, train_labels_spam)
        acc_train_spam = boost_classifier.score(train_features_spam,
                                                train_labels_spam)
        acc_train_spam_list.append(acc_train_spam)
        acc_test_spam = boost_classifier.score(test_features_spam,
                                               test_labels_spam)
        acc_test_spam_list.append(acc_test_spam)
    plt.figure(figsize=(10, 6))
    plt.subplot(121)
    plt.plot(num_trees_list, acc_train_cancer_list, label='train')
    plt.plot(num_trees_list, acc_test_cancer_list, label='test')
    plt.xlabel('num of trees')
    plt.ylabel('accuracy')
    plt.title(
        'post-pruning boosting cancer classifer \nperformance vs number of boosting trees'
    )
    plt.legend(loc='upper right')
    plt.subplot(122)
    plt.plot(num_trees_list, acc_train_spam_list, label='train')
    plt.plot(num_trees_list, acc_test_spam_list, label='test')
    plt.xlabel('num of trees')
    plt.ylabel('accuracy')
    plt.title(
        'post-pruning boosting spam classifer \nperformance vs number of boosting trees'
    )
    plt.legend(loc='upper right')
    plt.show()
def radius_neighbors_r():
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    r_list = [
        2**0, 2**1, 2**2, 2**3, 2**4, 2**5, 2**6, 2**7, 2**8, 2**9, 2**10
    ]
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('rn', RadiusNeighborsClassifier(outlier_label='most_frequent'))
    ])
    acc_train_cancer_list = []
    acc_test_cancer_list = []
    for r in r_list:
        pipe.set_params(rn__radius=r)
        pipe.fit(train_features_cancer, train_labels_cancer)
        acc_train_cancer = pipe.score(train_features_cancer,
                                      train_labels_cancer)
        acc_test_cancer = pipe.score(test_features_cancer, test_labels_cancer)
        acc_train_cancer_list.append(acc_train_cancer)
        acc_test_cancer_list.append(acc_test_cancer)

    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    acc_train_spam_list = []
    acc_test_spam_list = []
    for r in r_list:
        pipe.set_params(rn__radius=r)
        pipe.fit(train_features_spam, train_labels_spam)
        acc_train_spam = pipe.score(train_features_spam, train_labels_spam)
        acc_test_spam = pipe.score(test_features_spam, test_labels_spam)
        acc_train_spam_list.append(acc_train_spam)
        acc_test_spam_list.append(acc_test_spam)
    plt.figure(figsize=(10, 6))
    plt.subplot(121)
    plt.plot(r_list, acc_train_cancer_list)
    plt.plot(r_list, acc_test_cancer_list)
    plt.xscale('log')
    plt.xlabel('radius value')
    plt.ylabel('accuracy')
    plt.title('radius neighbor cancer \nclassifier performance vs K')
    plt.subplot(122)
    plt.plot(r_list, acc_train_spam_list)
    plt.plot(r_list, acc_test_spam_list)
    plt.xscale('log')
    plt.xlabel('radius value')
    plt.ylabel('accuracy')
    plt.title('radius neighbor spam \nclassifier performance vs K')
    plt.show()
def accuracy_vs_num_tree():
    max_depth_tree = DecisionTreeClassifier(max_depth=3)
    num_trees_list = [i + 1 for i in range(100)]
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    acc_train_cancer_list = []
    acc_test_cancer_list = []
    boost_classifier = AdaBoostClassifier(max_depth_tree, n_estimators=1)
    for num_trees in num_trees_list:
        boost_classifier.set_params(n_estimators=num_trees)
        boost_classifier.fit(train_features_cancer, train_labels_cancer)
        acc_train_cancer = boost_classifier.score(train_features_cancer,
                                                  train_labels_cancer)
        acc_train_cancer_list.append(acc_train_cancer)
        acc_test_cancer = boost_classifier.score(test_features_cancer,
                                                 test_labels_cancer)
        acc_test_cancer_list.append(acc_test_cancer)

    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    acc_train_spam_list = []
    acc_test_spam_list = []
    for num_trees in num_trees_list:
        boost_classifier.set_params(n_estimators=num_trees)
        boost_classifier.fit(train_features_spam, train_labels_spam)
        acc_train_spam = boost_classifier.score(train_features_spam,
                                                train_labels_spam)
        acc_train_spam_list.append(acc_train_spam)
        acc_test_spam = boost_classifier.score(test_features_spam,
                                               test_labels_spam)
        acc_test_spam_list.append(acc_test_spam)
    plt.figure(figsize=(10, 6))
    plt.subplot(121)
    plt.plot(num_trees_list, acc_train_cancer_list, label='train')
    plt.plot(num_trees_list, acc_test_cancer_list, label='test')
    plt.xlabel('num of trees')
    plt.ylabel('accuracy')
    plt.title('cancer accuracy vs number of boosting trees')
    plt.legend(loc='upper right')
    plt.subplot(122)
    plt.plot(num_trees_list, acc_train_spam_list, label='train')
    plt.plot(num_trees_list, acc_test_spam_list, label='test')
    plt.xlabel('num of trees')
    plt.ylabel('accuracy')
    plt.title('spam accuracy vs number of boosting trees')
    plt.legend(loc='upper right')
    plt.show()
Beispiel #9
0
def performance_vs_gamma():
    gamma_list = [
        2**(-10), 2**(-9), 2**(-8), 2**(-7), 2**(-6), 2**(-5), 2**(-4),
        2**(-3), 2**(-2), 2**(-1), 2**0
    ]
    pipe = Pipeline([('scaler', StandardScaler()), ('svm', SVC(kernel='rbf'))])
    acc_train_cancer_list = []
    acc_test_cancer_list = []
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    for gamma in gamma_list:
        pipe.set_params(svm__gamma=gamma)
        pipe.fit(train_features_cancer, train_labels_cancer)
        acc_train_cancer = pipe.score(train_features_cancer,
                                      train_labels_cancer)
        acc_test_cancer = pipe.score(test_features_cancer, test_labels_cancer)
        acc_train_cancer_list.append(acc_train_cancer)
        acc_test_cancer_list.append(acc_test_cancer)

    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    acc_train_spam_list = []
    acc_test_spam_list = []
    for gamma in gamma_list:
        pipe.set_params(svm__gamma=gamma)
        pipe.fit(train_features_spam, train_labels_spam)
        acc_train_spam = pipe.score(train_features_spam, train_labels_spam)
        acc_test_spam = pipe.score(test_features_spam, test_labels_spam)
        acc_train_spam_list.append(acc_train_spam)
        acc_test_spam_list.append(acc_test_spam)
    plt.figure(figsize=(10, 6))
    plt.subplot(121)
    plt.plot(gamma_list, acc_train_cancer_list, label='train')
    plt.plot(gamma_list, acc_test_cancer_list, label='test')
    plt.xscale('log')
    plt.xlabel('gamma')
    plt.ylabel('accuracy')
    plt.title('rbf kernel cancer classifier \nperformance vs gamma')
    plt.subplot(122)
    plt.plot(gamma_list, acc_train_spam_list, label='train')
    plt.plot(gamma_list, acc_test_spam_list, label='test')
    plt.xscale('log')
    plt.xlabel('gamma')
    plt.ylabel('accuracy')
    plt.title('rbf kernel spam classifier \nperformance vs gamma')
    plt.show()
def learning_curve():
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('knn', KNeighborsClassifier())])
    num_cancer = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
    num_spam = [400, 800, 1200, 1600, 2000, 2400, 2800, 3200, 3600, 4000]
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    acc_train_cancer_list = []
    acc_test_cancer_list = []
    for num in num_cancer:
        pipe.set_params(knn__n_neighbors=3)
        pipe.fit(train_features_cancer[:num, :], train_labels_cancer[:num])
        acc_train_cancer = pipe.score(train_features_cancer[:num, :],
                                      train_labels_cancer[:num])
        acc_test_cancer = pipe.score(test_features_cancer, test_labels_cancer)
        acc_train_cancer_list.append(acc_train_cancer)
        acc_test_cancer_list.append(acc_test_cancer)

    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    acc_train_spam_list = []
    acc_test_spam_list = []
    for num in num_spam:
        pipe.set_params(knn__n_neighbors=3)
        pipe.fit(train_features_spam[:num, :], train_labels_spam[:num])
        acc_train_spam = pipe.score(train_features_spam[:num, :],
                                    train_labels_spam[:num])
        acc_test_spam = pipe.score(test_features_spam, test_labels_spam)
        acc_train_spam_list.append(acc_train_spam)
        acc_test_spam_list.append(acc_test_spam)

    plt.figure(figsize=(10, 6))
    plt.subplot(121)
    plt.plot(num_cancer, acc_train_cancer_list)
    plt.plot(num_cancer, acc_test_cancer_list)
    plt.xlabel('training size')
    plt.ylabel('accuracy')
    plt.title('knn cancer classifier \nperformance vs training size')
    plt.subplot(122)
    plt.plot(num_spam, acc_train_spam_list)
    plt.plot(num_spam, acc_test_spam_list)
    plt.xlabel('training size')
    plt.ylabel('accuracy')
    plt.title('knn spam classifier \nperformance vs training size')
    plt.show()
Beispiel #11
0
def spam_classifier_with_different_training_size():
    max_depth_tree = DecisionTreeClassifier(max_depth=3)
    clf = AdaBoostClassifier(max_depth_tree, n_estimators=10)
    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    acc_train_list = []
    acc_test_list = []
    for num in num_spam:
        clf.fit(train_features_spam[:num, :], train_labels_spam[:num])
        acc_train = clf.score(train_features_spam[:num, :],
                              train_labels_spam[:num])
        acc_test = clf.score(test_features_spam, test_labels_spam)
        acc_train_list.append(acc_train)
        acc_test_list.append(acc_test)

    pruning_tree = DecisionTreeClassifier(ccp_alpha=0.012)
    clf_pruning = AdaBoostClassifier(pruning_tree, n_estimators=10)
    acc_train_list_post = []
    acc_test_list_post = []
    for num in num_spam:
        clf_pruning.fit(train_features_spam[:num, :], train_labels_spam[:num])
        acc_train_post = clf_pruning.score(train_features_spam[:num, :],
                                           train_labels_spam[:num])
        acc_test_post = clf_pruning.score(test_features_spam, test_labels_spam)
        acc_train_list_post.append(acc_train_post)
        acc_test_list_post.append(acc_test_post)

    plt.figure(figsize=(10, 6))
    plt.subplot(121)
    plt.plot(num_spam, acc_train_list, label='train')
    plt.plot(num_spam, acc_test_list, label='test')
    plt.xlabel('training size')
    plt.ylabel('accuracy')
    plt.title(
        'boosting spam classifier with pre-pruning trees \nperformance vs training size'
    )
    plt.subplot(122)
    plt.plot(num_spam, acc_train_list_post, label='train')
    plt.plot(num_spam, acc_test_list_post, label='test')
    plt.xlabel('training size')
    plt.ylabel('accuracy')
    plt.title(
        'boosting spam classifier with post-pruning trees \nperformance vs training size'
    )
    plt.show()
def KNN_vs_k():
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    k_list = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('knn', KNeighborsClassifier())])
    acc_train_cancer_list = []
    acc_test_cancer_list = []
    for k in k_list:
        pipe.set_params(knn__n_neighbors=k)
        pipe.fit(train_features_cancer, train_labels_cancer)
        acc_train_cancer = pipe.score(train_features_cancer,
                                      train_labels_cancer)
        acc_test_cancer = pipe.score(test_features_cancer, test_labels_cancer)
        acc_train_cancer_list.append(acc_train_cancer)
        acc_test_cancer_list.append(acc_test_cancer)

    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    acc_train_spam_list = []
    acc_test_spam_list = []
    for k in k_list:
        pipe.set_params(knn__n_neighbors=k)
        pipe.fit(train_features_spam, train_labels_spam)
        acc_train_spam = pipe.score(train_features_spam, train_labels_spam)
        acc_test_spam = pipe.score(test_features_spam, test_labels_spam)
        acc_train_spam_list.append(acc_train_spam)
        acc_test_spam_list.append(acc_test_spam)
    plt.figure(figsize=(10, 6))
    plt.subplot(121)
    plt.plot(k_list, acc_train_cancer_list)
    plt.plot(k_list, acc_test_cancer_list)
    plt.xticks(k_list)
    plt.xlabel('k value')
    plt.ylabel('accuracy')
    plt.title('knn cancer classifier performance vs K')
    plt.subplot(122)
    plt.plot(k_list, acc_train_spam_list)
    plt.plot(k_list, acc_test_spam_list)
    plt.xticks(k_list)
    plt.xlabel('k value')
    plt.ylabel('accuracy')
    plt.title('knn spam classifier performance vs K')
    plt.show()
Beispiel #13
0
def rbf_svc():
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('svm', SVC(kernel='rbf', gamma=5.e-1))])
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    acc_train_cancer_list = []
    acc_test_cancer_list = []
    for num in num_cancer:
        pipe.fit(train_features_cancer[:num, :], train_labels_cancer[:num])
        acc_train_cancer = pipe.score(train_features_cancer[:num, :],
                                      train_labels_cancer[:num])
        acc_test_cancer = pipe.score(test_features_cancer, test_labels_cancer)
        acc_train_cancer_list.append(acc_train_cancer)
        acc_test_cancer_list.append(acc_test_cancer)

    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    acc_train_spam_list = []
    acc_test_spam_list = []
    pipe.set_params(svm__gamma=5.e-2)
    for num in num_spam:
        pipe.fit(train_features_spam[:num, :], train_labels_spam[:num])
        acc_train_spam = pipe.score(train_features_spam[:num, :],
                                    train_labels_spam[:num])
        acc_test_spam = pipe.score(test_features_spam, test_labels_spam)
        acc_train_spam_list.append(acc_train_spam)
        acc_test_spam_list.append(acc_test_spam)
    plt.figure(figsize=(10, 6))
    plt.subplot(121)
    plt.plot(num_cancer, acc_train_cancer_list, label='train')
    plt.plot(num_cancer, acc_test_cancer_list, label='test')
    plt.xlabel('training size')
    plt.ylabel('accuracy')
    plt.title('rbf kernel cancer smv performance \nvs training size')
    plt.legend(loc='upper right')
    plt.subplot(122)
    plt.plot(num_spam, acc_train_spam_list, label='train')
    plt.plot(num_spam, acc_test_spam_list, label='test')
    plt.xlabel('training size')
    plt.ylabel('accuracy')
    plt.title('rbf kernel spam smv performance \nvs training size')
    plt.legend(loc='upper right')
    plt.show()
def leaf_num_limited_tree():
    accuracy_cancer_train = []
    accuracy_cancer_test = []
    accuracy_spam_train = []
    accuracy_spam_test = []
    clf = tree.DecisionTreeClassifier(min_samples_leaf=10)
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    for num in num_cancer:
        clf.fit(train_features_cancer[:num, :], train_labels_cancer[:num])
        acc_train_cancer = clf.score(train_features_cancer[:num, :],
                                     train_labels_cancer[:num])
        accuracy_cancer_train.append(acc_train_cancer)
        acc_test_cancer = clf.score(test_features_cancer, test_labels_cancer)
        accuracy_cancer_test.append(acc_test_cancer)
    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    clf.set_params(min_samples_leaf=10)
    for num in num_spam:
        clf.fit(train_features_spam[:num, :], train_labels_spam[:num])
        acc_train_spam = clf.score(train_features_spam[:num, :],
                                   train_labels_spam[:num])
        accuracy_spam_train.append(acc_train_spam)
        acc_test_spam = clf.score(test_features_spam, test_labels_spam)
        accuracy_spam_test.append(acc_test_spam)
    plt.figure(figsize=(10, 6))
    plt.subplot(121)
    plt.plot(num_cancer, accuracy_cancer_train, label='training')
    plt.plot(num_cancer, accuracy_cancer_test, label='test')
    plt.title(
        'breast cancer performance with \nat least 10 examples at leaf nodes')
    plt.xlabel('size of training examples')
    plt.ylabel('accuracy')
    plt.legend(loc='upper right')
    plt.subplot(122)
    plt.plot(num_spam, accuracy_spam_train, label='training')
    plt.plot(num_spam, accuracy_spam_test, label='test')
    plt.title('spam performance with \nat least 10 examples at leaf nodes')
    plt.xlabel('size of training examples')
    plt.ylabel('accuracy')
    plt.legend(loc='upper right')
    plt.show()
def full_tree():
    accuracy_cancer_train = []
    accuracy_cancer_test = []
    accuracy_spam_train = []
    accuracy_spam_test = []
    clf = tree.DecisionTreeClassifier()

    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    for num in num_cancer:
        clf.fit(train_features_cancer[:num, :], train_labels_cancer[:num])
        acc_train_cancer = clf.score(train_features_cancer[:num, :],
                                     train_labels_cancer[:num])
        accuracy_cancer_train.append(acc_train_cancer)
        acc_test_cancer = clf.score(test_features_cancer, test_labels_cancer)
        accuracy_cancer_test.append(acc_test_cancer)
    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    for num in num_spam:
        clf.fit(train_features_spam[:num, :], train_labels_spam[:num])
        acc_train_spam = clf.score(train_features_spam[:num, :],
                                   train_labels_spam[:num])
        accuracy_spam_train.append(acc_train_spam)
        acc_test_spam = clf.score(test_features_spam, test_labels_spam)
        accuracy_spam_test.append(acc_test_spam)
    plt.figure(figsize=(10, 6))
    plt.subplot(121)
    plt.plot(num_cancer, accuracy_cancer_train, label='training')
    plt.plot(num_cancer, accuracy_cancer_test, label='test')
    plt.title('breast cancer train/test accuracy')
    plt.xlabel('size of training examples')
    plt.ylabel('accuracy')
    plt.legend(loc='upper right')
    plt.subplot(122)
    plt.plot(num_spam, accuracy_spam_train, label='training')
    plt.plot(num_spam, accuracy_spam_test, label='test')
    plt.title('spam train/test accuracy')
    plt.xlabel('size of training examples')
    plt.ylabel('accuracy')
    plt.legend(loc='upper right')
    plt.show()
def converge_time():
    scaler = StandardScaler()
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    scaler.fit(train_features_cancer)
    train_features_cancer_norm = scaler.transform(train_features_cancer)
    test_features_cancer_norm = scaler.transform(test_features_cancer)
    cost_vs_iter_cancer, acc_cancer = zero_hidden_model(
        train_features_cancer_norm, train_labels_cancer,
        test_features_cancer_norm, test_labels_cancer)
    iter_list_cancer, cost_list_cancer = cost_vs_iter_cancer
    acc_train_cancer, acc_test_cancer = acc_cancer
    print('training accuracy for breast cancer learner is:', acc_train_cancer)
    print('test accuracy for breast cancer learner is:', acc_test_cancer)

    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    scaler.fit(train_features_spam)
    train_features_spam_norm = scaler.transform(train_features_spam)
    test_features_spam_norm = scaler.transform(test_features_spam)
    cost_vs_iter_spam, acc_spam = zero_hidden_model(train_features_spam_norm,
                                                    train_labels_spam,
                                                    test_features_spam_norm,
                                                    test_labels_spam)
    iter_list_spam, cost_list_spam = cost_vs_iter_spam
    acc_train_spam, acc_test_spam = acc_spam
    print('training accuracy for spam learner is:', acc_train_spam)
    print('test accuracy for spam learner is:', acc_test_spam)

    plt.figure(figsize=(8, 3.5))
    plt.subplot(121)
    plt.plot(iter_list_cancer, cost_list_cancer)
    plt.title('cost_vs_iter for breast cancer')
    plt.xlabel('iter_num')
    plt.ylabel('entropy loss')
    plt.subplot(122)
    plt.plot(iter_list_spam, cost_list_spam)
    plt.title('cost_vs_iter for spam')
    plt.xlabel('iter_num')
    plt.ylabel('entropy loss')