def gridSearch(): pipe = Pipeline([('scaler', StandardScaler()), ('svm', SVC())]) params = [{ 'svm__kernel': ['linear'] }, { 'svm__kernel': ['poly'], 'svm__degree': [2, 3, 4, 5, 6, 7, 8, 9, 10] }, { 'svm__kernel': ['rbf'], 'svm__gamma': [ 2**(-10), 2**(-9), 2**(-8), 2**(-7), 2**(-6), 2**(-5), 2**(-4), 2**(-3), 2**(-2), 2**(-1), 2**0 ] }] clf = GridSearchCV(pipe, param_grid=params, cv=5) train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) clf.fit(train_features_cancer, train_labels_cancer) best_train_acc_cancer = clf.score(train_features_cancer, train_labels_cancer) best_test_acc_cancer = clf.score(test_features_cancer, test_labels_cancer) print('best train accuracy for cancer SVM is:', best_train_acc_cancer) print('best test accuracy for cancer SVM is:', best_test_acc_cancer) print('best parameters for cancer SVM is:', clf.best_params_) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) clf.fit(train_features_spam, train_labels_spam) best_train_acc_spam = clf.score(train_features_spam, train_labels_spam) best_test_acc_spam = clf.score(test_features_spam, test_labels_spam) print('best train accuracy for spam SVM is:', best_train_acc_spam) print('best test accuracy for spam SVM is:', best_test_acc_spam) print('best parameters for spam SVM is:', clf.best_params_)
def gridSearch(): train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]) k_list = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19] clf = GridSearchCV(pipe, [{'knn__n_neighbors': k_list}], cv=5) clf.fit(train_features_cancer, train_labels_cancer) best_train_acc_cancer = clf.score(train_features_cancer, train_labels_cancer) best_test_acc_cancer = clf.score(test_features_cancer, test_labels_cancer) print('best train accuracy for cancer knn classifier is:', best_train_acc_cancer) print('best test accuracy for cancer knn classifier is:', best_test_acc_cancer) print('best hyperparameters for cancer knn classifier is:', clf.best_params_) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) clf.fit(train_features_spam, train_labels_spam) best_train_acc_spam = clf.score(train_features_spam, train_labels_spam) best_test_acc_spam = clf.score(test_features_spam, test_labels_spam) print('------------------------------------------------') print('best train accuracy for spam knn classifier is:', best_train_acc_spam) print('best test accuracy for spam knn classifier is:', best_test_acc_spam) print('best hyperparameters for spam knn classifier is:', clf.best_params_)
def gridsearch(): my_tree = tree.DecisionTreeClassifier() param_grid = [{ 'max_depth': [2, 4, 6, 8, 10] }, { 'min_samples_leaf': [2, 4, 8, 16, 32, 64, 128, 256] }, { 'ccp_alpha': [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128] }] clf = GridSearchCV(my_tree, param_grid, scoring='accuracy', cv=5) train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) best_cancer_clf = clf.fit(train_features_cancer, train_labels_cancer) print('Best parameter for breast cancer classifier:', best_cancer_clf.best_params_) accuracy_cancer_train = best_cancer_clf.score(train_features_cancer, train_labels_cancer) accuracy_cancer_test = best_cancer_clf.score(test_features_cancer, test_labels_cancer) print('Training accuracy for best breast cancer classifier :', accuracy_cancer_train) print('Test accuracy for best breast cancer classifier :', accuracy_cancer_test) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) best_spam_clf = clf.fit(train_features_spam, train_labels_spam) accuracy_spam_train = best_spam_clf.score(train_features_spam, train_labels_spam) accuracy_spam_test = best_spam_clf.score(test_features_spam, test_labels_spam) print('--------------------------------------------------------------') print('Best parameter for spam classifier:', best_spam_clf.best_params_) print('Training accuracy for best spam classifier :', accuracy_spam_train) print('Test accuracy for best spam classifier :', accuracy_spam_test)
def performance_curve_one_hidden_few_units(): scaler = StandardScaler() train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) acc_train_list_cancer = [] acc_test_list_cancer = [] for num in num_cancer: scaler.fit(train_features_cancer[:num, :]) train_features_norm_cancer = scaler.transform( train_features_cancer[:num, :]) test_features_norm_cancer = scaler.transform(test_features_cancer) _, acc_cancer = one_hidden_model(train_features_norm_cancer, train_labels_cancer[:num], test_features_norm_cancer, test_labels_cancer, hidden_units=4) acc_train_cancer, acc_test_cancer = acc_cancer acc_train_list_cancer.append(acc_train_cancer) acc_test_list_cancer.append(acc_test_cancer) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) acc_train_list_spam = [] acc_test_list_spam = [] for num in num_spam: scaler.fit(train_features_spam[:num, :]) train_features_norm_spam = scaler.transform( train_features_spam[:num, :]) test_features_norm_spam = scaler.transform(test_features_spam) _, acc_spam = one_hidden_model(train_features_norm_spam, train_labels_spam[:num], test_features_norm_spam, test_labels_spam, hidden_units=4) acc_train_spam, acc_test_spam = acc_spam acc_train_list_spam.append(acc_train_spam) acc_test_list_spam.append(acc_test_spam) plt.figure(figsize=(10, 4)) plt.subplot(121) plt.plot(num_cancer, acc_train_list_cancer, label='train acc') plt.plot(num_cancer, acc_test_list_cancer, label='test acc') plt.xlabel('training size') plt.ylabel('accuracy') plt.legend(loc='upper right') plt.title( 'accuracy vs trainig size for cancer classifier \nwith one hidden layer 4 hidden units' ) plt.subplot(122) plt.plot(num_spam, acc_train_list_spam, label='train acc') plt.plot(num_spam, acc_test_list_spam, label='test acc') plt.xlabel('training size') plt.ylabel('accuracy') plt.legend(loc='upper right') plt.title( 'accuracy vs trainig size for spam classifier \nwith one hidden layer 4 hidden units' ) plt.show()
def batch_vs_stocastic(): scaler = StandardScaler() train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) scaler.fit(train_features_spam) train_features_spam_norm = scaler.transform(train_features_spam) test_features_spam_norm = scaler.transform(test_features_spam) best_spam_classifier_batch(train_features_spam_norm, train_labels_spam, test_features_spam_norm, test_labels_spam) best_spam_classifier_stocastic(train_features_spam_norm, train_labels_spam, test_features_spam_norm, test_labels_spam)
def post_pruning_boosting_tree_performance(): pruning_tree = DecisionTreeClassifier(ccp_alpha=0.015) num_trees_list = [i + 1 for i in range(20)] train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) acc_train_cancer_list = [] acc_test_cancer_list = [] boost_classifier = AdaBoostClassifier(pruning_tree, n_estimators=1) for num_trees in num_trees_list: boost_classifier.set_params(n_estimators=num_trees) boost_classifier.fit(train_features_cancer, train_labels_cancer) acc_train_cancer = boost_classifier.score(train_features_cancer, train_labels_cancer) acc_train_cancer_list.append(acc_train_cancer) acc_test_cancer = boost_classifier.score(test_features_cancer, test_labels_cancer) acc_test_cancer_list.append(acc_test_cancer) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) acc_train_spam_list = [] acc_test_spam_list = [] for num_trees in num_trees_list: boost_classifier.set_params(base_estimator__ccp_alpha=0.005, n_estimators=num_trees) boost_classifier.fit(train_features_spam, train_labels_spam) acc_train_spam = boost_classifier.score(train_features_spam, train_labels_spam) acc_train_spam_list.append(acc_train_spam) acc_test_spam = boost_classifier.score(test_features_spam, test_labels_spam) acc_test_spam_list.append(acc_test_spam) plt.figure(figsize=(10, 6)) plt.subplot(121) plt.plot(num_trees_list, acc_train_cancer_list, label='train') plt.plot(num_trees_list, acc_test_cancer_list, label='test') plt.xlabel('num of trees') plt.ylabel('accuracy') plt.title( 'post-pruning boosting cancer classifer \nperformance vs number of boosting trees' ) plt.legend(loc='upper right') plt.subplot(122) plt.plot(num_trees_list, acc_train_spam_list, label='train') plt.plot(num_trees_list, acc_test_spam_list, label='test') plt.xlabel('num of trees') plt.ylabel('accuracy') plt.title( 'post-pruning boosting spam classifer \nperformance vs number of boosting trees' ) plt.legend(loc='upper right') plt.show()
def radius_neighbors_r(): train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) r_list = [ 2**0, 2**1, 2**2, 2**3, 2**4, 2**5, 2**6, 2**7, 2**8, 2**9, 2**10 ] pipe = Pipeline([ ('scaler', StandardScaler()), ('rn', RadiusNeighborsClassifier(outlier_label='most_frequent')) ]) acc_train_cancer_list = [] acc_test_cancer_list = [] for r in r_list: pipe.set_params(rn__radius=r) pipe.fit(train_features_cancer, train_labels_cancer) acc_train_cancer = pipe.score(train_features_cancer, train_labels_cancer) acc_test_cancer = pipe.score(test_features_cancer, test_labels_cancer) acc_train_cancer_list.append(acc_train_cancer) acc_test_cancer_list.append(acc_test_cancer) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) acc_train_spam_list = [] acc_test_spam_list = [] for r in r_list: pipe.set_params(rn__radius=r) pipe.fit(train_features_spam, train_labels_spam) acc_train_spam = pipe.score(train_features_spam, train_labels_spam) acc_test_spam = pipe.score(test_features_spam, test_labels_spam) acc_train_spam_list.append(acc_train_spam) acc_test_spam_list.append(acc_test_spam) plt.figure(figsize=(10, 6)) plt.subplot(121) plt.plot(r_list, acc_train_cancer_list) plt.plot(r_list, acc_test_cancer_list) plt.xscale('log') plt.xlabel('radius value') plt.ylabel('accuracy') plt.title('radius neighbor cancer \nclassifier performance vs K') plt.subplot(122) plt.plot(r_list, acc_train_spam_list) plt.plot(r_list, acc_test_spam_list) plt.xscale('log') plt.xlabel('radius value') plt.ylabel('accuracy') plt.title('radius neighbor spam \nclassifier performance vs K') plt.show()
def accuracy_vs_num_tree(): max_depth_tree = DecisionTreeClassifier(max_depth=3) num_trees_list = [i + 1 for i in range(100)] train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) acc_train_cancer_list = [] acc_test_cancer_list = [] boost_classifier = AdaBoostClassifier(max_depth_tree, n_estimators=1) for num_trees in num_trees_list: boost_classifier.set_params(n_estimators=num_trees) boost_classifier.fit(train_features_cancer, train_labels_cancer) acc_train_cancer = boost_classifier.score(train_features_cancer, train_labels_cancer) acc_train_cancer_list.append(acc_train_cancer) acc_test_cancer = boost_classifier.score(test_features_cancer, test_labels_cancer) acc_test_cancer_list.append(acc_test_cancer) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) acc_train_spam_list = [] acc_test_spam_list = [] for num_trees in num_trees_list: boost_classifier.set_params(n_estimators=num_trees) boost_classifier.fit(train_features_spam, train_labels_spam) acc_train_spam = boost_classifier.score(train_features_spam, train_labels_spam) acc_train_spam_list.append(acc_train_spam) acc_test_spam = boost_classifier.score(test_features_spam, test_labels_spam) acc_test_spam_list.append(acc_test_spam) plt.figure(figsize=(10, 6)) plt.subplot(121) plt.plot(num_trees_list, acc_train_cancer_list, label='train') plt.plot(num_trees_list, acc_test_cancer_list, label='test') plt.xlabel('num of trees') plt.ylabel('accuracy') plt.title('cancer accuracy vs number of boosting trees') plt.legend(loc='upper right') plt.subplot(122) plt.plot(num_trees_list, acc_train_spam_list, label='train') plt.plot(num_trees_list, acc_test_spam_list, label='test') plt.xlabel('num of trees') plt.ylabel('accuracy') plt.title('spam accuracy vs number of boosting trees') plt.legend(loc='upper right') plt.show()
def performance_vs_gamma(): gamma_list = [ 2**(-10), 2**(-9), 2**(-8), 2**(-7), 2**(-6), 2**(-5), 2**(-4), 2**(-3), 2**(-2), 2**(-1), 2**0 ] pipe = Pipeline([('scaler', StandardScaler()), ('svm', SVC(kernel='rbf'))]) acc_train_cancer_list = [] acc_test_cancer_list = [] train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) for gamma in gamma_list: pipe.set_params(svm__gamma=gamma) pipe.fit(train_features_cancer, train_labels_cancer) acc_train_cancer = pipe.score(train_features_cancer, train_labels_cancer) acc_test_cancer = pipe.score(test_features_cancer, test_labels_cancer) acc_train_cancer_list.append(acc_train_cancer) acc_test_cancer_list.append(acc_test_cancer) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) acc_train_spam_list = [] acc_test_spam_list = [] for gamma in gamma_list: pipe.set_params(svm__gamma=gamma) pipe.fit(train_features_spam, train_labels_spam) acc_train_spam = pipe.score(train_features_spam, train_labels_spam) acc_test_spam = pipe.score(test_features_spam, test_labels_spam) acc_train_spam_list.append(acc_train_spam) acc_test_spam_list.append(acc_test_spam) plt.figure(figsize=(10, 6)) plt.subplot(121) plt.plot(gamma_list, acc_train_cancer_list, label='train') plt.plot(gamma_list, acc_test_cancer_list, label='test') plt.xscale('log') plt.xlabel('gamma') plt.ylabel('accuracy') plt.title('rbf kernel cancer classifier \nperformance vs gamma') plt.subplot(122) plt.plot(gamma_list, acc_train_spam_list, label='train') plt.plot(gamma_list, acc_test_spam_list, label='test') plt.xscale('log') plt.xlabel('gamma') plt.ylabel('accuracy') plt.title('rbf kernel spam classifier \nperformance vs gamma') plt.show()
def learning_curve(): pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]) num_cancer = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500] num_spam = [400, 800, 1200, 1600, 2000, 2400, 2800, 3200, 3600, 4000] train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) acc_train_cancer_list = [] acc_test_cancer_list = [] for num in num_cancer: pipe.set_params(knn__n_neighbors=3) pipe.fit(train_features_cancer[:num, :], train_labels_cancer[:num]) acc_train_cancer = pipe.score(train_features_cancer[:num, :], train_labels_cancer[:num]) acc_test_cancer = pipe.score(test_features_cancer, test_labels_cancer) acc_train_cancer_list.append(acc_train_cancer) acc_test_cancer_list.append(acc_test_cancer) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) acc_train_spam_list = [] acc_test_spam_list = [] for num in num_spam: pipe.set_params(knn__n_neighbors=3) pipe.fit(train_features_spam[:num, :], train_labels_spam[:num]) acc_train_spam = pipe.score(train_features_spam[:num, :], train_labels_spam[:num]) acc_test_spam = pipe.score(test_features_spam, test_labels_spam) acc_train_spam_list.append(acc_train_spam) acc_test_spam_list.append(acc_test_spam) plt.figure(figsize=(10, 6)) plt.subplot(121) plt.plot(num_cancer, acc_train_cancer_list) plt.plot(num_cancer, acc_test_cancer_list) plt.xlabel('training size') plt.ylabel('accuracy') plt.title('knn cancer classifier \nperformance vs training size') plt.subplot(122) plt.plot(num_spam, acc_train_spam_list) plt.plot(num_spam, acc_test_spam_list) plt.xlabel('training size') plt.ylabel('accuracy') plt.title('knn spam classifier \nperformance vs training size') plt.show()
def spam_classifier_with_different_training_size(): max_depth_tree = DecisionTreeClassifier(max_depth=3) clf = AdaBoostClassifier(max_depth_tree, n_estimators=10) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) acc_train_list = [] acc_test_list = [] for num in num_spam: clf.fit(train_features_spam[:num, :], train_labels_spam[:num]) acc_train = clf.score(train_features_spam[:num, :], train_labels_spam[:num]) acc_test = clf.score(test_features_spam, test_labels_spam) acc_train_list.append(acc_train) acc_test_list.append(acc_test) pruning_tree = DecisionTreeClassifier(ccp_alpha=0.012) clf_pruning = AdaBoostClassifier(pruning_tree, n_estimators=10) acc_train_list_post = [] acc_test_list_post = [] for num in num_spam: clf_pruning.fit(train_features_spam[:num, :], train_labels_spam[:num]) acc_train_post = clf_pruning.score(train_features_spam[:num, :], train_labels_spam[:num]) acc_test_post = clf_pruning.score(test_features_spam, test_labels_spam) acc_train_list_post.append(acc_train_post) acc_test_list_post.append(acc_test_post) plt.figure(figsize=(10, 6)) plt.subplot(121) plt.plot(num_spam, acc_train_list, label='train') plt.plot(num_spam, acc_test_list, label='test') plt.xlabel('training size') plt.ylabel('accuracy') plt.title( 'boosting spam classifier with pre-pruning trees \nperformance vs training size' ) plt.subplot(122) plt.plot(num_spam, acc_train_list_post, label='train') plt.plot(num_spam, acc_test_list_post, label='test') plt.xlabel('training size') plt.ylabel('accuracy') plt.title( 'boosting spam classifier with post-pruning trees \nperformance vs training size' ) plt.show()
def KNN_vs_k(): train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) k_list = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19] pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]) acc_train_cancer_list = [] acc_test_cancer_list = [] for k in k_list: pipe.set_params(knn__n_neighbors=k) pipe.fit(train_features_cancer, train_labels_cancer) acc_train_cancer = pipe.score(train_features_cancer, train_labels_cancer) acc_test_cancer = pipe.score(test_features_cancer, test_labels_cancer) acc_train_cancer_list.append(acc_train_cancer) acc_test_cancer_list.append(acc_test_cancer) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) acc_train_spam_list = [] acc_test_spam_list = [] for k in k_list: pipe.set_params(knn__n_neighbors=k) pipe.fit(train_features_spam, train_labels_spam) acc_train_spam = pipe.score(train_features_spam, train_labels_spam) acc_test_spam = pipe.score(test_features_spam, test_labels_spam) acc_train_spam_list.append(acc_train_spam) acc_test_spam_list.append(acc_test_spam) plt.figure(figsize=(10, 6)) plt.subplot(121) plt.plot(k_list, acc_train_cancer_list) plt.plot(k_list, acc_test_cancer_list) plt.xticks(k_list) plt.xlabel('k value') plt.ylabel('accuracy') plt.title('knn cancer classifier performance vs K') plt.subplot(122) plt.plot(k_list, acc_train_spam_list) plt.plot(k_list, acc_test_spam_list) plt.xticks(k_list) plt.xlabel('k value') plt.ylabel('accuracy') plt.title('knn spam classifier performance vs K') plt.show()
def rbf_svc(): pipe = Pipeline([('scaler', StandardScaler()), ('svm', SVC(kernel='rbf', gamma=5.e-1))]) train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) acc_train_cancer_list = [] acc_test_cancer_list = [] for num in num_cancer: pipe.fit(train_features_cancer[:num, :], train_labels_cancer[:num]) acc_train_cancer = pipe.score(train_features_cancer[:num, :], train_labels_cancer[:num]) acc_test_cancer = pipe.score(test_features_cancer, test_labels_cancer) acc_train_cancer_list.append(acc_train_cancer) acc_test_cancer_list.append(acc_test_cancer) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) acc_train_spam_list = [] acc_test_spam_list = [] pipe.set_params(svm__gamma=5.e-2) for num in num_spam: pipe.fit(train_features_spam[:num, :], train_labels_spam[:num]) acc_train_spam = pipe.score(train_features_spam[:num, :], train_labels_spam[:num]) acc_test_spam = pipe.score(test_features_spam, test_labels_spam) acc_train_spam_list.append(acc_train_spam) acc_test_spam_list.append(acc_test_spam) plt.figure(figsize=(10, 6)) plt.subplot(121) plt.plot(num_cancer, acc_train_cancer_list, label='train') plt.plot(num_cancer, acc_test_cancer_list, label='test') plt.xlabel('training size') plt.ylabel('accuracy') plt.title('rbf kernel cancer smv performance \nvs training size') plt.legend(loc='upper right') plt.subplot(122) plt.plot(num_spam, acc_train_spam_list, label='train') plt.plot(num_spam, acc_test_spam_list, label='test') plt.xlabel('training size') plt.ylabel('accuracy') plt.title('rbf kernel spam smv performance \nvs training size') plt.legend(loc='upper right') plt.show()
def leaf_num_limited_tree(): accuracy_cancer_train = [] accuracy_cancer_test = [] accuracy_spam_train = [] accuracy_spam_test = [] clf = tree.DecisionTreeClassifier(min_samples_leaf=10) train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) for num in num_cancer: clf.fit(train_features_cancer[:num, :], train_labels_cancer[:num]) acc_train_cancer = clf.score(train_features_cancer[:num, :], train_labels_cancer[:num]) accuracy_cancer_train.append(acc_train_cancer) acc_test_cancer = clf.score(test_features_cancer, test_labels_cancer) accuracy_cancer_test.append(acc_test_cancer) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) clf.set_params(min_samples_leaf=10) for num in num_spam: clf.fit(train_features_spam[:num, :], train_labels_spam[:num]) acc_train_spam = clf.score(train_features_spam[:num, :], train_labels_spam[:num]) accuracy_spam_train.append(acc_train_spam) acc_test_spam = clf.score(test_features_spam, test_labels_spam) accuracy_spam_test.append(acc_test_spam) plt.figure(figsize=(10, 6)) plt.subplot(121) plt.plot(num_cancer, accuracy_cancer_train, label='training') plt.plot(num_cancer, accuracy_cancer_test, label='test') plt.title( 'breast cancer performance with \nat least 10 examples at leaf nodes') plt.xlabel('size of training examples') plt.ylabel('accuracy') plt.legend(loc='upper right') plt.subplot(122) plt.plot(num_spam, accuracy_spam_train, label='training') plt.plot(num_spam, accuracy_spam_test, label='test') plt.title('spam performance with \nat least 10 examples at leaf nodes') plt.xlabel('size of training examples') plt.ylabel('accuracy') plt.legend(loc='upper right') plt.show()
def full_tree(): accuracy_cancer_train = [] accuracy_cancer_test = [] accuracy_spam_train = [] accuracy_spam_test = [] clf = tree.DecisionTreeClassifier() train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) for num in num_cancer: clf.fit(train_features_cancer[:num, :], train_labels_cancer[:num]) acc_train_cancer = clf.score(train_features_cancer[:num, :], train_labels_cancer[:num]) accuracy_cancer_train.append(acc_train_cancer) acc_test_cancer = clf.score(test_features_cancer, test_labels_cancer) accuracy_cancer_test.append(acc_test_cancer) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) for num in num_spam: clf.fit(train_features_spam[:num, :], train_labels_spam[:num]) acc_train_spam = clf.score(train_features_spam[:num, :], train_labels_spam[:num]) accuracy_spam_train.append(acc_train_spam) acc_test_spam = clf.score(test_features_spam, test_labels_spam) accuracy_spam_test.append(acc_test_spam) plt.figure(figsize=(10, 6)) plt.subplot(121) plt.plot(num_cancer, accuracy_cancer_train, label='training') plt.plot(num_cancer, accuracy_cancer_test, label='test') plt.title('breast cancer train/test accuracy') plt.xlabel('size of training examples') plt.ylabel('accuracy') plt.legend(loc='upper right') plt.subplot(122) plt.plot(num_spam, accuracy_spam_train, label='training') plt.plot(num_spam, accuracy_spam_test, label='test') plt.title('spam train/test accuracy') plt.xlabel('size of training examples') plt.ylabel('accuracy') plt.legend(loc='upper right') plt.show()
def converge_time(): scaler = StandardScaler() train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) scaler.fit(train_features_cancer) train_features_cancer_norm = scaler.transform(train_features_cancer) test_features_cancer_norm = scaler.transform(test_features_cancer) cost_vs_iter_cancer, acc_cancer = zero_hidden_model( train_features_cancer_norm, train_labels_cancer, test_features_cancer_norm, test_labels_cancer) iter_list_cancer, cost_list_cancer = cost_vs_iter_cancer acc_train_cancer, acc_test_cancer = acc_cancer print('training accuracy for breast cancer learner is:', acc_train_cancer) print('test accuracy for breast cancer learner is:', acc_test_cancer) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) scaler.fit(train_features_spam) train_features_spam_norm = scaler.transform(train_features_spam) test_features_spam_norm = scaler.transform(test_features_spam) cost_vs_iter_spam, acc_spam = zero_hidden_model(train_features_spam_norm, train_labels_spam, test_features_spam_norm, test_labels_spam) iter_list_spam, cost_list_spam = cost_vs_iter_spam acc_train_spam, acc_test_spam = acc_spam print('training accuracy for spam learner is:', acc_train_spam) print('test accuracy for spam learner is:', acc_test_spam) plt.figure(figsize=(8, 3.5)) plt.subplot(121) plt.plot(iter_list_cancer, cost_list_cancer) plt.title('cost_vs_iter for breast cancer') plt.xlabel('iter_num') plt.ylabel('entropy loss') plt.subplot(122) plt.plot(iter_list_spam, cost_list_spam) plt.title('cost_vs_iter for spam') plt.xlabel('iter_num') plt.ylabel('entropy loss')