def test5(): print("\n\nTest 5 - Algorithm Tweaks (Bias & Variance)") print("Expected / Actual:") print("\nRegularized Linear Regression: ") X, y = ut.read_mat('mat/ex5data1.mat') X = ut.create_design(X) theta = np.array([1, 1]) print("303.993 / ", alg.SSD(theta, X, y, 1)) grad = alg.SSD_gradient(theta, X, y, 1) print("-15.30 / ", grad[0]) print("598.250 / ", grad[1]) print("\nLearning Curve:") raw = ut.read_mat_raw('mat/ex5data1.mat') X = raw['X'] y = raw['y'].reshape(-1) Xval = raw['Xval'] yval = raw['yval'].reshape(-1) print("Check plot") # pt.plot_learning_curve(ut.create_design(X), y, ut.create_design(Xval), yval, 0) print("\nFitting polynomial regression:") p = 8 X_poly = ut.poly_features(X, p) X_poly, mu, sigma = ut.normalize_features(X_poly) X_poly = ut.create_design(X_poly) Xval = ut.poly_features(Xval, p) Xval -= mu Xval /= sigma Xval = ut.create_design(Xval) l = 0.01 theta = alg.parametrize_linear(X_poly, y, l) print("Check plot, l =", l) pt.fit_plot(X, y, mu, sigma, theta, p) pt.plot_learning_curve(X_poly, y, Xval, yval, l) print("\nOptimize regularization:") print("Check plot") l = pt.plot_validation_curve(X_poly, y, Xval, yval) Xtest = raw['Xtest'] ytest = raw['ytest'].reshape(-1) Xtest = ut.poly_features(Xtest, p) Xtest -= mu Xtest /= sigma Xtest = ut.create_design(Xtest) theta = alg.parametrize_linear(X_poly, y, l) print("3.8599 / ", alg.SSD(theta, Xtest, ytest, 0)) print("\nRandomized learning curve:") print("Check plot") pt.plot_randomized_learning_curve(X_poly, y, Xval, yval, 0.01) return
def plot_learning_curves_across_topics(n_runs, start_idx, stop_idx, estimators_dict, comment=None): """ TODO Most probably buggy """ for topic_id, data in texts_vote_lists_truths_by_topic_id.iteritems(): print 'Loading topic %s' % topic_id texts, vote_lists, truths = data n_documents = len(texts) vectorizer = TfidfVectorizer() tfidf = vectorizer.fit_transform(texts) text_similarity = cosine_similarity(tfidf) x = np.arange(start_idx, stop_idx) y_by_estimator = dict( (estimator, []) for estimator in estimators_dict.keys() ) for estimator_name, estimator_and_args in estimators_dict.iteritems(): print 'Calculating for %s' % estimator_name estimator, args, active_pars = estimator_and_args if active_pars is None: sequences = Parallel(n_jobs=4)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts, vote_lists, truths, text_similarity, idx, False, *args) for idx in xrange(n_runs) ) else: sequences = Parallel(n_jobs=4)( delayed(get_accuracy_sequence_active)(estimator, stop_idx, texts, vote_lists, truths, text_similarity, active_pars, idx, False, *args) for idx in xrange(n_runs) ) good_slices = [ s[start_idx:] for s in sequences if s is not None ] if good_slices: results = np.vstack(good_slices) begin_accuracies = results[:, 0] end_accuracies = results[:, -1] begin_accuracies.dump("pickles/%s-%s-begin-accuracies--.pkl" % (topic_id, estimator_name) ) end_accuracies.dump("pickles/%s-%s-end-accuracies--.pkl" % (topic_id, estimator_name)) # We will then need to vstack and avg though all the topic accuracies for each estimator y_by_estimator[estimator_name].append( np.mean(results, axis=0) ) else: print 'Topic %s is not represented with estimator %s' % (topic_id, estimator_name) result_by_estimator = {} for estimator_name, mean_accuracy_sequences in y_by_estimator.iteritems(): if mean_accuracy_sequences: to_avg = np.vstack(mean_accuracy_sequences) result_by_estimator[estimator_name] = np.mean(to_avg, axis=0) else: print "Nope" if comment: title = 'Across topics, %s runs, %s' % (n_runs, comment) else: title = 'Across topics, %s runs' % topic_id plot_learning_curve(title, x, result_by_estimator, 'Votes sampled', 'Accuracy')
def cross_validate(self, test_size=0.25, n_iter=100): cv = cross_validation.ShuffleSplit(self.dataset.matrix.shape[0], n_iter=n_iter, test_size=test_size) title = "Learning Curves (Logistic Regression)" plot_learning_curve(self.clf, title, self.dataset.matrix, self.dataset.labels, cv=cv, n_jobs=4) scores = cross_validation.cross_val_score(self.clf, self.dataset.matrix, self.dataset.labels, cv=cv) print scores print 'Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2)
def plot_learning_curves_for_topic(topic_id, n_runs, votes_per_doc, estimators_dict, comment=None): texts, vote_lists, truths = texts_vote_lists_truths_by_topic_id[topic_id] n_documents = len(texts) vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(texts) text_similarity = cosine_similarity(X) min_votes_per_doc, max_votes_per_doc = votes_per_doc start_idx, stop_idx = int(min_votes_per_doc * n_documents), int(max_votes_per_doc * n_documents) x = np.arange(float(start_idx), float(stop_idx)) / n_documents estimator_y = {} for estimator_name, estimator_and_args in estimators_dict.iteritems(): print 'Calculating for %s' % estimator_name estimator, args, active_pars = estimator_and_args if active_pars is None: sequences = Parallel(n_jobs=N_CORES)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts, vote_lists, truths, X, text_similarity, idx, False, *args) for idx in xrange(n_runs) ) else: sequences = Parallel(n_jobs=N_CORES)( delayed(get_accuracy_sequence_active)(estimator, stop_idx, texts, vote_lists, truths, X, text_similarity, active_pars, idx, False, *args) for idx in xrange(n_runs) ) good_slices = [ s[start_idx:] for s in sequences if s is not None ] if good_slices: results = np.vstack(good_slices) # Pickling is not necessary yet ''' begin_accuracies = results[:, 0] middle_accuracies = results[:, int(results.shape[1] / 2)] end_accuracies = results[:, -1] begin_accuracies.dump("pickles/%s-%s-begin-accuracies---.pkl" % (topic_id, estimator_name) ) ''' estimator_y[estimator_name] = np.mean(results, axis=0) else: print 'Query %s is not represented with estimator %s' % (topic_id, estimator_name) if comment: title = 'Query %s, %s runs, %s' % (topic_id, n_runs, comment) else: title = 'Query %s, %s runs' % (topic_id, n_runs) plot_learning_curve(title, x, estimator_y, 'Votes per document', 'Accuracy')
def cross_validate(self, test_size=0.25, n_iter=100): cv = cross_validation.ShuffleSplit(self.dataset.matrix.shape[0], n_iter=n_iter, test_size=test_size) title = "Learning Curves (Logistic Regression)" plot_learning_curve(self.clf, title, self.dataset.matrix, self.dataset.labels, cv=cv, n_jobs=4) scores = cross_validation.cross_val_score(self.clf, self.dataset.matrix, self.dataset.labels, cv=cv) print scores print 'Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2)
def run_experiment(self, dataset): # SVM if dataset.dataset_name == 'Diabetes Data Set': svm_simple = SVC() svm_simple.fit(dataset.train_x, dataset.train_y) y_pred = svm_simple.predict(dataset.test_x) print(classification_report(dataset.test_y, y_pred)) # Create SVM classifier self.learner_model = SVC(C=15.0, kernel='linear', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight=None, max_iter=-1, decision_function_shape='ovr', random_state=dataset.randomness ) # Fit the classifier to the data self.learner_model.fit(dataset.train_x, dataset.train_y) scores = cross_val_score(self.learner_model, dataset.x, dataset.y, cv=10) print("mean: {:.3f} (std: {:.3f})".format(scores.mean(), scores.std()), end="\n\n") predictions = self.learner_model.predict(dataset.test_x) print("Classification Report") print(classification_report(predictions, dataset.test_y)) curve = plots.plot_learning_curve(self.learner_model, "SVM Learning Curve", dataset.x, dataset.y, cv=5, n_jobs=4) curve.show(block=False) plt.show() ## TRAINING/TEST ACCURACY SCORE k_range = [10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] train_accuracy = np.empty(len(k_range)) test_accuracy = np.empty(len(k_range)) for i, k in enumerate(k_range): svm = SVC(C=k, kernel='linear', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight=None, max_iter=-1, decision_function_shape='ovr', random_state=dataset.randomness ) svm.fit(dataset.train_x, dataset.train_y) # Compute accuracy on the training set train_accuracy[i] = svm.score(dataset.train_x, dataset.train_y) # Compute accuracy on the testing set test_accuracy[i] = svm.score(dataset.test_x, dataset.test_y) # Visualization of k values vs accuracy plt.title('SVM: Varying C With Gamma as Scale and Kernel Linear') plt.plot(k_range, test_accuracy, label='Testing Accuracy') plt.plot(k_range, train_accuracy, label='Training Accuracy') plt.xticks(np.arange(min(k_range), max(k_range) + 1, 1.0)) plt.legend() plt.xlabel('C') plt.ylabel('Accuracy') plt.show(block=False) plt.show() ## PLOT TIMINGS # Plot time taken for various sizes of database train_sizes = np.linspace(0.1, 0.9, 5) time_taken = np.empty(len(train_sizes)) accuracy_scores = np.empty(len(train_sizes)) for i, k in enumerate(train_sizes): X_train, X_test, y_train, y_test = train_test_split( dataset.x, dataset.y, test_size=k, random_state=42) start_time = time.time() self.learner_model.fit(X_train, y_train) y_pred = self.learner_model.predict(X_test) elapsed_time = time.time() - start_time time_taken[i] = elapsed_time accuracy_scores[i] = accuracy_score(y_pred, y_test) ## Plot Times taken by different models plt.title(f'SVM: Varying DataSet Sizes vs Time Taken for Dataset {dataset.dataset_name}') plt.plot(train_sizes, time_taken, label='Time Taken Vs TestData Size') plt.plot(train_sizes, accuracy_scores, label='Accuracy Vs TestData Size') plt.legend() plt.xlabel('TestData Size') plt.ylabel('Time Taken') plt.show(block=False) plt.show() print(f"Average time taken for SVM algorithms is {np.mean(time_taken)}") # Using Grid Search CV param_grid = {'C': [15, 20, 25, 30, 35, 40, 50], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']} grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3, cv=3) grid.fit(dataset.train_x, dataset.train_y) # print best parameter after tuning print(f" Best params for the dataset is {grid.best_params_}") # print how our model looks after hyper-parameter tuning print(grid.best_estimator_) grid_predictions = grid.best_estimator_.predict(dataset.test_x) # print classification report print( f"The classification report for the best estimator is {classification_report(dataset.test_y, grid_predictions)}") learning_curve = plots.plot_learning_curve(grid.best_estimator_, "SVM Learning Curve For best estimator", dataset.x, dataset.y, cv=5, n_jobs=4) learning_curve.show(block=False) plt.show() else: svm_simple = SVC() svm_simple.fit(dataset.train_x, dataset.train_y) y_pred = svm_simple.predict(dataset.test_x) print(classification_report(dataset.test_y, y_pred)) # Create SVM classifier self.learner_model = SVC(C=10.0, kernel='rbf', degree=3, gamma='scale', random_state=dataset.randomness) # Fit the classifier to the data self.learner_model.fit(dataset.train_x, dataset.train_y) scores = cross_val_score(self.learner_model, dataset.x, dataset.y, cv=10) print("mean: {:.3f} (std: {:.3f})".format(scores.mean(), scores.std()), end="\n\n") predictions = self.learner_model.predict(dataset.test_x) print("Classification Report") print(classification_report(predictions, dataset.test_y)) curve = plots.plot_learning_curve(self.learner_model, "SVM Learning Curve", dataset.x, dataset.y, cv=5, n_jobs=4) curve.show(block=False) plt.show() ## TRAINING/TEST ACCURACY SCORE k_range = np.linspace(5, 15, 10) train_accuracy = np.empty(len(k_range)) test_accuracy = np.empty(len(k_range)) for i, k in enumerate(k_range): svm = SVC(C=k, kernel='rbf', degree=3, gamma='auto', random_state=dataset.randomness ) svm.fit(dataset.train_x, dataset.train_y) # Compute accuracy on the training set train_accuracy[i] = svm.score(dataset.train_x, dataset.train_y) # Compute accuracy on the testing set test_accuracy[i] = svm.score(dataset.test_x, dataset.test_y) plt.title('SVM: Varying C With Gamma as Auto and Kernel RBF') plt.plot(k_range, test_accuracy, label='Testing Accuracy') plt.plot(k_range, train_accuracy, label='Training Accuracy') plt.xticks(np.arange(min(k_range), max(k_range) + 1, 1.0)) plt.legend() plt.xlabel('C') plt.ylabel('Accuracy') plt.show(block=False) plt.show() ## PLOT TIMINGS # Plot time taken for various sizes of database train_sizes = np.linspace(0.1, 0.9, 5) time_taken = np.empty(len(train_sizes)) accuracy_scores = np.empty(len(train_sizes)) for i, k in enumerate(train_sizes): X_train, X_test, y_train, y_test = train_test_split( dataset.x, dataset.y, test_size=k, random_state=42) start_time = time.time() self.learner_model.fit(X_train, y_train) y_pred = self.learner_model.predict(X_test) elapsed_time = time.time() - start_time time_taken[i] = elapsed_time accuracy_scores[i] = accuracy_score(y_pred, y_test) ## Plot Times taken by different models plt.title(f'SVM: Varying DataSet Sizes vs Time Taken for Dataset {dataset.dataset_name}') plt.plot(train_sizes, time_taken, label='Time Taken Vs TestData Size') plt.plot(train_sizes, accuracy_scores, label='Accuracy Vs TestData Size') plt.legend() plt.xlabel('TestData Size') plt.ylabel('Time Taken') plt.show(block=False) plt.show() print(f"Average time taken for SVM algorithms is {np.mean(time_taken)}") # Using Grid Search CV param_grid = {'C': [10, 10.5, 10.6, 10.7, 10.8, 10.9, 11], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']} grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3, cv=3) grid.fit(dataset.train_x, dataset.train_y) # print best parameter after tuning print(f" Best params for the dataset is {grid.best_params_}") # print how our model looks after hyper-parameter tuning print(grid.best_estimator_) grid_predictions = grid.best_estimator_.predict(dataset.test_x) # print classification report print( f"The classification report for the best estimator is {classification_report(dataset.test_y, grid_predictions)}") learning_curve = plots.plot_learning_curve(grid.best_estimator_, "SVM Learning Curve For best estimator", dataset.x, dataset.y, cv=5, n_jobs=4) learning_curve.show(block=False) plt.show()
# create train-test splits X, X_test, y, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=2018) # plot learning curves with metrics of interest lc_scoring = ['accuracy', 'precision', 'recall', 'roc_auc'] for scoring in lc_scoring: # load default model: Linear SVM with SGD clf_supervised = models.default_model() plt_handle = plots.plot_learning_curve( clf_supervised, 'Supervised Learning Curve (Scorer: {})'.format(scoring), X, y, cv=5, scoring=scoring) plt_handle.show() # train and report test results clf_supervised = models.default_model() clf_supervised.fit(X, y) sup_y_test_preds = clf_supervised.predict(X_test) supervised_results = { 'accuracy': metrics.accuracy(y_test, sup_y_test_preds), 'precision': metrics.precision(y_test, sup_y_test_preds), 'recall': metrics.recall(y_test, sup_y_test_preds), 'gmeans': metrics.g_means(y_test, sup_y_test_preds), 'auc': metrics.auc(y_test, sup_y_test_preds),
# Draw Histogram of errors on test and train plots.dualHist(errors1 = rs.predict(X_test) - y_test, errors2 = rs.predict(X_train) - y_train, label1 = 'test', label2 = 'train', title = 'Prediction Error: test vs train', xlabel = '$', hist_range = [-15,15]) #%% title = 'Learning Curves (Random Forest Regression)' # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) plt = plots.plot_learning_curve(rf, title, X, y, cv=cv, n_jobs=5) plt.show() plt.close() #%% # Use classifier from sklearn.ensemble import RandomForestClassifier y = df['classification_y'] # Split df X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, shuffle=False)
def run_experiment(self, dataset): # KNN WITH CROSS VALIDATION if dataset.dataset_name == 'Diabetes Data Set': k_range = np.arange(40, 50) train_accuracy = np.empty(len(k_range)) test_accuracy = np.empty(len(k_range)) for i, k in enumerate(k_range): knn = KNeighborsClassifier(n_neighbors=k, weights='uniform', metric='manhattan', n_jobs=4) knn.fit(dataset.train_x, dataset.train_y) # Compute accuracy on the training set train_accuracy[i] = knn.score(dataset.train_x, dataset.train_y) # Compute accuracy on the testing set test_accuracy[i] = knn.score(dataset.test_x, dataset.test_y) # Visualization of k values vs accuracy plt.title('k-NN: Varying Number of Neighbors') plt.plot(k_range, test_accuracy, label='Testing Accuracy') plt.plot(k_range, train_accuracy, label='Training Accuracy') plt.xticks(np.arange(min(k_range), max(k_range) + 1, 1.0)) plt.legend() plt.xlabel('Number of Neighbors') plt.ylabel('Accuracy') plt.show(block=False) plt.show() # create new a knn model knn2 = KNeighborsClassifier(n_jobs=4) leaf_range = [1, 2, 3, 4] # create a dictionary of all values we want to test for n_neighbors param_grid = { 'n_neighbors': k_range, 'leaf_size': leaf_range, 'weights': ['uniform', 'distance'], 'metric': ['manhattan', 'euclidean'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] } # use gridsearch to test all values for n_neighbors knn_gscv = GridSearchCV(knn2, param_grid, cv=3, n_jobs=-1, verbose=1) # fit model to data knn_gscv.fit(dataset.train_x, dataset.train_y) print( f"Best parameters for this knn algorithm are {knn_gscv.best_params_}" ) print( f"Best score for this knn algorithm are {knn_gscv.best_score_}" ) # Cross validation of model # scores = cross_val_score(knn_gscv.best_estimator_, dataset.x, dataset.y, scoring='accuracy') # print(f"CV Scores mean GSV Search : {scores.mean()} ") knn_gscv.best_estimator_.predict(dataset.test_x) print( f"Knn GSCV Score {knn_gscv.best_estimator_.score(dataset.test_x, dataset.test_y)}" ) ## Learning Curve learning = plots.plot_learning_curve(knn_gscv.best_estimator_, "Learning Curves (KNN)", dataset.train_x, dataset.train_y, cv=5, n_jobs=4) learning.show(block=False) plt.show() ## PLOT TIMINGS # Plot time taken for various sizes of database train_sizes = np.linspace(0.1, 0.9, 5) time_taken = np.empty(len(train_sizes)) accuracy_scores = np.empty(len(train_sizes)) for i, k in enumerate(train_sizes): X_train, X_test, y_train, y_test = train_test_split( dataset.x, dataset.y, test_size=k, random_state=dataset.randomness) start_time = time.time() knn_gscv.best_estimator_.fit(X_train, y_train) y_pred = knn_gscv.best_estimator_.predict(X_test) elapsed_time = time.time() - start_time time_taken[i] = elapsed_time accuracy_scores[i] = accuracy_score(y_pred, y_test) ## Plot Times taken by different models plt.title( f'KNN:Time Taken & Accuracy vs TestSet Sizes for {dataset.dataset_name}' ) plt.plot(train_sizes, time_taken, label='Time Taken Vs TestData Size') plt.plot(train_sizes, accuracy_scores, label='Accuracy Vs TestData Size') plt.legend() plt.xlabel('TestData Size') plt.ylabel('Time Taken') plt.show(block=False) plt.show() print( f"Average time taken for KNN algorithms is {np.mean(time_taken)}" ) else: k_range = np.arange(6, 15) train_accuracy = np.empty(len(k_range)) test_accuracy = np.empty(len(k_range)) for i, k in enumerate(k_range): knn = KNeighborsClassifier(n_neighbors=k, weights='uniform', metric='manhattan', n_jobs=4) knn.fit(dataset.train_x, dataset.train_y) # Compute accuracy on the training set train_accuracy[i] = knn.score(dataset.train_x, dataset.train_y) # Compute accuracy on the testing set test_accuracy[i] = knn.score(dataset.test_x, dataset.test_y) # Visualization of k values vs accuracy plt.title('k-NN: Varying Number of Neighbors') plt.plot(k_range, test_accuracy, label='Testing Accuracy') plt.plot(k_range, train_accuracy, label='Training Accuracy') plt.xticks(np.arange(min(k_range), max(k_range) + 1, 1.0)) plt.legend() plt.xlabel('Number of Neighbors') plt.ylabel('Accuracy') plt.show(block=False) plt.show() # create new a knn model knn2 = KNeighborsClassifier(n_jobs=4) # create a dictionary of all values we want to test for n_neighbors param_grid = { 'n_neighbors': k_range, 'weights': ['uniform', 'distance'], 'metric': ['manhattan', 'euclidean'] } # use gridsearch to test all values for n_neighbors knn_gscv = GridSearchCV(knn2, param_grid, cv=3, n_jobs=-1, verbose=1) # fit model to data knn_gscv.fit(dataset.train_x, dataset.train_y) print( f"Best parameters for this knn algorithm are {knn_gscv.best_params_}" ) print( f"Best score for this knn algorithm are {knn_gscv.best_score_}" ) # Cross validation of model scores = cross_val_score(knn_gscv.best_estimator_, dataset.x, dataset.y, scoring='accuracy') print(f"CV Scores mean GSV Search : {scores.mean()} ") knn_gscv.best_estimator_.predict(dataset.test_x) print( f"Knn GSCV Score {knn_gscv.best_estimator_.score(dataset.test_x, dataset.test_y)}" ) ## Learning Curve learning = plots.plot_learning_curve(knn_gscv.best_estimator_, "Learning Curves (KNN)", dataset.train_x, dataset.train_y, cv=5, n_jobs=4) learning.show(block=False) plt.show() ## PLOT TIMINGS # Plot time taken for various sizes of database train_sizes = np.linspace(0.1, 0.9, 5) time_taken = np.empty(len(train_sizes)) accuracy_scores = np.empty(len(train_sizes)) for i, k in enumerate(train_sizes): X_train, X_test, y_train, y_test = train_test_split( dataset.x, dataset.y, test_size=k, random_state=dataset.randomness) start_time = time.time() knn_gscv.best_estimator_.fit(X_train, y_train) y_pred = knn_gscv.best_estimator_.predict(X_test) elapsed_time = time.time() - start_time time_taken[i] = elapsed_time accuracy_scores[i] = accuracy_score(y_pred, y_test) ## Plot Times taken by different models plt.title( f'KNN:Time Taken & Accuracy vs TestSet Sizes for {dataset.dataset_name}' ) plt.plot(train_sizes, time_taken, label='Time Taken Vs TestData Size') plt.plot(train_sizes, accuracy_scores, label='Accuracy Vs TestData Size') plt.legend() plt.xlabel('TestData Size') plt.ylabel('Time Taken') plt.show(block=False) plt.show() print( f"Average time taken for KNN algorithms is {np.mean(time_taken)}" )
def run_experiment(self, dataset): if dataset.dataset_name == 'Diabetes Data Set': # Decision Tree without Pruning self.learner_model = tree.DecisionTreeClassifier( random_state=dataset.randomness) self.learner_model.fit(dataset.train_x, dataset.train_y) y_test_pred = self.learner_model.predict(dataset.test_x) print( f'Test score for simple tree {accuracy_score(y_test_pred, dataset.test_y)}' ) clf = self.learner_model # Decision Tree with Post-Pruning path = clf.cost_complexity_pruning_path(dataset.train_x, dataset.train_y) ccp_alphas, impurities = path.ccp_alphas, path.impurities # print(ccp_alphas) clfs = [] for ccp_alpha in ccp_alphas: clf = tree.DecisionTreeClassifier( random_state=dataset.randomness, ccp_alpha=ccp_alpha) clf.fit(dataset.train_x, dataset.train_y) clfs.append(clf) clfs = clfs[:-1] train_acc = [] test_acc = [] for c in clfs: y_train_pred = c.predict(dataset.train_x) y_test_pred = c.predict(dataset.test_x) train_acc.append(accuracy_score(y_train_pred, dataset.train_y)) test_acc.append(accuracy_score(y_test_pred, dataset.test_y)) plt.title('Decision Trees: Varying CCP ALPHAS') plt.plot(ccp_alphas[:-1], test_acc, label='Testing Accuracy') plt.plot(ccp_alphas[:-1], train_acc, label='Training Accuracy') # plt.xticks(np.arange(min(ccp_alphas), max(ccp_alphas) + 1, 1.0)) plt.legend() plt.xlabel('CCP Alpha Values') plt.ylabel('Accuracy') plt.show(block=False) plt.show() clf_ = tree.DecisionTreeClassifier(random_state=dataset.randomness, ccp_alpha=0.01) clf_.fit(dataset.train_x, dataset.train_y) y_test_pred = clf_.predict(dataset.test_x) print( f'Test score cost complexity pruning {accuracy_score(y_test_pred, dataset.test_y)}' ) ###### WHOLE NEW TEST param_grid = { "criterion": ["gini", "entropy"], "min_samples_split": [6, 7, 8], "max_depth": [5, 10, 15, 18], "min_samples_leaf": [1, 2, 4], "max_leaf_nodes": [26, 28, 29, 30, 32], } dt = DecisionTreeClassifier() ts_gs = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5) ts_gs.fit(dataset.train_x, dataset.train_y) model = ts_gs.best_estimator_ # test the returned best parameters print("\n\n-- Testing best parameters [Grid]...") print(ts_gs.best_params_) y_test_pred = model.predict(dataset.test_x) print(f'Test score {accuracy_score(y_test_pred, dataset.test_y)}') learning_curve = plots.plot_learning_curve( ts_gs.best_estimator_, "Learning Curves (Decision Trees)", dataset.x, dataset.y, cv=5, n_jobs=4) learning_curve.show() k_range = np.arange(15, 25) train_accuracy = np.empty(len(k_range)) test_accuracy = np.empty(len(k_range)) for i, k in enumerate(k_range): knn = DecisionTreeClassifier(criterion='entropy', max_depth=k, min_samples_leaf=2, min_samples_split=6, max_leaf_nodes=28) knn.fit(dataset.train_x, dataset.train_y) # Compute accuracy on the training set train_accuracy[i] = knn.score(dataset.train_x, dataset.train_y) # Compute accuracy on the testing set test_accuracy[i] = knn.score(dataset.test_x, dataset.test_y) # Visualization of k values vs accuracy print( f"Average accuracy for all algorithms {np.mean(test_accuracy)}" ) plt.title('Decision Trees: Varying max_depth') plt.plot(k_range, test_accuracy, label='Testing Accuracy') plt.plot(k_range, train_accuracy, label='Training Accuracy') plt.xticks(np.arange(min(k_range), max(k_range) + 1, 1.0)) plt.legend() plt.xlabel('max_depth') plt.ylabel('Accuracy') plt.show(block=False) plt.show() ## PLOT TIMINGS # Plot time taken for various sizes of database train_sizes = np.linspace(0.1, 0.9, 5) time_taken = np.empty(len(train_sizes)) accuracy_scores = np.empty(len(train_sizes)) for i, k in enumerate(train_sizes): X_train, X_test, y_train, y_test = train_test_split( dataset.x, dataset.y, test_size=k, random_state=dataset.randomness) start_time = time.time() model.fit(X_train, y_train) y_pred = model.predict(X_test) elapsed_time = time.time() - start_time time_taken[i] = elapsed_time accuracy_scores[i] = accuracy_score(y_pred, y_test) ## Plot Times taken by different models plt.title( f'Decision Trees: Time Taken & Accuracy vs TestData Sizes for {dataset.dataset_name}' ) plt.plot(train_sizes, time_taken, label='Time Taken Vs TestData Size') plt.plot(train_sizes, accuracy_scores, label='Accuracy Vs TestData Size') plt.legend() plt.xlabel('TestData Size') plt.ylabel('Time Taken') plt.show(block=False) plt.show() print( f"Average time taken for Decision Trees algorithms is {np.mean(time_taken)}" ) else: # Decision Tree without Pruning self.learner_model = tree.DecisionTreeClassifier( random_state=dataset.randomness) self.learner_model.fit(dataset.train_x, dataset.train_y) y_test_pred = self.learner_model.predict(dataset.test_x) print( f'Test score for simple tree {accuracy_score(y_test_pred, dataset.test_y)}' ) clf = self.learner_model # Decision Tree with Post-Pruning path = clf.cost_complexity_pruning_path(dataset.train_x, dataset.train_y) ccp_alphas, impurities = path.ccp_alphas, path.impurities # print(ccp_alphas) clfs = [] for ccp_alpha in ccp_alphas: clf = tree.DecisionTreeClassifier( random_state=dataset.randomness, ccp_alpha=ccp_alpha) clf.fit(dataset.train_x, dataset.train_y) clfs.append(clf) clfs = clfs[:-1] train_acc = [] test_acc = [] for c in clfs: y_train_pred = c.predict(dataset.train_x) y_test_pred = c.predict(dataset.test_x) train_acc.append(accuracy_score(y_train_pred, dataset.train_y)) test_acc.append(accuracy_score(y_test_pred, dataset.test_y)) plt.title('Decision Trees: Varying CCP ALPHAS') plt.plot(ccp_alphas[:-1], test_acc, label='Testing Accuracy') plt.plot(ccp_alphas[:-1], train_acc, label='Training Accuracy') # plt.xticks(np.arange(min(ccp_alphas), max(ccp_alphas) + 1, 1.0)) plt.legend() plt.xlabel('CCP Alpha Values') plt.ylabel('Accuracy') plt.show(block=False) plt.show() clf_ = tree.DecisionTreeClassifier(random_state=dataset.randomness, ccp_alpha=0.00234) clf_.fit(dataset.train_x, dataset.train_y) y_test_pred = clf_.predict(dataset.test_x) print( f'Test score cost complexity pruning {accuracy_score(y_test_pred, dataset.test_y)}' ) ###### WHOLE NEW TEST param_grid = { "criterion": ["gini", "entropy"], "min_samples_split": [2, 3, 4], "max_depth": [5, 6, 7, 8, 10, 12, 14], "min_samples_leaf": [2, 3, 4, 6, 8], "max_leaf_nodes": [30, 32, 34, 36, 38, 40, 42, 44], } dt = DecisionTreeClassifier() ts_gs = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5) ts_gs.fit(dataset.train_x, dataset.train_y) model = ts_gs.best_estimator_ # test the returned best parameters print("\n\n-- Testing best parameters [Grid]...") print(ts_gs.best_params_) y_test_pred = model.predict(dataset.test_x) print(f'Test score {accuracy_score(y_test_pred, dataset.test_y)}') learning_curve = plots.plot_learning_curve( ts_gs.best_estimator_, "Learning Curves (Decision Trees)", dataset.x, dataset.y, cv=5, n_jobs=4) learning_curve.show() k_range = np.arange(5, 15) train_accuracy = np.empty(len(k_range)) test_accuracy = np.empty(len(k_range)) for i, k in enumerate(k_range): knn = DecisionTreeClassifier(criterion='entropy', max_depth=k, min_samples_leaf=2, min_samples_split=6, max_leaf_nodes=28) knn.fit(dataset.train_x, dataset.train_y) # Compute accuracy on the training set train_accuracy[i] = knn.score(dataset.train_x, dataset.train_y) # Compute accuracy on the testing set test_accuracy[i] = knn.score(dataset.test_x, dataset.test_y) # Visualization of k values vs accuracy print( f"Average accuracy for all algorithms {np.mean(test_accuracy)}" ) plt.title('Decision Trees: Varying max_depth') plt.plot(k_range, test_accuracy, label='Testing Accuracy') plt.plot(k_range, train_accuracy, label='Training Accuracy') plt.xticks(np.arange(min(k_range), max(k_range) + 1, 1.0)) plt.legend() plt.xlabel('max_depth') plt.ylabel('Accuracy') plt.show(block=False) plt.show() ## PLOT TIMINGS # Plot time taken for various sizes of database train_sizes = np.linspace(0.1, 0.9, 5) time_taken = np.empty(len(train_sizes)) accuracy_scores = np.empty(len(train_sizes)) for i, k in enumerate(train_sizes): X_train, X_test, y_train, y_test = train_test_split( dataset.x, dataset.y, test_size=k, random_state=dataset.randomness) start_time = time.time() model.fit(X_train, y_train) y_pred = model.predict(X_test) elapsed_time = time.time() - start_time time_taken[i] = elapsed_time accuracy_scores[i] = accuracy_score(y_pred, y_test) ## Plot Times taken by different models plt.title( f'Decision Trees:Time Taken & Accuracy vs TestData Size for {dataset.dataset_name}' ) plt.plot(train_sizes, time_taken, label='Time Taken Vs TestData Size') plt.plot(train_sizes, accuracy_scores, label='Accuracy Vs TestData Size') plt.legend() plt.xlabel('TestData Size') plt.ylabel('Time Taken') plt.show(block=False) plt.show() print( f"Average time taken for SVM algorithms is {np.mean(time_taken)}" )
def run_experiment(self, dataset): # Fit regression model if dataset.dataset_name == 'Diabetes Data Set': regr_1 = tree.DecisionTreeClassifier(random_state=dataset.randomness, ccp_alpha=0.01 ) regr_2 = AdaBoostClassifier(regr_1) param_grid = { "base_estimator__splitter": ["best", "random"], "n_estimators": [50, 100, 1, 2, 10, 20, 30, 40] } regr_1.fit(dataset.train_x, dataset.train_y) regr_2.fit(dataset.train_x, dataset.train_y) # Predict y_1 = regr_1.predict(dataset.test_x) y_2 = regr_2.predict(dataset.test_x) # Plot the results print(f"Accuracy of the model regr_1 is {accuracy_score(y_1, dataset.test_y)}") print("Classification Report") print(classification_report(y_1, dataset.test_y)) print(f"Accuracy of the model regr_2 is {accuracy_score(y_2, dataset.test_y)}") print("Classification Report") print(classification_report(y_2, dataset.test_y)) # evaluate the model cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) n_scores = cross_val_score(regr_2, dataset.x, dataset.y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise') print(f"Cross Validation Score is {np.mean(n_scores)}") # Grid Search # run grid search grid_search = GridSearchCV(regr_2, param_grid=param_grid, scoring='accuracy', cv=5) # execute the grid search grid_result = grid_search.fit(dataset.train_x, dataset.train_y) # summarize the best score and configuration print(f"Best parameters are {grid_result.best_params_}") predictions = grid_result.best_estimator_.predict(dataset.test_x) print(f"Accuracy of the model is {accuracy_score(predictions, dataset.test_y)}") print("Classification Report of Model") print(classification_report(predictions, dataset.test_y)) learning_curve = plots.plot_learning_curve(grid_result.best_estimator_, "Learning Curves (Decision Trees)", dataset.x, dataset.y, cv=5, n_jobs=4) learning_curve.show(block=False) else: regr_1 = tree.DecisionTreeClassifier(random_state=dataset.randomness, ccp_alpha=0.00234 ) regr_2 = AdaBoostClassifier(regr_1) param_grid = { "base_estimator__splitter": ["best", "random"], "n_estimators": [50, 100,150,200] } regr_1.fit(dataset.train_x, dataset.train_y) regr_2.fit(dataset.train_x, dataset.train_y) # Predict y_1 = regr_1.predict(dataset.test_x) y_2 = regr_2.predict(dataset.test_x) # Plot the results print(f"Accuracy of the model regr_1 is {accuracy_score(y_1, dataset.test_y)}") print("Classification Report") print(classification_report(y_1, dataset.test_y)) print(f"Accuracy of the model regr_2 is {accuracy_score(y_2, dataset.test_y)}") print("Classification Report") print(classification_report(y_2, dataset.test_y)) # evaluate the model cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) n_scores = cross_val_score(regr_2, dataset.x, dataset.y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise') print(f"Cross Validation Score is {np.mean(n_scores)}") # Grid Search # run grid search grid_search = GridSearchCV(regr_2, param_grid=param_grid, scoring='accuracy', cv=5) # execute the grid search grid_result = grid_search.fit(dataset.train_x, dataset.train_y) # summarize the best score and configuration print(f"Best parameters are {grid_result.best_params_}") predictions = grid_result.best_estimator_.predict(dataset.test_x) print(f"Accuracy of the model is {accuracy_score(predictions, dataset.test_y)}") print("Classification Report of Model") print(classification_report(predictions, dataset.test_y)) learning_curve = plots.plot_learning_curve(grid_result.best_estimator_, "Learning Curves (Decision Trees)", dataset.x, dataset.y, cv=5, n_jobs=4) learning_curve.show(block=False)