def q2(): models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins'] spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) k = 10 k_folds = hw3.partition_folds(spamData, k) for model_type in range(4): print '\nModel: {}'.format(models[model_type]) train_acc_sum = 0 nb_models = [] for ki in range(k - 1): alpha = .001 if model_type==0 else 0 nb_model = nb.NaiveBayes(model_type, alpha=alpha) truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki]) nb_model.train(data_rows, truth_rows) predict = nb_model.predict(data_rows) print predict accuracy = hw3.get_accuracy(predict, truth_rows) train_acc_sum += accuracy print_output(ki, accuracy) nb_models.append(nb_model) nb_combined = nb.NaiveBayes(model_type, alpha=.001) if model_type < 2: nb_combined.aggregate_model(nb_models) else: nb_combined.aggregate_model3(nb_models) truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[k - 1]) test_predict = nb_combined.predict(data_rows) test_accuracy = hw3.get_accuracy(test_predict, truth_rows) print_test_output(test_accuracy, float(train_acc_sum)/(k-1)) #print len(k_folds[0]) truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(spamData)
def TreeTest(): spamDat = spamData() k = 10 all_folds = hw3.partition_folds(spamDat, k) num_in_fold = [] err_in_fold = [] for i in range(len(all_folds) - 1): spam = all_folds[i] num_in_fold.append(len(spam)) truth, f_data = decTree.split_truth_from_data(spam) tree = decTree.TreeOptimal(max_depth=2) #tree = decTree.TreeRandom() tree.fit(f_data, truth) print 'Prediction...\n' predict = tree.predict(f_data) print predict print truth error = 1. - hw3.get_accuracy(predict, truth) err_in_fold.append(error) print 'Tree error is: {}'.format(error) spam = all_folds[k -1] truth, f_data = decTree.split_truth_from_data(spam) tree = decTree.TreeOptimal(max_depth=2) #tree = decTree.TreeRandom() tree.fit(f_data, truth) predict = tree.predict(f_data) error = 1. - hw3.get_accuracy(predict, truth) sum_training_err = 0 for i in range(len(num_in_fold)): sum_training_err += err_in_fold[i] #sum_training_err += float(err_in_fold)/num_in_fold average_training_error = float(sum_training_err)/len(num_in_fold) print 'Average training error: {}\nAverage testing error: {}'.format(average_training_error, error)
def q2_plots(): models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins'] spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) k = 10 num_points = 50 k_folds = hw3.partition_folds(spamData, k) for model_type in range(4): roc = ROC.ROC() print '\nModel: {}'.format(models[model_type]) train_acc_sum = 0 nb_models = [] for ki in [0]: alpha = .001 if model_type==0 else 0 nb_model = nb.NaiveBayes(model_type, alpha=alpha) truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki]) nb_model.train(data_rows, truth_rows) for ti in range(num_points + 2): theta = ti * 1./(num_points + 1) predict = nb_model.predict(data_rows, theta) print predict accuracy = hw3.get_accuracy(predict, truth_rows) train_acc_sum += accuracy roc.add_tp_tn(predict, truth_rows, theta) #print_plot_output(ki, accuracy, theta) roc.plot_ROC('/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks/roc_{}.pdf'.format(model_type)) roc.print_info()
def GaussianNB(X, num_features=None): model_type = 1 train_acc_sum = 0 test_acc_sum = 0 k = 10 nb_models = [] if num_features is not None: y, X = utils.split_truth_from_data(X) q4_slct = SelectKBest(k=num_features).fit(X, y) X = q4_slct.transform(X) X = utils.add_row(X, y) k_folds = hw3u.partition_folds(X, k) for ki in range(k): grouped_fold = hw5u.group_fold(k_folds, ki) alpha = .001 if model_type==0 else 0 mask_cols = check_cols(grouped_fold) #nb_model = nb.NaiveBayes(model_type, alpha=alpha, ignore_cols=mask_cols) nb_model = BernoulliNB() print 'len of kfolds {}'.format(len(grouped_fold)) #truth_rows, data_rows, data_mus, y_mu = hw3u.get_data_and_mus(grouped_fold) truth_rows, data_rows = utils.split_truth_from_data(grouped_fold) print 'len of data {}'.format(len(data_rows)) #nb_model.train(data_rows, truth_rows) nb_model.fit(data_rows, truth_rows) predict = nb_model.predict(data_rows) #print predict accuracy = hw3u.get_accuracy(predict, truth_rows) train_acc_sum += accuracy print_output(ki, accuracy) nb_models.append(nb_model) truth_rows, data_rows = utils.split_truth_from_data(k_folds[ki]) test_predict = nb_model.predict(data_rows) test_accuracy = hw3u.get_accuracy(test_predict, truth_rows) test_acc_sum += test_accuracy print_output(ki, test_accuracy, 'test') print_test_output(float(train_acc_sum)/k, float(test_acc_sum)/k)