def SpamClassifier(features, skclassifier, myclassifier): data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 if features != 'all': # Only use the features passed in the features array new = [] t = utils.transpose_array(data) for i in xrange(len(t)): if i in features: new.append(t[i]) data = utils.transpose_array(t) all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'start MyKNN' knn = hw7u.KNN(classifier=myclassifier) print 'start scikit' knnsci = hw7u.KNN(classifier=skclassifier) print 'start my pred' y_pred = knn.predict(X_test, X, y) print 'My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred))) print 'start sk pred' y_sci = knnsci.predict(X_test, X, y) print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
def runDigitsDensity(n,_i, j): metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = hw7u.Kernel(ktype=metric[j]+'_sci').compute #skclf = KernelDensity(metric=ma) myclf = hw7u.MyKNN(metric=metric[j], density=True) mnsize = n df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y, dtype=np.float), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float) print 'my fit' clf = OneVsRestClassifier(myclf).fit(X, y) print 'scikit fit' #skclf = skclf.fit(X, y) print 'my predict' y_pred = clf.predict(X_test) myacc = accuracy_score(y_test, y_pred) print '({})'.format(myacc) #print 'scikit predict' #sk_pred = skclf.predict(X_test) #print sk_pred print y_test print y_pred #print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc) print 'My Accuracy: {}'.format(myacc)
def q1(): """ feature analysis with Adaboost """ #spamData = hw3u.pandas_to_data(hw3u.load_and_normalize_spambase()) spamData = utils.load_and_normalize_polluted_spam_data() k = 10 all_folds = hw3u.partition_folds(spamData, k) col_errs = [] kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) # We're not actually cross-validating anything -- we just want feature weights #X = np.concatenate([X, X_test], axis=0) #y = np.concatenate([y, y_test], axis=0) #adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='random')) adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='best')) #adaboost = adac.AdaboostOptimal(max_rounds=10, do_fast=False, learner=hw4u.TreeOptimal) adaboost.fit(X, y) margin_fractions = get_margin_fractions(adaboost, X[0]) #margin_fractions_v = hw5u.get_margin_fractions_validate(adaboost, X, y) #print col_errs ranked = rank(margin_fractions) print_ranks(ranked) pred = adaboost.predict(X_test) print 'Accuracy: {}'.format(accuracy_score(adaboost._check_y_not_zero(y_test), adaboost._check_y_not_zero(pred)))
def runDigits(n, skclf, myclf): mnsize = n df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y, dtype=np.float), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float) print 'my fit' clf = OneVsRestClassifier(myclf).fit(X, y) print 'scikit fit' skclf = skclf.fit(X, y) print 'my predict' y_pred = clf.predict(X_test) myacc = accuracy_score(y_test, y_pred) print '({})'.format(myacc) print 'scikit predict' sk_pred = skclf.predict(X_test) print sk_pred print y_test print y_pred print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
def q1(): """GDA """ """Run the Gaussian Discriminant Analysis on the spambase data. Use the k-folds from the previous problem (1 for testing, k-1 for training, for each fold) Since you have 57 real value features, each of the 2gaussians (for + class and for - class) will have a mean vector with 57 components, and a they will have either a common (shared) covariance matrix size 57x57. This covariance is estimated from all training data (both classes) or two separate covariance 57x57 matrices (estimated separately for each class) (you can use a Matlab or Python of Java built in function to estimated covariance matrices, but the estimator is easy to code up). Looking at the training and testing performance, does it appear that the gaussian assumption (normal distributed data) holds for this particular dataset? """ spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) # returns an array of arrays - this is by row k = 10 train_acc_sum = 0 k_folds = hw3.partition_folds(spamData, k) gdas = [] for ki in range(k - 1): subset = [] gda = hw3.GDA() X, truth = hw3.separate_X_and_y(k_folds[ki]) covariance_matrix = hw3.get_covar(X) gda.p_y = float(sum(truth)) / len(truth) gda.train(X, covariance_matrix, truth) predictions = gda.predict(X) #print predictions accuracy = mystats.get_error(predictions, truth, True) #gdas.append(gda) print_output(ki, accuracy) #print gda.prob gdas.append(gda)
def TreeTest(): spamDat = spamData() k = 10 all_folds = hw3.partition_folds(spamDat, k) num_in_fold = [] err_in_fold = [] for i in range(len(all_folds) - 1): spam = all_folds[i] num_in_fold.append(len(spam)) truth, f_data = decTree.split_truth_from_data(spam) tree = decTree.TreeOptimal(max_depth=2) #tree = decTree.TreeRandom() tree.fit(f_data, truth) print 'Prediction...\n' predict = tree.predict(f_data) print predict print truth error = 1. - hw3.get_accuracy(predict, truth) err_in_fold.append(error) print 'Tree error is: {}'.format(error) spam = all_folds[k -1] truth, f_data = decTree.split_truth_from_data(spam) tree = decTree.TreeOptimal(max_depth=2) #tree = decTree.TreeRandom() tree.fit(f_data, truth) predict = tree.predict(f_data) error = 1. - hw3.get_accuracy(predict, truth) sum_training_err = 0 for i in range(len(num_in_fold)): sum_training_err += err_in_fold[i] #sum_training_err += float(err_in_fold)/num_in_fold average_training_error = float(sum_training_err)/len(num_in_fold) print 'Average training error: {}\nAverage testing error: {}'.format(average_training_error, error)
def q2(): models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins'] spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) k = 10 k_folds = hw3.partition_folds(spamData, k) for model_type in range(4): print '\nModel: {}'.format(models[model_type]) train_acc_sum = 0 nb_models = [] for ki in range(k - 1): alpha = .001 if model_type==0 else 0 nb_model = nb.NaiveBayes(model_type, alpha=alpha) truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki]) nb_model.train(data_rows, truth_rows) predict = nb_model.predict(data_rows) print predict accuracy = hw3.get_accuracy(predict, truth_rows) train_acc_sum += accuracy print_output(ki, accuracy) nb_models.append(nb_model) nb_combined = nb.NaiveBayes(model_type, alpha=.001) if model_type < 2: nb_combined.aggregate_model(nb_models) else: nb_combined.aggregate_model3(nb_models) truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[k - 1]) test_predict = nb_combined.predict(data_rows) test_accuracy = hw3.get_accuracy(test_predict, truth_rows) print_test_output(test_accuracy, float(train_acc_sum)/(k-1)) #print len(k_folds[0]) truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(spamData)
def q2_plots(): models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins'] spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) k = 10 num_points = 50 k_folds = hw3.partition_folds(spamData, k) for model_type in range(4): roc = ROC.ROC() print '\nModel: {}'.format(models[model_type]) train_acc_sum = 0 nb_models = [] for ki in [0]: alpha = .001 if model_type==0 else 0 nb_model = nb.NaiveBayes(model_type, alpha=alpha) truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki]) nb_model.train(data_rows, truth_rows) for ti in range(num_points + 2): theta = ti * 1./(num_points + 1) predict = nb_model.predict(data_rows, theta) print predict accuracy = hw3.get_accuracy(predict, truth_rows) train_acc_sum += accuracy roc.add_tp_tn(predict, truth_rows, theta) #print_plot_output(ki, accuracy, theta) roc.plot_ROC('/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks/roc_{}.pdf'.format(model_type)) roc.print_info()
def tests_radius(): i = 0 j = 0 k = 10 X, y = testData() #print X X = np.concatenate([X, y.reshape((len(y), 1))], axis=1) X = [list(x.ravel()) for x in X] radius = [3, 5, 7] radius = [1e-1,1e-2,1e-3] # for radius metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = speedy.Kernel(ktype=metric[j]).compute #ma = hw7u.Kernel(ktype=metric[j]).compute print 'spam radius is {}'.format(radius[i]) clf = hw7u.MyKNN(radius=radius[i], metric=metric[j], outlier_label=-1) skclf = RadiusNeighborsClassifier(radius=radius[i], algorithm='brute', metric="euclidean", p=2, outlier_label=.5) all_folds = hw3u.partition_folds(X, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'start scikit' knnsci = hw7u.KNN(classifier=skclf) print 'start MyKNN' knn = hw7u.KNN(classifier=clf) print 'start sk pred' y_sci = knnsci.predict(X_test, X, y) print 'start my pred' y_pred = knn.predict(X_test, X, y) print y_pred print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_sci)), accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_pred)))
def q3(): # Got points off b/c I have 89 accuracy instead of 92 """ Logistic Regression """ data = utils.load_and_normalize_polluted_spam_data() k = 10 k_folds = hw3u.partition_folds(data, k) train_acc = [] test_acc = [] hw2_train_acc = [] hw2_test_acc = [] for ki in range(k): grouped_fold = hw5u.group_fold(k_folds, ki) y, X = utils.split_truth_from_data(grouped_fold) y_truth, X_test = utils.split_truth_from_data(k_folds[ki]) clf = lm.LogisticRegression() #penalty="l1") ridge_clf = hw5u.Ridge() #clf = lm.Lasso(alpha=.5) #clf = lm.RidgeClassifier(alpha=.1) clf.fit(X, y) ridge_clf.fit(X, y) y_train = [1 if p >= .5 else 0 for p in clf.predict(X)] y_test = [1 if p >= .5 else 0 for p in clf.predict(X_test)] yhat_ridge_train = [1 if p >= .5 else 0 for p in ridge_clf.predict(X)] yhat_ridge_test = [1 if p >= .5 else 0 for p in ridge_clf.predict(X_test)] train_acc.append(accuracy_score(y, y_train)) test_acc.append(accuracy_score(y_truth, y_test)) hw2_train_acc.append(accuracy_score(y, yhat_ridge_train)) hw2_test_acc.append(accuracy_score(y_truth, yhat_ridge_test)) print 'Fold {} train acc: {} test acc: {} HW2 ridge train: {} HW2 ridge test: {}'.format(ki+1, train_acc[-1], test_acc[-1], hw2_train_acc[-1], hw2_test_acc[-1]) print 'Average acc - Train: {} Test: {} HW2 ridge: {}'.format(np.mean(train_acc), np.mean(test_acc), np.mean(hw2_train_acc), np.mean(hw2_test_acc))
def q1(): spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) k = 10 all_folds = hw3.partition_folds(spamData, k) tprs = [] fprs = [] for i in [0]: #range(len(all_folds)): kf_data, kf_test = dl.get_train_and_test(all_folds, i) y, X = hw4.split_truth_from_data(kf_data) y_test, X_test = hw4.split_truth_from_data(kf_test) adaboost = run_adaboost(X, y, X_test, y_test, i) predicted = adaboost.predict(X) print(roc_auc_score(y, predicted)) for i in range(len(adaboost.snapshots)): round_number = i + 1 ab = adaboost.snapshots[i] yt_pred = ab.predict(X_test) round_err = float(np.sum([1 if yt==yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test) adaboost.adaboost_error_test[round_number] = round_err print predicted[:20] print y[:20] name = 'q1' directory = '/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks' path = os.path.join(directory, name + 'hw4errors.pdf') tterrpath = os.path.join(directory, name + 'hw4_errors_test_train.pdf') print path plt.Errors([adaboost.local_errors]).plot_all_errors(path) plt.Errors([adaboost.adaboost_error, adaboost.adaboost_error_test]).plot_all_errors(tterrpath) roc = plt.ROC() #roc.add_tpr_fpr_arrays(adaboost.tpr.values(), adaboost.fpr.values()) get_tpr_fpr(adaboost, roc, X_test, y_test, 30) roc.plot_ROC(os.path.join(directory, name + 'hw4_roc.pdf'))
def svm_q1(data, classifier=svm.SVC()): k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'length train: {} length test {}'.format(len(X), len(X_test)) clf = classifier clf.fit(X, y) y_pred = clf.predict(X_test) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y), fix_y(clf.predict(X))), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
def multiclassSVC(classifier, sz=2000): mnsize = sz df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test) print 'Beginning analysis: {}'.format(X.shape) #clf = OneVsRestClassifier(classifier, n_jobs=4).fit(X, y) clf = OneVsOneClassifier(classifier).fit(X, y) #clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=10, random_state=0).fit(np.asarray(X), y) y_pred = clf.predict(X) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y_pred), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test)))) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(clf.predict(X)), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
def run(self, data, weights): k_folds = hw3.partition_folds(data, self.number_k_folds) for k in xrange(self.number_k_folds - 1): err_matrix = [] fold = k_folds[k] truth, f_data = split_truth_from_data(fold) model = self.fit(f_data) predicted = self.predict(model, f_data) # {-1, 1} err_matrix = self.compute_error_matrix(truth, predicted) self.training_errors.append(self.get_error(err_matrix)) self.training_errors_weighted.append(sum(self.weight_errors(err_matrix, weights))) fold = k_folds[self.number_k_folds - 1] truth, f_data = utils.split_truth_from_data(fold) predicted = self.predict(model, f_data) # Error matrix for round computed from test data self.err_matrix = self.compute_error_matrix(truth, predicted) self.testing_error = self.get_error(self.err_matrix) self.testing_errors_weighted = self.weight_errors(self.err_matrix, weights) self.set_weight_distribution_and_total() # Dt(x) and epsilon self.set_alpha()
def relief(n): max_iters = 1 j = 0 i = 1 n_neighbors = [1, 3, 7] metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = hw7u.Kernel(ktype=metric[j]).compute data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) loops = 0 weights = np.zeros(len(X[0])) loops += 1 n_features = len(X[0]) n_samples = len(X) for j in range(n_features): #feature for i in range(n_samples): # data closest_same = None closest_opp = None for z_i in range(n_samples): if z_i == i: continue diff = (X[z_i][j] - X[i][j]) ** 2 if y[z_i] == y[i]: # same if closest_same is None or diff < closest_same: closest_same = diff else: # opp if closest_opp is None or diff < closest_opp: closest_opp = diff weights[j] += (-closest_same + closest_opp) if i % 1000 == 0: print 'feature {} of {}, sample {} of {}'.format(j, n_features, i, n_samples) print weights return sorted(zip(weights, range(len(weights))), reverse=True)[:n][1]
def tests_density(): i = 0 j = 2 k = 10 X, y = testData() print X X = np.concatenate([X, y.reshape((len(y), 1))], axis=1) X = [list(x.ravel()) for x in X] radius = [3, 5, 7] metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = hw7u.Kernel(ktype=metric[j]).compute params = {'bandwidth': np.logspace(-1, 1, 20)} grid = GridSearchCV(KernelDensity(), params) grid.fit(X) clf = hw7u.MyKNN(metric=metric[j], density=True) bw = grid.best_estimator_.bandwidth print("best bandwidth: {0}".format(bw)) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ skclf = KernelDensity(bandwidth=bw, kernel='gaussian') skclf.fit(X[:-10], y[:-10]) print skclf.score_samples(X[-10:]) return all_folds = hw3u.partition_folds(X, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'start scikit' knnsci = hw7u.KNN(classifier=skclf) print 'start MyKNN' knn = hw7u.KNN(classifier=clf) print 'start sk pred' y_sci = knnsci.predict(X_test, X, y) print 'start my pred' y_pred = knn.predict(X_test, X, y) print y_pred print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_sci)), accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_pred)))
def GaussianNB(X, num_features=None): model_type = 1 train_acc_sum = 0 test_acc_sum = 0 k = 10 nb_models = [] if num_features is not None: y, X = utils.split_truth_from_data(X) q4_slct = SelectKBest(k=num_features).fit(X, y) X = q4_slct.transform(X) X = utils.add_row(X, y) k_folds = hw3u.partition_folds(X, k) for ki in range(k): grouped_fold = hw5u.group_fold(k_folds, ki) alpha = .001 if model_type==0 else 0 mask_cols = check_cols(grouped_fold) #nb_model = nb.NaiveBayes(model_type, alpha=alpha, ignore_cols=mask_cols) nb_model = BernoulliNB() print 'len of kfolds {}'.format(len(grouped_fold)) #truth_rows, data_rows, data_mus, y_mu = hw3u.get_data_and_mus(grouped_fold) truth_rows, data_rows = utils.split_truth_from_data(grouped_fold) print 'len of data {}'.format(len(data_rows)) #nb_model.train(data_rows, truth_rows) nb_model.fit(data_rows, truth_rows) predict = nb_model.predict(data_rows) #print predict accuracy = hw3u.get_accuracy(predict, truth_rows) train_acc_sum += accuracy print_output(ki, accuracy) nb_models.append(nb_model) truth_rows, data_rows = utils.split_truth_from_data(k_folds[ki]) test_predict = nb_model.predict(data_rows) test_accuracy = hw3u.get_accuracy(test_predict, truth_rows) test_acc_sum += test_accuracy print_output(ki, test_accuracy, 'test') print_test_output(float(train_acc_sum)/k, float(test_acc_sum)/k)
def runSpamDensity(_i, j, features='all'): metric = ['gaussian', 'poly2', 'cosine_similarity', 'gaussian_density'] data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print(len(X)) print(len(X_test)) myclassifier = hw7u.MyKNN(metric=metric[j], density=True) print 'start MyKNN' myclassifier.fit(X, y) #print 'start scikit' #knnsci = skclassifier.fit(X, y) print 'start my pred' y_pred = myclassifier.predict(X_test) print(y_pred) #print 'start sk pred' #y_sci = knnsci.score(X_test) #print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred))) print '2b: My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred)))
def test_partition_data(): arr = get_test_data(303) print hw3.partition_folds(arr)