def regression_line_spam_no_libs(): """ Solution for HW1 prob 2 """ print('Homework 1 problem 2 - No Libraries - Regression Line') print('Spam Dataset') spam_data = utils.load_and_normalize_spam_data() test, train = utils.split_test_and_train(spam_data) columns = train.columns[:-1] Y_fit = mystats.linear_regression_points(train[columns], train['is_spam']) #print 'Y_fit' #print Y_fit #for i in range(0, len(Y_fit)): # print str(Y_fit[i]) + ' -- ' + str(train['is_spam'][i]) col_MSE = {} for i, col in enumerate(columns): col_fit = Y_fit[i] + Y_fit[-1] col_MSE[col] = mystats.compute_MSE_arrays(col_fit, train['is_spam']) print col_MSE RMSE = np.sqrt(col_MSE.values()) average_MSE = utils.average(col_MSE.values()) average_RMSE = utils.average(RMSE) print 'Average MSE: ' + str(average_MSE) print 'Average RMSE: ' + str(average_RMSE)
def decision_spambase_set_no_libs(): """ Solution for HW1 prob 1 """ print('Homework 1 problem 1 - No Libraries - Regression Decision tree') print('Spambase Dataset') spam_data = utils.load_and_normalize_spam_data() test, train = utils.split_test_and_train(spam_data) print str(len(train)) + " # in training set <--> # in test " + str(len(test)) node = mytree.Node(np.ones(len(train))) branch_node(node, train, 5, 'is_spam') #node.show_children_tree() node.show_children_tree(follow=False) model = mytree.Tree(node) model.print_leaves() print 'Trained model error is : ' + str(model.error()) node.presence = np.ones(len(test)) test_node(node, test, 'is_spam') test_tree = mytree.Tree(node) prediction = test_tree.predict_obj() test_tree.print_leaves_test() print 'predict sum: ' + str(sum(prediction)) print 'MSE:' + str(test_tree.error_test()) [tp, tn, fp, fn] = mystats.get_performance_stats(test['is_spam'].as_matrix(), prediction) print 'TP: {}\tFP: {}\nTN: {}\tFN: {}'.format(tp, fp, tn, fn) print 'Accuracy: ' + str(mystats.compute_accuracy(tp,tn, fp,fn)) print 'MSE: ' + str(mystats.compute_MSE_arrays(prediction, test['is_spam']))
def SpamClassifier(features, skclassifier, myclassifier): data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 if features != 'all': # Only use the features passed in the features array new = [] t = utils.transpose_array(data) for i in xrange(len(t)): if i in features: new.append(t[i]) data = utils.transpose_array(t) all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'start MyKNN' knn = hw7u.KNN(classifier=myclassifier) print 'start scikit' knnsci = hw7u.KNN(classifier=skclassifier) print 'start my pred' y_pred = knn.predict(X_test, X, y) print 'My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred))) print 'start sk pred' y_sci = knnsci.predict(X_test, X, y) print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
def q1a(): """SVM on Spam Data length train: 4140 length test 461 train acc: 0.806763285024 test acc: 0.819956616052 """ data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) svm_q1(data, svm.SVC(kernel='poly'))
def decision_spambase_set(): """ Solution for HW1 prob 1 """ print('Homework 1 problem 1 - Regression Decision tree') print('Spambase Dataset') spam_data = utils.load_and_normalize_spam_data() test, train = utils.split_test_and_train(spam_data) print str(len(train)) + " # in training set <--> # in test " + str(len(test)) dt = train_decision_tree(train) predicted = test_decision_tree(dt, test) #print predicted #print test['is_spam'] error = mystats.calculate_binary_error(predicted, test['is_spam']) print 'Error: ' + str(error)
def q_1(): h_test, h_train = utils.load_and_normalize_housing_set() h_results = [] s_results = [] # h_results.append(dec_or_reg_tree(h_train, h_test, 'MEDV')) # MSE - 568 test- 448 # h_results.append(linear_reg_errors(h_train, h_test, 'MEDV')) # MSE - 27 test -14 # h_results.append(linear_reg_errors(h_train, h_test, 'MEDV', True)) # 24176 - 68289 # h_results.append(linear_gd(h_train, h_test, 'MEDV')) # works but MSE too low? .0022 - .0013 # h_results.append(logistic_gd(h_train, h_test, 'MEDV')) # 1.46e_13 - 1.17e+13 s_test, s_train = utils.split_test_and_train(utils.load_and_normalize_spam_data()) s_results.append(dec_or_reg_tree(s_train, s_test, "is_spam")) # works .845 - .86 s_results.append(linear_reg_errors(s_train, s_test, "is_spam")) # works .8609 - .903 s_results.append(linear_reg_errors(s_train, s_test, "is_spam", True)) # works .8416 - .8543 s_results.append(k_folds_linear_gd(s_train, s_test, "is_spam")) # does not work .6114 - .6114 s_results.append(logistic_gd(s_train, s_test, "is_spam")) # returns perfect... 1- 1 print_results_1(s_results, h_results)
def testLogisticGradient(): """ logistic gradient descent """ df_test, df_train = utils.split_test_and_train(utils.load_and_normalize_spam_data()) Y = 'is_spam' binary = utils.check_binary(df_train[Y]) model = gd.logistic_gradient(df_train, df_train[Y], .1, max_iterations=5) #print model #raw_input() predict = gd.predict(df_train, model, binary, True) print predict error_train = mystats.get_error(predict, df_train[Y], binary) #raw_input() predict = gd.predict(df_test, model, binary, True) print predict error_test = mystats.get_error(predict, df_test[Y], binary) print 'error train {} error_test {}'.format(error_train, error_test) return [error_train, error_test]
def relief(n): max_iters = 1 j = 0 i = 1 n_neighbors = [1, 3, 7] metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = hw7u.Kernel(ktype=metric[j]).compute data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) loops = 0 weights = np.zeros(len(X[0])) loops += 1 n_features = len(X[0]) n_samples = len(X) for j in range(n_features): #feature for i in range(n_samples): # data closest_same = None closest_opp = None for z_i in range(n_samples): if z_i == i: continue diff = (X[z_i][j] - X[i][j]) ** 2 if y[z_i] == y[i]: # same if closest_same is None or diff < closest_same: closest_same = diff else: # opp if closest_opp is None or diff < closest_opp: closest_opp = diff weights[j] += (-closest_same + closest_opp) if i % 1000 == 0: print 'feature {} of {}, sample {} of {}'.format(j, n_features, i, n_samples) print weights return sorted(zip(weights, range(len(weights))), reverse=True)[:n][1]
def runSpamDensity(_i, j, features='all'): metric = ['gaussian', 'poly2', 'cosine_similarity', 'gaussian_density'] data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print(len(X)) print(len(X_test)) myclassifier = hw7u.MyKNN(metric=metric[j], density=True) print 'start MyKNN' myclassifier.fit(X, y) #print 'start scikit' #knnsci = skclassifier.fit(X, y) print 'start my pred' y_pred = myclassifier.predict(X_test) print(y_pred) #print 'start sk pred' #y_sci = knnsci.score(X_test) #print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred))) print '2b: My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred)))
def load_and_normalize_spambase(): return utils.load_and_normalize_spam_data()
def analyze_spambase_hw1(): """ HW1 - problem 2 """ spamData = utils.load_and_normalize_spam_data() mystats.k_folds(spamData, 10)
def q2_kernel_poly(): # c = 1, tol = 1-e2, passes = 1 # data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) svm_q1(data, mysvm.SVC(mysvm.SMO, Kernel('poly')))
def q2(): # c= 1, tol = 1-e2, passes = 1 # train acc: 0.904830917874 test acc: 0.917570498915 data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) svm_q1(data, mysvm.SVC(mysvm.SMO, Kernel('linear')))