def test_area(): b = test_count_black() c = hw5u.get_rect_coords(1, size=len(b)) #c = [[[1, -1], [2, -1], [1, 1], [2, 1]]] # 0 #c = [[[0, -1], [1, -1], [0, 1], [1, 1]]] # .5 print b print c[0] print hw5u.get_black_amt(b, c[0])
def q5(): """ ECOC for image analysis 1000 Set: train. Accuracy: 1.000 Set: test. Accuracy: 0.851 12,000 (20% of 60,000) Set: train. Accuracy: 0.923 Set: test. Accuracy: 0.905 Process finished with exit code 0 http://colah.github.io/posts/2014-10-Visualizing-MNIST/ """ path = os.path.join(os.getcwd(), 'data/HW5/haar') limit = 60000 images, labels = load_mnist('training', path=path) images /= 128.0 X = [] print 'processing images' black = [hw5u.count_black(b) for b in images[:limit]] #bdf = [pd.DataFrame(bd) for bd in black] #with open('save_img_' + str(limit) + '.csv', 'w') as fimg: # pd.concat(bdf, axis=1).to_csv(fimg) print 'finished processing' rects = hw5u.get_rect_coords(100) #hw5u.show_rectangles(rects) for i in range(len(black)): row = [] for r in range(len(rects)): h_diff, v_diff = hw5u.get_features(black[i], rects[r]) row.append(h_diff) row.append(v_diff) X.append(row) save(X, labels) # Each image is a row in table X. # Features are # rectangle_1_horizontal_difference, rectangle_1_vertical_difference, rectangle_2_ho... data = utils.add_row(X, labels) data_split = hw5u.split_test_and_train(data, .2) data_test = data_split[0] data_train = data_split[1] y_train, X_train = utils.split_truth_from_data(data_train) y_test, X_test = utils.split_truth_from_data(data_test) cls = ec.ECOCClassifier(learner=lambda: adac.AdaboostOptimal(learner=lambda: DecisionTreeClassifier(max_depth=1), max_rounds=200), #LogisticRegression, # TODO: replace with AdaBoost #cls = ec.ECOCClassifier(learner=LogisticRegression, # TODO: replace with AdaBoost verbose=True, encoding_type='exhaustive').fit(X_train, y_train) for set_name, X, y in [('train', X_train, y_train), ('test', X_test, y_test)]: print("Set: {}. Accuracy: {:.3f}".format(set_name, accuracy_score(y, cls.predict(X))))
def q3(): # Got points off b/c I have 89 accuracy instead of 92 """ Logistic Regression """ data = utils.load_and_normalize_polluted_spam_data() k = 10 k_folds = hw3u.partition_folds(data, k) train_acc = [] test_acc = [] hw2_train_acc = [] hw2_test_acc = [] for ki in range(k): grouped_fold = hw5u.group_fold(k_folds, ki) y, X = utils.split_truth_from_data(grouped_fold) y_truth, X_test = utils.split_truth_from_data(k_folds[ki]) clf = lm.LogisticRegression() #penalty="l1") ridge_clf = hw5u.Ridge() #clf = lm.Lasso(alpha=.5) #clf = lm.RidgeClassifier(alpha=.1) clf.fit(X, y) ridge_clf.fit(X, y) y_train = [1 if p >= .5 else 0 for p in clf.predict(X)] y_test = [1 if p >= .5 else 0 for p in clf.predict(X_test)] yhat_ridge_train = [1 if p >= .5 else 0 for p in ridge_clf.predict(X)] yhat_ridge_test = [1 if p >= .5 else 0 for p in ridge_clf.predict(X_test)] train_acc.append(accuracy_score(y, y_train)) test_acc.append(accuracy_score(y_truth, y_test)) hw2_train_acc.append(accuracy_score(y, yhat_ridge_train)) hw2_test_acc.append(accuracy_score(y_truth, yhat_ridge_test)) print 'Fold {} train acc: {} test acc: {} HW2 ridge train: {} HW2 ridge test: {}'.format(ki+1, train_acc[-1], test_acc[-1], hw2_train_acc[-1], hw2_test_acc[-1]) print 'Average acc - Train: {} Test: {} HW2 ridge: {}'.format(np.mean(train_acc), np.mean(test_acc), np.mean(hw2_train_acc), np.mean(hw2_test_acc))
def test_count_black(): t = [[0,0,0,0]] t.append([1,1,0,0]) t.append([0,0,1,1]) t.append([1,1,0,0]) print t return hw5u.count_black(t)
def test_rectangle(): path=os.path.join(os.getcwd(), 'data/HW5/haar') images, labels = load_mnist('training', digits=[4], path=path) one_img = images[7] one_img /= 128.0 b = hw5u.count_black(one_img) print one_img print b
def GaussianNB(X, num_features=None): model_type = 1 train_acc_sum = 0 test_acc_sum = 0 k = 10 nb_models = [] if num_features is not None: y, X = utils.split_truth_from_data(X) q4_slct = SelectKBest(k=num_features).fit(X, y) X = q4_slct.transform(X) X = utils.add_row(X, y) k_folds = hw3u.partition_folds(X, k) for ki in range(k): grouped_fold = hw5u.group_fold(k_folds, ki) alpha = .001 if model_type==0 else 0 mask_cols = check_cols(grouped_fold) #nb_model = nb.NaiveBayes(model_type, alpha=alpha, ignore_cols=mask_cols) nb_model = BernoulliNB() print 'len of kfolds {}'.format(len(grouped_fold)) #truth_rows, data_rows, data_mus, y_mu = hw3u.get_data_and_mus(grouped_fold) truth_rows, data_rows = utils.split_truth_from_data(grouped_fold) print 'len of data {}'.format(len(data_rows)) #nb_model.train(data_rows, truth_rows) nb_model.fit(data_rows, truth_rows) predict = nb_model.predict(data_rows) #print predict accuracy = hw3u.get_accuracy(predict, truth_rows) train_acc_sum += accuracy print_output(ki, accuracy) nb_models.append(nb_model) truth_rows, data_rows = utils.split_truth_from_data(k_folds[ki]) test_predict = nb_model.predict(data_rows) test_accuracy = hw3u.get_accuracy(test_predict, truth_rows) test_acc_sum += test_accuracy print_output(ki, test_accuracy, 'test') print_test_output(float(train_acc_sum)/k, float(test_acc_sum)/k)
def test_random_color(): return hw5u.random_color()
def test_plot(): rects = hw5u.get_rect_coords(10) hw5u.show_rectangles(rects, fname='test_rects.png')