def q3():  # Got points off b/c I have 89 accuracy instead of 92
    """ Logistic Regression """
    data = utils.load_and_normalize_polluted_spam_data()
    k = 10
    k_folds = hw3u.partition_folds(data, k)
    train_acc = []
    test_acc = []
    hw2_train_acc = []
    hw2_test_acc = []
    for ki in range(k):
        grouped_fold = hw5u.group_fold(k_folds, ki)
        y, X = utils.split_truth_from_data(grouped_fold)
        y_truth, X_test = utils.split_truth_from_data(k_folds[ki])
        clf = lm.LogisticRegression() #penalty="l1")
        ridge_clf = hw5u.Ridge()
        #clf = lm.Lasso(alpha=.5)
        #clf = lm.RidgeClassifier(alpha=.1)
        clf.fit(X, y)
        ridge_clf.fit(X, y)

        y_train = [1 if p >= .5 else 0 for p in clf.predict(X)]
        y_test = [1 if p >= .5 else 0 for p in clf.predict(X_test)]
        yhat_ridge_train = [1 if p >= .5 else 0 for p in ridge_clf.predict(X)]
        yhat_ridge_test = [1 if p >= .5 else 0 for p in ridge_clf.predict(X_test)]
        train_acc.append(accuracy_score(y, y_train))
        test_acc.append(accuracy_score(y_truth, y_test))
        hw2_train_acc.append(accuracy_score(y, yhat_ridge_train))
        hw2_test_acc.append(accuracy_score(y_truth, yhat_ridge_test))
        print 'Fold {} train acc: {} test acc: {} HW2 ridge train: {}  HW2 ridge test: {}'.format(ki+1, train_acc[-1], test_acc[-1], hw2_train_acc[-1], hw2_test_acc[-1])
    print 'Average acc - Train: {}  Test: {}  HW2 ridge: {}'.format(np.mean(train_acc), np.mean(test_acc), np.mean(hw2_train_acc), np.mean(hw2_test_acc))
def q5():
    """ ECOC for image analysis
    1000 Set: train. Accuracy: 1.000
         Set: test. Accuracy: 0.851
    12,000 (20% of 60,000)
         Set: train. Accuracy: 0.923
         Set: test. Accuracy: 0.905

Process finished with exit code 0
    http://colah.github.io/posts/2014-10-Visualizing-MNIST/
    """
    path = os.path.join(os.getcwd(), 'data/HW5/haar')
    limit = 60000
    images, labels = load_mnist('training', path=path)
    images /= 128.0
    X = []
    print 'processing images'
    black = [hw5u.count_black(b) for b in images[:limit]]
    #bdf = [pd.DataFrame(bd) for bd in black]
    #with open('save_img_' + str(limit) + '.csv', 'w') as fimg:
    #    pd.concat(bdf, axis=1).to_csv(fimg)
    print 'finished processing'

    rects = hw5u.get_rect_coords(100)
    #hw5u.show_rectangles(rects)

    for i in range(len(black)):
        row = []
        for r in range(len(rects)):
            h_diff, v_diff = hw5u.get_features(black[i], rects[r])
            row.append(h_diff)
            row.append(v_diff)
        X.append(row)
    save(X, labels)
    # Each image is a row in table X.
    # Features are
    # rectangle_1_horizontal_difference, rectangle_1_vertical_difference, rectangle_2_ho...

    data = utils.add_row(X, labels)
    data_split = hw5u.split_test_and_train(data, .2)
    data_test = data_split[0]
    data_train = data_split[1]

    y_train, X_train = utils.split_truth_from_data(data_train)
    y_test, X_test = utils.split_truth_from_data(data_test)

    cls = ec.ECOCClassifier(learner=lambda: adac.AdaboostOptimal(learner=lambda: DecisionTreeClassifier(max_depth=1), max_rounds=200), #LogisticRegression,  # TODO: replace with AdaBoost
    #cls = ec.ECOCClassifier(learner=LogisticRegression,  # TODO: replace with AdaBoost
                         verbose=True,
                         encoding_type='exhaustive').fit(X_train, y_train)
    for set_name, X, y in [('train', X_train, y_train),
                       ('test', X_test, y_test)]:
        print("Set: {}. Accuracy: {:.3f}".format(set_name, accuracy_score(y, cls.predict(X))))
def GaussianNB(X, num_features=None):
    model_type = 1
    train_acc_sum = 0
    test_acc_sum = 0
    k = 10
    nb_models = []
    if num_features is not None:
        y, X = utils.split_truth_from_data(X)
        q4_slct = SelectKBest(k=num_features).fit(X, y)
        X = q4_slct.transform(X)
        X = utils.add_row(X, y)
    k_folds = hw3u.partition_folds(X, k)
    for ki in range(k):
        grouped_fold = hw5u.group_fold(k_folds, ki)
        alpha = .001 if model_type==0 else 0
        mask_cols = check_cols(grouped_fold)
        #nb_model = nb.NaiveBayes(model_type, alpha=alpha, ignore_cols=mask_cols)
        nb_model = BernoulliNB()
        print 'len of kfolds {}'.format(len(grouped_fold))
        #truth_rows, data_rows, data_mus, y_mu = hw3u.get_data_and_mus(grouped_fold)
        truth_rows, data_rows = utils.split_truth_from_data(grouped_fold)
        print 'len of data {}'.format(len(data_rows))
        #nb_model.train(data_rows, truth_rows)
        nb_model.fit(data_rows, truth_rows)
        predict = nb_model.predict(data_rows)
        #print predict
        accuracy = hw3u.get_accuracy(predict, truth_rows)
        train_acc_sum += accuracy
        print_output(ki, accuracy)
        nb_models.append(nb_model)

        truth_rows, data_rows = utils.split_truth_from_data(k_folds[ki])
        test_predict = nb_model.predict(data_rows)
        test_accuracy = hw3u.get_accuracy(test_predict, truth_rows)
        test_acc_sum += test_accuracy
        print_output(ki, test_accuracy, 'test')

    print_test_output(float(train_acc_sum)/k, float(test_acc_sum)/k)
 def run(self, data, weights):
     k_folds = hw3.partition_folds(data, self.number_k_folds)
     for k in xrange(self.number_k_folds - 1):
         err_matrix = []
         fold = k_folds[k]
         truth, f_data = split_truth_from_data(fold)
         model = self.fit(f_data)
         predicted = self.predict(model, f_data)  # {-1, 1}
         err_matrix = self.compute_error_matrix(truth, predicted)
         self.training_errors.append(self.get_error(err_matrix))
         self.training_errors_weighted.append(sum(self.weight_errors(err_matrix, weights)))
     fold = k_folds[self.number_k_folds - 1]
     truth, f_data = utils.split_truth_from_data(fold)
     predicted = self.predict(model, f_data)
     # Error matrix for round computed from test data
     self.err_matrix = self.compute_error_matrix(truth, predicted)
     self.testing_error = self.get_error(self.err_matrix)
     self.testing_errors_weighted = self.weight_errors(self.err_matrix, weights)
     self.set_weight_distribution_and_total()  # Dt(x) and epsilon
     self.set_alpha()