def q1(): """ feature analysis with Adaboost """ #spamData = hw3u.pandas_to_data(hw3u.load_and_normalize_spambase()) spamData = utils.load_and_normalize_polluted_spam_data() k = 10 all_folds = hw3u.partition_folds(spamData, k) col_errs = [] kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) # We're not actually cross-validating anything -- we just want feature weights #X = np.concatenate([X, X_test], axis=0) #y = np.concatenate([y, y_test], axis=0) #adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='random')) adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='best')) #adaboost = adac.AdaboostOptimal(max_rounds=10, do_fast=False, learner=hw4u.TreeOptimal) adaboost.fit(X, y) margin_fractions = get_margin_fractions(adaboost, X[0]) #margin_fractions_v = hw5u.get_margin_fractions_validate(adaboost, X, y) #print col_errs ranked = rank(margin_fractions) print_ranks(ranked) pred = adaboost.predict(X_test) print 'Accuracy: {}'.format(accuracy_score(adaboost._check_y_not_zero(y_test), adaboost._check_y_not_zero(pred)))
def q3(): # Got points off b/c I have 89 accuracy instead of 92 """ Logistic Regression """ data = utils.load_and_normalize_polluted_spam_data() k = 10 k_folds = hw3u.partition_folds(data, k) train_acc = [] test_acc = [] hw2_train_acc = [] hw2_test_acc = [] for ki in range(k): grouped_fold = hw5u.group_fold(k_folds, ki) y, X = utils.split_truth_from_data(grouped_fold) y_truth, X_test = utils.split_truth_from_data(k_folds[ki]) clf = lm.LogisticRegression() #penalty="l1") ridge_clf = hw5u.Ridge() #clf = lm.Lasso(alpha=.5) #clf = lm.RidgeClassifier(alpha=.1) clf.fit(X, y) ridge_clf.fit(X, y) y_train = [1 if p >= .5 else 0 for p in clf.predict(X)] y_test = [1 if p >= .5 else 0 for p in clf.predict(X_test)] yhat_ridge_train = [1 if p >= .5 else 0 for p in ridge_clf.predict(X)] yhat_ridge_test = [1 if p >= .5 else 0 for p in ridge_clf.predict(X_test)] train_acc.append(accuracy_score(y, y_train)) test_acc.append(accuracy_score(y_truth, y_test)) hw2_train_acc.append(accuracy_score(y, yhat_ridge_train)) hw2_test_acc.append(accuracy_score(y_truth, yhat_ridge_test)) print 'Fold {} train acc: {} test acc: {} HW2 ridge train: {} HW2 ridge test: {}'.format(ki+1, train_acc[-1], test_acc[-1], hw2_train_acc[-1], hw2_test_acc[-1]) print 'Average acc - Train: {} Test: {} HW2 ridge: {}'.format(np.mean(train_acc), np.mean(test_acc), np.mean(hw2_train_acc), np.mean(hw2_test_acc))
def q2(): # Done """ standard deviation for some columns is 0 """ data = utils.load_and_normalize_polluted_spam_data() #data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) #data, _ = utils.random_sample(data, None, 300) #TODO - remove #GaussianNB(data) GaussianNB(data, num_features=100)