def q6(): """ Bagging - sample with replacement """ spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) y, X = hw4.split_truth_from_data(spamData) bagged = bag.Bagging(max_rounds=100, sample_size=1000, learner=lambda: DecisionTreeClassifier(max_depth=3)) bagged.fit(X, y) kf_fold = hw4.partition_folds(spamData, .4) test_y, test_X = hw4.split_truth_from_data(kf_fold[0]) test_pred = bagged.predict(test_X) test_y = bagged._check_y(test_y) test_pred = bagged._check_y(test_pred) test_error = float(sum([0 if py == ty else 1 for py, ty in zip(test_pred, test_y)]))/len(test_y) print 'Final testing error: {}'.format(test_error)
def q2(): """Boosting on UCI datasets""" crx = dl.data_q3_crx() #crx = dl.data_q3_vote() num_points = len(crx) for i in xrange(5, 85, 5): percent = float(i)/100 all_folds = hw4.partition_folds(crx, percent) kf_train = all_folds[0] kf_test = all_folds[1] y, X = hw4.split_truth_from_data(kf_train) y_test, X_test = hw4.split_truth_from_data(kf_test) adaboost = run_adaboost(X, y, X_test, y_test, 'q2_crx') yt_pred = adaboost.predict(X_test) yt_pred = adaboost._check_y(yt_pred) y_test = adaboost._check_y(y_test) round_err = float(np.sum([1 if yt!=yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test) last_round = adaboost.local_errors.keys()[-1] #print 'Error at {}%: Train: {} Test: {}'.format(percent, adaboost.adaboost_error[last_round], round_err) print 'Error at {}%: Test: {}'.format(percent, round_err)