def main(): for file in [ 'data/breast-cancer-assignment5.txt', 'data/german-assignment5.txt' ]: data, labels, types = load_matrix_from_txt(file) splices = k_fold_split(10, data, labels) accuracies = [] for i in range(10): train_indexes = splices[i][0] test_indexes = splices[i][1] train_data = np.copy(data[train_indexes]) train_label = np.copy(labels[train_indexes]) test_data = np.copy(data[test_indexes]) test_label = np.copy(labels[test_indexes]) boost = AdaBoost() boost.train(train_data, train_label, types) class_result = boost.test(test_data) accuracy = compute_accuracy(class_result, test_label) accuracies.append(accuracy) print 'accuracy: %f' % accuracy print('file: {}, mean: {}, std: {}'.format(file, np.mean(accuracies), np.std(accuracies)))
def main(): data = np.loadtxt(open("/Users/rio512hsu/dataset/MachineLearningTechniques" + "/hw2_adaboost_train.csv", "rb"), delimiter=" ") X = data[:, :-1] y = data[:, -1] u = np.ones(X.shape[0]) / X.shape[0] clf = DecisionStump().fit(X, y, u) # Q12 print clf.getEin() # Q13 adaboost = AdaBoost(DecisionStump).fit(X, y, 300) # print adaboost.predict(X) print np.sum(adaboost.predict(X) != y) # Q17 test = np.loadtxt(open("/Users/rio512hsu/dataset/" + "MachineLearningTechniques/" + "hw2_adaboost_test.csv"), delimiter=' ') X_test = test[:, :-1] y_test = test[:, -1] print np.sum(clf.predict(X) != y) / float(test.shape[0]) # Q18 print np.sum(adaboost.predict(X_test) != y_test) / float(test.shape[0]) return 0
def testTitanicCARTAdaBoost(): print('-' * 30, '\ntestTitanicCARTAdaBoost\n', '-' * 30) trd = pd.read_csv('Titanic_dataset/train.csv') # drop useless and continue features #for i in ["PassengerId", "Name", "Ticket", "Cabin", "Age", "SibSp", "Parch", "Fare"]: for i in ["PassengerId", "Name", "Ticket", "Cabin"]: trd.pop(i) trd = trd.dropna() # drop nan values # convert non-digits to digits trd = pd.get_dummies(trd, columns=['Sex']) Embarked_map = { val: idx for idx, val in enumerate(trd['Embarked'].unique()) } trd['Embarked'] = trd['Embarked'].map(Embarked_map) if DEBUG: print(trd[:5]) # create train data trdd = trd.sample(frac=0.4) # using "Survived" as labels trl = trd.pop("Survived") trl[trl == 0] = -1 trll = trdd.pop("Survived") trll[trll == 0] = -1 # training tree t = AdaBoost(CART_weight_classifier, 10) t.fit(trdd, trll) # prediction pred = t.predict(trd) print('Acc.: ', np.sum(pred == trl.reset_index(drop=True)) / trl.shape[0])
def q_8(self): tx, ty = ex4_tools.generate_data(5000, noise_ratio=0) x, y = ex4_tools.generate_data(200, noise_ratio=0) self.a_boost = AdaBoost(WL=ex4_tools.DecisionStump, T=500) self.a_boost.train(tx, ty) training_errs, test_errs = self.get_ab_errs(tx, ty, x, y) self.plt_q_8(training_errs, test_errs)
def get_ab_errors(self, tx, ty, x, y): errors = [] for t in self.ts: a_boost = AdaBoost(WL=ex4_tools.DecisionStump, T=t) a_boost.train(tx, ty) errors.append(a_boost.error(x, y, t)) return errors
def main(): ''' Load data, split data, creates adaboost algorithm with decision stump, calculates errors, save final file''' classifier = AdaBoost(DecisionStump) num_sets = 50 T = 100 percentage = 0.9 all_errors_train = [] all_errors_test = [] aver_error_train = [] aver_error_test = [] # split data in the # of datasets split_data(percentage, num_sets) # run for all datasets, for boosting interations = T for i in range(num_sets): data_split_train = './data/bupa_train' + str(i) + ".txt" data_split_test = './data/' + "bupa_test" + str(i) + ".txt" X_train, Y_train = load_data(data_split_train) X_test, Y_test = load_data(data_split_test) score_train, score_test = classifier.run_adaboost(X_train, Y_train, T, X_test) error_train = calculate_error(T, score_train, Y_train) error_test = calculate_error(T, score_test, Y_test) all_errors_train.append(error_train) all_errors_test.append(error_test)
def adaboost_avg_run_new(max_classes, avg_num_of_run, training_set, testing_set): all_error_list = [] # because datasets sometimes place the class attribute at the end or even # at the beginning or the middle, we'll separate the attribute vector from # the class-label. also note that this is the way scikit-learn does it. # train_x: the attribute vector; train_y: the class_label (train_x, train_y) = split_attribute_and_label(training_set) (test_x, test_y) = split_attribute_and_label(testing_set) # print(len(train_x)) train_subset_num = int(len(train_y) * 0.2) our_ada_training_errors = {} our_ada_testing_errors = {} # init dict of num classifier to error list for i in range(1, max_classes + 1): our_ada_training_errors[i] = [] our_ada_testing_errors[i] = [] # run ada num_runs times for i in range(avg_num_of_run): ada_obj = AdaBoost(max_classes, train_subset_num, THRESHOLD, ETA, UPPER_BOUND, ETA_WEIGHTS, False) ada_obj.fit_with_errors(train_x, train_y, test_x, test_y) for j in range(max_classes): our_ada_training_errors[j + 1].append(ada_obj.training_error[j]) our_ada_testing_errors[j + 1].append(ada_obj.testing_error[j]) for cl in range(1, max_classes + 1): scikit_error = [] for i in range(avg_num_of_run): pada = perceptron.Perceptron(max_iter=UPPER_BOUND, verbose=0, random_state=None, fit_intercept=True, eta0=ETA) bdt = AdaBoostClassifier(pada, algorithm="SAMME", n_estimators=cl) bdt.fit(train_x, train_y) result_list = bdt.predict(test_x) scikit_error.append(calculate_error(test_y, result_list)) errors = ErrorWrapper( cl, sum(our_ada_training_errors[cl]) / len(our_ada_training_errors[cl]), sum(our_ada_testing_errors[cl]) / len(our_ada_testing_errors[cl]), sum(scikit_error) / len(scikit_error)) all_error_list.append(errors) print("Train avg for %s %s" % (cl, errors.train_error)) print("Testing avg for %s %s" % (cl, errors.test_error)) print("Scikit adaboost avg for %s %s" % (cl, errors.scikit_error)) return all_error_list
def testBupaData(self): X, Y = load_bupa_dataset() classifier = AdaBoost(DecisionStump) for t in [100, 200, 300, 400, 500]: score = classifier.test_on_training_set(X, Y, t) roc = pyroc.ROCData(zip(Y, score)) auc = roc.auc() print auc self.failUnless(auc > .9)
def testBupaData(self): X, Y = load_bupa_dataset() classifier = AdaBoost(DecisionStump) for t in [100,200,300,400,500]: score = classifier.test_on_training_set(X,Y,t) roc = pyroc.ROCData(zip(Y,score)) auc = roc.auc() print auc self.failUnless(auc > .9)
def q_9(self): tx, ty = ex4_tools.generate_data(5000, noise_ratio=0) x, y = ex4_tools.generate_data(200, noise_ratio=0) i = 1 for t in self.ts: a_boost = AdaBoost(WL=ex4_tools.DecisionStump, T=t) a_boost.train(tx, ty) plt.subplot(2, 3, i) ex4_tools.decision_boundaries(a_boost, x, y, t) i += 1 plt.show()
def q_10(self): tx, ty = ex4_tools.generate_data(5000, noise_ratio=0) x, y = ex4_tools.generate_data(200, noise_ratio=0) errors = self.get_ab_errors(tx, ty, x, y) min_t = np.argmin(errors) a_boost = AdaBoost(WL=ex4_tools.DecisionStump, T=self.ts[min_t]) a_boost.train(tx, ty) ex4_tools.decision_boundaries(a_boost, tx, ty, self.ts[min_t]) plt.title("min error is " + str(errors[min_t]) + " with " + str(self.ts[min_t]) + " classifiers") plt.show()
def Q10(): X, y = generate_data(1000, 0) T = [5, 10, 50, 100, 200, 500] i = int(np.argmin(Q9())) T_min = T[i] optimal_h = AdaBoost(DecisionStump, T_min) optimal_h.train(X, y) decision_boundaries(optimal_h, X, y, T_min) plt.title('Descision for T=500 that minimizing the test err') plt.savefig('Q10') plt.show()
def Q9(): X, y = generate_data(300, 0) h = AdaBoost(DecisionStump, 500) h.train(X, y) err = [0] * len(T) f = plt.figure(figsize=(10, 10)) for i, t in enumerate(T): f.add_subplot(3, 2, i + 1) err[i] = h.error(X, y, t) decision_boundaries(h, X, y, t) plt.savefig('Q9') plt.show() return np.array(err)
def crossValidateAdaboost(inputFile, outputFile, nIterations): ticTacToe = TicTacToe(inputFile) avgEin = np.zeros(nIterations) avgEout = np.zeros(nIterations) for k in range(ticTacToe.N_FOLDS): ticTacToe.createTrainAndTestSets(k) adaboost = AdaBoost(ticTacToe) Ein, Eout = adaboost.train(ticTacToe, nIterations) avgEin = np.sum([avgEin, Ein], axis=0) avgEout = np.sum([avgEout, Eout], axis=0) print('--------------------------------------') return avgEin / ticTacToe.N_FOLDS, avgEout / ticTacToe.N_FOLDS
def adaboost_avg_run(max_classes, avg_num_of_run, training_set, testing_set): testing_error_list = [] all_error_list = [] # because datasets sometimes place the class attribute at the end or even # at the beginning or the middle, we'll separate the attribute vector from # the class-label. also note that this is the way scikit-learn does it. # train_x: the attribute vector; train_y: the class_label (train_x, train_y) = split_attribute_and_label(training_set) (test_x, test_y) = split_attribute_and_label(testing_set) # print(len(train_x)) train_subset_num = int(len(train_y) * 0.2) for cl in range(1, max_classes + 1, 2): train_error = [] testing_error = [] scikit_error = [] for i in range(avg_num_of_run): ada_obj = AdaBoost(cl, train_subset_num, THRESHOLD, ETA, UPPER_BOUND, ETA_WEIGHTS, False) ada_obj.fit(train_x, train_y) hypothesis_list = ada_obj.predict(train_x) mistakes = ada_obj.xor_tuples(train_y, hypothesis_list) error_rate_train = classifier_error_rate(mistakes) hypothesis_list = ada_obj.predict(test_x) mistakes = ada_obj.xor_tuples(test_y, hypothesis_list) error_rate_test = classifier_error_rate(mistakes) train_error.append(error_rate_train) testing_error.append(error_rate_test) pada = perceptron.Perceptron(max_iter=UPPER_BOUND, verbose=0, random_state=None, fit_intercept=True, eta0=ETA) bdt = AdaBoostClassifier(pada, algorithm="SAMME", n_estimators=cl) bdt.fit(train_x, train_y) result_list = bdt.predict(test_x) scikit_error.append(calculate_error(test_y, result_list)) errors = ErrorWrapper(cl, sum(train_error) / len(train_error), sum(testing_error) / len(testing_error), sum(scikit_error) / len(scikit_error)) all_error_list.append(errors) print("Train avg for %s %s" % (cl, errors.train_error)) print("Testing avg for %s %s" % (cl, errors.test_error)) testing_error_list.append( (sum(testing_error) / len(testing_error)) * 100) print("Scikit adaboost avg for %s %s" % (cl, errors.scikit_error)) #return testing_error_list return all_error_list
def main(): classifier = AdaBoost(DecisionStump) num_sets = 50 T = 100 percentage = 0.9 all_errors_train = [] all_errors_test = [] aver_error_train = [] aver_error_test = [] split_data(percentage, num_sets) for i in range(num_sets): data_split_train = './data/bupa_train' + str(i) + ".txt" data_split_test = './data/' + "bupa_test" + str(i) + ".txt" X_train, Y_train = load_data(data_split_train) X_test, Y_test = load_data(data_split_test) score_train, score_test = classifier.run_adaboost(X_train, Y_train, T, X_test) error_train = calculate_error(T, score_train, Y_train) error_test = calculate_error(T, score_test, Y_test) all_errors_train.append(error_train) all_errors_test.append(error_test) # calculates the average errors for j in range(T): a_e_train = 0 a_e_test = 0 for i in range(num_sets): a_e_train += all_errors_train[i][j] a_e_test += all_errors_test[i][j] aver_error_train.append(a_e_train/num_sets) aver_error_test.append(a_e_test/num_sets) save_result_final(aver_error_train, 'train') save_result_final(aver_error_test, 'test') dataset_here = "./data/bupa.data" X_all, Y_all = load_data(dataset_here) score_optional = classifier.run_adaboost(X_all, Y_all, T, None, True) save_result_final(score_optional, 'empirical')
def main(): X_train = np.array([ [1.0, 2.1], [2.0, 1.1], [1.3, 1.0], [1.0, 1.0], [2.0, 1.0] ]) y_train = np.array([1.0, 1.0, -1.0, -1.0, 1.0]).reshape((-1, 1)) model = AdaBoost(verbose=1) model.fit(X_train, y_train) X_test = np.array([ [5, 5], [0, 0] ]) y_predict = model.predict(X_test) print('predict result: ', y_predict)
def Q3(): # AdaBoost path = "/cs/usr/kotek/PycharmProjects/iml_ex4/SynData/" X_train, y_train = read_from_txt(path + "X_train.txt", path + "y_train.txt") X_val, y_val = read_from_txt(path + "X_val.txt", path + "y_val.txt") X_test, y_test = read_from_txt(path + "X_test.txt", path + "y_test.txt") # -------- First part -------- T = np.arange(5, 105, step=5) T = np.append(T, np.array([200])) training_err = np.zeros(len(T)) validation_err = np.zeros(len(T)) # adaBoost uses a weighted trainer (WL) WL = ex4_tools.DecisionStump for i in range(len(T)): adaboost = AdaBoost(WL, T[i]) adaboost.train(X_train, y_train) training_err[i] = adaboost.error(X_train, y_train) validation_err[i] = adaboost.error(X_val, y_val) plt.plot(T, training_err, label="train error") plt.plot(T, validation_err, label="validation error") plt.legend() plt.show() # ------------------------ # # -------- Second part -------- decision_T = [1, 5, 10, 100, 200] plt.figure() plt.ion() for idx, t in enumerate(decision_T): adaboost = AdaBoost(WL, t) adaboost.train(X_train, y_train) plt.subplot(2, 3, idx + 1) ex4_tools.decision_boundaries(adaboost, X_train, y_train, "T=" + str(t)) plt.show() plt.pause(5)
def Q3(): # AdaBoost T = [1,5,10,50,100,200] T_loop = [1,5,10] train_err = [] valid_err = [] plt.figure("decisions of the learned classifiers for T") num_graph = 0 for i in range(3,41): T_loop.append(i*5) for t in T_loop: ada_boost = AdaBoost(DecisionStump, t) ada_boost.train(x_train, y_train) if (t in T): num_graph += 1 plt.subplot(3,2, num_graph) decision_boundaries(ada_boost, x_train, y_train, "T = %d" %t) train_err.append(ada_boost.error(x_train, y_train)) valid_err.append(ada_boost.error(x_val, y_val)) plt.figure("training error and the validation error") plt.plot(T_loop, train_err, 'ro-', hold=False, label= "Training Error") plt.plot(T_loop, valid_err, 'go-', label= "Validation Error") plt.legend() plt.show() ''' find the T min, and plot it with training error ''' plt.figure("decision boundaries of T min, with the training data") T_hat = 5 * np.argmin(valid_err) ada_boost = AdaBoost(DecisionStump, T_hat) ada_boost.train(x_train, y_train) test_err = ada_boost.error(x_test, y_test) decision_boundaries(ada_boost, x_train, y_train, "T = %d" %T_hat) plt.show() print ("The value of T that minimizes the validation error is: ", T_hat) print("the test error of the corresponding classifier is: ", test_err) return
def Q3(): # AdaBoost print("Q3") print("===============================================") T = [None]*41 T[0] = 1 for i in range(5, 201, 5): T[i//5] = i classifiers = [None]*41 train_err = [None]*41 val_err = [None]*41 for i in range(len(T)): classifiers[i] = AdaBoost(DecisionStump, T[i]) classifiers[i].train(X_train, y_train) train_err[i] = classifiers[i].error(X_train, y_train) val_err[i] = classifiers[i].error(X_val, y_val) plt.figure(1) plt.subplot(3, 2, 1) decision_boundaries(classifiers[0], X_train, y_train, "Training Classification T=1") plt.subplot(3, 2, 2) decision_boundaries(classifiers[1], X_train, y_train, "Training Classification T=5") plt.subplot(3, 2, 3) decision_boundaries(classifiers[2], X_train, y_train, "Training Classification T=10") plt.subplot(3, 2, 4) decision_boundaries(classifiers[10], X_train, y_train, "Training Classification T=50") plt.subplot(3, 2, 5) decision_boundaries(classifiers[20], X_train, y_train, "Training Classification T=100") plt.subplot(3, 2, 6) decision_boundaries(classifiers[40], X_train, y_train, "Training Classification T=200") plt.show() plt.figure(2) red_patch = mpatches.Patch(color='red', label='Training') b_patch = mpatches.Patch(color='blue', label='Validation') plt.legend(handles=[red_patch, b_patch]) plt.plot(T, train_err, 'r', T, val_err, 'b') plt.title("Training Error and Validation Error ") plt.show() T_hat = T[np.argmin(val_err)] print("the value of T_hat (T that minimize validation error) is:", T_hat) #55 print("the test error of T_hat is:", classifiers[T_hat//5].error(X_test, y_test)) #0.184 plt.figure(3) decision_boundaries(classifiers[T_hat//5], X_train, y_train, "Training Classification of T_hat") plt.show() print("===============================================") return
def select_classifier(cname, features=None, labels=None, **kwargs): if 'svm'.startswith(cname): del kwargs['class_weight'] c = SVC(probability=True, **kwargs) elif 'logistic-regression'.startswith(cname): c = LogisticRegression() elif 'linear-regression'.startswith(cname): c = LinearRegression() elif 'random-forest'.startswith(cname): try: c = RandomForest() except NameError: logging.warning(' Tried to use random forest, but not available.' + ' Falling back on adaboost.') cname = 'ada' if 'adaboost'.startswith(cname): c = AdaBoost(**kwargs) if features is not None and labels is not None: c = c.fit(features, labels, **kwargs) return c
def train(self): raise ("Unfinished") detection_rate = 0 from config import EXPECTED_FPR_PRE_LAYYER from config import EXPECTED_FPR from config import LABEL_NEGATIVE cur_fpr = 1.0 mat = self._mat label = self._label for i in xrange(self.limit): if cur_fpr < EXPECTED_FPR: break else: cache_filename = ADABOOST_CACHE_FILE + str(i) if os.path.isfile(cache_filename): self.strong_classifier[i] = getCachedAdaBoost( mat=self._mat, label=self._label, filename=cache_filename, limit=ADABOOST_LIMIT) else: self.strong_classifier[i] = AdaBoost(mat, label, limit=ADABOOST_LIMIT) output, fpr = self.strong_classifier[i].train() cur_fpr *= fpr fp_num = fpr * numpy.count_nonzero(label == LABEL_NEGATIVE) self.strong_classifier[i].saveModel(cache_filename) mat, label = self.updateTrainingDate(mat, output, fp_num) self.classifierNum += 1
def Q_adaboost(noise_ratio): X_train, y_train = generate_data(5000, noise_ratio) classifier = AdaBoost(DecisionStump, 500) classifier.train(X_train, y_train) X_test, y_test = generate_data(200, noise_ratio) vals = np.arange(1, 501) plt.plot(vals, [classifier.error(X_train, y_train, t) for t in vals], label='Training Error', lw=1, alpha=0.6) plt.plot(vals, [classifier.error(X_test, y_test, t) for t in vals], label='Test Error', lw=1, alpha=0.6) plt.legend() plt.title( f'Adaboost Training & Test Error according to T, noise={noise_ratio}') plt.show() boosts = [5, 10, 50, 100, 200, 500] for i in range(6): plt.subplot(2, 3, i + 1) decision_boundaries(classifier, X_test, y_test, boosts[i]) plt.title(f'T={boosts[i]}, noise={noise_ratio}') plt.show() test_errors = [classifier.error(X_test, y_test, t) for t in vals] min_t = np.argmin(test_errors) min_err = test_errors[min_t] # print(min_t, min_err) decision_boundaries(classifier, X_train, y_train, min_t) plt.title(f'min test_err {min_err} T={min_t} noise {noise_ratio}') plt.show() decision_boundaries(classifier, X_train, y_train, 499, classifier.D_of_last_iteration) plt.title(f'un-normalized weighed sample T=500, noise={noise_ratio}') plt.show() decision_boundaries( classifier, X_train, y_train, 499, classifier.D_of_last_iteration / np.max(classifier.D_of_last_iteration) * 100) plt.title(f'normalized weighed sample T=500, noise={noise_ratio}') plt.show()
def Q8(): X, y = generate_data(5000, 0) h = AdaBoost(DecisionStump, 500) h.train(X, y) training_err = np.zeros((500, )) test_err = np.zeros((500, )) test_set, labels = generate_data(200, 0) for t in range(1, 501): training_err[t - 1] = h.error(X, y, t) test_err[t - 1] = h.error(test_set, labels, t) plt.plot(range(500), training_err, label='Training error') plt.plot(range(500), test_err, label='Test error') plt.title('question 8') plt.legend(loc='upper right') plt.xlabel('T') plt.ylabel('Error rate') plt.savefig('Q8') plt.show()
def Q17(): train_images, test_images, train_labels, test_labels = load_images( '../Docs/') train_images = integral_image(train_images) test_images = integral_image(test_images) WL, T = WeakImageClassifier, 50 ada = AdaBoost(WL, T) ada.train(train_images, train_labels) T_range = np.arange(1, T) train_errs = [ada.error(train_images, train_labels, t) for t in T_range] test_errs = [ada.error(test_images, test_labels, t) for t in T_range] fig = plt.figure() fig.suptitle("Train vs Test error, Face Classifier") plt.xlabel('# of Hypotheses (T)') plt.ylabel('Error rate (%)') plt.plot(T_range, train_errs, label='Train Error') plt.plot(T_range, test_errs, label='Test Error') # plt.ylim(top=0.06) plt.legend() plt.savefig(FIG_DIR3 + 'q17') 'TODO complete this function'
def Q8(noise=0.0): n_samples_train, n_samples_test, T = 5000, 200, 500 train_X, train_y = generate_data(n_samples_train, noise) test_X, test_y = generate_data(n_samples_test, noise) WL = DecisionStump ada = AdaBoost(WL, T) ada.train(train_X, train_y) T_range = np.arange(1, T) train_errs = [ada.error(train_X, train_y, t) for t in T_range] test_errs = [ada.error(test_X, test_y, t) for t in T_range] fig = plt.figure() fig.suptitle("Train vs Test error, Adaboost") plt.xlabel('# of Hypotheses (T)') plt.ylabel('Error rate (%)') plt.plot(T_range, train_errs, label='Train Error') plt.plot(T_range, test_errs, label='Test Error') # plt.ylim(top=0.06) plt.legend() plt.savefig(FIG_DIR3 + 'q8' + ('' if noise == 0 else '_' + str(noise).replace('.', '_'))) return ada, test_X, test_y, train_X, train_y 'TODO complete this function'
for j in range(i * TESTING_SAMPLE_NUM, (i+1) * TESTING_SAMPLE_NUM): haarGroup.append(float(tmp[j])) Original_Data.append(haarGroup) Original_Data = numpy.array(Original_Data) fileObj.close() fileObj = open(ADABOOST_FILE, "a+") print "Constructing AdaBoost from existed model data" tmp = fileObj.readlines() a = AdaBoost(train = False) for i in range(0, len(tmp), 4): alpha, demention, label, threshold = None, None, None, None for j in range(i, i + 4): if (j % 4) == 0: alpha = float(tmp[j]) elif (j % 4) == 1: demention = int(tmp[j]) elif (j % 4) == 2: label = float(tmp[j]) elif (j % 4) == 3: threshold = float(tmp[j])
from mapReduce import reduce map(Face, nonFace) _mat = reduce() mat = _mat featureNum, sampleNum = _mat.shape assert sampleNum == (POSITIVE_SAMPLE + NEGATIVE_SAMPLE) assert featureNum == FEATURE_NUM Label_Face = [+1 for i in xrange(POSITIVE_SAMPLE)] Label_NonFace = [-1 for i in xrange(NEGATIVE_SAMPLE)] label = numpy.array(Label_Face + Label_NonFace) cache_filename = ADABOOST_CACHE_FILE + str(0) if os.path.isfile(cache_filename): model = getCachedAdaBoost(mat = _mat, label = label, filename= cache_filename, limit = ADABOOST_LIMIT) else: model = AdaBoost(mat, label, limit = ADABOOST_LIMIT) model.train() model.saveModel(cache_filename) print model
def ab_plot(iterate): fig, axes = plt.subplots(2, 2) # fig.set_size_inches(10, 10) for ax in axes.flatten(): n_ex = 100 n_trees = 50 n_feats = np.random.randint(2, 100) max_depth_d = 1 #np.random.randint(1, 100) classifier = np.random.choice([True]) #, False if classifier: # create classification problem n_classes = np.random.randint(2, 10) X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model def loss(yp, y): return accuracy_score(yp, y) # initialize model criterion = np.random.choice(["entropy", "gini"]) mine_g = AdaBoost( n_iter=iterate, max_depth=max_depth_d, classifier=classifier, # learning_rate=1, # loss="crossentropy", # step_size="constant", # split_criterion=criterion, ) else: # create regeression problem X, Y = make_regression(n_samples=n_ex, n_features=1) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model criterion = "mse" loss = mean_squared_error mine_g = GradientBoostedDecisionTree( n_iter=iterate, # n_trees=n_trees, max_depth=max_depth_d, classifier=classifier, learning_rate=1, loss="mse", step_size="adaptive", split_criterion=criterion, ) # fit 'em mine_g.fit(X, Y) # # get preds on test set # y_pred_mine_test_g = mine_g.predict(X_test) # # loss_mine_test_g = loss(y_pred_mine_test_g, Y_test) # # if classifier: # entries = [ # ("GB", loss_mine_test_g, y_pred_mine_test_g) # ] # (lbl, test_loss, preds) = entries[np.random.randint(1)] # ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100)) # for i in np.unique(Y_test): # ax.scatter( # X_test[preds == i, 0].flatten(), # X_test[preds == i, 1].flatten(), # # s=0.5, # ) # else: # X_ax = np.linspace( # np.min(X_test.flatten()) - 1, np.max(X_test.flatten()) + 1, 100 # ).reshape(-1, 1) # y_pred_mine_test_g = mine_g.predict(X_ax) # # ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5) # # s=0.5) # ax.plot( # X_ax.flatten(), # y_pred_mine_test_g.flatten(), # # linewidth=0.5, # label="GB".format(n_trees, n_feats, max_depth_d), # color="red", # ) # ax.set_title( # "GB: {:.1f}".format( # loss_mine_test_g # ) # ) # ax.legend() # ax.xaxis.set_ticklabels([]) # ax.yaxis.set_ticklabels([]) # plt.savefig("plot.png", dpi=300) plt.show()
Just Enjoy it. """ import numpy import matplotlib.pyplot as pyplot from adaboost import AdaBoost from sklearn import datasets """ Samples for AdaBoost """ Original_Data, Tag = datasets.make_hastie_10_2(n_samples = 200, random_state = 1) Original_Data = Original_Data.transpose() for i in range(len(Tag)): if Tag[i] == 1: pyplot.plot(Original_Data[0][i], Original_Data[1][i], \ '+r', markersize = 10) else: pyplot.plot(Original_Data[0][i], Original_Data[1][i], \ '+b', markersize = 10) pyplot.title("Sample Points") pyplot.show() a = AdaBoost(Original_Data, Tag) a.train(10000)
def main(): ''' Load data, split data, creates adaboost algorithm with decision stump, calculates errors, save final file. Since this is a binary classifier, we will do for each of the 4 networks, one at time''' classification = [] ada_folder = OUTPUT_FOLDER + 'adaboost/' if not os.path.exists(ada_folder): os.makedirs(ada_folder) output_file = ada_folder + 'results.out' with open(output_file, "w") as f: f.write("# ADABOOST RESULTS, TRAIN/TEST FRACTION: " + str(PERCENTAGE) + "\n") f. write("# Net Norm Set OL? Accu. Train Accu Test\n") # chose classifier classifier = AdaBoost(DecisionStump) # for each normalization: for norm in NORM: # for each set for number in range(1, NUM_SETS+1): ''' with with_outlier ''' with_outlier = True # get input and output file paths input_train = get_input_data('train', number, norm, with_outlier) input_test = get_input_data('test', number, norm, with_outlier) # for each network type: for net_name in NET_NAMES: # get data X_train, Y_train = one_against_all.load_data(input_train, net_name) X_test, Y_test = one_against_all.load_data(input_test, net_name) print 'Calculating adaboost for net ' + net_name + ' with normalization ' + norm + ' and set ' + str(number) score_train, score_test = classifier.run_adaboost(X_train, Y_train, T, X_test) error_train = calculate_error(T, score_train, Y_train) error_test = calculate_error(T, score_test, Y_test) error_train_total = sum(error_train)/len(error_train) error_test_total = sum(error_test)/len(error_test) save_result_final(error_train_total, error_test_total, output_file, net_name, norm, number, with_outlier) classification.append(str(round(error_test_total,3)) +', ' + str(norm) + ', ' + str(number) + ', ' + str(with_outlier)[0] + '\n') ''' with no outlier ''' with_outlier = False # get input and output file paths input_train = get_input_data('train', number, norm, with_outlier) input_test = get_input_data('test', number, norm, with_outlier) # for each network type: for net_name in NET_NAMES: # get data X_train, Y_train = one_against_all.load_data(input_train, net_name) X_test, Y_test = one_against_all.load_data(input_test, net_name) score_train, score_test = classifier.run_adaboost(X_train, Y_train, T, X_test) error_train = calculate_error(T, score_train, Y_train) error_test = calculate_error(T, score_test, Y_test) error_train_total = sum(error_train)/len(error_train) error_test_total = sum(error_test)/len(error_test) save_result_final(error_train_total, error_test_total, output_file, net_name, norm, number, with_outlier) classification.append(str(round(error_test_total,3)) +', ' + str(norm) + ', ' + str(number) + ', ' + str(with_outlier)[0] + '\n') #find best classfiers classification.sort() with open(output_file + 'good_classification', "w") as f: f.write("\n\n\nClassification\n\n") for feat in classification: f.write(feat + '\n') f.write("\n") print 'Results saved at ' + ada_folder print 'Done!!!'
[-1], [-1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [+1], [-1], [-1], [-1], [-1], [-1], [-1], [-1], [-1], [-1], [-1], [+1], [+1], [+1]]).transpose() Tag = Tag.flatten() for i in range(len(Tag)): if Tag[i] == 1: pyplot.plot(Original_Data[0][i], Original_Data[1][i], \ '+r', markersize = 10) else: pyplot.plot(Original_Data[0][i], Original_Data[1][i], \ '+b', markersize = 10) a = AdaBoost(Original_Data, Tag) a.train(100) TestCase = [[0.55, 1.1, 5.35, 7.0, 8.5, -1.0, 3.0, 3.0, 4.0, 2, 3], [4.4, 2.8, 0.9, -12, -13, -9, -10, -9, -5, 0, 2.5]] output = a.prediction(TestCase) for i in range(len(output)): if output[i] == 1: pyplot.plot(TestCase[0][i], TestCase[1][i], \ 'or', markersize = 20) else: pyplot.plot(TestCase[0][i], TestCase[1][i], \ 'ob', markersize = 20)
[1], [2], [3], [4], [5], [6], [7], [8], [9] ]).transpose() Tag = numpy.array([ [+1], [+1], [+1], [-1], [-1], [-1], [+1], [+1], [+1], [-1], ]).transpose() Tag = Tag.flatten() a = AdaBoost(Original_Data, Tag) a.train(5)
(i + 1) * TESTING_SAMPLE_NUM): haarGroup.append(float(tmp[j])) Original_Data.append(haarGroup) Original_Data = numpy.array(Original_Data) fileObj.close() fileObj = open(ADABOOST_FILE, "a+") print "Constructing AdaBoost from existed model data" tmp = fileObj.readlines() a = AdaBoost(train=False) for i in range(0, len(tmp), 4): alpha, demention, label, threshold = None, None, None, None for j in range(i, i + 4): if (j % 4) == 0: alpha = float(tmp[j]) elif (j % 4) == 1: demention = int(tmp[j]) elif (j % 4) == 2: label = float(tmp[j]) elif (j % 4) == 3: threshold = float(tmp[j])
def do_adaboost(self): if self.flag == 0: start = time.time() adaboost_instance = AdaBoost(self.train) adaboost_instance.create_and_train_classifiers() adaboost_instance.write_model(self.model_file) end = time.time() print 'Training Time :', (end - start) / 60, 'mins' else: start = time.time() adaboost_instance = AdaBoost(None) adaboost_instance.load_model(self.model_file) test_output = adaboost_instance.test(self.test, self.output_file) print test_output['accuracy'], '%' end = time.time() print 'Testing Time :', (end - start) / 60, 'mins'
def _load_data(name): return np.loadtxt(_get_file_path('X_' + name)), np.loadtxt( _get_file_path('y_' + name)) if __name__ == '__main__': X_train, y_train = _load_data('train') X_val, y_val = _load_data('val') T_values = range(5, 200, 5) validation_error = [] training_error = [] for t in T_values: ada_boost = AdaBoost(DecisionStump, t) ada_boost.train(X_train, y_train) validation_error.append(ada_boost.error(X_val, y_val)) training_error.append(ada_boost.error(X_train, y_train)) training_error_plot, = plot(T_values, training_error, linestyle='--', label='training_error') validation_error_plot, = plot(T_values, validation_error, linestyle='--', label='validation_error') legend(handles=[training_error_plot, validation_error_plot])
for j in range(i * SAMPLE_NUM , (i+1) * SAMPLE_NUM): haarGroup.append(float(tmp[j])) Original_Data.append(haarGroup) Original_Data = numpy.array(Original_Data) fileObj.close() SampleDem = Original_Data.shape[0] SampleNum = Original_Data.shape[1] assert SampleNum == (POSITIVE_SAMPLE + NEGATIVE_SAMPLE) Label_Face = [+1 for i in range(POSITIVE_SAMPLE)] Label_NonFace = [-1 for i in range(NEGATIVE_SAMPLE)] Label = numpy.array(Label_Face + Label_NonFace) a = AdaBoost(Original_Data, Label) try: a.train(200) except KeyboardInterrupt: print "You pressed interrupt key. Training process interrupt." saveModel(a)
[+1], [+1], [+1], ] ).transpose() Tag = Tag.flatten() for i in range(len(Tag)): if Tag[i] == 1: pyplot.plot(Original_Data[0][i], Original_Data[1][i], "+r", markersize=10) else: pyplot.plot(Original_Data[0][i], Original_Data[1][i], "+b", markersize=10) a = AdaBoost(Original_Data, Tag) a.train(100) TestCase = [[0.55, 1.1, 5.35], [4.4, 2.8, 0.9]] output = a.prediction(TestCase) for i in range(len(output)): if output[i] == 1: pyplot.plot(TestCase[0][i], TestCase[1][i], "or", markersize=20) else: pyplot.plot(TestCase[0][i], TestCase[1][i], "ob", markersize=20) pyplot.show()
def Q6(): # Republican or Democrat? print("Q6") print("===============================================") votes_tmp = np.column_stack((votes, parties)) training_votes, val_votes, test_votes = np.vsplit(votes_tmp[np.random.permutation(votes_tmp.shape[0])],(217,391)) training_parties = training_votes[:, 16] training_votes = np.delete(training_votes, np.s_[16:17], axis=1) val_parties = val_votes[:, 16] val_votes = np.delete(val_votes, np.s_[16:17], axis=1) test_parties = test_votes[:, 16] test_votes = np.delete(test_votes, np.s_[16:17], axis=1) adaboost_classifiers = [None]*5 dtree_classifiers = [None]*5 knn_classifiers = [None]*5 adaboost_val_err = [None]*5 dtree_val_err = [None]*5 knn_val_err = [None]*5 T = [1, 25, 50, 100, 200] k = [1, 5, 25, 100, 200] d = [1, 5, 10, 16, 20] for i in range(5): dtree_classifiers[i] = DecisionTree(d[i]) dtree_classifiers[i].train(training_votes, training_parties) dtree_val_err[i] = dtree_classifiers[i].error(val_votes, val_parties) adaboost_classifiers[i] = AdaBoost(DecisionStump, T[i]) adaboost_classifiers[i].train(training_votes, training_parties) adaboost_val_err[i] = adaboost_classifiers[i].error(val_votes, val_parties) knn_classifiers[i] = kNN(k[i]) knn_classifiers[i].train(training_votes, training_parties) knn_val_err[i] = knn_classifiers[i].error(val_votes, val_parties) """ explanation for choosing the parameters for each classifier: I trained some classifiers of each type with different parameters and then measured the validation error with the validation sample. then,as I did in previous tasks, I chose the parameter that minimize the validation error over the sample and used the classifiers with this parameter to measure the test error. here is plots with validation error of each classifier over some parameters: """ plt.figure(1) plt.subplot(3, 1, 1) plt.plot(d, dtree_val_err) plt.title("Validation Error of Decision Tree") plt.subplot(3, 1, 2) plt.plot(T, adaboost_val_err) plt.title("Validation Error of Adaboost") plt.subplot(3, 1, 3) plt.plot(k, knn_val_err) plt.title("Validation Error of k Nearest Neighbors") plt.show() d_hat = d[np.argmin(dtree_val_err)] T_hat = T[np.argmin(adaboost_val_err)] k_hat = k[np.argmin(knn_val_err)] print("Decision Tree: the optimal validation error is: ", dtree_val_err[d.index(d_hat)], " , and the optimal test error is: ", dtree_classifiers[d.index(d_hat)].error(test_votes, test_parties)) print("Adaboost: the optimal validation error is: ", adaboost_val_err[T.index(T_hat)], " , and the optimal test error is: ", adaboost_classifiers[T.index(T_hat)].error(test_votes, test_parties)) print("k Nearest Neighbors: the optimal validation error is: ", knn_val_err[k.index(k_hat)], " , and the optimal test error is: ", knn_classifiers[k.index(k_hat)].error(test_votes, test_parties)) #optional dtree1 = DecisionTree(3) dtree1.train(votes[:10, :], parties[:10]) view_dtree(dtree1, feature_names, class_names) dtree2 = DecisionTree(6) dtree2.train(votes[:150, :], parties[:150]) view_dtree(dtree2, feature_names, class_names) dtree3 = DecisionTree(10) dtree3.train(votes, parties) view_dtree(dtree3, feature_names, class_names) print("===============================================") return