def cross(): train, target = load_vote() target = np.array(map(lambda v: -1.0 if v == 0 else 1.0, target)) k = 10 train_size = len(train) test_index_generator = k_fold_cross_validation(train_size, k) fold = 1 overall_acc = 0 overall_error = 0 overall_auc = 0 for start, end in test_index_generator: print "====================Fold %s============" % fold k_fold_train = np.vstack( (train[range(0, start)], train[range(end, train_size)])) test = train[range(start, end)] train_target = np.append(target[range(0, start)], target[range(end, train_size)]) test_target = target[range(start, end)] adaboost = AdaBoost(OptimalWeakLearner()) acc, err, auc = adaboost.boost(k_fold_train, train_target, test, test_target) overall_auc += auc overall_acc += acc overall_error += err fold += 1 print "Overall test accuracy: %s, overall test error: %s, overall test auc: %s" % ( overall_acc / k, overall_error / k, overall_auc / k)
def optimal_weak_learner_on_random_data(): data, target = load_spambase() train, test, train_target, test_target = train_test_shuffle_split( data, target, len(data) / 10) train_target = np.array( map(lambda v: -1.0 if v == 0 else 1.0, train_target)) test_target = np.array(map(lambda v: -1.0 if v == 0 else 1.0, test_target)) indices = range(len(train)) param = 0.05 res = [] while param < 0.5: print "Choose %.2f%% of data" % (param * 100) choose_size = int(len(indices) * param) choose_indices = random.sample(indices, choose_size) X = train[choose_indices] Y = train_target[choose_indices] adaboost = AdaBoost(OptimalWeakLearner()) acc, err, auc = adaboost.boost(X, Y, test, test_target) res.append((acc, err, auc)) param += 0.05 print res
def active_learning(self, train, train_target, test, test_target): param = 0.05 increment = 0.05 init_size = int(len(train) * param) increment_size = int(len(train) * increment) X = train[:init_size] Y = train_target[:init_size] R = train[init_size:] RY = train_target[init_size:] while param < 0.5: print "labeled data: %.2f%%" % (100.0 * len(X) / len(train)) adaboost = AdaBoost(OptimalWeakLearner()) acc, err, auc = adaboost.boost(X, Y, test, test_target) self.result.append((acc, err, auc)) H = adaboost.hypothesis(R) H_abs = np.abs(H) sorted_indices = H_abs.argsort().tolist() selected = sorted_indices[:increment_size] remained = sorted_indices[increment_size:] X = np.vstack((X, R[selected])) # Y = np.append(Y, adaboost.sign(H[selected])) Y = np.append(Y, RY[selected]) R = R[remained] RY = RY[remained] param += increment
def train(self, x): """ xは0, 1, ..., 9の順で同じ数並んでいるとする """ # 1クラスの数 num = len(x) // self.class_num for j, label in enumerate(tqdm(range(self.class_num))): # labelとotherを結合 # otherは1クラスの数分をランダムで取ってくる label_x = x[label * num:(label + 1) * num] other_x = np.delete( x, [i for i in range(label * num, (label + 1) * num)], axis=0) other_x = other_x[np.random.choice(len(x) - num, num), :] vs_x = np.concatenate([label_x, other_x]) # ラベルがlabelのとき1, それ以外を-1にする vs_y = np.array([1 if i < num else -1 for i in range(num * 2)]) # 学習 # self.model_list[j].train(vs_x, vs_y) # copy_model = copy(self.model).train(vs_x, vs_y) # self.model_list.append(copy_model) ImageSize = 28 binary_SVM = SVM(ImageSize**2) adaboost = AdaBoost(binary_SVM, 10) adaboost.train(vs_x, vs_y) self.model_list.append(adaboost)
def train(self, x): """ xは0, 1, ..., 9の順で同じ数並んでいるとする """ # 1クラスの数 num = len(x) // self.class_num for j, combi in enumerate(tqdm(self.combinations)): # combi[0]とcombi[1]のデータを結合 vs_x = np.concatenate([ x[num * combi[0]:num * (combi[0] + 1)], x[num * combi[1]:num * (combi[1] + 1)] ], axis=0) # combi[0]が+1, combi[1]が-1 vs_y = np.array([1 if i < num else -1 for i in range(num * 2)]) # 学習 # self.model_list[j].train(vs_x, vs_y) ImageSize = 28 binary_SVM = SVM(ImageSize**2) adaboost = AdaBoost(binary_SVM, 10) adaboost.train(vs_x, vs_y) self.model_list.append(adaboost)
def train(self, train, train_target, test, test_target, T=100, percentage=0.5): k, n = self.selected_code.shape train = train[:int(len(train) * percentage)] train_target = train_target[:int(len(train_target) * percentage)] first_time = True predictors = None for f in range(n): print "Run Adaboost on function %f" % f codes = self.selected_code[:, f] labels = self.convert_to_binary(train_target, codes) test_labels = self.convert_to_binary(test_target, codes) learner = OptimalWeakLearner() if not first_time: learner.set_predictors(predictors) adaboost = AdaBoost(learner) adaboost.boost(train, labels, test, test_labels, T, calculate_auc=False) self.functions.append(adaboost) if first_time: first_time = False predictors = learner.get_predictors()
def random_weak_learner(): print '==============Random Weak Learner============' train, target = load_spambase() train, test, train_target, test_target = train_test_shuffle_split( train, target, len(train) / 10) train_target = np.array( map(lambda v: -1.0 if v == 0 else 1.0, train_target)) test_target = np.array(map(lambda v: -1.0 if v == 0 else 1.0, test_target)) adaboost = AdaBoost(RandomChooseLeaner()) adaboost.boost(train, train_target, test, test_target, T=200)
def entire(): data, target = load_vote() train, test, train_target, test_target = train_test_shuffle_split( data, target, len(data) / 10) train_target = np.array( map(lambda v: -1.0 if v == 0 else 1.0, train_target)) test_target = np.array(map(lambda v: -1.0 if v == 0 else 1.0, test_target)) adaboost = AdaBoost(OptimalWeakLearner()) adaboost.boost(train, train_target, test, test_target, discrete_features=range(train.shape[1]))
from boosting import AdaBoost, OptimalDecisionStump if __name__ == '__main__': k = 10 T = 300 print('Original Dataset') print('Reading data') data = np.genfromtxt('../HW1/data/spambase/spambase.data', delimiter=',') x = data[:, :-1] y = data[:, -1] print('Training model') # bst = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1, splitter='best', criterion='entropy'), n_estimators=T, algorithm='SAMME.R') bst = AdaBoost(T, OptimalDecisionStump) bst.fit(x, y, test_data=(x, y)) print('Analyzing') score = np.zeros(x.shape[1]) # for i in range(T): # sub_tree = bst.estimators_[i] # score[sub_tree.tree_.feature] += (y * bst.estimator_weights_[i] * sub_tree.predict(x)).sum() for i in range(T): sub_tree = bst.classifiers[i] score[sub_tree.feature] += (y * bst.alpha[i] * sub_tree.predict(x)).sum() score /= score.sum() print('Top 15 features', np.argsort(score)[::-1][:15]) pred = bst.predict(x) print("Accuracy:", np.mean(np.equal(pred, y)))
TestSampleNum = 100 # テストサンプル総数 ClassNum = 10 # クラス数(今回は10) # ImageSize = 8 # 画像サイズ(今回は縦横ともに8) ImageSize = 28 # TrainingDataFile = './Images/TrainingCompressionSamples/{0:1d}-{1:04d}.png' # TestDataFile = './Images/TestCompressionSamples/{0:1d}-{1:04d}.png' TrainingDataFile = './Images/TrainingSamples/{0:1d}-{1:04d}.png' TestDataFile = './Images/TestSamples/{0:1d}-{1:04d}.png' train_x, train_t = LoadDataset(TrainingDataFile, TrainingSampleNum, ClassNum, ImageSize) test_x, test_t = LoadDataset(TestDataFile, TestSampleNum, ClassNum, ImageSize) # Adaboost binary_SVM = SVM(ImageSize**2) adaboost = AdaBoost(binary_SVM, 5) # multi = one_vs_one(binary_SVM, ClassNum, ImageSize**2) # multi = one_vs_other(binary_SVM, ClassNum, ImageSize**2) multi = one_vs_one(SVM, ClassNum, ImageSize**2) # multi = one_vs_other(adaboost, ClassNum, ImageSize**2) # 学習 multi.train(train_x) # 推測 y = multi.eval(test_x) TestResult(y, test_t, ClassNum)
return cum_x[:, x + w, y + h] - cum_x[:, x + w, y] - cum_x[:, x, y + h] + cum_x[:, x, y] if __name__ == '__main__': print('Load MINST') x, y = load_mnist(path='minst') x = x > 0 cum_x = np.cumsum(np.cumsum(x, axis=1), axis=2) print('Traning ECOC') n, width, height = x.shape features = np.zeros((n, 200)) for i in range(100): w = (np.random.randint(3, 10) + 1) * 2 h = np.random.choice([k for k in range(int(130 / w), int(170 / w), 2)], 1)[0] x_pos = np.random.randint(int(width - w)) y_pos = np.random.randint(int(height - h)) haar_h = HAARFeature(x_pos, y_pos, w, h, 'horizontal') features[:, i] = haar_h.feature_value(cum_x) haar_v = HAARFeature(x_pos, y_pos, w, h, 'vertical') features[:, 100 + i] = haar_v.feature_value(cum_x) ecoc = ECOC(lambda: AdaBoost(200, OptimalDecisionStump), k=50) ecoc.fit(features, y) pred = ecoc.predict(features) print('Training Accuracy', np.equal(pred, y).mean())
def optimal_weak_learner(): print '==============Optimal Weak Learner============' train, target = load_spambase() train, target = shuffle(train, target) target = np.array(map(lambda v: -1.0 if v == 0 else 1.0, target)) k = 10 train_size = len(train) test_index_generator = k_fold_cross_validation(train_size, k) fold = 1 overall_acc = 0 overall_error = 0 overall_auc = 0 for start, end in test_index_generator: print "====================Fold %s============" % fold k_fold_train = np.vstack( (train[range(0, start)], train[range(end, train_size)])) test = train[range(start, end)] train_target = np.append(target[range(0, start)], target[range(end, train_size)]) test_target = target[range(start, end)] adaboost = AdaBoost(OptimalWeakLearner()) plot = False if fold == 1: plot = True else: plot = False acc, err, auc = adaboost.boost(k_fold_train, train_target, test, test_target, plot=plot) if plot: test_err_points = np.array(adaboost.test_err_array) train_err_points = np.array(adaboost.train_err_array) auc_points = np.array(adaboost.test_auc_array) round_err_points = np.array(adaboost.weighted_err_array) plt.xlabel('Round') plt.ylabel('Error Rate') plt.plot(test_err_points[:, 0], test_err_points[:, 1], c='r', label='Test Error') plt.plot(test_err_points[:, 0], train_err_points[:, 1], c='g', label='Train Error') plt.plot(test_err_points[:, 0], round_err_points[:, 1], c='b', label='Round Error') plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.) plt.show() plt.xlabel('Round') plt.ylabel('AUC') plt.plot(test_err_points[:, 0], auc_points[:, 1], c='r', label='AUC') plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.) plt.show() overall_auc += auc overall_acc += acc overall_error += err if fold == 1: hypo = adaboost.hypothesis(test) roc_points = roc(test_target, hypo, 1.0, -1.0) plt.xlabel('FPR') plt.ylabel('TPR') plt.xlim(xmin=0) plt.ylim(ymin=0) plt.scatter(roc_points[:, 1], roc_points[:, 0]) plt.show() fold += 1 print "Overall test accuracy: %s, overall test error: %s, overall test auc: %s" % ( overall_acc / k, overall_error / k, overall_auc / k)
init_dataset = train[:init_size] init_target = train_target[:init_size] remain_dataset = train[init_size:] remain_target = train_target[init_size:] # Active learning X = init_dataset Y = init_target R = remain_dataset RY = remain_target result = [] while percentage < 0.5: print "labeled data: %.2f%%" % (100.0 * len(X) / len(train)) adaboost = AdaBoost(OptimalWeakLearner()) acc, err, auc = adaboost.boost(X, Y, test, test_target) result.append((acc, err, auc)) H = adaboost.hypothesis(R) H_abs = np.abs(H) sorted_indices = H_abs.argsort().tolist() selected = sorted_indices[:increment_size] remained = sorted_indices[increment_size:] X = np.vstack((X, R[selected])) # Y = np.append(Y, adaboost.sign(H[selected])) Y = np.append(Y, RY[selected]) R = R[remained] RY = RY[remained] percentage += increment