def testTitanicCARTAdaBoost(): print('-' * 30, '\ntestTitanicCARTAdaBoost\n', '-' * 30) trd = pd.read_csv('Titanic_dataset/train.csv') # drop useless and continue features #for i in ["PassengerId", "Name", "Ticket", "Cabin", "Age", "SibSp", "Parch", "Fare"]: for i in ["PassengerId", "Name", "Ticket", "Cabin"]: trd.pop(i) trd = trd.dropna() # drop nan values # convert non-digits to digits trd = pd.get_dummies(trd, columns=['Sex']) Embarked_map = { val: idx for idx, val in enumerate(trd['Embarked'].unique()) } trd['Embarked'] = trd['Embarked'].map(Embarked_map) if DEBUG: print(trd[:5]) # create train data trdd = trd.sample(frac=0.4) # using "Survived" as labels trl = trd.pop("Survived") trl[trl == 0] = -1 trll = trdd.pop("Survived") trll[trll == 0] = -1 # training tree t = AdaBoost(CART_weight_classifier, 10) t.fit(trdd, trll) # prediction pred = t.predict(trd) print('Acc.: ', np.sum(pred == trl.reset_index(drop=True)) / trl.shape[0])
def adaboost_avg_run(max_classes, avg_num_of_run, training_set, testing_set): testing_error_list = [] all_error_list = [] # because datasets sometimes place the class attribute at the end or even # at the beginning or the middle, we'll separate the attribute vector from # the class-label. also note that this is the way scikit-learn does it. # train_x: the attribute vector; train_y: the class_label (train_x, train_y) = split_attribute_and_label(training_set) (test_x, test_y) = split_attribute_and_label(testing_set) # print(len(train_x)) train_subset_num = int(len(train_y) * 0.2) for cl in range(1, max_classes + 1, 2): train_error = [] testing_error = [] scikit_error = [] for i in range(avg_num_of_run): ada_obj = AdaBoost(cl, train_subset_num, THRESHOLD, ETA, UPPER_BOUND, ETA_WEIGHTS, False) ada_obj.fit(train_x, train_y) hypothesis_list = ada_obj.predict(train_x) mistakes = ada_obj.xor_tuples(train_y, hypothesis_list) error_rate_train = classifier_error_rate(mistakes) hypothesis_list = ada_obj.predict(test_x) mistakes = ada_obj.xor_tuples(test_y, hypothesis_list) error_rate_test = classifier_error_rate(mistakes) train_error.append(error_rate_train) testing_error.append(error_rate_test) pada = perceptron.Perceptron(max_iter=UPPER_BOUND, verbose=0, random_state=None, fit_intercept=True, eta0=ETA) bdt = AdaBoostClassifier(pada, algorithm="SAMME", n_estimators=cl) bdt.fit(train_x, train_y) result_list = bdt.predict(test_x) scikit_error.append(calculate_error(test_y, result_list)) errors = ErrorWrapper(cl, sum(train_error) / len(train_error), sum(testing_error) / len(testing_error), sum(scikit_error) / len(scikit_error)) all_error_list.append(errors) print("Train avg for %s %s" % (cl, errors.train_error)) print("Testing avg for %s %s" % (cl, errors.test_error)) testing_error_list.append( (sum(testing_error) / len(testing_error)) * 100) print("Scikit adaboost avg for %s %s" % (cl, errors.scikit_error)) #return testing_error_list return all_error_list
def main(): X_train = np.array([ [1.0, 2.1], [2.0, 1.1], [1.3, 1.0], [1.0, 1.0], [2.0, 1.0] ]) y_train = np.array([1.0, 1.0, -1.0, -1.0, 1.0]).reshape((-1, 1)) model = AdaBoost(verbose=1) model.fit(X_train, y_train) X_test = np.array([ [5, 5], [0, 0] ]) y_predict = model.predict(X_test) print('predict result: ', y_predict)
def ab_plot(iterate): fig, axes = plt.subplots(2, 2) # fig.set_size_inches(10, 10) for ax in axes.flatten(): n_ex = 100 n_trees = 50 n_feats = np.random.randint(2, 100) max_depth_d = 1 #np.random.randint(1, 100) classifier = np.random.choice([True]) #, False if classifier: # create classification problem n_classes = np.random.randint(2, 10) X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model def loss(yp, y): return accuracy_score(yp, y) # initialize model criterion = np.random.choice(["entropy", "gini"]) mine_g = AdaBoost( n_iter=iterate, max_depth=max_depth_d, classifier=classifier, # learning_rate=1, # loss="crossentropy", # step_size="constant", # split_criterion=criterion, ) else: # create regeression problem X, Y = make_regression(n_samples=n_ex, n_features=1) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model criterion = "mse" loss = mean_squared_error mine_g = GradientBoostedDecisionTree( n_iter=iterate, # n_trees=n_trees, max_depth=max_depth_d, classifier=classifier, learning_rate=1, loss="mse", step_size="adaptive", split_criterion=criterion, ) # fit 'em mine_g.fit(X, Y) # # get preds on test set # y_pred_mine_test_g = mine_g.predict(X_test) # # loss_mine_test_g = loss(y_pred_mine_test_g, Y_test) # # if classifier: # entries = [ # ("GB", loss_mine_test_g, y_pred_mine_test_g) # ] # (lbl, test_loss, preds) = entries[np.random.randint(1)] # ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100)) # for i in np.unique(Y_test): # ax.scatter( # X_test[preds == i, 0].flatten(), # X_test[preds == i, 1].flatten(), # # s=0.5, # ) # else: # X_ax = np.linspace( # np.min(X_test.flatten()) - 1, np.max(X_test.flatten()) + 1, 100 # ).reshape(-1, 1) # y_pred_mine_test_g = mine_g.predict(X_ax) # # ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5) # # s=0.5) # ax.plot( # X_ax.flatten(), # y_pred_mine_test_g.flatten(), # # linewidth=0.5, # label="GB".format(n_trees, n_feats, max_depth_d), # color="red", # ) # ax.set_title( # "GB: {:.1f}".format( # loss_mine_test_g # ) # ) # ax.legend() # ax.xaxis.set_ticklabels([]) # ax.yaxis.set_ticklabels([]) # plt.savefig("plot.png", dpi=300) plt.show()
from adaboost import AdaBoost def create_data(): iris = load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) df['label'] = iris.target df.columns = [ 'sepal length', 'sepal width', 'petal length', 'petal width', 'label' ] data = np.array(df.iloc[:100, [0, 1, -1]]) for i in range(len(data)): if data[i, -1] == 0: data[i, -1] = -1 return data[:, :2], data[:, -1] X, y = create_data() # print(X) # print(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # plt.scatter(X[:50, 0], X[:50, 1], label='0') # plt.scatter(X[50:, 0], X[50:, 1], label='1') # plt.legend() # plt.show() clf = AdaBoost(n_estimators=50, learning_rate=0.2) clf.fit(X_train, y_train) print(clf.score(X_test, y_test))
# KNN scikit-learn knn_begin = time.time() clf2 = KNeighborsClassifier(n_neighbors=5) clf2.fit(features_train, labels_train) pred = clf2.predict(features_test) accuracy = accuracy_score(labels_test, pred) knn_end = time.time() print('====== sklearn KNN ======') print('The sklearn KNN classification accuracy = {}'.format(accuracy)) print('Training and prediction time = {}'.format(knn_end - knn_begin)) # AdaBoost custom implementation custom_adaboost_begin = time.time() custom_adaboost = AdaBoost(num_of_hypotheses=100) custom_adaboost.fit(features_train, labels_train) pred = custom_adaboost.predict(features_test) accuracy = accuracy_score(labels_test, pred) custom_adaboost_end = time.time() print('====== Custom AdaBoost ======') print('The custom AdaBoost classification accuracy = {}'.format(accuracy)) print('Training and prediction time = {}'.format(custom_adaboost_end - custom_adaboost_begin)) # AdaBoost scikit-learn adaboost_begin = time.time() clf3 = AdaBoostClassifier(n_estimators=100) clf3.fit(features_train, labels_train) pred = clf3.predict(features_test) accuracy = accuracy_score(labels_test, pred) adaboost_end = time.time() print('====== sklearn AdaBoost ======')