Exemple #1
0
def main():
    data = np.loadtxt(open("/Users/rio512hsu/dataset/MachineLearningTechniques" +
                           "/hw2_adaboost_train.csv", "rb"),
                      delimiter=" ")

    X = data[:, :-1]
    y = data[:, -1]
    u = np.ones(X.shape[0]) / X.shape[0]
    clf = DecisionStump().fit(X, y, u)
    # Q12
    print clf.getEin()

    # Q13
    adaboost = AdaBoost(DecisionStump).fit(X, y, 300)
    # print adaboost.predict(X)
    print np.sum(adaboost.predict(X) != y)

    # Q17
    test = np.loadtxt(open("/Users/rio512hsu/dataset/" +
                           "MachineLearningTechniques/" +
                           "hw2_adaboost_test.csv"),
                      delimiter=' ')
    X_test = test[:, :-1]
    y_test = test[:, -1]
    print np.sum(clf.predict(X) != y) / float(test.shape[0])

    # Q18
    print np.sum(adaboost.predict(X_test) != y_test) / float(test.shape[0])

    return 0
def adaboost_avg_run(max_classes, avg_num_of_run, training_set, testing_set):
    testing_error_list = []
    all_error_list = []

    # because datasets sometimes place the class attribute at the end or even
    # at the beginning or the middle, we'll separate the attribute vector from
    # the class-label. also note that this is the way scikit-learn does it.
    # train_x: the attribute vector; train_y: the class_label
    (train_x, train_y) = split_attribute_and_label(training_set)
    (test_x, test_y) = split_attribute_and_label(testing_set)
    # print(len(train_x))
    train_subset_num = int(len(train_y) * 0.2)

    for cl in range(1, max_classes + 1, 2):
        train_error = []
        testing_error = []
        scikit_error = []
        for i in range(avg_num_of_run):

            ada_obj = AdaBoost(cl, train_subset_num, THRESHOLD, ETA,
                               UPPER_BOUND, ETA_WEIGHTS, False)
            ada_obj.fit(train_x, train_y)

            hypothesis_list = ada_obj.predict(train_x)
            mistakes = ada_obj.xor_tuples(train_y, hypothesis_list)
            error_rate_train = classifier_error_rate(mistakes)

            hypothesis_list = ada_obj.predict(test_x)
            mistakes = ada_obj.xor_tuples(test_y, hypothesis_list)
            error_rate_test = classifier_error_rate(mistakes)
            train_error.append(error_rate_train)
            testing_error.append(error_rate_test)

            pada = perceptron.Perceptron(max_iter=UPPER_BOUND,
                                         verbose=0,
                                         random_state=None,
                                         fit_intercept=True,
                                         eta0=ETA)

            bdt = AdaBoostClassifier(pada, algorithm="SAMME", n_estimators=cl)
            bdt.fit(train_x, train_y)
            result_list = bdt.predict(test_x)
            scikit_error.append(calculate_error(test_y, result_list))

        errors = ErrorWrapper(cl,
                              sum(train_error) / len(train_error),
                              sum(testing_error) / len(testing_error),
                              sum(scikit_error) / len(scikit_error))

        all_error_list.append(errors)
        print("Train avg for %s   %s" % (cl, errors.train_error))
        print("Testing avg for %s   %s" % (cl, errors.test_error))
        testing_error_list.append(
            (sum(testing_error) / len(testing_error)) * 100)
        print("Scikit adaboost avg for %s   %s" % (cl, errors.scikit_error))

    #return testing_error_list
    return all_error_list
def testTitanicCARTAdaBoost():
    print('-' * 30, '\ntestTitanicCARTAdaBoost\n', '-' * 30)
    trd = pd.read_csv('Titanic_dataset/train.csv')
    # drop useless and continue features
    #for i in ["PassengerId", "Name", "Ticket", "Cabin", "Age", "SibSp", "Parch", "Fare"]:
    for i in ["PassengerId", "Name", "Ticket", "Cabin"]:
        trd.pop(i)
    trd = trd.dropna()  # drop nan values
    # convert non-digits to digits
    trd = pd.get_dummies(trd, columns=['Sex'])
    Embarked_map = {
        val: idx
        for idx, val in enumerate(trd['Embarked'].unique())
    }
    trd['Embarked'] = trd['Embarked'].map(Embarked_map)
    if DEBUG: print(trd[:5])
    # create train data
    trdd = trd.sample(frac=0.4)
    # using "Survived" as labels
    trl = trd.pop("Survived")
    trl[trl == 0] = -1
    trll = trdd.pop("Survived")
    trll[trll == 0] = -1
    # training tree
    t = AdaBoost(CART_weight_classifier, 10)
    t.fit(trdd, trll)
    # prediction
    pred = t.predict(trd)
    print('Acc.: ', np.sum(pred == trl.reset_index(drop=True)) / trl.shape[0])
Exemple #4
0
def main():
    X_train = np.array([
        [1.0, 2.1],
        [2.0, 1.1],
        [1.3, 1.0],
        [1.0, 1.0],
        [2.0, 1.0]
    ])

    y_train = np.array([1.0, 1.0, -1.0, -1.0, 1.0]).reshape((-1, 1))
    model = AdaBoost(verbose=1)
    model.fit(X_train, y_train)

    X_test = np.array([
        [5, 5],
        [0, 0]
    ])

    y_predict = model.predict(X_test)
    print('predict result: ', y_predict)
Exemple #5
0
# KNN scikit-learn
knn_begin = time.time()
clf2 = KNeighborsClassifier(n_neighbors=5)
clf2.fit(features_train, labels_train)
pred = clf2.predict(features_test)
accuracy = accuracy_score(labels_test, pred)
knn_end = time.time()
print('====== sklearn KNN ======')
print('The sklearn KNN classification accuracy = {}'.format(accuracy))
print('Training and prediction time = {}'.format(knn_end - knn_begin))

# AdaBoost custom implementation
custom_adaboost_begin = time.time()
custom_adaboost = AdaBoost(num_of_hypotheses=100)
custom_adaboost.fit(features_train, labels_train)
pred = custom_adaboost.predict(features_test)
accuracy = accuracy_score(labels_test, pred)
custom_adaboost_end = time.time()
print('====== Custom AdaBoost ======')
print('The custom AdaBoost classification accuracy = {}'.format(accuracy))
print('Training and prediction time = {}'.format(custom_adaboost_end - custom_adaboost_begin))

# AdaBoost scikit-learn
adaboost_begin = time.time()
clf3 = AdaBoostClassifier(n_estimators=100)
clf3.fit(features_train, labels_train)
pred = clf3.predict(features_test)
accuracy = accuracy_score(labels_test, pred)
adaboost_end = time.time()
print('====== sklearn AdaBoost ======')
print('The sklearn AdaBoost classification accuracy = {}'.format(accuracy))
Exemple #6
0
                           algorithm='SAMME',
                           n_estimators=no_base_classifiers,
                           learning_rate=1.0)

## CV
kf = KFold(n_splits=no_folds)
cv_acc_arr = []
cv_sk_acc_arr = []
i = 0
for train_ind, test_ind in kf.split(X_train):
    print("cross split no", i)
    x_tr, x_te = X_train.copy()[train_ind], X_train.copy()[test_ind]
    y_tr, y_te = y_train.copy()[train_ind], y_train.copy()[test_ind]

    f.init(x_tr, y_tr)
    f.train(no_base_classifiers)
    y_predict = f.predict(x_te)
    accuracy = np.mean(y_predict == y_te)
    cv_acc_arr.append(accuracy)

    ## comparing sklearn implementation of boost
    boost.fit(x_tr, y_tr)
    y_pred = boost.predict(x_te)
    accuracy_sk = np.mean(y_pred == y_te)
    cv_sk_acc_arr.append(accuracy_sk)

    i += 1

print(np.mean(cv_acc_arr))
print(np.mean(cv_sk_acc_arr))
Exemple #7
0
    
    if trainOrTest == 'train':
        myBoost = AdaBoost(300,verbose = False)
        TrainX,TrainY,TrainXID = myBoost.getDataFromFile(train_test_file)
        myBoost.train(TrainX,TrainY)
        pk.dump(myBoost,open(model_file,'wb'))
        
    if trainOrTest == 'test':
        try:
            myBoost = pk.load(open(model_file,'rb'))
        except:
            print("output file has not been generated")
        
        if myBoost.isTrained:
            Xtest,yTest,XtestID = myBoost.getDataFromFile(train_test_file)
            finalPredictions = myBoost.predict(Xtest)
            myBoost.writeToFile(XtestID,finalPredictions,'output.txt')
            print("Accuracy is: " ,sum(finalPredictions==yTest)/len(yTest))
        else:
            print("Untrained model being tested")

#train train-data.txt knn_model.txt knn
#test test-data.txt knn_model.txt knn
if model == 'knn' :
    
    if trainOrTest == 'train':
        knn.train(train_test_file,model_file)
        
    if trainOrTest == 'test':
        try:
            myKnn = open(model_file,'rb')
    train_set, validation_set = split_train_validation(dataset,
                                                       validation_set_size)
    num_to_train_on = 10000000
    time_before("training adaboost")
    ab.train_set(dataset[:num_to_train_on])
    time_after("training adaboost")
    time_before("training naive bayes")
    nb.train_set(dataset[:num_to_train_on])
    time_after("training naive bayes")

    kg_validations_nb = []
    kg_validations_ab = []

    for i in validation_set:
        kg_validations_nb.append(nb.predict(*i[1:]) == i[0])
        kg_validations_ab.append(ab.predict(*i[1:]) == i[0])

    # print("Errors nb: %s " % sum([0 if i else 1 for i in kg_validations_nb]))
    print("Errors ab: %s " % sum([0 if i else 1 for i in kg_validations_ab]))

    # import pdb; pdb.set_trace()

    predictions = []

    print("creating predictions...")
    with open(testset, "r") as testfile:
        data = testfile.read()
        lines = data.split('\n')[1:][:num_to_train_on]
        for line in lines:
            if not line:
                continue
Exemple #9
0
# encoding=utf-8
# @Author: wendesi
# @Date:   15-11-16
# @Email:  [email protected]
# @Last modified by:   wendesi
# @Last modified time: 15-11-16

import logging

from generate_dataset import *
from adaboost import AdaBoost

from sklearn.metrics import accuracy_score

if __name__ == '__main__':
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    train_features, train_labels, test_features, test_labels = generate_dataset(
        200)

    ada = AdaBoost()
    ada.train(train_features, train_labels)

    print 'end train'
    test_predict = ada.predict(test_features)

    score = accuracy_score(test_labels, test_predict)
    print "ada boost the accruacy socre is ", score