def main():
    learner = lambda d, l: adaboost.train(
        stump.random, stump.predict, data=d, labels=l, iters=3000)
    codes, predictors = train('data/8newsgroup/train.trec', 20, learner,
                              adaboost.predict)
    train_error = test('data/8newsgroup/train.trec', codes, predictors)
    test_error = test('data/8newsgroup/test.trec', codes, predictors)
    print "\ntrain_err=%.6f, test_err=%.6f" % (train_error, test_error)
Exemple #2
0
 def test_train(self):
     print("Test Train:")
     data_mat, labels_arr = adb.load_simple_data()
     classifies_arr, est_agg = adb.train(data_mat, labels_arr, 9)
     self.assertEqual(len(classifies_arr), 3)
     self.assertEqual(0, classifies_arr[-1]['dim'])
     self.assertEqual('lt', classifies_arr[-1]['ineq'])
     self.assertEqual(0.9, classifies_arr[-1]['thresh'])
     self.assertEqual(0.8958797346, round(classifies_arr[-1]['alpha'], 10))
Exemple #3
0
    def test_classify(self):
        print("Test Classify:")
        print("Classify Simple Data:")
        data_mat, labels_arr = adb.load_simple_data()
        classifies_arr, est_agg = adb.train(data_mat, labels_arr, 30)
        pred = adb.classify([[5, 5], [0, 0]], classifies_arr)
        res = np.matrix([[1.], [-1.]])
        self.assertEqual(True, (pred == res).all())

        print("Classify Loaded Data:")
        datArr, labelArr = adb.load_data_set('horseColicTraining2.txt')
        classiferArray, aggClassEst = adb.train(datArr, labelArr, 10)
        testArr, testLabelArr = adb.load_data_set('horseColicTest2.txt')
        prediction10 = adb.classify(testArr, classiferArray)
        errArr = np.mat(np.ones((67, 1)))
        err_rate = errArr[prediction10 != np.mat(testLabelArr).T].sum() / 67
        self.assertEqual(16.0 / 67, err_rate)
        print("Test Error: %f%%" % (err_rate * 100))
        # 绘制ROC和计算AUC
        val_auc = adb.plot_roc(aggClassEst, labelArr)
        self.assertLessEqual(0.8582969635, round(val_auc, 10))
Exemple #4
0
def active_learning(filenames, random=False):
    data, labels = load_data(filenames, delimiter=",", label_map={0.0:-1.0,1.0:1.0})
    total = len(data)
    state = np.random.get_state()
    np.random.shuffle(data)
    np.random.set_state(state)
    np.random.shuffle(labels)
    step = int(ceil(0.025 * len(labels)))
    sample_data = np.copy(data[-step*2:])
    sample_labels = np.copy(labels[-step*2:])
    data = np.delete(data, np.s_[-step*2:], axis=0)
    labels = np.delete(labels, np.s_[-step*2:])
    for x in range(19):
        assert len(data) + len(sample_data) == total
        assert len(labels) + len(sample_labels) == total
        
        # train using data
        alphas, predictors = adaboost.train(
            stump.random, stump.predict,
            data=sample_data, labels=sample_labels, iters=1000
        )
        
        # calculate the error
        train_error = adaboost.test(alphas, predictors, data=sample_data, labels=sample_labels)
        test_error = adaboost.test(alphas, predictors, data=data, labels=labels)
        sample = (x + 2) * 0.025
        msg = "sample=%.3f, train_err=%.6f, test_err=%.6f"
        print msg % (sample, train_error, test_error)
        
        # pick new sample points
        if random:
            sample_data = np.append(sample_data, data[-step:], axis=0)
            sample_labels = np.append(sample_labels, labels[-step:])
            data = np.delete(data, np.s_[-step:], axis=0)
            labels = np.delete(labels, np.s_[-step:])
        else:
            margins = np.absolute(sum([
                alpha * predictor(data)
                for alpha, predictor in zip(alphas, predictors)
            ]))
            sorted = margins.argsort()[:step]
            sample_data = np.append(sample_data, data[sorted], axis=0)
            sample_labels = np.append(sample_labels, labels[sorted])
            data = np.delete(data, sorted, axis=0)
            labels = np.delete(labels, sorted)
def uci(folder):
    data, labels = load_data(folder)
    for step in STEPS:
        state = np.random.get_state()
        np.random.shuffle(data)
        np.random.set_state(state)
        np.random.shuffle(labels)
        i = ceil(step * data.shape[DATA_AXIS])
        alphas, predictors = adaboost.train(stump.random,
                                            stump.predict,
                                            data=data[:i],
                                            labels=labels[:i],
                                            iters=500)
        train_error = adaboost.test(alphas,
                                    predictors,
                                    data=data[:i],
                                    labels=labels[:i])
        test_error = adaboost.test(alphas,
                                   predictors,
                                   data=data[i:],
                                   labels=labels[i:])
        msg = "sample=%.2f, train_err=%.6f, test_err=%.6f"
        print msg % (step, train_error, test_error)
            # Resolving conflict, if any. Shouldn't be the same
            if column_1 == column_2:
                if column_2 != total_num_of_cols:
                    column_2 += 1
                else:
                    column_2 -= 1
                    
            random_hyp_pairs.append((column_1,column_2))

        # Running train for the different orientations...
        # Each run will return a dict of dicts with key as particular orientation
        # Pipeline the dicts returned from one training stage to another till we finish.
        # Output will be a dict of dict with 4 keys (0, 90, 180, 270)
        
        print("Max iterations set to: ", max_iterations)
        alphas_for_0 = adaboost.train(data, 0, random_hyp_pairs, max_iterations, hyp_alphas)
        alphas_for_90 = adaboost.train(data, 90, random_hyp_pairs, max_iterations, alphas_for_0)
        alphas_for_180 = adaboost.train(data, 180, random_hyp_pairs, max_iterations, alphas_for_90)
        alphas_for_270 = adaboost.train(data, 270, random_hyp_pairs, max_iterations, alphas_for_180)
        
        # Storing model params in a pickle to retain dictionary structure
        pickle.dump(alphas_for_270, f, protocol=pickle.HIGHEST_PROTOCOL)
        end = datetime.datetime.now()

        print("Finished training in", end-start)
        # Training ends....
        f.close()
        
    #----------------------------------#
    #            TESTING               #
    #----------------------------------#
import numpy as np
import adaboost

def createDataSet(filename):
    file = open(filename, 'r')
    lines = file.readlines()
    file.close()

    dataSet, labels = [], []
    for line in lines:
        data = line.split()
        dataSet.append(list(map(float, data[:-1])))
        labels.append(int(float(data[-1])))

    return np.array(dataSet), np.array(labels)

trainingSet, trainingLabels = createDataSet('data/horse/training.txt')
stumps = adaboost.train(trainingSet, trainingLabels)

testSet, testLabels = createDataSet('data/horse/test.txt')
res = adaboost.classify(testSet, stumps)
a, b = np.sum(res == testLabels), len(testLabels)
print('Correctness: %d/%d = %.2f%%' % (a, b, a/b * 100))
Exemple #8
0
    return mat


train_face_features = features_to_mat(features, train_face_integral_images)
train_face_labels = [1.] * len(train_face_features)

train_non_face_features = features_to_mat(features,
                                          train_non_face_integral_images)
train_non_face_labels = [-1.] * len(train_non_face_features)

test_face_features = features_to_mat(features, test_face_integral_images)
test_non_face_features = features_to_mat(features,
                                         test_non_face_integral_images)

adaboost_classifiers = adaboost.train(
    train_face_features + train_non_face_features,
    train_face_labels + train_non_face_labels, 10)


def output_round(features, ada_classifiers, round):
    best_feature_idx = ada_classifiers[round - 1]['index']
    best_feature = features[best_feature_idx]

    print("\nAdaboost rounds: %d" % round)

    # print image with top feature rectangle
    if best_feature.type == FeatureType.TWO_HORIZONTAL:
        printed_img = draw_feature_2h(open_face(test_face_images[0]),
                                      best_feature)
        print("\nType: TWO_HORIZONTAL")
    elif best_feature.type == FeatureType.TWO_VERTICAL:
    with open(join(mypath, file), "r") as f:
        templist = []
        for line in f:
            for word in line.split():
                if word not in templist and word.isnumeric():  #len(word) > 1
                    templist.append(word)
    templist.append(not "spmsg" in file)
    development_mails.append(templist)

print("Training bayes and adaboost...")
for i in range(len(generalHyperParameters)):
    print(i)
    correctBayes = 0
    correctAdaboost = 0
    basic_classifiers_with_weights = adaboost.train(
        trainMailsList, trainHam, trainSpam, sortedIGs,
        generalHyperParameters[i])  #maybe allMailsList as well
    bayes_probabilities = naiveBayes.train(
        sortedIGs, allWords, trainHam, trainSpam,
        generalHyperParameters[i])  #parameters for naive_bayes
    for incoming in development_mails:
        if adaboost.predict(basic_classifiers_with_weights, sortedIGs,
                            incoming,
                            generalHyperParameters[i]) == incoming[-1]:
            correctAdaboost += 1
        if naiveBayes.predict(bayes_probabilities, incoming, sortedIGs,
                              trainHam / (trainHam + trainSpam),
                              1 - trainHam / (trainHam + trainSpam),
                              generalHyperParameters[i]) == incoming[-1]:
            correctBayes += 1
    if correctBayes > maxBayes:
if __name__ == "__main__":
    writer = csv.writer(open("F12.csv", "w", encoding="utf-8", newline=""))
    traindataSet, valdataSet = loadDataSet('newTrain2.csv')
    testdataSet = loadTestSet('newTest2.csv')
    """
    traindataSet = [[1.,2.1, 1.0],
                         [2.,1.1, 1.0],
                         [1.3,1., -1.0],
                         [1.,1., -1.0],
                         [2.,1., 1.0]
    ]
    """
    trainLabels = [row[-1] for row in traindataSet]
    valLabels = [row[-1] for row in valdataSet]
    valdataSet = [row[:-1] for row in valdataSet]
    for depth in range(10, 11):
        for numOfClassifier in range(16, 21):
            print("\nnumofClassifier: " + str(numOfClassifier) + "  depth: " +
                  str(depth))
            weekCartClassList, trainResult = adaboost.train(
                traindataSet, numOfClassifier, depth)
            trainF1 = printResult(trainResult, trainLabels)

            valResult = adaboost.predict(valdataSet, weekCartClassList)
            valF1 = printResult(valResult, valLabels)

            # testResult = adaboost.predict(testdataSet, weekCartClassList)
            # outputResult(testResult, numOfClassifier, depth, trainF1, 0)
            writer.writerow([trainF1, valF1])
Exemple #11
0
from metrics import acc

import numpy as np

cross_validate = True

if __name__ == "__main__":
    # First obtain our training and testing data
    Xt, Yt, Xv = load_validation_data()

    if cross_validate:
        # for cross-validation
        Xt1, Xt2, Yt1, Yt2 = shuffle_split(Xt, Yt)

        classifiers = [
            adaboost.train(Xt1, Yt1),
            extra_randomized_trees.train(Xt1, Yt1),
            gradient_boost.train(Xt1, Yt1),
            random_forest.train(Xt1, Yt1),
            logistic_regression.train(Xt1, Yt1),
        ]

        # Train another classifier on the ensembles output training predictions
        # for each sample in the training data
        training_predictions = np.mat(
            [[c.predict(sample)[0] for c in classifiers] for sample in Xt1])

        meta_classifier = logistic_regression.train(training_predictions, Yt1)

        # Check results on training data
        print "Accuracy for individual classifiers:", [
def train(data, labels):
    learner = lambda d, l: adaboost.train(
        stump.random, stump.predict, data=d, labels=l, iters=5000)
    return ecoc.train(data, labels, 50, learner, adaboost.predict)
import cascade

train_x, train_y, test_x, test_y = data_loader.load()

train_f, i_f = feature.get_features(train_x)
test_selection_index = np.concatenate(
    [range(472),
     np.random.choice(19572, 2000, replace=False) + 472])
test_x = test_x[test_selection_index]
test_y = test_y[test_selection_index]

try:
    with open("classifier.pkl", "rb") as f:
        classifier = pickle.load(f)
except:
    classifier = adaboost.train(train_f, train_y, 10)
    with open("classifier.pkl", "wb") as f:
        pickle.dump(classifier, f, protocol=pickle.HIGHEST_PROTOCOL)

# 0.99, 0.4, 0.01
try:
    with open("cascade.pkl", "rb") as f:
        cascade_classifier = pickle.load(f)
except:
    cascade_classifier = cascade.train_cascade(train_f, train_y, 0.99, 0.4,
                                               0.01)
    with open("cascade.pkl", "wb") as f:
        pickle.dump(cascade_classifier, f, protocol=pickle.HIGHEST_PROTOCOL)

test_f, i_f = feature.get_features(test_x)
Exemple #14
0
# Designed by Junbo Zhao
# 12/23/2013
# This is a simple demo, training and testing the adaboost classifier

from numpy import *
import data
import adaboost

# The following data files are only used to show you how this program works.
# The two files are generated by function randomData()

# It is better to use your own data to see the power of adaboost!
# Face recognition problems are good for using adaboost.

trainData,label = data.readData('train.txt','train')
#trainData,label = data.loadSimpleData()
testData = data.readData('test.txt','test')
classifier = adaboost.train(trainData,label,150)
adaboost.test(testData,classifier)

input()
from __future__ import print_function,division
import numpy as np
import data
import adaboost

dataMat,labels=data.loadSimpleData()
print("dataMat:",dataMat)
print("labels:",labels)

adaboost.train(dataMat,labels)



from metrics import acc

import numpy as np

cross_validate = True

if __name__ == "__main__":
    # First obtain our training and testing data
    Xt, Yt, Xv = load_validation_data()

    if cross_validate:
        # for cross-validation
        Xt1, Xt2, Yt1, Yt2 = shuffle_split(Xt, Yt)

        classifiers = [
            adaboost.train(Xt1, Yt1),
            extra_randomized_trees.train(Xt1, Yt1),
            gradient_boost.train(Xt1, Yt1),
            random_forest.train(Xt1, Yt1),
            logistic_regression.train(Xt1, Yt1),
            ]

        # Train another classifier on the ensembles output training predictions
        # for each sample in the training data
        training_predictions = np.mat([[c.predict(sample)[0] for c in classifiers] for sample in Xt1])

        meta_classifier = logistic_regression.train(training_predictions, Yt1)

        # Check results on training data
        print "Accuracy for individual classifiers:", [acc(Yt2, c.predict(Xt2)) for c in classifiers]
        predictions = np.mat([c.predict(Xt2) for c in classifiers]).transpose()