예제 #1
0
def main():
    kernel = c.COSINE
    # training parameter
    result_path = 'results/PB2_spam.acc'
    model_name = 'digits_' + kernel

    tr_data_path = 'data\\digits\\tr_f_l_10.pickle'
    te_data_path = 'data\\digits\\te_f_l_10.pickle'
    # laod and preprocess training data
    tr_data = loader.load_pickle_file(tr_data_path)
    te_data = loader.load_pickle_file(te_data_path)

    # transpose label
    tr_data[1] = np.transpose(tr_data[1])[0]
    te_data[1] = np.transpose(te_data[1])[0]

    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0])
    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0])
    # start training

    st = time.time()

    # start training
    print('{:.2f} Start training.'.format(time.time() - st))

    for r in (0.15, 0.1):
        clf = kNN.kNN(kernel=kernel, dataset=c.DS_DIGITS)
        clf.fit(tr_data[0], tr_data[1])
        tr_pred = clf.predict(tr_data[0], r=r)
        te_pred = clf.predict(te_data[0], r=r)

        tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
        te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

        print('{} Final results with kernel {} and r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, r, tr_acc, te_acc))
예제 #2
0
def main():
    st = time.time()
    # training parameter
    result_path = 'results/PB2_A_spam_polluted_NB_Gaussian.acc'
    model_name = 'spam_'
    train_data_path = 'data/spam_polluted/train/data.pickle'
    test_data_path = 'data/spam_polluted/test/data.pickle'

    tr_data = loader.load_pickle_file(train_data_path)
    te_data = loader.load_pickle_file(test_data_path)
    print('{:.2f} Data loaded!'.format(time.time() - st))

    # start training
    print('{:.2f} Building model...'.format(time.time() - st))
    model = m.NBGaussian()
    model.build(tr_data[0], tr_data[1])

    print('{:.2f} Predicting...'.format(time.time() - st))
    tr_pred = model.predict(tr_data[0])
    te_pred = model.predict(te_data[0])

    print('{:.2f} Calculating results...'.format(time.time() - st))
    tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
    te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]


    print('{} Final results. Train acc: {}, Test acc: {}'.format(time.time() - st, tr_acc, te_acc))

    result = {}
    result['TrainingAcc'] = tr_acc
    result['TestingAcc'] = te_acc

    # log the training result to file
    util.write_result_to_file(result_path, model_name, result, True)
예제 #3
0
def test():

    # laod and preprocess training data
    # tr_data = loader.load_pickle_file(tr_data_path)
    te_data= loader.load_pickle_file(te_data_path)
    model = loader.load_pickle_file(model_path)
    # te_pred_dict = loader.load_pickle_file(te_pred_dict_path)

    test_pred_dict = {}
    for i in range(9):
        test_pred_dict[i] = {}
        for j in range(i + 1, 10):
            if i == j:
                continue
            # get training data for this class
            clf = model[i][j]
            te_pred = clf.predict(te_data[0])
            test_pred_dict[i][j] = te_pred


    te_n = len(te_data[1])
    te_pred = np.zeros((1, te_n))[0]

    for i in range(te_n):
        votes = np.zeros((10,), dtype=np.int)
        for j in range(9):
            for k in range(j):
                votes[j] += 1 if test_pred_dict[k][j][i] == -1 else 0
            for kk in test_pred_dict[j]:
                votes[j] += 1 if test_pred_dict[j][kk][i] == 1 else 0
        count = np.bincount(votes)
        if count[-1] == 1:
            te_pred[i] = votes.argmax()
        else:
            te_pred[i] = votes.argmax()
            tie_ind = [votes.argmax()]
            cc = 0
            for ind_v, v in enumerate(votes):
                if v == votes.max():
                    if cc == 1:
                        tie_ind.append(ind_v)
                        break
                    else:
                        cc += 1
            te_pred[i] = tie_ind[0] if test_pred_dict[tie_ind[0]][tie_ind[1]][i] == 1 else tie_ind[1]
            print('{} Tie! {} wins.'.format(count[-1], te_pred[i]))


    acc = 0
    acc_n = 0
    for ind_l, l in enumerate(te_data[1]):
        acc += 1 if l == te_pred[ind_l] else 0

    acc /= te_n
    # acc = (te_data[1] == te_pred).sum() / te_n

    print('Acc: {}'.format(acc))
예제 #4
0
def get_cs(data_path, cs_path):
    # dp compute cheat sheet
    cs = None
    if os.path.isfile(cs_path):
        cs = loader.load_pickle_file(cs_path)
        print('CS loaded.')
    else:
        print('Start compute cs.')
        data = loader.load_pickle_file(data_path)
        cs = dp_compute_cs(data[0])
        loader.save(cs_path, cs)
        print('CS saved.')
    return cs
예제 #5
0
def main():
    is_sklearn = False
    # kernel = c.COSINE
    # kernel = c.GAUSSIAN
    kernel = c.POLY
    # training parameter
    result_path = 'results/PB2_spam.acc'
    model_name = 'digits_' + kernel
    model_path = 'data/PB1_B_digits_sk_Gaussian_1.model'

    # tr_data_path = 'data\\digits\\tr_f_l.pickle'
    # te_data_path = 'data\\digits\\te_f_l.pickle'
    tr_data_path = 'data\\digits\\tr_f_l_10.pickle'
    te_data_path = 'data\\digits\\te_f_l_10.pickle'
    # laod and preprocess training data
    tr_data = loader.load_pickle_file(tr_data_path)
    te_data = loader.load_pickle_file(te_data_path)

    # transpose label
    tr_data[1] = np.transpose(tr_data[1])[0]
    te_data[1] = np.transpose(te_data[1])[0]

    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0])
    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0])


    # start training
    models = []
    st = time.time()

    # start training
    print('{:.2f} Start training.'.format(time.time() - st))

    for k in (1, 3, 7):
        if not is_sklearn:
            clf = kNN.kNN(kernel=kernel)
            clf.fit(tr_data[0], tr_data[1])
            tr_pred = clf.predict(tr_data[0], k=k)
            te_pred = clf.predict(te_data[0], k=k)
        else:
            clf = KNeighborsClassifier(n_neighbors=k, metric=cosine_distances)
            clf.fit(tr_data[0], tr_data[1])
            tr_pred = clf.predict(tr_data[0])
            te_pred = clf.predict(te_data[0])

        tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
        te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]
        models.append(clf)
        print('{} Final results with kernel {} and k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, k, tr_acc, te_acc))
예제 #6
0
파일: PB6_test.py 프로젝트: Juncai/CS6140
def main():

    target = 'v2'
    # training parameter
    k = 10  # fold
    layer_thresh = 2
    T = 50
    threshes_path = 'data/spambase.threshes'

    # laod and preprocess training data
    training_data = loader.load_dataset('data/spambase.data')

    # load thresholds
    threshes = loader.load_pickle_file(threshes_path)

    # start training
    k_folds = Preprocess.prepare_k_folds(training_data, k)
    tr_data, te_data = Preprocess.get_i_fold(k_folds, 0)
    f_cur = [x[0] for x in tr_data[0]]

    t = dt.DecisionTree()
    if target == 'v1':
        for i in range(100):
            h_y = t.compute_entropy(tr_data[1])
            thresh = threshes[0][30]
            ig = t.compute_ig(f_cur, tr_data[1], thresh, h_y)
    else:
        h_y = t.compute_entropy_v2(tr_data[1])
        thresh = threshes[0][0]
        ig = t.compute_ig_v2(f_cur, tr_data[1], thresh, h_y)
예제 #7
0
def main():
    st = time.time()
    # training parameter
    result_path = 'results/PB4_spam_polluted_missing_NB_Bern.acc'
    model_name = 'spam_'
    mean_path = 'data/spam_polluted_missing/train/f_mean.pickle'
    train_data_path = 'data/spam_polluted_missing/train/data.pickle'
    test_data_path = 'data/spam_polluted_missing/test/data.pickle'

    # laod and preprocess training data
    tr_data = loader.load_pickle_file(train_data_path)
    te_data = loader.load_pickle_file(test_data_path)
    print('{:.2f} Data loaded!'.format(time.time() - st))

    # load means
    means = loader.load_pickle_file(mean_path)
    print('{:.2f} Means loaded!'.format(time.time() - st))

    # start training
    roc = []
    auc = 0.0

    tr_n, f_d = np.shape(tr_data[0])
    te_n, = np.shape(te_data[1])
    te_auc = 2.
    round = 0
    model = m.NBBernoulli(means)
    model.build(tr_data[0], tr_data[1])

    training_acc = model.test(tr_data[0], tr_data[1], util.acc)
    # training_cms.append(training_test_res[1])
    testing_acc = model.test(te_data[0], te_data[1], util.acc)
    # testing_cms.append(testing_test_res[1])


    print('Final results. Train acc: {}, Test acc: {}'.format(training_acc, testing_acc))

    result = {}
    result['TrainingAcc'] = training_acc
    result['TestingAcc'] = testing_acc

    # log the training result to file
    util.write_result_to_file(result_path, model_name, result, True)
예제 #8
0
def abstract_features(data_path, cs_path, rects_path, res_path):
    # get cs
    cs = get_cs(data_path, cs_path)
    rects = loader.load_pickle_file(rects_path)

    # 2 features for each rectangle
    features = []
    for i, ccs in enumerate(cs):
        f = []
        for rect in rects:
            f.extend(compute_feature_with_cs(rect, ccs))
        features.append(f)
        print('{} rects finished.'.format(i))

    # combine with labels
    label = loader.load_pickle_file(data_path)[1]
    f_l = [np.array(features), label]
    loader.save(res_path, f_l)

    return f_l
예제 #9
0
def ecoc_test():
    svms = loader.load_pickle_file(model_path)
    te_data= loader.load_pickle_file(te_data_path)
    pred = []

    for f in te_data[0]:
        min_hamming_dist = 1.
        match_label = 0
        code = []
        for s in svms:
            c_pred = s.predict([f])[0]
            code.append(1 if c_pred == 1 else 0)  # replace -1 with 0
        for ind, c in enumerate(ecoc):
            cur_hd = hamming(c, code)
            if cur_hd < min_hamming_dist:
                min_hamming_dist = cur_hd
                match_label = ind
        pred.append(match_label)

    return (pred == te_data[1]).sum() / len(te_data[1])
예제 #10
0
def random_select_data(tr_save_path, sel_tr_save_path, percent):
    all_tr = loader.load_pickle_file(tr_save_path)

    tr_l_ind_dict = {}
    selected_tr_data = [[], []]
    for i in range(10):
        tr_l_ind_dict[i] = [l_ind for l_ind, l in enumerate(all_tr[1]) if l == i]
    for i in range(10):
        i_n = len(tr_l_ind_dict[i])
        pick_n = int(percent * i_n)
        cur_pick_ind = np.random.choice(tr_l_ind_dict[i], pick_n, replace=False).tolist()
        selected_tr_data[0].extend([x for x_ind, x in enumerate(all_tr[0]) if x_ind in cur_pick_ind])
        selected_tr_data[1].extend([y for y_ind, y in enumerate(all_tr[1]) if y_ind in cur_pick_ind])
    loader.save(sel_tr_save_path, selected_tr_data)
예제 #11
0
def ecoc():

    # training parameter
    c = 0.001
    tol = 0.01
    epsilon = 0.001
    # kernel = 'rbf'
    kernel = 'linear'

    # laod and preprocess training data
    print('Loading data...')
    tr_data = loader.load_pickle_file(tr_data_path)
    te_data= loader.load_pickle_file(te_data_path)

    # randomly generate ECOC of 50 functions
    num_ecoc = 10
    class_num = 10
    best_ecoc = util.get_ecoc(ecoc_path, num_ecoc, class_num)

    # train 10 svm
    print('Begin training...')
    svms = []  # list of svm classifiers
    function_tr_err = []
    sst = time.time()
    for ind, c_ecoc in enumerate(best_ecoc[1]):
        st = time.time()
        # prepare label
        c_label = [-1 if c_ecoc[l] == 0 else 1 for l in tr_data[1]]
        clf = svm.SVM(C=c, tol=tol, epsilon=epsilon, kernel=kernel)
        clf.fit(tr_data[0], c_label)
        tr_pred = clf.predict(tr_data)
        tr_acc = (c_label == tr_pred).sum() / tr_data[0].shape[0]
        print('{} Function {} done. Final results. Train acc: {}'.format(time.time() - st, ind, tr_acc))
        svms.append(clf)

    print('{} Training finished.'.format(time.time() - sst))
    loader.save(model_path, svms)
예제 #12
0
def main():
    # training parameter
    result_path = 'results/PB1_B_digits.acc'
    model_name = 'digits_'
    threshes_path = 'data/spambase.threshes'
    tr_data_path = 'data\\digits\\tr_f_l_10r.pickle'
    te_data_path = 'data\\digits\\te_f_l_10r.pickle'
    # laod and preprocess training data
    tr_data = loader.load_pickle_file(tr_data_path)
    te_data = loader.load_pickle_file(te_data_path)

    # transpose label
    tr_data[1] = np.transpose(tr_data[1])[0]
    te_data[1] = np.transpose(te_data[1])[0]

    # start training
    # kernel = 'poly'
    kernel = 'linear'
    tol = 0.01
    c = 0.01

    st = time.time()

    # start training
    print('{} Start training. Kernel: {}'.format(time.time() - st, kernel))
    # clf = svm.SVC(kernel='poly')
    clf = svm.SVC(C=c, kernel=kernel, tol=tol)
    # clf = svm.NuSVC(kernel=kernel)
    clf.fit(tr_data[0], tr_data[1])
    tr_pred = clf.predict(tr_data[0])
    te_pred = clf.predict(te_data[0])

    tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
    te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

    print('{} Final results. Train acc: {}, Test acc: {}'.format(time.time() - st, tr_acc, te_acc))
예제 #13
0
파일: PB5_RELIEF.py 프로젝트: Juncai/CS6140
def main():
    # training parameter
    is_sklearn = True
    k = 10  # fold
    result_path = 'results/PB2_spam.acc'
    model_name = 'spam_' + str(k) + 'fold'
    data_path = 'data/spam/data.pickle'

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)
    # TODO convert labels from {0, 1} to {-1, 1}
    # util.replace_zero_label_with_neg_one(training_data)

    # Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])
    # training_data[0] = preprocessing.scale(training_data[0])


    # start training
    training_errs = []
    testing_errs = []
    print('Preparing k fold data.')
    k_folds = Preprocess.prepare_k_folds(training_data, k)

    for i in (0,):
        st = time.time()
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)

        # start training
        print('{:.2f} Start training.'.format(time.time() - st))
        kernel = c.EUCLIDEAN
        # kernel = c.GAUSSIAN
        f_select = True
        best_features_num = 5
        clf = kNN.kNN(kernel=kernel)
        clf.fit(tr_data[0], tr_data[1], f_select=f_select, best_f=best_features_num)
        print("Best features: {}".format(clf.best_f_indices))
        for kk in (1, 2, 3, 7):
            tr_pred = clf.predict(tr_data[0], k=kk)
            te_pred = clf.predict(te_data[0], k=kk)

            tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
            te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

            print('{} Final results with kernel {}, k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, kk, tr_acc, te_acc))
예제 #14
0
파일: PB2_A_spam.py 프로젝트: Juncai/CS6140
def main():
    # training parameter
    k = 8  # fold
    result_path = 'results/PB2_spam.acc'
    model_name = 'spam_' + str(k) + 'fold'
    data_path = 'data/spam/data.pickle'

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)
    # TODO convert labels from {0, 1} to {-1, 1}
    # util.replace_zero_label_with_neg_one(training_data)

    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])
    # Preprocess.normalize_features_all(Preprocess.shifiat_and_scale, training_data[0])


    # start training
    training_accs = []
    testing_accs = []
    print('Preparing k fold data.')
    k_folds = Preprocess.prepare_k_folds(training_data, k)
    kernel = c.EUCLIDEAN
    sst = time.time()
    for i in (1,):
        st = time.time()
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)

        # start training
        print('{:.2f} Start training.'.format(time.time() - st))
        for r in (2.5, 2.7):
            clf = kNN.kNN(kernel=kernel)
            # clf.fit(training_data[0], training_data[1])
            clf.fit(tr_data[0], tr_data[1])
            # tr_pred = clf.predict(training_data[0], r=r)
            tr_pred = clf.predict(tr_data[0], r=r)
            te_pred = clf.predict(te_data[0], r=r)

            # tr_acc = (training_data[1] == tr_pred).sum() / training_data[0].shape[0]
            tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
            te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

            testing_accs.append(te_acc)
            print('{} {}-fold results with kernel {}, r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, i, kernel, r, tr_acc, te_acc))
예제 #15
0
def main():
    # training parameter
    k = 10  # fold
    result_path = "results/PB1_A_spam.acc"
    model_name = "spam_" + str(k) + "fold"
    threshes_path = "data/spambase.threshes"
    data_path = "data/spam/data.pickle"
    # kernel = 'poly'
    kernel = "linear"
    # kernel = 'rbf'
    verbose = False
    tol = 0.01
    c = 0.1

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)
    # TODO convert labels from {0, 1} to {-1, 1}
    util.replace_zero_label_with_neg_one(training_data)

    # normalize
    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])

    print("Preparing k fold data.")
    k_folds = Preprocess.prepare_k_folds(training_data, k)

    for i in range(1):
        st = time.time()
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)

        # start training
        print("{:3f} Start training. Kernel: {}".format(time.time() - st, kernel))

        clf = svm.SVC(C=c, kernel=kernel, tol=tol, verbose=verbose)
        # clf = svm.NuSVC(kernel=kernel, tol=tol, verbose=verbose)
        clf.fit(tr_data[0], tr_data[1])
        tr_pred = clf.predict(tr_data[0])
        te_pred = clf.predict(te_data[0])

        tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
        te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

        print("{:3f} Final results. Train acc: {}, Test acc: {}".format(time.time() - st, tr_acc, te_acc))
예제 #16
0
파일: Utilities.py 프로젝트: Juncai/CS6140
def get_ecoc(ecoc_path, num_ecoc, class_num):
    if path.isfile(ecoc_path):
        print('Loading the ecoc...')
        best_ecoc = loader.load_pickle_file(ecoc_path)
    else:
        print('Creating the ecoc...')
        best_ecoc = [0, [], []]     # distance, ecoc for training, ecoc for predicting
        for i in range(100):
            n = int(math.pow(2, num_ecoc))
            codes = choice(n, class_num)
            ecoc_func_codes = []
            for i in range(num_ecoc):
                ecoc_func_codes.append([])
            c_ecoc = []
            for c in codes:
                bin_s = '{0:0' + str(num_ecoc) + '10b}'.format(c)
                bin_s = [int(ss) for ss in bin_s]
                c_ecoc.append(bin_s)
                for i in range(num_ecoc):
                    ecoc_func_codes[i].append(bin_s[i])
            c_hamming_dist = 0
            has_same_code = False
            for j in range(len(c_ecoc)):
                for k in range(len(c_ecoc)):
                    if j != k:
                        c_hd = hamming(c_ecoc[j], c_ecoc[k])
                        if c_hd == 0:
                            has_same_code = True
                        c_hamming_dist += c_hd
            if has_same_code:
                continue
            if c_hamming_dist > best_ecoc[0]:
                best_ecoc[0] = c_hamming_dist
                best_ecoc[1] = ecoc_func_codes
                best_ecoc[2] = c_ecoc

        # serialize the best ecoc
        loader.save(ecoc_path, best_ecoc)
    return best_ecoc
예제 #17
0
 def test_count_black(self):
     rect = ((1, 1), (4, 5))
     cs = []
     css = loader.load_pickle_file('data/digits/')
     pass
예제 #18
0
    means = []
    for i in range(d):

        cur_f = features[:, i]
        means.append(np.nanmean(cur_f))

        # cur_mean = 0
        # for f in features:
        #     if not np.isnan(f[i]):
        #         cur_mean += f[i]
        # means.append(cur_mean / n)
    means = np.array(means)
    loader.save(save_path, means)
    return means


if __name__ == '__main__':
    # generate means for the features, missing
    path = 'data/spam_polluted_missing/train/data.pickle'
    mean_path = 'data/spam_polluted_missing/train/f_mean.pickle'
    features = loader.load_pickle_file(path)[0]
    means = np.nanmean(features, axis=0)
    loader.save(mean_path, means)

    # generate means for the features, polluted
    # path = 'data/spam_polluted/train/data.pickle'
    # mean_path = 'data/spam_polluted/train/f_mean.pickle'
    # features = loader.load_pickle_file(path)[0]
    # means = np.nanmean(features, axis=0)
    # loader.save(mean_path, means)
예제 #19
0
import os.path as path
import copy
import time

# training parameter
result_path = 'results/8newsgroupECOC_3.acc'
model_name = '8newsgroupECOC_cs'
model_path = 'results/8newsgroup/' + model_name + '.model'
model2_path = 'results/8newsgroup/8newsgroupECOC_cs_2.model'
threshes_path = 'data/8newsgroup/8newsgroup.thresh'
tr_data_path = 'data/8newsgroup/train.data'
te_data_path = 'data/8newsgroup/test.data'
ecoc_path = 'data/8newsgroup/ecoc_cs'

print('Loading boosts...')
boosts = loader.load_pickle_file(model_path)
boosts2 = loader.load_pickle_file(model2_path)





print('Loading the ecoc...')
best_ecoc = loader.load_pickle_file(ecoc_path)


# laod and preprocess training data
tr_data = loader.load_pickle_file(tr_data_path)
te_data= loader.load_pickle_file(te_data_path)

예제 #20
0
def main():
    # training parameter
    k = 10  # fold
    layer_thresh = 2
    T = 50
    result_path = 'results/spamDT_final.acc'
    model_name = 'spam_' + str(k) + 'fold'
    threshes_path = 'data/spambase.threshes'

    # laod and preprocess training data
    training_data = loader.load_dataset('data/spambase.data')

    # load thresholds
    threshes = loader.load_pickle_file(threshes_path)

    # start training
    training_errs = []
    testing_errs = []
    roc = []
    auc = 0.0
    k_folds = Preprocess.prepare_k_folds(training_data, k)

    for i in range(1):
        st = time.time()
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)
        tr_n, f_d = np.shape(tr_data[0])
        te_n, = np.shape(te_data[1])
        t = dt.DecisionTree()
        t.build(tr_data[0], tr_data[1], threshes, layer_thresh)
        # test the bagging model and compute testing acc
        training_errs.append(t.test(tr_data[0], tr_data[1], util.acc))
        testing_errs.append(t.test(te_data[0], te_data[1], util.acc))
        print('Round {} finishes, time used: {}'.format(i, time.time() - st))


    mean_training_err = np.mean(training_errs)
    mean_testing_err = np.mean(testing_errs)

    print(str(k) + '-fold validation done. Training errs are:')
    print(training_errs)
    print('Mean training err is:')
    print(mean_training_err)
    print('Testing errs are:')
    print(testing_errs)
    print('Mean testing err is:')
    print(mean_testing_err)

    result = {}
    result['Fold'] = k
    result['Trainingerrs'] = training_errs
    result['MeanTrainingAcc'] = mean_training_err
    result['Testingerrs'] = testing_errs
    result['MeanTestingAcc'] = mean_testing_err

    result['ROC'] = roc
    result['AUC'] = auc



    # log the training result to file
    util.write_result_to_file(result_path, model_name, result, True)
예제 #21
0
파일: PB1_plot.py 프로젝트: Juncai/CS6140
# result['1stBoostTestingAUC'] = te_auc_1st_boost
# result['1stBoostTestingROC'] = te_roc_1st_boost

# DS_TYPE = 'Random'
DS_TYPE = 'Optimal'

if DS_TYPE == 'Random':
    result_path = 'results/spamRDSBoosting_final.acc.pickle'
else:
    result_path = 'results/spamODSBoosting_final.acc.pickle'

# target = 'auc'
# target = 'errs'
target = 'm_err'

result = loader.load_pickle_file(result_path)
n_round = len(result['1stBoostTestingAUC'])


if target == 'auc':
    auc = result['1stBoostTestingAUC']
    x = [i+1 for i in range(n_round)]

    plt.plot(x, auc, color='red', linestyle='solid')
    plt.title("Adaboost with " + DS_TYPE + "DecisionStump - AUC")
    plt.xlabel("Iteration Step")
    plt.ylabel("AUC")
    plt.show()

if target == 'errs':
    tr_err = result['1stBoostTrainingError']
예제 #22
0
model_name = '8newsgroupECOC_cs_' + wl_type + 'final'
model_path = 'results/8newsgroup/' + model_name + '.model'
threshes_path = 'data/8newsgroup/8newsgroup.thresh'
threshes_path_v2 = 'data/8newsgroup/8newsgroup_f_i.thresh'
tr_data_path = 'data/8newsgroup/train.data'
te_data_path = 'data/8newsgroup/test.data'
ecoc_path = 'data/8newsgroup/ecoc_cs'

# specify weak learner
if wl_type == 'random_':
    wl = ds.RandomDecisionStump
else :
    wl = ds.DecisionStump

# laod and preprocess training data
tr_data = loader.load_pickle_file(tr_data_path)
te_data= loader.load_pickle_file(te_data_path)

# load thresholds
threshes = loader.load_pickle_file(threshes_path)
# threshes_v2 = loader.load_pickle_file(threshes_path_v2)

# start training
tr_n = len(tr_data[0])
te_n = len(te_data[1])

# randomly generate ECOC of 20 functions
num_ecoc = 20
if path.isfile(ecoc_path):
    print('Loading the ecoc...')
    best_ecoc = loader.load_pickle_file(ecoc_path)
st = time.time()
# training parameter
result_path = "results/PB3_C_spam_polluted_LoR_myRIDGE_final.acc"
model_name = "spam_"
model_path = result_path + ".model"
train_data_path = "data/spam_polluted/train/data.pickle"
test_data_path = "data/spam_polluted/test/data.pickle"

# params
lamda = 0.5
tol = 0.92
normalize_method = prep.zero_mean_unit_var
term_method = util.acc_higher_than_ridge

# laod and preprocess training data
tr_data = loader.load_pickle_file(train_data_path)
te_data = loader.load_pickle_file(test_data_path)
print("{:.2f} Data loaded!".format(time.time() - st))

tr_data[0] = tr_data[0].tolist()
te_data[0] = te_data[0].tolist()

# normalize features
prep.normalize_features_all(normalize_method, tr_data[0], te_data[0])
print("{:.2f} Features normalized!".format(time.time() - st))

saved_model = loader.load_pickle_file(model_path)  # load the model
theta = saved_model.theta
is_batch = True
penalty = "l2"  # l2 for RIDGE
alpha = 0.05
예제 #24
0
target = "vote"
k = 10  # fold
round_limit = 100
if target == "crx":
    result_path = "results/crxBoosting_cPercent_final_1.acc"
    model_name = "crx_" + str(k) + "fold"
    threshes_path = "data/crx.threshes"
    data_path = "data/crx_parsed.data"
else:
    result_path = "results/voteBoosting_cPercent_final_1.acc"
    model_name = "vote_" + str(k) + "fold"
    threshes_path = "data/vote.threshes"
    data_path = "data/vote_parsed.data"

# laod and preprocess training data
training_data = loader.load_pickle_file(data_path)
print("total data points: {}".format(len(training_data[0])))
# load thresholds
threshes = loader.load_pickle_file(threshes_path)

# start training
training_errs_by_percent = {}
testing_errs_by_percent = {}
auc_by_percent = {}
roc = []
auc = 0.0
k_folds = Preprocess.prepare_k_folds(training_data, k)
percent_list = (5, 10, 15, 20, 30, 50, 80)
for i in range(k):
    tr_data_all, te_data = Preprocess.get_i_fold(k_folds, i)
예제 #25
0
def main():
    # training parameter
    k = 10  # fold
    round_limit = 300
    result_path = 'results/PB1_A_spam_final.acc'
    model_name = 'spam_' + str(k) + 'fold'
    threshes_path = 'data/spambase.threshes'
    data_path = 'data/spam/data.pickle'

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)
    # TODO convert labels from {0, 1} to {-1, 1}
    util.replace_zero_label_with_neg_one(training_data)

    # load thresholds
    threshes = loader.load_pickle_file(threshes_path)

    # start training
    training_errs = []
    testing_errs = []
    round_err_1st_boost = None
    tr_errs_1st_boost = None
    te_errs_1st_boost = None
    te_auc_1st_boost = None
    te_roc_1st_boost = None
    ranked_f = None
    roc = []
    auc = 0.0

    tr_data = training_data
    tr_n, f_d = np.shape(tr_data[0])
    # TODO prepare distribution
    d = util.init_distribution(len(tr_data[0]))
    # TODO compute thresholds cheat sheet
    thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes)
    boost = b.Boosting(d)
    training_predict = np.zeros((1, tr_n)).tolist()[0]
    round_tr_err = []
    round_te_err = []
    round_model_err = []
    round = 0
    while round < round_limit:  # and not converged:
        round += 1
        boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs)
        boost.update_predict(tr_data[0], training_predict)
        c_model_err = boost.model[-1].w_err
        round_model_err.append(c_model_err)
        c_f_ind = boost.model[-1].f_ind
        c_thresh = boost.model[-1].thresh
        c_tr_err = util.get_err_from_predict(training_predict, tr_data[1])
        # TODO calculate the AUC for testing results
        round_tr_err.append(c_tr_err)
        print('Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {:.12f}'.format(round, c_f_ind, c_thresh, c_model_err, c_tr_err, 0, 0))

        training_errs.append(round_tr_err[-1])
    ranked_f = util.get_f_ranking_from_predictions(boost, threshes)


        # break      # for testing

    mean_training_err = np.mean(training_errs)

    print('Final results. Mean Train err: {}, Mean Test err: {}'.format(mean_training_err, 0))
    print('Top 10 features: ')
    print(ranked_f[:10])

    result = {}
    result['Fold'] = k
    result['Trainingerrs'] = training_errs
    result['MeanTrainingAcc'] = mean_training_err
    result['Testingerrs'] = testing_errs
    result['1stBoostTrainingError'] = tr_errs_1st_boost
    result['1stBoostTestingError'] = te_errs_1st_boost
    result['1stBoostModelError'] = round_err_1st_boost
    result['1stBoostTestingAUC'] = te_auc_1st_boost
    result['1stBoostTestingROC'] = te_roc_1st_boost
    result['rankedFeatures'] = ranked_f

    # result['ROC'] = str(roc)
    result['AUC'] = auc

    # log the training result to file
    util.write_result_to_file(result_path, model_name, result, True)
예제 #26
0
import GradientBoostedTrees as g
import copy

# training parameter
layer_thresh = 2
R = 10
result_path = 'results/housingGBT_final.err'
model_name = 'housingGBT'
threshes_path = 'data/housing_train.threshes'

# laod and preprocess training data
tr_data = loader.load_dataset('data/housing_train.txt')
te_data = loader.load_dataset('data/housing_test.txt')

# load thresholds
threshes = loader.load_pickle_file(threshes_path)

# start training
training_errs = []
testing_errs = []

tr_n, f_d = np.shape(tr_data[0])
round = 1
gbt = g.GradientBoostedTrees()
gbt_label = copy.deepcopy(tr_data[1])
while round <= R:
    # prepare training data
    gbt.add_tree(tr_data[0], gbt_label, threshes, layer_thresh)

    # training error is from newly added tree, testing error is from current GBT
    pred = gbt.trees[-1].batch_predict(tr_data[0])
예제 #27
0
def main():
    # training parameter
    target = 'crx'
    # target = 'vote'
    k = 10  # fold
    round_limit = 150

    if target == 'crx':
        result_path = 'results/crxBoosting_final_1.acc'
        model_name = 'crx_' + str(k) + 'fold'
        threshes_path = 'data/crx.threshes'
        data_path = 'data/crx_parsed.data'
    else:
        result_path = 'results/voteBoosting_final.acc'
        model_name = 'vote_' + str(k) + 'fold'
        threshes_path = 'data/vote.threshes'
        data_path = 'data/vote_parsed.data'

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)

    # load thresholds
    threshes = loader.load_pickle_file(threshes_path)

    # start training
    training_errs = []
    testing_errs = []
    round_err_1st_boost = None
    tr_errs_1st_boost = None
    te_errs_1st_boost = None
    te_auc_1st_boost = None
    roc = []
    auc = 0.0
    k_folds = Preprocess.prepare_k_folds(training_data, k)

    for i in range(k):
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)
        tr_n, f_d = np.shape(tr_data[0])
        te_n, = np.shape(te_data[1])
        # TODO prepare distribution
        d = util.init_distribution(len(tr_data[0]))
        # TODO compute thresholds cheat sheet
        thresh_cs = util.pre_compute_threshes_uci(tr_data[0], tr_data[1], threshes)
        boost = b.Boosting(d)
        testing_predict = np.zeros((1, te_n)).tolist()[0]
        training_predict = np.zeros((1, tr_n)).tolist()[0]
        round_tr_err = []
        round_te_err = []
        round_model_err = []
        round_te_auc = []
        converged = False
        tol = 1e-5
        te_auc = 2.
        round = 0
        while round < round_limit: # and not converged:
            round += 1
            boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs)
            boost.update_predict(tr_data[0], training_predict)
            boost.update_predict(te_data[0], testing_predict)
            c_model_err = boost.model[-1].w_err
            round_model_err.append(c_model_err)
            c_f_ind = boost.model[-1].f_ind
            c_thresh = boost.model[-1].thresh
            c_tr_err = util.get_err_from_predict(training_predict, tr_data[1])
            c_te_err = util.get_err_from_predict(testing_predict, te_data[1])
            # TODO calculate the AUC for testing results
            # c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1])
            # round_tr_err.append(c_tr_err)
            # round_te_err.append(c_te_err)
            # round_te_auc.append(c_te_auc)
            print('Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f}'.format(round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err))
            # converged =  abs(c_te_auc - te_auc) / te_auc <= tol
            # te_auc = c_te_auc

        training_errs.append(c_tr_err)
        testing_errs.append(c_te_err)
        # if k == 0:
        #     round_err_1st_boost = round_model_err
        #     tr_errs_1st_boost = round_tr_err
        #     te_errs_1st_boost = round_te_err
            # te_auc_1st_boost = round_te_auc

        # break      # for testing


    mean_training_err = np.mean(training_errs)
    mean_testing_err = np.mean(testing_errs)

    print(str(k) + '-fold validation done. Training errs are:')
    print(training_errs)
    print('Mean training err is:')
    print(mean_training_err)
    print('Testing errs are:')
    print(testing_errs)
    print('Mean testing err is:')
    print(mean_testing_err)

    result = {}
    result['Fold'] = str(k)
    result['Trainingerrs'] = str(training_errs)
    result['MeanTrainingAcc'] = str(mean_training_err)
    result['Testingerrs'] = str(testing_errs)
    result['MeanTestingAcc'] = str(mean_testing_err)
    result['1stBoostTrainingError'] = str(tr_errs_1st_boost)
    result['1stBoostTestingError'] = str(te_errs_1st_boost)
    result['1stBoostModelError'] = str(round_err_1st_boost)
    result['1stBoostTestingAUC'] = str(te_auc_1st_boost)

    # result['ROC'] = str(roc)
    result['AUC'] = str(auc)



    # log the training result to file
    util.write_result_to_file(result_path, model_name, result)
예제 #28
0
def main():
    # training parameter
    round_limit = 50
    result_path = 'results/spamActive_random_final_1.acc'
    model_name = 'spam_active'
    threshes_path = 'data/spambase.threshes'

    # laod and preprocess training data
    training_data = loader.load_dataset('data/spambase.data')
    # TODO convert labels from {0, 1} to {-1, 1}
    util.replace_zero_label_with_neg_one(training_data)

    # load thresholds
    threshes = loader.load_pickle_file(threshes_path)

    # start training
    training_errs = []
    testing_errs = []
    # round_err_1st_boost = None
    # tr_errs_1st_boost = None
    # te_errs_1st_boost = None
    # te_auc_1st_boost = None
    roc = []
    auc = 0.0
    k_folds = Preprocess.prepare_k_folds(training_data, 5)
    tr_data_pool, te_data = Preprocess.get_i_fold(k_folds, 1)
    data_set = DataSet.DataSet(tr_data_pool)
    data_rates = (5, 10, 15, 20, 30, 50)
    for c in data_rates:
        tr_data = data_set.random_pick(c, False)
        tr_n, f_d = np.shape(tr_data[0])
        te_n, = np.shape(te_data[1])
        # TODO prepare distribution
        d = util.init_distribution(len(tr_data[0]))
        # TODO compute thresholds cheat sheet
        thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes)
        boost = b.Boosting(d)
        testing_predict = np.zeros((1, te_n)).tolist()[0]
        training_predict = np.zeros((1, tr_n)).tolist()[0]
        round_tr_err = []
        round_te_err = []
        round_model_err = []
        round_te_auc = []
        converged = False
        tol = 1e-5
        te_auc = 2.
        round = 0
        while round < round_limit: # and not converged:
            round += 1
            boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs)
            boost.update_predict(tr_data[0], training_predict)
            boost.update_predict(te_data[0], testing_predict)
            c_model_err = boost.model[-1].w_err
            round_model_err.append(c_model_err)
            c_f_ind = boost.model[-1].f_ind
            c_thresh = boost.model[-1].thresh
            c_tr_err = util.get_err_from_predict(training_predict, tr_data[1])
            c_te_err = util.get_err_from_predict(testing_predict, te_data[1])
            # TODO calculate the AUC for testing results
            # c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1])
            round_tr_err.append(c_tr_err)
            round_te_err.append(c_te_err)
            # round_te_auc.append(c_te_auc)
            print('Data {}% Round: {} Feature: {} Threshold: {:.3f} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {}'.format(c, round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err, 0))
            # converged =  abs(c_te_auc - te_auc) / te_auc <= tol
            # te_auc = c_te_auc

        training_errs.append(round_tr_err[-1])
        testing_errs.append(round_te_err[-1])
        # break      # for testing


    mean_training_err = np.mean(training_errs)
    mean_testing_err = np.mean(testing_errs)

    print('Training errs are:')
    print(training_errs)
    print('Mean training err is:')
    print(mean_training_err)
    print('Testing errs are:')
    print(testing_errs)
    print('Mean testing err is:')
    print(mean_testing_err)

    result = {}
    result['Trainingerrs'] = training_errs
    result['MeanTrainingAcc'] = mean_training_err
    result['Testingerrs'] = testing_errs
    result['MeanTestingAcc'] = mean_testing_err

    # result['ROC'] = str(roc)
    result['AUC'] = auc



    # log the training result to file
    util.write_result_to_file(result_path, model_name, result, True)
예제 #29
0
def main():
    st = time.time()
    # training parameter
    round_limit = 15
    result_path = 'results/PB1_B_spam_2.acc'
    model_name = 'spam_'
    model_path = result_path + '.model'
    threshes_path = 'data/spambase_polluted.threshes'
    train_data_path = 'data/spam_polluted/train/data.pickle'
    test_data_path = 'data/spam_polluted/test/data.pickle'

    # laod and preprocess training data
    tr_data = loader.load_pickle_file(train_data_path)
    te_data = loader.load_pickle_file(test_data_path)
    print('{:.2f} Data loaded!'.format(time.time() - st))
    # TODO convert labels from {0, 1} to {-1, 1}
    util.replace_zero_label_with_neg_one(tr_data)
    util.replace_zero_label_with_neg_one(te_data)
    print('{:.2f} Label converted!'.format(time.time() - st))

    # load thresholds
    threshes = loader.load_pickle_file(threshes_path)
    print('{:.2f} Thresholds loaded!'.format(time.time() - st))
    # start training
    training_errs = []
    testing_errs = []
    round_err_1st_boost = None
    tr_errs_1st_boost = None
    te_errs_1st_boost = None
    te_auc_1st_boost = None
    te_roc_1st_boost = None
    ranked_f = None
    roc = []
    auc = 0.0
    thresh_cs = None

    tr_n, f_d = np.shape(tr_data[0])
    te_n, = np.shape(te_data[1])
    # TODO prepare distribution
    d = util.init_distribution(len(tr_data[0]))

    # TODO compute thresholds cheat sheet (not a solution due to huge thresh_cs table)
    # thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes)
    # print('{:.2f} Thresholds cheat sheet computed!'.format(time.time() - st))

    boost = b.Boosting(d)
    testing_predict = np.zeros((1, te_n)).tolist()[0]
    training_predict = np.zeros((1, tr_n)).tolist()[0]
    round_tr_err = []
    round_te_err = []
    round_model_err = []
    round_te_auc = []
    converged = False
    tol = 1e-5
    te_auc = 2.
    round = 0
    while round < round_limit:  # and not converged:
        round += 1
        boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs)
        boost.update_predict(tr_data[0], training_predict)
        boost.update_predict(te_data[0], testing_predict)
        c_model_err = boost.model[-1].w_err
        round_model_err.append(c_model_err)
        c_f_ind = boost.model[-1].f_ind
        c_thresh = boost.model[-1].thresh
        c_tr_err = util.get_err_from_predict(training_predict, tr_data[1])
        c_te_err = util.get_err_from_predict(testing_predict, te_data[1])
        # TODO calculate the AUC for testing results
        # c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1])
        round_tr_err.append(c_tr_err)
        round_te_err.append(c_te_err)
        # round_te_auc.append(c_te_auc)
        print('{:.2f} Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {:.12f}'.format(time.time() - st, round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err, 0))
        # converged =  abs(c_te_auc - te_auc) / te_auc <= tol
        # te_auc = c_te_auc

    training_errs.append(round_tr_err[-1])
    testing_errs.append(round_te_err[-1])
    # TODO get feature ranking from the predictions
    ranked_f = util.get_f_ranking_from_predictions(boost, threshes)
    round_err_1st_boost = round_model_err
    tr_errs_1st_boost = round_tr_err
    te_errs_1st_boost = round_te_err
    # te_auc_1st_boost = round_te_auc

    # _, te_roc_1st_boost = util.get_auc_from_predict(testing_predict, te_data[1], True)

        # break      # for testing

    mean_training_err = np.mean(training_errs)
    mean_testing_err = np.mean(testing_errs)

    print('Final results. Mean Train err: {}, Mean Test err: {}'.format(mean_training_err, mean_testing_err))
    print('Top 10 features: ')
    # print(ranked_f[:10])

    result = {}
    result['Trainingerrs'] = training_errs
    result['MeanTrainingAcc'] = mean_training_err
    result['Testingerrs'] = testing_errs
    result['MeanTestingAcc'] = mean_testing_err
    result['1stBoostTrainingError'] = tr_errs_1st_boost
    result['1stBoostTestingError'] = te_errs_1st_boost
    result['1stBoostModelError'] = round_err_1st_boost
    result['1stBoostTestingAUC'] = te_auc_1st_boost
    result['1stBoostTestingROC'] = te_roc_1st_boost
    result['rankedFeatures'] = ranked_f

    # result['ROC'] = str(roc)
    result['AUC'] = auc

    # store the model
    loader.save(model_path, boost)
    # log the training result to file
    util.write_result_to_file(result_path, model_name, result, True)
예제 #30
0
st = time.time()
# training parameter
result_path = 'results/PB3_B_spam_polluted_LoR_RIDGE_sklearn.acc'
model_name = 'spam_'
train_data_path = 'data/spam_polluted/train/data.pickle'
test_data_path = 'data/spam_polluted/test/data.pickle'

# params
lamda = 0.0001
tol = 0.85
normalize_method = prep.zero_mean_unit_var
term_method = util.acc_higher_than

# laod and preprocess training data
tr_data = loader.load_pickle_file(train_data_path)
te_data = loader.load_pickle_file(test_data_path)
print('{:.2f} Data loaded!'.format(time.time() - st))

tr_data[0] = tr_data[0].tolist()
te_data[0] = te_data[0].tolist()

# normalize features
prep.normalize_features_all(normalize_method, tr_data[0], te_data[0])
print('{:.2f} Features normalized!'.format(time.time() - st))


# using sklearn
parameters = {'C' : [0.05, 0.04, 0.1, 0.2, 0.3], 'penalty' : ('l2',), 'tol' : (0.06,)}
model = LogisticRegression(C=0.05, penalty='l1', tol=0.08)
clf = grid_search.GridSearchCV(model, parameters)