def main(): kernel = c.COSINE # training parameter result_path = 'results/PB2_spam.acc' model_name = 'digits_' + kernel tr_data_path = 'data\\digits\\tr_f_l_10.pickle' te_data_path = 'data\\digits\\te_f_l_10.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(tr_data_path) te_data = loader.load_pickle_file(te_data_path) # transpose label tr_data[1] = np.transpose(tr_data[1])[0] te_data[1] = np.transpose(te_data[1])[0] Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0]) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0]) # start training st = time.time() # start training print('{:.2f} Start training.'.format(time.time() - st)) for r in (0.15, 0.1): clf = kNN.kNN(kernel=kernel, dataset=c.DS_DIGITS) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0], r=r) te_pred = clf.predict(te_data[0], r=r) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print('{} Final results with kernel {} and r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, r, tr_acc, te_acc))
def main(): st = time.time() # training parameter result_path = 'results/PB2_A_spam_polluted_NB_Gaussian.acc' model_name = 'spam_' train_data_path = 'data/spam_polluted/train/data.pickle' test_data_path = 'data/spam_polluted/test/data.pickle' tr_data = loader.load_pickle_file(train_data_path) te_data = loader.load_pickle_file(test_data_path) print('{:.2f} Data loaded!'.format(time.time() - st)) # start training print('{:.2f} Building model...'.format(time.time() - st)) model = m.NBGaussian() model.build(tr_data[0], tr_data[1]) print('{:.2f} Predicting...'.format(time.time() - st)) tr_pred = model.predict(tr_data[0]) te_pred = model.predict(te_data[0]) print('{:.2f} Calculating results...'.format(time.time() - st)) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print('{} Final results. Train acc: {}, Test acc: {}'.format(time.time() - st, tr_acc, te_acc)) result = {} result['TrainingAcc'] = tr_acc result['TestingAcc'] = te_acc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
def test(): # laod and preprocess training data # tr_data = loader.load_pickle_file(tr_data_path) te_data= loader.load_pickle_file(te_data_path) model = loader.load_pickle_file(model_path) # te_pred_dict = loader.load_pickle_file(te_pred_dict_path) test_pred_dict = {} for i in range(9): test_pred_dict[i] = {} for j in range(i + 1, 10): if i == j: continue # get training data for this class clf = model[i][j] te_pred = clf.predict(te_data[0]) test_pred_dict[i][j] = te_pred te_n = len(te_data[1]) te_pred = np.zeros((1, te_n))[0] for i in range(te_n): votes = np.zeros((10,), dtype=np.int) for j in range(9): for k in range(j): votes[j] += 1 if test_pred_dict[k][j][i] == -1 else 0 for kk in test_pred_dict[j]: votes[j] += 1 if test_pred_dict[j][kk][i] == 1 else 0 count = np.bincount(votes) if count[-1] == 1: te_pred[i] = votes.argmax() else: te_pred[i] = votes.argmax() tie_ind = [votes.argmax()] cc = 0 for ind_v, v in enumerate(votes): if v == votes.max(): if cc == 1: tie_ind.append(ind_v) break else: cc += 1 te_pred[i] = tie_ind[0] if test_pred_dict[tie_ind[0]][tie_ind[1]][i] == 1 else tie_ind[1] print('{} Tie! {} wins.'.format(count[-1], te_pred[i])) acc = 0 acc_n = 0 for ind_l, l in enumerate(te_data[1]): acc += 1 if l == te_pred[ind_l] else 0 acc /= te_n # acc = (te_data[1] == te_pred).sum() / te_n print('Acc: {}'.format(acc))
def get_cs(data_path, cs_path): # dp compute cheat sheet cs = None if os.path.isfile(cs_path): cs = loader.load_pickle_file(cs_path) print('CS loaded.') else: print('Start compute cs.') data = loader.load_pickle_file(data_path) cs = dp_compute_cs(data[0]) loader.save(cs_path, cs) print('CS saved.') return cs
def main(): is_sklearn = False # kernel = c.COSINE # kernel = c.GAUSSIAN kernel = c.POLY # training parameter result_path = 'results/PB2_spam.acc' model_name = 'digits_' + kernel model_path = 'data/PB1_B_digits_sk_Gaussian_1.model' # tr_data_path = 'data\\digits\\tr_f_l.pickle' # te_data_path = 'data\\digits\\te_f_l.pickle' tr_data_path = 'data\\digits\\tr_f_l_10.pickle' te_data_path = 'data\\digits\\te_f_l_10.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(tr_data_path) te_data = loader.load_pickle_file(te_data_path) # transpose label tr_data[1] = np.transpose(tr_data[1])[0] te_data[1] = np.transpose(te_data[1])[0] Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0]) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0]) # start training models = [] st = time.time() # start training print('{:.2f} Start training.'.format(time.time() - st)) for k in (1, 3, 7): if not is_sklearn: clf = kNN.kNN(kernel=kernel) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0], k=k) te_pred = clf.predict(te_data[0], k=k) else: clf = KNeighborsClassifier(n_neighbors=k, metric=cosine_distances) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0]) te_pred = clf.predict(te_data[0]) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] models.append(clf) print('{} Final results with kernel {} and k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, k, tr_acc, te_acc))
def main(): target = 'v2' # training parameter k = 10 # fold layer_thresh = 2 T = 50 threshes_path = 'data/spambase.threshes' # laod and preprocess training data training_data = loader.load_dataset('data/spambase.data') # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training k_folds = Preprocess.prepare_k_folds(training_data, k) tr_data, te_data = Preprocess.get_i_fold(k_folds, 0) f_cur = [x[0] for x in tr_data[0]] t = dt.DecisionTree() if target == 'v1': for i in range(100): h_y = t.compute_entropy(tr_data[1]) thresh = threshes[0][30] ig = t.compute_ig(f_cur, tr_data[1], thresh, h_y) else: h_y = t.compute_entropy_v2(tr_data[1]) thresh = threshes[0][0] ig = t.compute_ig_v2(f_cur, tr_data[1], thresh, h_y)
def main(): st = time.time() # training parameter result_path = 'results/PB4_spam_polluted_missing_NB_Bern.acc' model_name = 'spam_' mean_path = 'data/spam_polluted_missing/train/f_mean.pickle' train_data_path = 'data/spam_polluted_missing/train/data.pickle' test_data_path = 'data/spam_polluted_missing/test/data.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(train_data_path) te_data = loader.load_pickle_file(test_data_path) print('{:.2f} Data loaded!'.format(time.time() - st)) # load means means = loader.load_pickle_file(mean_path) print('{:.2f} Means loaded!'.format(time.time() - st)) # start training roc = [] auc = 0.0 tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) te_auc = 2. round = 0 model = m.NBBernoulli(means) model.build(tr_data[0], tr_data[1]) training_acc = model.test(tr_data[0], tr_data[1], util.acc) # training_cms.append(training_test_res[1]) testing_acc = model.test(te_data[0], te_data[1], util.acc) # testing_cms.append(testing_test_res[1]) print('Final results. Train acc: {}, Test acc: {}'.format(training_acc, testing_acc)) result = {} result['TrainingAcc'] = training_acc result['TestingAcc'] = testing_acc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
def abstract_features(data_path, cs_path, rects_path, res_path): # get cs cs = get_cs(data_path, cs_path) rects = loader.load_pickle_file(rects_path) # 2 features for each rectangle features = [] for i, ccs in enumerate(cs): f = [] for rect in rects: f.extend(compute_feature_with_cs(rect, ccs)) features.append(f) print('{} rects finished.'.format(i)) # combine with labels label = loader.load_pickle_file(data_path)[1] f_l = [np.array(features), label] loader.save(res_path, f_l) return f_l
def ecoc_test(): svms = loader.load_pickle_file(model_path) te_data= loader.load_pickle_file(te_data_path) pred = [] for f in te_data[0]: min_hamming_dist = 1. match_label = 0 code = [] for s in svms: c_pred = s.predict([f])[0] code.append(1 if c_pred == 1 else 0) # replace -1 with 0 for ind, c in enumerate(ecoc): cur_hd = hamming(c, code) if cur_hd < min_hamming_dist: min_hamming_dist = cur_hd match_label = ind pred.append(match_label) return (pred == te_data[1]).sum() / len(te_data[1])
def random_select_data(tr_save_path, sel_tr_save_path, percent): all_tr = loader.load_pickle_file(tr_save_path) tr_l_ind_dict = {} selected_tr_data = [[], []] for i in range(10): tr_l_ind_dict[i] = [l_ind for l_ind, l in enumerate(all_tr[1]) if l == i] for i in range(10): i_n = len(tr_l_ind_dict[i]) pick_n = int(percent * i_n) cur_pick_ind = np.random.choice(tr_l_ind_dict[i], pick_n, replace=False).tolist() selected_tr_data[0].extend([x for x_ind, x in enumerate(all_tr[0]) if x_ind in cur_pick_ind]) selected_tr_data[1].extend([y for y_ind, y in enumerate(all_tr[1]) if y_ind in cur_pick_ind]) loader.save(sel_tr_save_path, selected_tr_data)
def ecoc(): # training parameter c = 0.001 tol = 0.01 epsilon = 0.001 # kernel = 'rbf' kernel = 'linear' # laod and preprocess training data print('Loading data...') tr_data = loader.load_pickle_file(tr_data_path) te_data= loader.load_pickle_file(te_data_path) # randomly generate ECOC of 50 functions num_ecoc = 10 class_num = 10 best_ecoc = util.get_ecoc(ecoc_path, num_ecoc, class_num) # train 10 svm print('Begin training...') svms = [] # list of svm classifiers function_tr_err = [] sst = time.time() for ind, c_ecoc in enumerate(best_ecoc[1]): st = time.time() # prepare label c_label = [-1 if c_ecoc[l] == 0 else 1 for l in tr_data[1]] clf = svm.SVM(C=c, tol=tol, epsilon=epsilon, kernel=kernel) clf.fit(tr_data[0], c_label) tr_pred = clf.predict(tr_data) tr_acc = (c_label == tr_pred).sum() / tr_data[0].shape[0] print('{} Function {} done. Final results. Train acc: {}'.format(time.time() - st, ind, tr_acc)) svms.append(clf) print('{} Training finished.'.format(time.time() - sst)) loader.save(model_path, svms)
def main(): # training parameter result_path = 'results/PB1_B_digits.acc' model_name = 'digits_' threshes_path = 'data/spambase.threshes' tr_data_path = 'data\\digits\\tr_f_l_10r.pickle' te_data_path = 'data\\digits\\te_f_l_10r.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(tr_data_path) te_data = loader.load_pickle_file(te_data_path) # transpose label tr_data[1] = np.transpose(tr_data[1])[0] te_data[1] = np.transpose(te_data[1])[0] # start training # kernel = 'poly' kernel = 'linear' tol = 0.01 c = 0.01 st = time.time() # start training print('{} Start training. Kernel: {}'.format(time.time() - st, kernel)) # clf = svm.SVC(kernel='poly') clf = svm.SVC(C=c, kernel=kernel, tol=tol) # clf = svm.NuSVC(kernel=kernel) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0]) te_pred = clf.predict(te_data[0]) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print('{} Final results. Train acc: {}, Test acc: {}'.format(time.time() - st, tr_acc, te_acc))
def main(): # training parameter is_sklearn = True k = 10 # fold result_path = 'results/PB2_spam.acc' model_name = 'spam_' + str(k) + 'fold' data_path = 'data/spam/data.pickle' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} # util.replace_zero_label_with_neg_one(training_data) # Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0]) # training_data[0] = preprocessing.scale(training_data[0]) # start training training_errs = [] testing_errs = [] print('Preparing k fold data.') k_folds = Preprocess.prepare_k_folds(training_data, k) for i in (0,): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) # start training print('{:.2f} Start training.'.format(time.time() - st)) kernel = c.EUCLIDEAN # kernel = c.GAUSSIAN f_select = True best_features_num = 5 clf = kNN.kNN(kernel=kernel) clf.fit(tr_data[0], tr_data[1], f_select=f_select, best_f=best_features_num) print("Best features: {}".format(clf.best_f_indices)) for kk in (1, 2, 3, 7): tr_pred = clf.predict(tr_data[0], k=kk) te_pred = clf.predict(te_data[0], k=kk) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print('{} Final results with kernel {}, k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, kk, tr_acc, te_acc))
def main(): # training parameter k = 8 # fold result_path = 'results/PB2_spam.acc' model_name = 'spam_' + str(k) + 'fold' data_path = 'data/spam/data.pickle' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} # util.replace_zero_label_with_neg_one(training_data) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0]) # Preprocess.normalize_features_all(Preprocess.shifiat_and_scale, training_data[0]) # start training training_accs = [] testing_accs = [] print('Preparing k fold data.') k_folds = Preprocess.prepare_k_folds(training_data, k) kernel = c.EUCLIDEAN sst = time.time() for i in (1,): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) # start training print('{:.2f} Start training.'.format(time.time() - st)) for r in (2.5, 2.7): clf = kNN.kNN(kernel=kernel) # clf.fit(training_data[0], training_data[1]) clf.fit(tr_data[0], tr_data[1]) # tr_pred = clf.predict(training_data[0], r=r) tr_pred = clf.predict(tr_data[0], r=r) te_pred = clf.predict(te_data[0], r=r) # tr_acc = (training_data[1] == tr_pred).sum() / training_data[0].shape[0] tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] testing_accs.append(te_acc) print('{} {}-fold results with kernel {}, r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, i, kernel, r, tr_acc, te_acc))
def main(): # training parameter k = 10 # fold result_path = "results/PB1_A_spam.acc" model_name = "spam_" + str(k) + "fold" threshes_path = "data/spambase.threshes" data_path = "data/spam/data.pickle" # kernel = 'poly' kernel = "linear" # kernel = 'rbf' verbose = False tol = 0.01 c = 0.1 # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} util.replace_zero_label_with_neg_one(training_data) # normalize Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0]) print("Preparing k fold data.") k_folds = Preprocess.prepare_k_folds(training_data, k) for i in range(1): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) # start training print("{:3f} Start training. Kernel: {}".format(time.time() - st, kernel)) clf = svm.SVC(C=c, kernel=kernel, tol=tol, verbose=verbose) # clf = svm.NuSVC(kernel=kernel, tol=tol, verbose=verbose) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0]) te_pred = clf.predict(te_data[0]) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print("{:3f} Final results. Train acc: {}, Test acc: {}".format(time.time() - st, tr_acc, te_acc))
def get_ecoc(ecoc_path, num_ecoc, class_num): if path.isfile(ecoc_path): print('Loading the ecoc...') best_ecoc = loader.load_pickle_file(ecoc_path) else: print('Creating the ecoc...') best_ecoc = [0, [], []] # distance, ecoc for training, ecoc for predicting for i in range(100): n = int(math.pow(2, num_ecoc)) codes = choice(n, class_num) ecoc_func_codes = [] for i in range(num_ecoc): ecoc_func_codes.append([]) c_ecoc = [] for c in codes: bin_s = '{0:0' + str(num_ecoc) + '10b}'.format(c) bin_s = [int(ss) for ss in bin_s] c_ecoc.append(bin_s) for i in range(num_ecoc): ecoc_func_codes[i].append(bin_s[i]) c_hamming_dist = 0 has_same_code = False for j in range(len(c_ecoc)): for k in range(len(c_ecoc)): if j != k: c_hd = hamming(c_ecoc[j], c_ecoc[k]) if c_hd == 0: has_same_code = True c_hamming_dist += c_hd if has_same_code: continue if c_hamming_dist > best_ecoc[0]: best_ecoc[0] = c_hamming_dist best_ecoc[1] = ecoc_func_codes best_ecoc[2] = c_ecoc # serialize the best ecoc loader.save(ecoc_path, best_ecoc) return best_ecoc
def test_count_black(self): rect = ((1, 1), (4, 5)) cs = [] css = loader.load_pickle_file('data/digits/') pass
means = [] for i in range(d): cur_f = features[:, i] means.append(np.nanmean(cur_f)) # cur_mean = 0 # for f in features: # if not np.isnan(f[i]): # cur_mean += f[i] # means.append(cur_mean / n) means = np.array(means) loader.save(save_path, means) return means if __name__ == '__main__': # generate means for the features, missing path = 'data/spam_polluted_missing/train/data.pickle' mean_path = 'data/spam_polluted_missing/train/f_mean.pickle' features = loader.load_pickle_file(path)[0] means = np.nanmean(features, axis=0) loader.save(mean_path, means) # generate means for the features, polluted # path = 'data/spam_polluted/train/data.pickle' # mean_path = 'data/spam_polluted/train/f_mean.pickle' # features = loader.load_pickle_file(path)[0] # means = np.nanmean(features, axis=0) # loader.save(mean_path, means)
import os.path as path import copy import time # training parameter result_path = 'results/8newsgroupECOC_3.acc' model_name = '8newsgroupECOC_cs' model_path = 'results/8newsgroup/' + model_name + '.model' model2_path = 'results/8newsgroup/8newsgroupECOC_cs_2.model' threshes_path = 'data/8newsgroup/8newsgroup.thresh' tr_data_path = 'data/8newsgroup/train.data' te_data_path = 'data/8newsgroup/test.data' ecoc_path = 'data/8newsgroup/ecoc_cs' print('Loading boosts...') boosts = loader.load_pickle_file(model_path) boosts2 = loader.load_pickle_file(model2_path) print('Loading the ecoc...') best_ecoc = loader.load_pickle_file(ecoc_path) # laod and preprocess training data tr_data = loader.load_pickle_file(tr_data_path) te_data= loader.load_pickle_file(te_data_path)
def main(): # training parameter k = 10 # fold layer_thresh = 2 T = 50 result_path = 'results/spamDT_final.acc' model_name = 'spam_' + str(k) + 'fold' threshes_path = 'data/spambase.threshes' # laod and preprocess training data training_data = loader.load_dataset('data/spambase.data') # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] roc = [] auc = 0.0 k_folds = Preprocess.prepare_k_folds(training_data, k) for i in range(1): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) t = dt.DecisionTree() t.build(tr_data[0], tr_data[1], threshes, layer_thresh) # test the bagging model and compute testing acc training_errs.append(t.test(tr_data[0], tr_data[1], util.acc)) testing_errs.append(t.test(te_data[0], te_data[1], util.acc)) print('Round {} finishes, time used: {}'.format(i, time.time() - st)) mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print(str(k) + '-fold validation done. Training errs are:') print(training_errs) print('Mean training err is:') print(mean_training_err) print('Testing errs are:') print(testing_errs) print('Mean testing err is:') print(mean_testing_err) result = {} result['Fold'] = k result['Trainingerrs'] = training_errs result['MeanTrainingAcc'] = mean_training_err result['Testingerrs'] = testing_errs result['MeanTestingAcc'] = mean_testing_err result['ROC'] = roc result['AUC'] = auc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
# result['1stBoostTestingAUC'] = te_auc_1st_boost # result['1stBoostTestingROC'] = te_roc_1st_boost # DS_TYPE = 'Random' DS_TYPE = 'Optimal' if DS_TYPE == 'Random': result_path = 'results/spamRDSBoosting_final.acc.pickle' else: result_path = 'results/spamODSBoosting_final.acc.pickle' # target = 'auc' # target = 'errs' target = 'm_err' result = loader.load_pickle_file(result_path) n_round = len(result['1stBoostTestingAUC']) if target == 'auc': auc = result['1stBoostTestingAUC'] x = [i+1 for i in range(n_round)] plt.plot(x, auc, color='red', linestyle='solid') plt.title("Adaboost with " + DS_TYPE + "DecisionStump - AUC") plt.xlabel("Iteration Step") plt.ylabel("AUC") plt.show() if target == 'errs': tr_err = result['1stBoostTrainingError']
model_name = '8newsgroupECOC_cs_' + wl_type + 'final' model_path = 'results/8newsgroup/' + model_name + '.model' threshes_path = 'data/8newsgroup/8newsgroup.thresh' threshes_path_v2 = 'data/8newsgroup/8newsgroup_f_i.thresh' tr_data_path = 'data/8newsgroup/train.data' te_data_path = 'data/8newsgroup/test.data' ecoc_path = 'data/8newsgroup/ecoc_cs' # specify weak learner if wl_type == 'random_': wl = ds.RandomDecisionStump else : wl = ds.DecisionStump # laod and preprocess training data tr_data = loader.load_pickle_file(tr_data_path) te_data= loader.load_pickle_file(te_data_path) # load thresholds threshes = loader.load_pickle_file(threshes_path) # threshes_v2 = loader.load_pickle_file(threshes_path_v2) # start training tr_n = len(tr_data[0]) te_n = len(te_data[1]) # randomly generate ECOC of 20 functions num_ecoc = 20 if path.isfile(ecoc_path): print('Loading the ecoc...') best_ecoc = loader.load_pickle_file(ecoc_path)
st = time.time() # training parameter result_path = "results/PB3_C_spam_polluted_LoR_myRIDGE_final.acc" model_name = "spam_" model_path = result_path + ".model" train_data_path = "data/spam_polluted/train/data.pickle" test_data_path = "data/spam_polluted/test/data.pickle" # params lamda = 0.5 tol = 0.92 normalize_method = prep.zero_mean_unit_var term_method = util.acc_higher_than_ridge # laod and preprocess training data tr_data = loader.load_pickle_file(train_data_path) te_data = loader.load_pickle_file(test_data_path) print("{:.2f} Data loaded!".format(time.time() - st)) tr_data[0] = tr_data[0].tolist() te_data[0] = te_data[0].tolist() # normalize features prep.normalize_features_all(normalize_method, tr_data[0], te_data[0]) print("{:.2f} Features normalized!".format(time.time() - st)) saved_model = loader.load_pickle_file(model_path) # load the model theta = saved_model.theta is_batch = True penalty = "l2" # l2 for RIDGE alpha = 0.05
target = "vote" k = 10 # fold round_limit = 100 if target == "crx": result_path = "results/crxBoosting_cPercent_final_1.acc" model_name = "crx_" + str(k) + "fold" threshes_path = "data/crx.threshes" data_path = "data/crx_parsed.data" else: result_path = "results/voteBoosting_cPercent_final_1.acc" model_name = "vote_" + str(k) + "fold" threshes_path = "data/vote.threshes" data_path = "data/vote_parsed.data" # laod and preprocess training data training_data = loader.load_pickle_file(data_path) print("total data points: {}".format(len(training_data[0]))) # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs_by_percent = {} testing_errs_by_percent = {} auc_by_percent = {} roc = [] auc = 0.0 k_folds = Preprocess.prepare_k_folds(training_data, k) percent_list = (5, 10, 15, 20, 30, 50, 80) for i in range(k): tr_data_all, te_data = Preprocess.get_i_fold(k_folds, i)
def main(): # training parameter k = 10 # fold round_limit = 300 result_path = 'results/PB1_A_spam_final.acc' model_name = 'spam_' + str(k) + 'fold' threshes_path = 'data/spambase.threshes' data_path = 'data/spam/data.pickle' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} util.replace_zero_label_with_neg_one(training_data) # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] round_err_1st_boost = None tr_errs_1st_boost = None te_errs_1st_boost = None te_auc_1st_boost = None te_roc_1st_boost = None ranked_f = None roc = [] auc = 0.0 tr_data = training_data tr_n, f_d = np.shape(tr_data[0]) # TODO prepare distribution d = util.init_distribution(len(tr_data[0])) # TODO compute thresholds cheat sheet thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes) boost = b.Boosting(d) training_predict = np.zeros((1, tr_n)).tolist()[0] round_tr_err = [] round_te_err = [] round_model_err = [] round = 0 while round < round_limit: # and not converged: round += 1 boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs) boost.update_predict(tr_data[0], training_predict) c_model_err = boost.model[-1].w_err round_model_err.append(c_model_err) c_f_ind = boost.model[-1].f_ind c_thresh = boost.model[-1].thresh c_tr_err = util.get_err_from_predict(training_predict, tr_data[1]) # TODO calculate the AUC for testing results round_tr_err.append(c_tr_err) print('Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {:.12f}'.format(round, c_f_ind, c_thresh, c_model_err, c_tr_err, 0, 0)) training_errs.append(round_tr_err[-1]) ranked_f = util.get_f_ranking_from_predictions(boost, threshes) # break # for testing mean_training_err = np.mean(training_errs) print('Final results. Mean Train err: {}, Mean Test err: {}'.format(mean_training_err, 0)) print('Top 10 features: ') print(ranked_f[:10]) result = {} result['Fold'] = k result['Trainingerrs'] = training_errs result['MeanTrainingAcc'] = mean_training_err result['Testingerrs'] = testing_errs result['1stBoostTrainingError'] = tr_errs_1st_boost result['1stBoostTestingError'] = te_errs_1st_boost result['1stBoostModelError'] = round_err_1st_boost result['1stBoostTestingAUC'] = te_auc_1st_boost result['1stBoostTestingROC'] = te_roc_1st_boost result['rankedFeatures'] = ranked_f # result['ROC'] = str(roc) result['AUC'] = auc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
import GradientBoostedTrees as g import copy # training parameter layer_thresh = 2 R = 10 result_path = 'results/housingGBT_final.err' model_name = 'housingGBT' threshes_path = 'data/housing_train.threshes' # laod and preprocess training data tr_data = loader.load_dataset('data/housing_train.txt') te_data = loader.load_dataset('data/housing_test.txt') # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] tr_n, f_d = np.shape(tr_data[0]) round = 1 gbt = g.GradientBoostedTrees() gbt_label = copy.deepcopy(tr_data[1]) while round <= R: # prepare training data gbt.add_tree(tr_data[0], gbt_label, threshes, layer_thresh) # training error is from newly added tree, testing error is from current GBT pred = gbt.trees[-1].batch_predict(tr_data[0])
def main(): # training parameter target = 'crx' # target = 'vote' k = 10 # fold round_limit = 150 if target == 'crx': result_path = 'results/crxBoosting_final_1.acc' model_name = 'crx_' + str(k) + 'fold' threshes_path = 'data/crx.threshes' data_path = 'data/crx_parsed.data' else: result_path = 'results/voteBoosting_final.acc' model_name = 'vote_' + str(k) + 'fold' threshes_path = 'data/vote.threshes' data_path = 'data/vote_parsed.data' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] round_err_1st_boost = None tr_errs_1st_boost = None te_errs_1st_boost = None te_auc_1st_boost = None roc = [] auc = 0.0 k_folds = Preprocess.prepare_k_folds(training_data, k) for i in range(k): tr_data, te_data = Preprocess.get_i_fold(k_folds, i) tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) # TODO prepare distribution d = util.init_distribution(len(tr_data[0])) # TODO compute thresholds cheat sheet thresh_cs = util.pre_compute_threshes_uci(tr_data[0], tr_data[1], threshes) boost = b.Boosting(d) testing_predict = np.zeros((1, te_n)).tolist()[0] training_predict = np.zeros((1, tr_n)).tolist()[0] round_tr_err = [] round_te_err = [] round_model_err = [] round_te_auc = [] converged = False tol = 1e-5 te_auc = 2. round = 0 while round < round_limit: # and not converged: round += 1 boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs) boost.update_predict(tr_data[0], training_predict) boost.update_predict(te_data[0], testing_predict) c_model_err = boost.model[-1].w_err round_model_err.append(c_model_err) c_f_ind = boost.model[-1].f_ind c_thresh = boost.model[-1].thresh c_tr_err = util.get_err_from_predict(training_predict, tr_data[1]) c_te_err = util.get_err_from_predict(testing_predict, te_data[1]) # TODO calculate the AUC for testing results # c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1]) # round_tr_err.append(c_tr_err) # round_te_err.append(c_te_err) # round_te_auc.append(c_te_auc) print('Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f}'.format(round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err)) # converged = abs(c_te_auc - te_auc) / te_auc <= tol # te_auc = c_te_auc training_errs.append(c_tr_err) testing_errs.append(c_te_err) # if k == 0: # round_err_1st_boost = round_model_err # tr_errs_1st_boost = round_tr_err # te_errs_1st_boost = round_te_err # te_auc_1st_boost = round_te_auc # break # for testing mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print(str(k) + '-fold validation done. Training errs are:') print(training_errs) print('Mean training err is:') print(mean_training_err) print('Testing errs are:') print(testing_errs) print('Mean testing err is:') print(mean_testing_err) result = {} result['Fold'] = str(k) result['Trainingerrs'] = str(training_errs) result['MeanTrainingAcc'] = str(mean_training_err) result['Testingerrs'] = str(testing_errs) result['MeanTestingAcc'] = str(mean_testing_err) result['1stBoostTrainingError'] = str(tr_errs_1st_boost) result['1stBoostTestingError'] = str(te_errs_1st_boost) result['1stBoostModelError'] = str(round_err_1st_boost) result['1stBoostTestingAUC'] = str(te_auc_1st_boost) # result['ROC'] = str(roc) result['AUC'] = str(auc) # log the training result to file util.write_result_to_file(result_path, model_name, result)
def main(): # training parameter round_limit = 50 result_path = 'results/spamActive_random_final_1.acc' model_name = 'spam_active' threshes_path = 'data/spambase.threshes' # laod and preprocess training data training_data = loader.load_dataset('data/spambase.data') # TODO convert labels from {0, 1} to {-1, 1} util.replace_zero_label_with_neg_one(training_data) # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] # round_err_1st_boost = None # tr_errs_1st_boost = None # te_errs_1st_boost = None # te_auc_1st_boost = None roc = [] auc = 0.0 k_folds = Preprocess.prepare_k_folds(training_data, 5) tr_data_pool, te_data = Preprocess.get_i_fold(k_folds, 1) data_set = DataSet.DataSet(tr_data_pool) data_rates = (5, 10, 15, 20, 30, 50) for c in data_rates: tr_data = data_set.random_pick(c, False) tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) # TODO prepare distribution d = util.init_distribution(len(tr_data[0])) # TODO compute thresholds cheat sheet thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes) boost = b.Boosting(d) testing_predict = np.zeros((1, te_n)).tolist()[0] training_predict = np.zeros((1, tr_n)).tolist()[0] round_tr_err = [] round_te_err = [] round_model_err = [] round_te_auc = [] converged = False tol = 1e-5 te_auc = 2. round = 0 while round < round_limit: # and not converged: round += 1 boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs) boost.update_predict(tr_data[0], training_predict) boost.update_predict(te_data[0], testing_predict) c_model_err = boost.model[-1].w_err round_model_err.append(c_model_err) c_f_ind = boost.model[-1].f_ind c_thresh = boost.model[-1].thresh c_tr_err = util.get_err_from_predict(training_predict, tr_data[1]) c_te_err = util.get_err_from_predict(testing_predict, te_data[1]) # TODO calculate the AUC for testing results # c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1]) round_tr_err.append(c_tr_err) round_te_err.append(c_te_err) # round_te_auc.append(c_te_auc) print('Data {}% Round: {} Feature: {} Threshold: {:.3f} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {}'.format(c, round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err, 0)) # converged = abs(c_te_auc - te_auc) / te_auc <= tol # te_auc = c_te_auc training_errs.append(round_tr_err[-1]) testing_errs.append(round_te_err[-1]) # break # for testing mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print('Training errs are:') print(training_errs) print('Mean training err is:') print(mean_training_err) print('Testing errs are:') print(testing_errs) print('Mean testing err is:') print(mean_testing_err) result = {} result['Trainingerrs'] = training_errs result['MeanTrainingAcc'] = mean_training_err result['Testingerrs'] = testing_errs result['MeanTestingAcc'] = mean_testing_err # result['ROC'] = str(roc) result['AUC'] = auc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
def main(): st = time.time() # training parameter round_limit = 15 result_path = 'results/PB1_B_spam_2.acc' model_name = 'spam_' model_path = result_path + '.model' threshes_path = 'data/spambase_polluted.threshes' train_data_path = 'data/spam_polluted/train/data.pickle' test_data_path = 'data/spam_polluted/test/data.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(train_data_path) te_data = loader.load_pickle_file(test_data_path) print('{:.2f} Data loaded!'.format(time.time() - st)) # TODO convert labels from {0, 1} to {-1, 1} util.replace_zero_label_with_neg_one(tr_data) util.replace_zero_label_with_neg_one(te_data) print('{:.2f} Label converted!'.format(time.time() - st)) # load thresholds threshes = loader.load_pickle_file(threshes_path) print('{:.2f} Thresholds loaded!'.format(time.time() - st)) # start training training_errs = [] testing_errs = [] round_err_1st_boost = None tr_errs_1st_boost = None te_errs_1st_boost = None te_auc_1st_boost = None te_roc_1st_boost = None ranked_f = None roc = [] auc = 0.0 thresh_cs = None tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) # TODO prepare distribution d = util.init_distribution(len(tr_data[0])) # TODO compute thresholds cheat sheet (not a solution due to huge thresh_cs table) # thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes) # print('{:.2f} Thresholds cheat sheet computed!'.format(time.time() - st)) boost = b.Boosting(d) testing_predict = np.zeros((1, te_n)).tolist()[0] training_predict = np.zeros((1, tr_n)).tolist()[0] round_tr_err = [] round_te_err = [] round_model_err = [] round_te_auc = [] converged = False tol = 1e-5 te_auc = 2. round = 0 while round < round_limit: # and not converged: round += 1 boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs) boost.update_predict(tr_data[0], training_predict) boost.update_predict(te_data[0], testing_predict) c_model_err = boost.model[-1].w_err round_model_err.append(c_model_err) c_f_ind = boost.model[-1].f_ind c_thresh = boost.model[-1].thresh c_tr_err = util.get_err_from_predict(training_predict, tr_data[1]) c_te_err = util.get_err_from_predict(testing_predict, te_data[1]) # TODO calculate the AUC for testing results # c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1]) round_tr_err.append(c_tr_err) round_te_err.append(c_te_err) # round_te_auc.append(c_te_auc) print('{:.2f} Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {:.12f}'.format(time.time() - st, round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err, 0)) # converged = abs(c_te_auc - te_auc) / te_auc <= tol # te_auc = c_te_auc training_errs.append(round_tr_err[-1]) testing_errs.append(round_te_err[-1]) # TODO get feature ranking from the predictions ranked_f = util.get_f_ranking_from_predictions(boost, threshes) round_err_1st_boost = round_model_err tr_errs_1st_boost = round_tr_err te_errs_1st_boost = round_te_err # te_auc_1st_boost = round_te_auc # _, te_roc_1st_boost = util.get_auc_from_predict(testing_predict, te_data[1], True) # break # for testing mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print('Final results. Mean Train err: {}, Mean Test err: {}'.format(mean_training_err, mean_testing_err)) print('Top 10 features: ') # print(ranked_f[:10]) result = {} result['Trainingerrs'] = training_errs result['MeanTrainingAcc'] = mean_training_err result['Testingerrs'] = testing_errs result['MeanTestingAcc'] = mean_testing_err result['1stBoostTrainingError'] = tr_errs_1st_boost result['1stBoostTestingError'] = te_errs_1st_boost result['1stBoostModelError'] = round_err_1st_boost result['1stBoostTestingAUC'] = te_auc_1st_boost result['1stBoostTestingROC'] = te_roc_1st_boost result['rankedFeatures'] = ranked_f # result['ROC'] = str(roc) result['AUC'] = auc # store the model loader.save(model_path, boost) # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
st = time.time() # training parameter result_path = 'results/PB3_B_spam_polluted_LoR_RIDGE_sklearn.acc' model_name = 'spam_' train_data_path = 'data/spam_polluted/train/data.pickle' test_data_path = 'data/spam_polluted/test/data.pickle' # params lamda = 0.0001 tol = 0.85 normalize_method = prep.zero_mean_unit_var term_method = util.acc_higher_than # laod and preprocess training data tr_data = loader.load_pickle_file(train_data_path) te_data = loader.load_pickle_file(test_data_path) print('{:.2f} Data loaded!'.format(time.time() - st)) tr_data[0] = tr_data[0].tolist() te_data[0] = te_data[0].tolist() # normalize features prep.normalize_features_all(normalize_method, tr_data[0], te_data[0]) print('{:.2f} Features normalized!'.format(time.time() - st)) # using sklearn parameters = {'C' : [0.05, 0.04, 0.1, 0.2, 0.3], 'penalty' : ('l2',), 'tol' : (0.06,)} model = LogisticRegression(C=0.05, penalty='l1', tol=0.08) clf = grid_search.GridSearchCV(model, parameters)