def main(): # training parameter result_path = 'results/housingLiR_1.mse' model_name = 'housing_shiftAndScale' # normalization = Preprocess.zero_mean_unit_var normalization = Preprocess.shift_and_scale # cols_not_norm = (0,7,12) cols_not_norm = [] # laod and preprocess training data training_data = loader.load_dataset('data/housing_train.txt') testing_data = loader.load_dataset('data/housing_test.txt') Preprocess.normalize_features_all(normalization, training_data[0], testing_data[0], cols_not_norm) # start training model = rm.LinearRegression() model.build(training_data[0], training_data[1]) training_mse = model.test(training_data[0], training_data[1], util.mse) testing_mse = model.test(testing_data[0], testing_data[1], util.mse) print 'Error for training data is:' print training_mse print 'Error for testing data is:' print testing_mse result = {} result['TrainingMSE'] = str(training_mse) result['TestingMSE'] = str(testing_mse) result['Theta'] = str(model.theta) # log the training result to file util.write_result_to_file(result_path, model_name, result)
def main(): st = time.time() # training parameter result_path = 'results/PB2_A_spam_polluted_NB_Gaussian.acc' model_name = 'spam_' train_data_path = 'data/spam_polluted/train/data.pickle' test_data_path = 'data/spam_polluted/test/data.pickle' tr_data = loader.load_pickle_file(train_data_path) te_data = loader.load_pickle_file(test_data_path) print('{:.2f} Data loaded!'.format(time.time() - st)) # start training print('{:.2f} Building model...'.format(time.time() - st)) model = m.NBGaussian() model.build(tr_data[0], tr_data[1]) print('{:.2f} Predicting...'.format(time.time() - st)) tr_pred = model.predict(tr_data[0]) te_pred = model.predict(te_data[0]) print('{:.2f} Calculating results...'.format(time.time() - st)) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print('{} Final results. Train acc: {}, Test acc: {}'.format(time.time() - st, tr_acc, te_acc)) result = {} result['TrainingAcc'] = tr_acc result['TestingAcc'] = te_acc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
def main(): st = time.time() # training parameter result_path = 'results/PB4_spam_polluted_missing_NB_Bern.acc' model_name = 'spam_' mean_path = 'data/spam_polluted_missing/train/f_mean.pickle' train_data_path = 'data/spam_polluted_missing/train/data.pickle' test_data_path = 'data/spam_polluted_missing/test/data.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(train_data_path) te_data = loader.load_pickle_file(test_data_path) print('{:.2f} Data loaded!'.format(time.time() - st)) # load means means = loader.load_pickle_file(mean_path) print('{:.2f} Means loaded!'.format(time.time() - st)) # start training roc = [] auc = 0.0 tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) te_auc = 2. round = 0 model = m.NBBernoulli(means) model.build(tr_data[0], tr_data[1]) training_acc = model.test(tr_data[0], tr_data[1], util.acc) # training_cms.append(training_test_res[1]) testing_acc = model.test(te_data[0], te_data[1], util.acc) # testing_cms.append(testing_test_res[1]) print('Final results. Train acc: {}, Test acc: {}'.format(training_acc, testing_acc)) result = {} result['TrainingAcc'] = training_acc result['TestingAcc'] = testing_acc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
def main(): st = time.time() # training parameter round_limit = 15 result_path = 'results/PB1_B_spam_2.acc' model_name = 'spam_' model_path = result_path + '.model' threshes_path = 'data/spambase_polluted.threshes' train_data_path = 'data/spam_polluted/train/data.pickle' test_data_path = 'data/spam_polluted/test/data.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(train_data_path) te_data = loader.load_pickle_file(test_data_path) print('{:.2f} Data loaded!'.format(time.time() - st)) # TODO convert labels from {0, 1} to {-1, 1} util.replace_zero_label_with_neg_one(tr_data) util.replace_zero_label_with_neg_one(te_data) print('{:.2f} Label converted!'.format(time.time() - st)) # load thresholds threshes = loader.load_pickle_file(threshes_path) print('{:.2f} Thresholds loaded!'.format(time.time() - st)) # start training training_errs = [] testing_errs = [] round_err_1st_boost = None tr_errs_1st_boost = None te_errs_1st_boost = None te_auc_1st_boost = None te_roc_1st_boost = None ranked_f = None roc = [] auc = 0.0 thresh_cs = None tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) # TODO prepare distribution d = util.init_distribution(len(tr_data[0])) # TODO compute thresholds cheat sheet (not a solution due to huge thresh_cs table) # thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes) # print('{:.2f} Thresholds cheat sheet computed!'.format(time.time() - st)) boost = b.Boosting(d) testing_predict = np.zeros((1, te_n)).tolist()[0] training_predict = np.zeros((1, tr_n)).tolist()[0] round_tr_err = [] round_te_err = [] round_model_err = [] round_te_auc = [] converged = False tol = 1e-5 te_auc = 2. round = 0 while round < round_limit: # and not converged: round += 1 boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs) boost.update_predict(tr_data[0], training_predict) boost.update_predict(te_data[0], testing_predict) c_model_err = boost.model[-1].w_err round_model_err.append(c_model_err) c_f_ind = boost.model[-1].f_ind c_thresh = boost.model[-1].thresh c_tr_err = util.get_err_from_predict(training_predict, tr_data[1]) c_te_err = util.get_err_from_predict(testing_predict, te_data[1]) # TODO calculate the AUC for testing results # c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1]) round_tr_err.append(c_tr_err) round_te_err.append(c_te_err) # round_te_auc.append(c_te_auc) print('{:.2f} Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {:.12f}'.format(time.time() - st, round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err, 0)) # converged = abs(c_te_auc - te_auc) / te_auc <= tol # te_auc = c_te_auc training_errs.append(round_tr_err[-1]) testing_errs.append(round_te_err[-1]) # TODO get feature ranking from the predictions ranked_f = util.get_f_ranking_from_predictions(boost, threshes) round_err_1st_boost = round_model_err tr_errs_1st_boost = round_tr_err te_errs_1st_boost = round_te_err # te_auc_1st_boost = round_te_auc # _, te_roc_1st_boost = util.get_auc_from_predict(testing_predict, te_data[1], True) # break # for testing mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print('Final results. Mean Train err: {}, Mean Test err: {}'.format(mean_training_err, mean_testing_err)) print('Top 10 features: ') # print(ranked_f[:10]) result = {} result['Trainingerrs'] = training_errs result['MeanTrainingAcc'] = mean_training_err result['Testingerrs'] = testing_errs result['MeanTestingAcc'] = mean_testing_err result['1stBoostTrainingError'] = tr_errs_1st_boost result['1stBoostTestingError'] = te_errs_1st_boost result['1stBoostModelError'] = round_err_1st_boost result['1stBoostTestingAUC'] = te_auc_1st_boost result['1stBoostTestingROC'] = te_roc_1st_boost result['rankedFeatures'] = ranked_f # result['ROC'] = str(roc) result['AUC'] = auc # store the model loader.save(model_path, boost) # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
def main(): # training parameter round_limit = 50 result_path = 'results/spamActive_random_final_1.acc' model_name = 'spam_active' threshes_path = 'data/spambase.threshes' # laod and preprocess training data training_data = loader.load_dataset('data/spambase.data') # TODO convert labels from {0, 1} to {-1, 1} util.replace_zero_label_with_neg_one(training_data) # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] # round_err_1st_boost = None # tr_errs_1st_boost = None # te_errs_1st_boost = None # te_auc_1st_boost = None roc = [] auc = 0.0 k_folds = Preprocess.prepare_k_folds(training_data, 5) tr_data_pool, te_data = Preprocess.get_i_fold(k_folds, 1) data_set = DataSet.DataSet(tr_data_pool) data_rates = (5, 10, 15, 20, 30, 50) for c in data_rates: tr_data = data_set.random_pick(c, False) tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) # TODO prepare distribution d = util.init_distribution(len(tr_data[0])) # TODO compute thresholds cheat sheet thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes) boost = b.Boosting(d) testing_predict = np.zeros((1, te_n)).tolist()[0] training_predict = np.zeros((1, tr_n)).tolist()[0] round_tr_err = [] round_te_err = [] round_model_err = [] round_te_auc = [] converged = False tol = 1e-5 te_auc = 2. round = 0 while round < round_limit: # and not converged: round += 1 boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs) boost.update_predict(tr_data[0], training_predict) boost.update_predict(te_data[0], testing_predict) c_model_err = boost.model[-1].w_err round_model_err.append(c_model_err) c_f_ind = boost.model[-1].f_ind c_thresh = boost.model[-1].thresh c_tr_err = util.get_err_from_predict(training_predict, tr_data[1]) c_te_err = util.get_err_from_predict(testing_predict, te_data[1]) # TODO calculate the AUC for testing results # c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1]) round_tr_err.append(c_tr_err) round_te_err.append(c_te_err) # round_te_auc.append(c_te_auc) print('Data {}% Round: {} Feature: {} Threshold: {:.3f} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {}'.format(c, round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err, 0)) # converged = abs(c_te_auc - te_auc) / te_auc <= tol # te_auc = c_te_auc training_errs.append(round_tr_err[-1]) testing_errs.append(round_te_err[-1]) # break # for testing mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print('Training errs are:') print(training_errs) print('Mean training err is:') print(mean_training_err) print('Testing errs are:') print(testing_errs) print('Mean testing err is:') print(mean_testing_err) result = {} result['Trainingerrs'] = training_errs result['MeanTrainingAcc'] = mean_training_err result['Testingerrs'] = testing_errs result['MeanTestingAcc'] = mean_testing_err # result['ROC'] = str(roc) result['AUC'] = auc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
def main(): # training parameter k = 10 # fold layer_thresh = 2 T = 50 result_path = 'results/spamDT_final.acc' model_name = 'spam_' + str(k) + 'fold' threshes_path = 'data/spambase.threshes' # laod and preprocess training data training_data = loader.load_dataset('data/spambase.data') # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] roc = [] auc = 0.0 k_folds = Preprocess.prepare_k_folds(training_data, k) for i in range(1): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) t = dt.DecisionTree() t.build(tr_data[0], tr_data[1], threshes, layer_thresh) # test the bagging model and compute testing acc training_errs.append(t.test(tr_data[0], tr_data[1], util.acc)) testing_errs.append(t.test(te_data[0], te_data[1], util.acc)) print('Round {} finishes, time used: {}'.format(i, time.time() - st)) mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print(str(k) + '-fold validation done. Training errs are:') print(training_errs) print('Mean training err is:') print(mean_training_err) print('Testing errs are:') print(testing_errs) print('Mean testing err is:') print(mean_testing_err) result = {} result['Fold'] = k result['Trainingerrs'] = training_errs result['MeanTrainingAcc'] = mean_training_err result['Testingerrs'] = testing_errs result['MeanTestingAcc'] = mean_testing_err result['ROC'] = roc result['AUC'] = auc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
print mean_training_acc print 'Testing accs are:' print testing_accs print 'Mean testing acc is:' print mean_testing_acc print 'Mean Training Confusion Matrix is:' print mean_training_cm print 'Mean Testing Confusion Matrix is:' print mean_testing_cm print 'AOC for fold 1 is:' print auc result = {} result['Fold'] = str(k) result['TrainingAccs'] = str(training_accs) result['MeanTrainingAcc'] = str(mean_training_acc) result['TestingAccs'] = str(testing_accs) result['MeanTestingAcc'] = str(mean_testing_acc) result['TrainingCMs'] = str(training_cms) result['TestingCMs'] = str(testing_cms) result['MeanTrainingCM'] = str(mean_training_cm) result['MeanTestingCM'] = str(mean_testing_cm) result['ROC'] = str(roc) result['AUC'] = str(auc) # log the training result to file util.write_result_to_file(result_path, model_name, result)
def main(): # training parameter k = 10 # fold round_limit = 300 result_path = 'results/PB1_A_spam_final.acc' model_name = 'spam_' + str(k) + 'fold' threshes_path = 'data/spambase.threshes' data_path = 'data/spam/data.pickle' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} util.replace_zero_label_with_neg_one(training_data) # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] round_err_1st_boost = None tr_errs_1st_boost = None te_errs_1st_boost = None te_auc_1st_boost = None te_roc_1st_boost = None ranked_f = None roc = [] auc = 0.0 tr_data = training_data tr_n, f_d = np.shape(tr_data[0]) # TODO prepare distribution d = util.init_distribution(len(tr_data[0])) # TODO compute thresholds cheat sheet thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes) boost = b.Boosting(d) training_predict = np.zeros((1, tr_n)).tolist()[0] round_tr_err = [] round_te_err = [] round_model_err = [] round = 0 while round < round_limit: # and not converged: round += 1 boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs) boost.update_predict(tr_data[0], training_predict) c_model_err = boost.model[-1].w_err round_model_err.append(c_model_err) c_f_ind = boost.model[-1].f_ind c_thresh = boost.model[-1].thresh c_tr_err = util.get_err_from_predict(training_predict, tr_data[1]) # TODO calculate the AUC for testing results round_tr_err.append(c_tr_err) print('Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {:.12f}'.format(round, c_f_ind, c_thresh, c_model_err, c_tr_err, 0, 0)) training_errs.append(round_tr_err[-1]) ranked_f = util.get_f_ranking_from_predictions(boost, threshes) # break # for testing mean_training_err = np.mean(training_errs) print('Final results. Mean Train err: {}, Mean Test err: {}'.format(mean_training_err, 0)) print('Top 10 features: ') print(ranked_f[:10]) result = {} result['Fold'] = k result['Trainingerrs'] = training_errs result['MeanTrainingAcc'] = mean_training_err result['Testingerrs'] = testing_errs result['1stBoostTrainingError'] = tr_errs_1st_boost result['1stBoostTestingError'] = te_errs_1st_boost result['1stBoostModelError'] = round_err_1st_boost result['1stBoostTestingAUC'] = te_auc_1st_boost result['1stBoostTestingROC'] = te_roc_1st_boost result['rankedFeatures'] = ranked_f # result['ROC'] = str(roc) result['AUC'] = auc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
def main(): # training parameter target = 'crx' # target = 'vote' k = 10 # fold round_limit = 150 if target == 'crx': result_path = 'results/crxBoosting_final_1.acc' model_name = 'crx_' + str(k) + 'fold' threshes_path = 'data/crx.threshes' data_path = 'data/crx_parsed.data' else: result_path = 'results/voteBoosting_final.acc' model_name = 'vote_' + str(k) + 'fold' threshes_path = 'data/vote.threshes' data_path = 'data/vote_parsed.data' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] round_err_1st_boost = None tr_errs_1st_boost = None te_errs_1st_boost = None te_auc_1st_boost = None roc = [] auc = 0.0 k_folds = Preprocess.prepare_k_folds(training_data, k) for i in range(k): tr_data, te_data = Preprocess.get_i_fold(k_folds, i) tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) # TODO prepare distribution d = util.init_distribution(len(tr_data[0])) # TODO compute thresholds cheat sheet thresh_cs = util.pre_compute_threshes_uci(tr_data[0], tr_data[1], threshes) boost = b.Boosting(d) testing_predict = np.zeros((1, te_n)).tolist()[0] training_predict = np.zeros((1, tr_n)).tolist()[0] round_tr_err = [] round_te_err = [] round_model_err = [] round_te_auc = [] converged = False tol = 1e-5 te_auc = 2. round = 0 while round < round_limit: # and not converged: round += 1 boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs) boost.update_predict(tr_data[0], training_predict) boost.update_predict(te_data[0], testing_predict) c_model_err = boost.model[-1].w_err round_model_err.append(c_model_err) c_f_ind = boost.model[-1].f_ind c_thresh = boost.model[-1].thresh c_tr_err = util.get_err_from_predict(training_predict, tr_data[1]) c_te_err = util.get_err_from_predict(testing_predict, te_data[1]) # TODO calculate the AUC for testing results # c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1]) # round_tr_err.append(c_tr_err) # round_te_err.append(c_te_err) # round_te_auc.append(c_te_auc) print('Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f}'.format(round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err)) # converged = abs(c_te_auc - te_auc) / te_auc <= tol # te_auc = c_te_auc training_errs.append(c_tr_err) testing_errs.append(c_te_err) # if k == 0: # round_err_1st_boost = round_model_err # tr_errs_1st_boost = round_tr_err # te_errs_1st_boost = round_te_err # te_auc_1st_boost = round_te_auc # break # for testing mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print(str(k) + '-fold validation done. Training errs are:') print(training_errs) print('Mean training err is:') print(mean_training_err) print('Testing errs are:') print(testing_errs) print('Mean testing err is:') print(mean_testing_err) result = {} result['Fold'] = str(k) result['Trainingerrs'] = str(training_errs) result['MeanTrainingAcc'] = str(mean_training_err) result['Testingerrs'] = str(testing_errs) result['MeanTestingAcc'] = str(mean_testing_err) result['1stBoostTrainingError'] = str(tr_errs_1st_boost) result['1stBoostTestingError'] = str(te_errs_1st_boost) result['1stBoostModelError'] = str(round_err_1st_boost) result['1stBoostTestingAUC'] = str(te_auc_1st_boost) # result['ROC'] = str(roc) result['AUC'] = str(auc) # log the training result to file util.write_result_to_file(result_path, model_name, result)
def main(): # training parameter k = 10 # fold round_limit = 100 result_path = 'results/spamODSBoosting_final.acc' model_name = 'spam_' + str(k) + 'fold' threshes_path = 'data/spambase.threshes' # laod and preprocess training data training_data = loader.load_dataset('data/spambase.data') # TODO convert labels from {0, 1} to {-1, 1} util.replace_zero_label_with_neg_one(training_data) # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] round_err_1st_boost = None tr_errs_1st_boost = None te_errs_1st_boost = None te_auc_1st_boost = None te_roc_1st_boost = None roc = [] auc = 0.0 k_folds = Preprocess.prepare_k_folds(training_data, k) for i in range(1): tr_data, te_data = Preprocess.get_i_fold(k_folds, i) tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) # TODO prepare distribution d = util.init_distribution(len(tr_data[0])) # TODO compute thresholds cheat sheet thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes) boost = b.Boosting(d) testing_predict = np.zeros((1, te_n)).tolist()[0] training_predict = np.zeros((1, tr_n)).tolist()[0] round_tr_err = [] round_te_err = [] round_model_err = [] round_te_auc = [] converged = False tol = 1e-5 te_auc = 2. round = 0 while round < round_limit: # and not converged: round += 1 boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs) boost.update_predict(tr_data[0], training_predict) boost.update_predict(te_data[0], testing_predict) c_model_err = boost.model[-1].w_err round_model_err.append(c_model_err) c_f_ind = boost.model[-1].f_ind c_thresh = boost.model[-1].thresh c_tr_err = util.get_err_from_predict(training_predict, tr_data[1]) c_te_err = util.get_err_from_predict(testing_predict, te_data[1]) # TODO calculate the AUC for testing results c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1]) round_tr_err.append(c_tr_err) round_te_err.append(c_te_err) round_te_auc.append(c_te_auc) print('Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {:.12f}'.format(round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err, c_te_auc)) converged = abs(c_te_auc - te_auc) / te_auc <= tol te_auc = c_te_auc training_errs.append(round_tr_err[-1]) testing_errs.append(round_te_err[-1]) if i == 0: round_err_1st_boost = round_model_err tr_errs_1st_boost = round_tr_err te_errs_1st_boost = round_te_err te_auc_1st_boost = round_te_auc _, te_roc_1st_boost = util.get_auc_from_predict(testing_predict, te_data[1], True) # break # for testing mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print(str(k) + '-fold validation done. Training errs are:') print(training_errs) print('Mean training err is:') print(mean_training_err) print('Testing errs are:') print(testing_errs) print('Mean testing err is:') print(mean_testing_err) result = {} result['Fold'] = k result['Trainingerrs'] = training_errs result['MeanTrainingAcc'] = mean_training_err result['Testingerrs'] = testing_errs result['MeanTestingAcc'] = mean_testing_err result['1stBoostTrainingError'] = tr_errs_1st_boost result['1stBoostTestingError'] = te_errs_1st_boost result['1stBoostModelError'] = round_err_1st_boost result['1stBoostTestingAUC'] = te_auc_1st_boost result['1stBoostTestingROC'] = te_roc_1st_boost # result['ROC'] = str(roc) result['AUC'] = auc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)