def main(): target = 'v2' # training parameter k = 10 # fold layer_thresh = 2 T = 50 threshes_path = 'data/spambase.threshes' # laod and preprocess training data training_data = loader.load_dataset('data/spambase.data') # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training k_folds = Preprocess.prepare_k_folds(training_data, k) tr_data, te_data = Preprocess.get_i_fold(k_folds, 0) f_cur = [x[0] for x in tr_data[0]] t = dt.DecisionTree() if target == 'v1': for i in range(100): h_y = t.compute_entropy(tr_data[1]) thresh = threshes[0][30] ig = t.compute_ig(f_cur, tr_data[1], thresh, h_y) else: h_y = t.compute_entropy_v2(tr_data[1]) thresh = threshes[0][0] ig = t.compute_ig_v2(f_cur, tr_data[1], thresh, h_y)
def main(): # training parameter is_sklearn = True k = 10 # fold result_path = 'results/PB2_spam.acc' model_name = 'spam_' + str(k) + 'fold' data_path = 'data/spam/data.pickle' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} # util.replace_zero_label_with_neg_one(training_data) # Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0]) # training_data[0] = preprocessing.scale(training_data[0]) # start training training_errs = [] testing_errs = [] print('Preparing k fold data.') k_folds = Preprocess.prepare_k_folds(training_data, k) for i in (0,): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) # start training print('{:.2f} Start training.'.format(time.time() - st)) kernel = c.EUCLIDEAN # kernel = c.GAUSSIAN f_select = True best_features_num = 5 clf = kNN.kNN(kernel=kernel) clf.fit(tr_data[0], tr_data[1], f_select=f_select, best_f=best_features_num) print("Best features: {}".format(clf.best_f_indices)) for kk in (1, 2, 3, 7): tr_pred = clf.predict(tr_data[0], k=kk) te_pred = clf.predict(te_data[0], k=kk) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print('{} Final results with kernel {}, k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, kk, tr_acc, te_acc))
def main(): # training parameter k = 8 # fold result_path = 'results/PB2_spam.acc' model_name = 'spam_' + str(k) + 'fold' data_path = 'data/spam/data.pickle' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} # util.replace_zero_label_with_neg_one(training_data) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0]) # Preprocess.normalize_features_all(Preprocess.shifiat_and_scale, training_data[0]) # start training training_accs = [] testing_accs = [] print('Preparing k fold data.') k_folds = Preprocess.prepare_k_folds(training_data, k) kernel = c.EUCLIDEAN sst = time.time() for i in (1,): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) # start training print('{:.2f} Start training.'.format(time.time() - st)) for r in (2.5, 2.7): clf = kNN.kNN(kernel=kernel) # clf.fit(training_data[0], training_data[1]) clf.fit(tr_data[0], tr_data[1]) # tr_pred = clf.predict(training_data[0], r=r) tr_pred = clf.predict(tr_data[0], r=r) te_pred = clf.predict(te_data[0], r=r) # tr_acc = (training_data[1] == tr_pred).sum() / training_data[0].shape[0] tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] testing_accs.append(te_acc) print('{} {}-fold results with kernel {}, r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, i, kernel, r, tr_acc, te_acc))
def main(): # training parameter k = 10 # fold result_path = "results/PB1_A_spam.acc" model_name = "spam_" + str(k) + "fold" threshes_path = "data/spambase.threshes" data_path = "data/spam/data.pickle" # kernel = 'poly' kernel = "linear" # kernel = 'rbf' verbose = False tol = 0.01 c = 0.1 # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} util.replace_zero_label_with_neg_one(training_data) # normalize Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0]) print("Preparing k fold data.") k_folds = Preprocess.prepare_k_folds(training_data, k) for i in range(1): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) # start training print("{:3f} Start training. Kernel: {}".format(time.time() - st, kernel)) clf = svm.SVC(C=c, kernel=kernel, tol=tol, verbose=verbose) # clf = svm.NuSVC(kernel=kernel, tol=tol, verbose=verbose) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0]) te_pred = clf.predict(te_data[0]) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print("{:3f} Final results. Train acc: {}, Test acc: {}".format(time.time() - st, tr_acc, te_acc))
def main(): # training parameter round_limit = 50 result_path = 'results/spamActive_random_final_1.acc' model_name = 'spam_active' threshes_path = 'data/spambase.threshes' # laod and preprocess training data training_data = loader.load_dataset('data/spambase.data') # TODO convert labels from {0, 1} to {-1, 1} util.replace_zero_label_with_neg_one(training_data) # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] # round_err_1st_boost = None # tr_errs_1st_boost = None # te_errs_1st_boost = None # te_auc_1st_boost = None roc = [] auc = 0.0 k_folds = Preprocess.prepare_k_folds(training_data, 5) tr_data_pool, te_data = Preprocess.get_i_fold(k_folds, 1) data_set = DataSet.DataSet(tr_data_pool) data_rates = (5, 10, 15, 20, 30, 50) for c in data_rates: tr_data = data_set.random_pick(c, False) tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) # TODO prepare distribution d = util.init_distribution(len(tr_data[0])) # TODO compute thresholds cheat sheet thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes) boost = b.Boosting(d) testing_predict = np.zeros((1, te_n)).tolist()[0] training_predict = np.zeros((1, tr_n)).tolist()[0] round_tr_err = [] round_te_err = [] round_model_err = [] round_te_auc = [] converged = False tol = 1e-5 te_auc = 2. round = 0 while round < round_limit: # and not converged: round += 1 boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs) boost.update_predict(tr_data[0], training_predict) boost.update_predict(te_data[0], testing_predict) c_model_err = boost.model[-1].w_err round_model_err.append(c_model_err) c_f_ind = boost.model[-1].f_ind c_thresh = boost.model[-1].thresh c_tr_err = util.get_err_from_predict(training_predict, tr_data[1]) c_te_err = util.get_err_from_predict(testing_predict, te_data[1]) # TODO calculate the AUC for testing results # c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1]) round_tr_err.append(c_tr_err) round_te_err.append(c_te_err) # round_te_auc.append(c_te_auc) print('Data {}% Round: {} Feature: {} Threshold: {:.3f} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {}'.format(c, round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err, 0)) # converged = abs(c_te_auc - te_auc) / te_auc <= tol # te_auc = c_te_auc training_errs.append(round_tr_err[-1]) testing_errs.append(round_te_err[-1]) # break # for testing mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print('Training errs are:') print(training_errs) print('Mean training err is:') print(mean_training_err) print('Testing errs are:') print(testing_errs) print('Mean testing err is:') print(mean_testing_err) result = {} result['Trainingerrs'] = training_errs result['MeanTrainingAcc'] = mean_training_err result['Testingerrs'] = testing_errs result['MeanTestingAcc'] = mean_testing_err # result['ROC'] = str(roc) result['AUC'] = auc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
def main(): # training parameter k = 10 # fold layer_thresh = 2 T = 50 result_path = 'results/spamDT_final.acc' model_name = 'spam_' + str(k) + 'fold' threshes_path = 'data/spambase.threshes' # laod and preprocess training data training_data = loader.load_dataset('data/spambase.data') # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] roc = [] auc = 0.0 k_folds = Preprocess.prepare_k_folds(training_data, k) for i in range(1): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) t = dt.DecisionTree() t.build(tr_data[0], tr_data[1], threshes, layer_thresh) # test the bagging model and compute testing acc training_errs.append(t.test(tr_data[0], tr_data[1], util.acc)) testing_errs.append(t.test(te_data[0], te_data[1], util.acc)) print('Round {} finishes, time used: {}'.format(i, time.time() - st)) mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print(str(k) + '-fold validation done. Training errs are:') print(training_errs) print('Mean training err is:') print(mean_training_err) print('Testing errs are:') print(testing_errs) print('Mean testing err is:') print(mean_testing_err) result = {} result['Fold'] = k result['Trainingerrs'] = training_errs result['MeanTrainingAcc'] = mean_training_err result['Testingerrs'] = testing_errs result['MeanTestingAcc'] = mean_testing_err result['ROC'] = roc result['AUC'] = auc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
k = 10 # fold result_path = 'results/spamNBBern_1.acc' model_name = 'spam_' + str(k) + 'fold_' + 'SS' # laod and preprocess training data training_data = loader.load_dataset('data/spambase.data') # start training training_accs = [] training_cms = [] testing_accs = [] testing_cms = [] roc = [] auc = 0.0 k_folds = Preprocess.prepare_k_folds(training_data, k) means = loader.load_spam_mean('data/spam_mean') for i in range(k): tr_data, te_data = Preprocess.get_i_fold(k_folds, i) model = m.NBBernoulli(means) model.build(tr_data[0], tr_data[1]) training_test_res = model.test(tr_data[0], tr_data[1], util.compute_acc_confusion_matrix) training_accs.append(training_test_res[0]) training_cms.append(training_test_res[1]) testing_test_res = model.test(te_data[0], te_data[1], util.compute_acc_confusion_matrix) testing_accs.append(testing_test_res[0])
def main(): # training parameter target = 'crx' # target = 'vote' k = 10 # fold round_limit = 150 if target == 'crx': result_path = 'results/crxBoosting_final_1.acc' model_name = 'crx_' + str(k) + 'fold' threshes_path = 'data/crx.threshes' data_path = 'data/crx_parsed.data' else: result_path = 'results/voteBoosting_final.acc' model_name = 'vote_' + str(k) + 'fold' threshes_path = 'data/vote.threshes' data_path = 'data/vote_parsed.data' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] round_err_1st_boost = None tr_errs_1st_boost = None te_errs_1st_boost = None te_auc_1st_boost = None roc = [] auc = 0.0 k_folds = Preprocess.prepare_k_folds(training_data, k) for i in range(k): tr_data, te_data = Preprocess.get_i_fold(k_folds, i) tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) # TODO prepare distribution d = util.init_distribution(len(tr_data[0])) # TODO compute thresholds cheat sheet thresh_cs = util.pre_compute_threshes_uci(tr_data[0], tr_data[1], threshes) boost = b.Boosting(d) testing_predict = np.zeros((1, te_n)).tolist()[0] training_predict = np.zeros((1, tr_n)).tolist()[0] round_tr_err = [] round_te_err = [] round_model_err = [] round_te_auc = [] converged = False tol = 1e-5 te_auc = 2. round = 0 while round < round_limit: # and not converged: round += 1 boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs) boost.update_predict(tr_data[0], training_predict) boost.update_predict(te_data[0], testing_predict) c_model_err = boost.model[-1].w_err round_model_err.append(c_model_err) c_f_ind = boost.model[-1].f_ind c_thresh = boost.model[-1].thresh c_tr_err = util.get_err_from_predict(training_predict, tr_data[1]) c_te_err = util.get_err_from_predict(testing_predict, te_data[1]) # TODO calculate the AUC for testing results # c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1]) # round_tr_err.append(c_tr_err) # round_te_err.append(c_te_err) # round_te_auc.append(c_te_auc) print('Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f}'.format(round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err)) # converged = abs(c_te_auc - te_auc) / te_auc <= tol # te_auc = c_te_auc training_errs.append(c_tr_err) testing_errs.append(c_te_err) # if k == 0: # round_err_1st_boost = round_model_err # tr_errs_1st_boost = round_tr_err # te_errs_1st_boost = round_te_err # te_auc_1st_boost = round_te_auc # break # for testing mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print(str(k) + '-fold validation done. Training errs are:') print(training_errs) print('Mean training err is:') print(mean_training_err) print('Testing errs are:') print(testing_errs) print('Mean testing err is:') print(mean_testing_err) result = {} result['Fold'] = str(k) result['Trainingerrs'] = str(training_errs) result['MeanTrainingAcc'] = str(mean_training_err) result['Testingerrs'] = str(testing_errs) result['MeanTestingAcc'] = str(mean_testing_err) result['1stBoostTrainingError'] = str(tr_errs_1st_boost) result['1stBoostTestingError'] = str(te_errs_1st_boost) result['1stBoostModelError'] = str(round_err_1st_boost) result['1stBoostTestingAUC'] = str(te_auc_1st_boost) # result['ROC'] = str(roc) result['AUC'] = str(auc) # log the training result to file util.write_result_to_file(result_path, model_name, result)
def main(): # centers = [i for i in range(3, 15)] # # for c in centers: # k_means(c) st = time.time() # load data print('{} Loading match data...'.format(time.time() - st)) match_dict = util.load_pickle_file(perfect_match_path) print('Loading player dict...') player_dict = util.load_pickle(player_dict_path) print('Loading champion tags...') champ_tags = util.load_pickle(champ_tags_path) champ_tags_list = list(champ_tags[0]) champ_tags_dict = champ_tags[1] # get data from the dict print('Getting features from the dict...') # each player feature will have six elements (one for each champ tag) # ORDER: Tank, Marksman, Support, Fighter, Mage, Assassin n_p_feature = 44 player_feature_dict_pre = {} col_to_del = [1, 34, 35] for pid in player_dict: player_feature_dict_pre[pid] = {c.MATCH_COUNT : np.zeros((6,)), c.FEATURES : []} for i in range(6): player_feature_dict_pre[pid][c.FEATURES].append(np.zeros((n_p_feature,))) for cid in player_dict[pid]: for t in champ_tags_dict[cid]: cur_f = np.delete(player_dict[pid][cid][c.FEATURES], col_to_del) player_feature_dict_pre[pid][c.FEATURES][champ_tags_list.index(t)] += cur_f player_feature_dict_pre[pid][c.MATCH_COUNT][champ_tags_list.index(t)] += player_dict[pid][cid][c.MATCH_COUNT] for i, f in enumerate(player_feature_dict_pre[pid][c.FEATURES]): cur_m_count = player_feature_dict_pre[pid][c.MATCH_COUNT][i] f /= (cur_m_count if cur_m_count > 0 else 1) # player_features_id = np.array([np.append(player_dict[pid][c.FEATURES], pid) for pid in player_dict]) # last column is pid # player_features = player_features_id[:, 0 : -1] # n = len(player_features) # # # # 2D embedding of the digits dataset # print("Computing embedding") # player_features = manifold.SpectralEmbedding(n_components=2).fit_transform(player_features) # print("Done.") # construct new features as a team play style (currently a simple aggregation of all the players' play style) print('{} Constructing new dataset...'.format(time.time() - st)) n_feature = n_p_feature * len(champ_tags_list) features = [] label = [] flip = False # flag for flip win/lose every match for mid, m in match_dict.items(): win_f = np.zeros((n_feature,)) loss_f = np.zeros((n_feature,)) team_f = [win_f, loss_f] for t_ind, team in enumerate(m): ct_count = np.zeros((6,)) # counts for each champion tag for ind, pid in enumerate(team[c.TEAM_INFO_PLAYERS]): champ_id = team[c.TEAM_INFO_CHAMPIONS][ind] champ_tags = champ_tags_dict[champ_id] for ct in champ_tags: ct_ind = champ_tags_list.index(ct) ct_count[ct_ind] += 1 start_col = 0 + ct_ind * n_p_feature end_col = (ct_ind + 1) * n_p_feature cur_pf = player_feature_dict_pre[pid][c.FEATURES][ct_ind] # print("ct: {}, ct_ind: {}, start_col: {}, end_col: {}".format(ct, ct_ind, start_col, end_col)) # print(team_f[t_ind][start_col:end_col]) # print(cur_pf) team_f[t_ind][start_col:end_col] += cur_pf for ctc_ind, ctc in enumerate(ct_count): start_col = 0 + ctc_ind * n_p_feature end_col = (ctc_ind + 1) * n_p_feature if ctc > 1: team_f[t_ind][start_col:end_col] /= ctc elif ctc == 0: for pid in team[c.TEAM_INFO_PLAYERS]: team_f[t_ind][start_col:end_col] += player_feature_dict_pre[pid][c.FEATURES][ctc_ind] team_f[t_ind][start_col:end_col] /= 5 if np.random.random_sample() >= 0.5: features.append(np.append(loss_f, win_f)) # features.append(loss_f - win_f) label.append(-1) else: features.append(np.append(win_f, loss_f)) # features.append(win_f - loss_f) label.append(1) flip = not flip # flip the flag features = np.array(features) label = np.array(label) # features = normalize(features) # prepare training and testing set print('{} Start training...'.format(time.time() - st)) k = 9 k_folds = Preprocess.prepare_k_folds([features, label], k) for i in range(k): tr_data, te_data = Preprocess.get_i_fold(k_folds, i) tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) # train with some algorithm # clf1 = LogisticRegression(random_state=123) # 0.57 cc = 0.01 kernel = 'rbf' tol = 0.01 # clf1 = svm.SVC(C=cc, kernel=kernel, tol=tol) # rbf: 0.5, # clf1 = KNeighborsClassifier(n_neighbors=4) # 3: 0.55, 4: 0.53 clf1 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200) clf1.fit(tr_data[0], tr_data[1]) tr_pred1 = clf1.predict(tr_data[0]) te_pred1 = clf1.predict(te_data[0]) # NN # net = buildNetwork(, 3, 1) tr_acc = (tr_pred1 == tr_data[1]).sum() / tr_n te_acc = (te_pred1 == te_data[1]).sum() / te_n print('Training acc: {}, Testing acc: {}'.format(tr_acc, te_acc))
def main(): # training parameter k = 10 # fold round_limit = 100 result_path = 'results/spamODSBoosting_final.acc' model_name = 'spam_' + str(k) + 'fold' threshes_path = 'data/spambase.threshes' # laod and preprocess training data training_data = loader.load_dataset('data/spambase.data') # TODO convert labels from {0, 1} to {-1, 1} util.replace_zero_label_with_neg_one(training_data) # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training training_errs = [] testing_errs = [] round_err_1st_boost = None tr_errs_1st_boost = None te_errs_1st_boost = None te_auc_1st_boost = None te_roc_1st_boost = None roc = [] auc = 0.0 k_folds = Preprocess.prepare_k_folds(training_data, k) for i in range(1): tr_data, te_data = Preprocess.get_i_fold(k_folds, i) tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) # TODO prepare distribution d = util.init_distribution(len(tr_data[0])) # TODO compute thresholds cheat sheet thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes) boost = b.Boosting(d) testing_predict = np.zeros((1, te_n)).tolist()[0] training_predict = np.zeros((1, tr_n)).tolist()[0] round_tr_err = [] round_te_err = [] round_model_err = [] round_te_auc = [] converged = False tol = 1e-5 te_auc = 2. round = 0 while round < round_limit: # and not converged: round += 1 boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs) boost.update_predict(tr_data[0], training_predict) boost.update_predict(te_data[0], testing_predict) c_model_err = boost.model[-1].w_err round_model_err.append(c_model_err) c_f_ind = boost.model[-1].f_ind c_thresh = boost.model[-1].thresh c_tr_err = util.get_err_from_predict(training_predict, tr_data[1]) c_te_err = util.get_err_from_predict(testing_predict, te_data[1]) # TODO calculate the AUC for testing results c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1]) round_tr_err.append(c_tr_err) round_te_err.append(c_te_err) round_te_auc.append(c_te_auc) print('Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {:.12f}'.format(round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err, c_te_auc)) converged = abs(c_te_auc - te_auc) / te_auc <= tol te_auc = c_te_auc training_errs.append(round_tr_err[-1]) testing_errs.append(round_te_err[-1]) if i == 0: round_err_1st_boost = round_model_err tr_errs_1st_boost = round_tr_err te_errs_1st_boost = round_te_err te_auc_1st_boost = round_te_auc _, te_roc_1st_boost = util.get_auc_from_predict(testing_predict, te_data[1], True) # break # for testing mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print(str(k) + '-fold validation done. Training errs are:') print(training_errs) print('Mean training err is:') print(mean_training_err) print('Testing errs are:') print(testing_errs) print('Mean testing err is:') print(mean_testing_err) result = {} result['Fold'] = k result['Trainingerrs'] = training_errs result['MeanTrainingAcc'] = mean_training_err result['Testingerrs'] = testing_errs result['MeanTestingAcc'] = mean_testing_err result['1stBoostTrainingError'] = tr_errs_1st_boost result['1stBoostTestingError'] = te_errs_1st_boost result['1stBoostModelError'] = round_err_1st_boost result['1stBoostTestingAUC'] = te_auc_1st_boost result['1stBoostTestingROC'] = te_roc_1st_boost # result['ROC'] = str(roc) result['AUC'] = auc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)