def accuracy(train, test): turn = 4 entityScoreSum = 0 emotionScoreSum = 0 for i in range(turn): # 切分数据 data_split('../coreEntityEmotion_baseline/data', 'coreEntityEmotion_train.txt') # 训练 train.train_ents() # 测试 test.test() # 计算F1 entityScore, emotionScore = computeF1Score( '../coreEntityEmotion_baseline/data/coreEntityEmotion_train.txt', '../coreEntityEmotion_baseline/data/2_coreEntityEmotion_train_result.txt' ) print('turn:', i + 1, 'entityScore:', entityScore, 'emotionScore:', emotionScore) # 统计F1 entityScoreSum += entityScore emotionScoreSum += emotionScore # 输出平均值 print('平均entityScore:', entityScoreSum / turn, '平均emotionScore:', emotionScoreSum / turn) return entityScoreSum / turn, emotionScoreSum / turn
def accuracy(train, test, feature_ents_func): turn = 4 entityScoreSum = 0 emotionScoreSum = 0 for i in range(turn): # 切分数据 data_split('../coreEntityEmotion_baseline/data', 'coreEntityEmotion_train.txt') # trueData = loadTrueData('../coreEntityEmotion_baseline/data/8_coreEntityEmotion_train.txt') # entity_list= [] # for newsId in trueData: # entity_list+= trueData[newsId]['entity'] # # feature_ents_func.set_train_data_entity(entity_list) # 训练 train.train_ents() # 测试 test.test() # 计算F1 entityScore, emotionScore = computeF1Score( '../coreEntityEmotion_baseline/data/coreEntityEmotion_train.txt', '../coreEntityEmotion_baseline/data/2_coreEntityEmotion_train_result.txt' ) print('turn:', i + 1, 'entityScore:', entityScore, 'emotionScore:', emotionScore) # 统计F1 entityScoreSum += entityScore emotionScoreSum += emotionScore # 输出平均值 print('平均entityScore:', entityScoreSum / turn, '平均emotionScore:', emotionScoreSum / turn) return entityScoreSum / turn, emotionScoreSum / turn
# smoothen step values due to more fluctuation valid_acc_steps = sp.savgol_filter(valid_acc_steps, 3, 1) train_acc_steps = sp.savgol_filter(train_acc_steps, 3, 1) plot_data_steps(valid_acc_steps, train_acc_steps, max_loc_steps) ###### if __name__ == "__main__": # seed of 6 was found to be most optimal seed = 6 split_size = 0.2 # convert all files, normalize and split data. # convert_to_numpy() # commented out since files do not need be constantly converted # normalize_data() # commented out since files do not need be constantly normalized data_split(seed, split_size) # load data train_data = np.load('./data/train_data.npy') train_label = np.load('./data/train_label.npy') val_data = np.load('./data/val_data.npy') val_label = np.load('./data/val_label.npy') # set hyperparameters lr = 0.005175 steps = 100 epochs = 500 bs = 32 # train if necessary train()
from sprint2.test import Test from data_split import data_split from f1_score import computeF1Score from sprint2.train import Train if __name__ == '__main__': from time import time start = time() test = Test() turn = 5 entityScoreSum = 0 emotionScoreSum = 0 for i in range(turn): # 切分数据 data_split('../coreEntityEmotion_baseline/data', 'coreEntityEmotion_train.txt') # 训练 # trainer = Train() # trainer.trainCoreEntity() # trainer.trainEmotion() # 测试 test.testCoreEntity('../coreEntityEmotion_baseline/data/2_coreEntityEmotion_train.txt', '../coreEntityEmotion_baseline/data/2_coreEntityEmotion_train_result.txt') # 计算F1 entityScore, emotionScore = computeF1Score('../coreEntityEmotion_baseline/data/coreEntityEmotion_train.txt', '../coreEntityEmotion_baseline/data/2_coreEntityEmotion_train_result.txt') print('turn:', i + 1, 'entityScore:', entityScore, 'emotionScore:', emotionScore) # 统计F1 entityScoreSum += entityScore emotionScoreSum += emotionScore # 输出平均值
def main_loop(config_list, final_table, count): print( " chosen_method: %s \n expanding_windows: %d \n optimize_method: %d \n randomizedsearch: %d \n" % (config_list['chosen_method'], config_list['expanding_window_ml'], config_list['optimize_method'], config_list['randomized_search'])) online_measures = pd.DataFrame({ 'day': [], 'accuracy': [], 'f-measure': [] }) opt_modulo_params_list = [] correct_pred = [] tp_list = [] fp_list = [] tn_list = [] fn_list = [] y_pred_list = [] y_prob_list = [] tpr_list = [] fpr_list = [] opt_modulo_params = 0 # +10 beschreiben! for x in range(tune_size + 10, len(X) - 1): ### # Test/Train Split in every iteration x ### X_train, X_test, y_train, y_test = ds.data_split( X, y, x, timeseries, config_list) ### # Actual Training and Prediction ### if config_list['chosen_method'] == config.method[0]: y_pred, opt_modulo_params = ml_nb.classifier( X_train, y_train, X_test, x, tune_size, config_list, opt_modulo_params) elif config_list['chosen_method'] == config.method[1]: y_pred, opt_modulo_params = ml_rf.classifier( X_train, y_train, X_test, x, tune_size, config_list, opt_modulo_params) elif config_list['chosen_method'] == config.method[2]: y_pred, opt_modulo_params = ml_svc.classifier( X_train, y_train, X_test, x, tune_size, config_list, opt_modulo_params) elif config_list['chosen_method'] == config.method[3]: y_pred, opt_modulo_params = ml_ann.classifier( X_train, y_train, X_test, x, tune_size, config_list, opt_modulo_params) else: print("Select a classifier in the config file!") ### # Evaluation; Initially a function was written; Though the function was difficult to implement in the running program ### if opt_modulo_params != 0 and x % 200 == 0: opt_modulo_params_w_day = opt_modulo_params.copy() opt_modulo_params_w_day['day'] = x opt_modulo_params_w_day['chosen_method'] = config_list[ 'chosen_method'] opt_modulo_params_list.append(opt_modulo_params_w_day) try: y_pred = int(test_predictions) y_test = int(y_test) except: pass y_pred_list.append(y_pred) tp_list.append(y_pred == +1 and y_test == +1) fp_list.append(y_pred == +1 and y_test == -1) tn_list.append(y_pred == -1 and y_test == -1) fn_list.append(y_pred == -1 and y_test == +1) eval_tuple = (tp_list, fp_list, tn_list, fn_list) pos_prec = 0 neg_prec = 0 pos_recall = 0 neg_recall = 0 accuracy = 0 f_measures = 0 tp = tp_list.count(True) #einfachere Init raussuchen fp = fp_list.count(True) tn = tn_list.count(True) fn = fn_list.count(True) try: #Accuracy accuracy = (tp + tn) / (tp + fp + tn + fn) #pos Precision and Recall pos_prec = tp / (tp + fp) pos_recall = tp / (tp + fn) #F-Measures f_measures = (2 * pos_prec * pos_recall) / (pos_prec + pos_recall) #negative Precision andRecall neg_prec = tn / (tn + fn) neg_recall = tn / (tn + fp) except: print("Division by zero") current_measures = pd.DataFrame({ 'day': [x], 'accuracy': [accuracy], 'f-measure': [f_measures] }) print(current_measures) online_measures = online_measures.append(current_measures) ### # Documentation ### if config.documentation == 1: documentation.save_doc(config_list, online_measures, dataset_input, df, count, opt_modulo_params_list) series_frame = documentation.concat_results(online_measures, config_list, dataset_input, start_time) pos_neg_total = pd.DataFrame({ 'tp': [tp], 'fp': [fp], 'tn': [tn], 'fn': fn }) if config_list['optimize_method'] == 1: print(opt_modulo_params_list) opt_modulo_params_list = pd.DataFrame(opt_modulo_params_list) opt_modulo_params_list.to_excel('%d_opt_modulo_params_list.xlsx' % count) final_table.to_excel('%d_final_table.xlsx' % count) online_measures.iloc[-1] print("Mainloop was executed") return online_measures, series_frame, pos_neg_total
# ? 因为完全依赖于距离的计算,对于维度大的数据,有维度灾难的问题。 百万级别的维度会不能处理。 # ============================================================================= from data_split import data_split from data_split import calc_ac from knnmy import KNNCLF from scaling import scaling # ============================================================================= from sklearn import datasets #test KNNCLF iris = datasets.load_iris() # seed 固定, 便于测试随机数 # seed = 123 时 scaling 效果差, 456 效果好 Xtr, Ytr, Xt, Yt = data_split(iris.data, iris.target.reshape(-1,1), seed=123) sc = scaling() sc.fit(Xtr) Xtr1 = sc.transform(Xtr) Xtr1 = Xtr knnmy = KNNCLF() knnmy.fit(Xtr1,Ytr) Xt1 = sc.transform(Xt) Xt1 = Xt y_pred = knnmy.predict2all(Xt1) ac1 = calc_ac(y_pred, Yt) #test sklearn