def SVM_validation(X_train, y_train, X_val, y_val, sample_amount, data_ratio, C_parameter, gamma_parameter): #sample_amount = 40000 #data_ratio = 1.2 label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1))) label_2_amount = int(sample_amount - label_1_amount) tpr_list_temp = [] fpr_list_temp = [] BER_list_temp = [] f1_score_list_temp = [] iter_max = 4 count = 0 while (count < iter_max): sample_feature, sample_label = choose_data_seperately( X_train, y_train, label_1_amount, label_2_amount) Precall, f1_score, BER, FPR, y_pred = SVM_base_fuction( sample_feature, sample_label, X_val, y_val, 2**C_parameter, 2**gamma_parameter) tpr_list_temp.append(Precall) fpr_list_temp.append(FPR) BER_list_temp.append(BER) f1_score_list_temp.append(f1_score) count = count + 1 tpr = (sum(tpr_list_temp) / len(tpr_list_temp)) fpr = (sum(fpr_list_temp) / len(fpr_list_temp)) BER = (sum(BER_list_temp) / len(BER_list_temp)) f1_score = (sum(f1_score_list_temp) / len(f1_score_list_temp)) return tpr, fpr, BER, f1_score
def kNN_validation(X_train, y_train, X_val, y_val, sample_amount, data_ratio, k_value): #sample_amount = 40000 #data_ratio = 1.2 label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1))) label_2_amount = int(sample_amount - label_1_amount) tpr_list_temp = [] fpr_list_temp = [] BER_list_temp = [] f1_score_list_temp = [] iter_max = 10 count = 0 while (count < iter_max): sample_feature, sample_label = choose_data_seperately( X_train, y_train, label_1_amount, label_2_amount) Precall, f1_score, BER, FPR = kNN_base_function( sample_feature, sample_label, X_val, y_val, k_value) tpr_list_temp.append(Precall) fpr_list_temp.append(FPR) BER_list_temp.append(BER) f1_score_list_temp.append(f1_score) count = count + 1 tpr = (sum(tpr_list_temp) / len(tpr_list_temp)) fpr = (sum(fpr_list_temp) / len(fpr_list_temp)) BER = (sum(BER_list_temp) / len(BER_list_temp)) f1_score = (sum(f1_score_list_temp) / len(f1_score_list_temp)) return tpr, fpr, BER, f1_score
def parameter_adjust(X_train, y_train, sample_amount, data_ratio): start = time.time() label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1))) label_2_amount = int(sample_amount - label_1_amount) Result_List = CalParList(3, "gamma_exp", "C_exp", "time") for gamma_exp in [-15, -13, -11, -9, -7, -5, -3, -1, 1, 3]: for C_exp in [-5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15]: start1 = time.time() sample_feature, sample_label = choose_data_seperately( X_train, y_train, label_1_amount, label_2_amount) tpr, fpr, BER, f1_score, time_var = SVM_cross_validation( sample_feature, sample_label, 2**C_exp, 2**gamma_exp) Result_List.list_append(tpr, f1_score, BER, fpr, gamma_exp, C_exp, time_var) #print("fit time:%5.1fminute"%(temp)) print("the total executing time:%5.1fminute" % ((time.time() - start) / 60)) result = Result_List.return_result() return result
def kNN_data_ratio_adjust(training_feature, training_label, k_value): start = time.time() label_1_amount = 40000 label_2_amount = 10000 Result_List = CalParList(4, "label_1_amount", "label_2_amount", "ratio", "time") iter_amount = 5 train_data = pd.concat([training_feature, training_label['label']], axis=1, join='outer') while (label_1_amount > 2000): count = 0 Cal_Result_List = CalList() time_list_temp = [] while (count < iter_amount): start1 = time.time() sample_feature, sample_label = choose_data_seperately( training_feature, training_label, label_1_amount, label_2_amount) Precall, FPR, BER, f1_score, time_var = kNN_cross_validation( sample_feature, sample_label, k_value) Cal_Result_List.list_append(Precall, f1_score, BER, FPR) time_list_temp.append(time_var) count = count + 1 Precall, FPR, BER, f1_score = Cal_Result_List.list_average_cal() time_ave = sum(time_list_temp) / len(time_list_temp) Result_List.list_append(Precall, f1_score, BER, FPR, label_1_amount, label_2_amount, label_1_amount / label_2_amount, time_ave) #print("current data labe 1 size:%d ,fit time:%5.1fminute"%(t,(time.time()-start1)/60)) if (label_1_amount > 10000): label_1_amount = label_1_amount - 5000 else: label_1_amount = label_1_amount - 2500 print("the total executing time:%5.1fminute" % ((time.time() - start) / 60)) result = Result_List.return_result() return result
def parameter_adjust(X_train, y_train, sample_amount, data_ratio): start = time.time() tpr_list = [] fpr_list = [] BER_list = [] f1_score_list = [] time_list = [] gamma_exp_list = [] C_exp_list = [] label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1))) label_2_amount = int(sample_amount - label_1_amount) for gamma_exp in [-15, -13, -11, -9, -7, -5, -3, -1, 1, 3]: for C_exp in [-5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15]: start1 = time.time() sample_feature, sample_label = choose_data_seperately( X_train, y_train, label_1_amount, label_2_amount) tpr, fpr, BER, f1_score, time_var = SVM_cross_validation( sample_feature, sample_label, 2**C_exp, 2**gamma_exp) time_list.append(time_var) tpr_list.append(tpr) f1_score_list.append(f1_score) BER_list.append(BER) fpr_list.append(fpr) gamma_exp_list.append(gamma_exp) C_exp_list.append(C_exp) #print("fit time:%5.1fminute"%(temp)) print("the total executing time:%5.1fminute" % ((time.time() - start) / 60)) result = { "gamma_exp": gamma_exp_list, "C_exp": C_exp_list, "TPR": tpr_list, "FPR": fpr_list, "f1_score": f1_score_list, "BER": BER_list, "time": time_list } columns = ["gamma_exp", "C_exp", "f1_score", "TPR", "FPR", "BER", "time"] result = pd.DataFrame(data=result, columns=columns) return result
def SVC_data_ratio_adjust(X_train, y_train, sample_amount): start = time.time() Result_List = CalParList(4, "label_1_amount", "label_2_amount", "ratio", "time") iter_amount = 5 data_ratio = 4 while (data_ratio > 0.2): count = 0 Cal_Result_List = CalList() time_list_temp = [] label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1))) label_2_amount = int(sample_amount - label_1_amount) while (count < iter_amount): start1 = time.time() sample_feature, sample_label = choose_data_seperately( X_train, y_train, label_1_amount, label_2_amount) tpr, fpr, BER, f1_score, time_var = SVM_cross_validation( sample_feature, sample_label, 1, 'auto') time_list_temp.append(time_var) Cal_Result_List.list_append(tpr, f1_score, BER, fpr) count = count + 1 Precall, FPR, BER, f1_score = Cal_Result_List.list_average_cal() time_ave = sum(time_list_temp) / len(time_list_temp) Result_List.list_append(Precall, f1_score, BER, FPR, label_1_amount, label_2_amount, label_1_amount / label_2_amount, time_ave) if (data_ratio > 2): data_ratio = data_ratio / 2 elif (data_ratio < 0.8): data_ratio = data_ratio - 0.25 else: data_ratio = data_ratio - 0.1 print("the total executing time:%5.1fminute" % ((time.time() - start) / 60)) result = Result_List.return_result() return result
def kNN_k_parameter_adjust_with_specific_data_ratio(X_train, y_train, data_ratio): tpr_list = [] fpr_list = [] BER_list = [] f1_score_list = [] count = 0 iter_count = 5 while (count < iter_count): label_1_amount = int(20000 * data_ratio) label_2_amount = 20000 sample_feature, sample_label = choose_data_seperately( X_train, y_train, label_1_amount, label_2_amount) result = kNN_k_parameter_adjust(sample_feature, sample_label) if (count == 0): tpr_list = result['TPR'] fpr_list = result['FPR'] f1_score_list = result['f1_score'] BER_list = result['BER'] else: tpr_list = (count * tpr_list + result['TPR']) / (count + 1) fpr_list = (count * fpr_list + result['FPR']) / (count + 1) f1_score_list = (count * f1_score_list + result['f1_score']) / (count + 1) BER_list = (count * BER_list + result['BER']) / (count + 1) count = count + 1 result_total = { "k_value": result['k_value'], "TPR": tpr_list, "FPR": fpr_list, "f1_score": f1_score_list, "BER": BER_list } columns = ["k_value", "f1_score", "TPR", "FPR", "BER"] result = pd.DataFrame(data=result, columns=columns) return result
def kNN_validation(X_train, y_train, X_val, y_val, sample_amount, data_ratio, k_value): #sample_amount = 40000 #data_ratio = 1.2 label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1))) label_2_amount = int(sample_amount - label_1_amount) Cal_Result_List = CalList() iter_max = 10 count = 0 while (count < iter_max): sample_feature, sample_label = choose_data_seperately( X_train, y_train, label_1_amount, label_2_amount) Precall, f1_score, BER, FPR = kNN_base_function( sample_feature, sample_label, X_val, y_val, k_value) Cal_Result_List.list_append(Precall, f1_score, BER, FPR) count = count + 1 Precall, FPR, BER, f1_score = Cal_Result_List.list_average_cal() return Precall, FPR, BER, f1_score
def SVM_validation(X_train, y_train, X_val, y_val, sample_amount, data_ratio, C_parameter, gamma_parameter): #sample_amount = 40000 #data_ratio = 1.2 label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1))) label_2_amount = int(sample_amount - label_1_amount) Cal_Result_List = CalList() iter_max = 4 count = 0 while (count < iter_max): sample_feature, sample_label = choose_data_seperately( X_train, y_train, label_1_amount, label_2_amount) Precall, f1_score, BER, FPR, y_pred = SVM_base_fuction( sample_feature, sample_label, X_val, y_val, 2**C_parameter, 2**gamma_parameter) Cal_Result_List.list_append(Precall, f1_score, BER, FPR) count = count + 1 Precall, FPR, BER, f1_score = Cal_Result_List.list_average_cal() return Precall, FPR, BER, f1_score
def kNN_data_ratio_adjust(training_feature, training_label, k_value): start = time.time() label_1_amount = 40000 label_2_amount = 10000 label_1_amount_list = [] label_2_amount_list = [] ratio_list = [] tpr_list = [] fpr_list = [] BER_list = [] f1_score_list = [] time_list = [] iter_amount = 5 train_data = pd.concat([training_feature, training_label['label']], axis=1, join='outer') while (label_1_amount > 2000): count = 0 tpr_list_temp = [] fpr_list_temp = [] BER_list_temp = [] f1_score_list_temp = [] time_list_temp = [] while (count < iter_amount): start1 = time.time() sample_feature, sample_label = choose_data_seperately( training_feature, training_label, label_1_amount, label_2_amount) Precall, FPR, BER, f1_score, time_var = kNN_cross_validation( sample_feature, sample_label, k_value) time_list_temp.append(time_var) tpr_list_temp.append(Precall) fpr_list_temp.append(FPR) BER_list_temp.append(BER) f1_score_list_temp.append(f1_score) count = count + 1 label_1_amount_list.append(label_1_amount) label_2_amount_list.append(label_2_amount) ratio_list.append(label_1_amount / label_2_amount) tpr_list.append(sum(tpr_list_temp) / len(tpr_list_temp)) fpr_list.append(sum(fpr_list_temp) / len(fpr_list_temp)) BER_list.append(sum(BER_list_temp) / len(BER_list_temp)) f1_score_list.append(sum(f1_score_list_temp) / len(f1_score_list_temp)) time_list.append(sum(time_list_temp) / len(time_list_temp)) #print("current data labe 1 size:%d ,fit time:%5.1fminute"%(t,(time.time()-start1)/60)) if (label_1_amount > 10000): label_1_amount = label_1_amount - 5000 else: label_1_amount = label_1_amount - 2500 print("the total executing time:%5.1fminute" % ((time.time() - start) / 60)) result = { "label_1_amount": label_1_amount_list, "label_2_amount": label_2_amount_list, "label 1: label 2 ratio": ratio_list, "TPR": tpr_list, "FPR": fpr_list, "f1_score": f1_score_list, "BER": BER_list, "time": time_list } columns = [ "label_1_amount", "label_2_amount", "label 1: label 2 ratio", "f1_score", "TPR", "FPR", "BER", "time" ] result = pd.DataFrame(data=result, columns=columns) return result
def SVC_data_ratio_adjust(X_train, y_train, sample_amount): start = time.time() tpr_list = [] fpr_list = [] BER_list = [] f1_score_list = [] time_list = [] label_1_amount_list = [] label_2_amount_list = [] ratio_list = [] iter_amount = 5 data_ratio = 4 while (data_ratio > 0.2): count = 0 tpr_list_temp = [] fpr_list_temp = [] BER_list_temp = [] f1_score_list_temp = [] time_list_temp = [] label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1))) label_2_amount = int(sample_amount - label_1_amount) while (count < iter_amount): start1 = time.time() sample_feature, sample_label = choose_data_seperately( X_train, y_train, label_1_amount, label_2_amount) tpr, fpr, BER, f1_score, time_var = SVM_cross_validation( sample_feature, sample_label, 1, 'auto') time_list_temp.append(time_var) tpr_list_temp.append(tpr) fpr_list_temp.append(fpr) BER_list_temp.append(BER) f1_score_list_temp.append(f1_score) count = count + 1 label_1_amount_list.append(label_1_amount) label_2_amount_list.append(label_2_amount) ratio_list.append(label_1_amount / label_2_amount) tpr_list.append(sum(tpr_list_temp) / len(tpr_list_temp)) fpr_list.append(sum(fpr_list_temp) / len(fpr_list_temp)) BER_list.append(sum(BER_list_temp) / len(BER_list_temp)) f1_score_list.append(sum(f1_score_list_temp) / len(f1_score_list_temp)) time_list.append(sum(time_list_temp) / len(time_list_temp)) if (data_ratio > 2): data_ratio = data_ratio / 2 elif (data_ratio < 0.8): data_ratio = data_ratio - 0.25 else: data_ratio = data_ratio - 0.1 print("the total executing time:%5.1fminute" % ((time.time() - start) / 60)) result = { "label_1_amount": label_1_amount_list, "label_2_amount": label_2_amount_list, "label 1: label 2 ratio": ratio_list, "TPR": tpr_list, "FPR": fpr_list, "f1_score": f1_score_list, "BER": BER_list, "time": time_list } columns = [ "label_1_amount", "label_2_amount", "label 1: label 2 ratio", "f1_score", "TPR", "FPR", "BER", "time" ] result = pd.DataFrame(data=result, columns=columns) return result