def mix_data(normal_file, attack_file, label_dict, output_dir): def open_file(input_file): X = [] with open(input_file, 'r') as in_hdl: for line in in_hdl: line_arr = line.strip().split(',') X.append(line_arr) return X X_normal = open_file(normal_file) y_normal = [label_dict['normal']] * len(X_normal) X_attack = open_file(attack_file) y_attack = [label_dict['attack']] * len(X_attack) min_val = min(len(y_normal), len(y_attack)) X = [] y = [] X_normal = X_normal[:min_val] y_normal = y_normal[:min_val] X_attack = X_attack[:min_val] y_attack = y_attack[:min_val] X.extend(X_normal) y.extend(y_normal) X.extend(X_attack) y.extend(y_attack) return X, y
def plot_data(gan_loss_file, gan_decision_file, name='gan'): X, _ = open_file(gan_loss_file, has_y_flg=False) show_figures(X[:, 0], X[:, 1], name) X, _ = open_file(input_f=gan_decision_file, has_y_flg=False) show_figures_2(X, name)
def main(normal_f='', attack_f='', gan_type='dcgan', epochs=10, label_dict={ 'normal': '0', 'attack': '1' }, output_dir='./log', select_train_size=0.01, show_flg=False, random_state=1, tp_tn_train_flg=True, time_str=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())): """ :param normal_f: :param attack_f: :param output_dir: :param gan_type= 'naive_gan' or 'dcgan' :param tp_tn_train_flg: train normal and attack gan only the svm predict correctly values (tp and tn) from case2 train set. :return: """ # step 1 achieve raw mix normal and attack data, and make (attack data size == normal data size). if not os.path.exists(output_dir): os.makedirs(output_dir) orig_mix_data = mix_data(normal_f, attack_f, label_dict, output_dir) X, y = orig_mix_data print( f"original data shape: {np.asarray(X, dtype=float).shape, np.asarray(y,dtype=int).shape} and y_label:{Counter(y)}" ) if show_flg: pca_show(X, y, name='pca n_components=2 on all data') # step 1.2 features reduction orig_mix_data = dimension_reduction(X, y, n_components=3) print() # step 1.3 split train, val and test print('case1...') # 70% train, 10% val and 20% test train_70, val_10, test_20 = split_train_val_test_data( orig_mix_data, train_val_test_ratio=[0.7, 0.1, 0.2]) case1_data = { 'train_set': train_70, 'val_set': val_10, 'test_set': test_20 } norm_X_train_70, min_val, max_val, range_val = normalizate_data( case1_data['train_set'][0]) norm_X_val_10 = (case1_data['val_set'][0] - min_val) / range_val print( f'after normalization, val_set range is {np.max(norm_X_val_10, axis=0)- np.min(norm_X_val_10, axis=0)}' ) norm_X_test_20 = (case1_data['test_set'][0] - min_val) / range_val print( f'after normalization, test_set range is {np.max(norm_X_test_20, axis=0)- np.min(norm_X_test_20, axis=0)}' ) case1_norm_data = { 'train_set': (norm_X_train_70, train_70[1]), 'val_set': (norm_X_val_10, case1_data['val_set'][1]), 'test_set': (norm_X_test_20, case1_data['test_set'][1]) } print( f"case1 train_set:{case1_norm_data['train_set'][0].shape, case1_norm_data['train_set'][1].shape}, " f"val_set:{case1_norm_data['val_set'][0].shape, case1_norm_data['val_set'][1].shape}, " f"test_set:{case1_norm_data['test_set'][0].shape, case1_norm_data['test_set'][1].shape}" ) print(f"case1 y_train:{Counter(case1_norm_data['train_set'][1])}, " f"y_val:{Counter(case1_norm_data['val_set'][1])}, " f"y_test:{Counter(case1_norm_data['test_set'][1])}\n") case1_train_f = save_numpy_data(case1_norm_data['train_set'], output_f=os.path.join( output_dir, 'case1_train_file.txt')) case1_val_f = save_numpy_data(case1_norm_data['val_set'], output_f=os.path.join( output_dir, 'case1_val_file.txt')) case1_test_f = save_numpy_data(case1_norm_data['test_set'], output_f=os.path.join( output_dir, 'case1_test_file.txt')) print('case2...') ## 70%*0.3 train, 10% val and 20% test X_train, y_train = train_70 X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=select_train_size, random_state=random_state) case2_data = { 'train_set': (X_train, y_train), 'val_set': val_10, 'test_set': test_20 } norm_X_train_70_03, min_val, max_val, range_val = normalizate_data( np.asarray( case2_data['train_set'][0], dtype=float)) # min, max, range maybe change on different train. norm_X_val_10 = (case2_data['val_set'][0] - min_val) / range_val print( f'after normalization, val_set range is {np.max(norm_X_val_10, axis=0)- np.min(norm_X_val_10, axis=0)}' ) norm_X_test_20 = (case2_data['test_set'][0] - min_val) / range_val print( f'after normalization, test_set range is {np.max(norm_X_test_20, axis=0)- np.min(norm_X_test_20, axis=0)}' ) case2_norm_data = { 'train_set': (norm_X_train_70_03, y_train), 'val_set': (norm_X_val_10, case2_data['val_set'][1]), 'test_set': (norm_X_test_20, case2_data['test_set'][1]) } print( f"case2 train_set:{case2_norm_data['train_set'][0].shape, case2_norm_data['train_set'][1].shape}, " f"val_set:{case2_norm_data['val_set'][0].shape, case2_norm_data['val_set'][1].shape}, " f"test_set:{case2_norm_data['test_set'][0].shape, case2_norm_data['test_set'][1].shape}" ) print(f"case2 y_train:{Counter(case2_norm_data['train_set'][1])}, " f"y_val:{Counter(case2_norm_data['val_set'][1])}, " f"y_test:{Counter(case2_norm_data['test_set'][1])}\n") case2_train_f = save_numpy_data(case2_norm_data['train_set'], output_f=os.path.join( output_dir, 'case2_train_file.txt')) # case2_val and case2_test are the same with case1 and case3 save_numpy_data( case2_norm_data['val_set'], output_f=os.path.join(output_dir, 'case2_val_file.txt') ) # case2_val and case2_test are the same with case1 and case3, however, the range_val will be different. save_numpy_data(case2_norm_data['test_set'], output_f=os.path.join(output_dir, 'case2_test_file.txt')) X_normal, y_normal, X_attack, y_attack = split_mix_data( case2_norm_data['train_set'], label_dict) save_numpy_data((X_normal, y_normal), output_f=os.path.join(output_dir, 'case2_normal_train_file.txt')) save_numpy_data((X_attack, y_attack), output_f=os.path.join(output_dir, 'case2_attack_train_file.txt')) # step2 train and evaluate on traditional machine learning print('case1 train svm on 70% train-set, 30% test-set') # svm_evalution(case1_norm_data['train_set'], case1_norm_data['val_set'], case1_norm_data['test_set'], # name='svm on 70% train set') print( f'case2 train svm on 70%*{select_train_size*100}% train-set, 30% test-set' ) tp_normal, tn_attack = svm_evalution( case2_norm_data['train_set'], case2_norm_data['val_set'], case2_norm_data['test_set'], name=f'svm on 70%*{select_train_size*100}% train set') if tp_tn_train_flg: print( 'normal and attack gan will be trained on different train set size according to the confusion matrix of svm.' ) X_normal, y_normal = tp_normal # for training GAN, only save svm predict correctly normal values (True Positive) X_attack, y_attack = tn_attack # for training GAN, only save svm predict correctly attack values (True Negative) else: print('normal and attack gan will be trained on equal train set size.') tp_normal_train_f = save_numpy_data( (X_normal, y_normal), output_f=os.path.join(output_dir, 'case2_tp_normal_train_file.txt')) tn_attack_train_f = save_numpy_data( (X_attack, y_attack), output_f=os.path.join(output_dir, 'case2_tn_attack_train_file.txt')) # step 3. build different gan for normal and attack data separately, just use train_size = 0.3. num = 10000 print( f'\nnormal_gan on train set:{np.asarray(X_normal, dtype=float).shape, np.asarray(y_normal,dtype=int).shape}' ) # 3.1 noraml_gan new_gen_normal_f, normal_gan_loss_file, normal_gan_decision_file = run_gan_main( input_f=tp_normal_train_f, name='normal', generated_num=num, output_dir=output_dir, epochs=epochs, show_flg=show_flg, gan_type=gan_type, time_str=time_str) print( f'\nattack_gan on train set:{np.asarray(X_attack, dtype=float).shape, np.asarray(y_attack,dtype=int).shape}' ) new_gen_attack_f, attack_gan_loss_file, attack_gan_decision_file = run_gan_main( input_f=tn_attack_train_f, name='attack', generated_num=num, output_dir=output_dir, epochs=epochs, show_flg=show_flg, gan_type=gan_type, time_str=time_str) # 3.3 merge the generated data (normal and attack data) (_, _), generate_train_f = mix_normal_attack_and_label( new_gen_normal_f, new_gen_attack_f, label_dict=label_dict, start_feat_idx=[0, '-'], output_f=os.path.join(output_dir, 'case3_generated_%d_mix_data.csv' % num)) # step 4. mix original train data and new generated data new_train_f = mix_two_files(case2_train_f, generate_train_f, output_f=os.path.join( output_dir, 'case3_new_train_set.csv')) print( '\'new train set\' (includes original train set and generated train) is in \'%s\'\n.' % new_train_f) # step 5. train and test on ML on the new data print('case3...') X_new_train, y_new_train = open_file( new_train_f ) # GAN use sigmoid, so there does not need to do normalization again. # case3_data = {'train_set':(X_new_train, y_new_train), # 'val_set': (val_10[0], val_10[1]), # 'test_set': (test_20[0], test_20[1])} # # case3 use the same range_val as case 2 on train set because both of them have the same original train set(70%*0.3) # norm_X_train_new, min_val, max_val, range_val = normalizate_data(case3_data['train_set'][0]) # norm_X_val_10 = (case3_data['val_set'][0] - min_val) / range_val # print(f'after normalization, val_set range is {np.max(norm_X_val_10, axis=0)- np.min(norm_X_val_10, axis=0)}') # norm_X_test_20 = (case3_data['test_set'][0] - min_val) / range_val # print(f'after normalization, test_set range is {np.max(norm_X_test_20, axis=0)- np.min(norm_X_test_20, axis=0)}') # case3_norm_data = {'train_set': (norm_X_train_new, y_new_train), # 'val_set': (norm_X_val_10, case3_data['val_set'][1]), # 'test_set': (norm_X_test_20, case3_data['test_set'][1])} # np.random.shuffle(zip(X_new_train, y_new_train)) # cannot successful X_new_train, y_new_train = shuffle(X_new_train, y_new_train, random_state=random_state) case3_norm_data = { 'train_set': (X_new_train, y_new_train), 'val_set': case2_norm_data['val_set'], 'test_set': case2_norm_data['test_set'] } print( f"case3 train_set:{case3_norm_data['train_set'][0].shape, case3_norm_data['train_set'][1].shape}, " f"val_set:{case3_norm_data['val_set'][0].shape, case3_norm_data['val_set'][1].shape}, " f"test_set:{case3_norm_data['test_set'][0].shape, case3_norm_data['test_set'][1].shape}" ) print(f"case3 y_train:{Counter(case3_norm_data['train_set'][1])}, " f"y_val:{Counter(case3_norm_data['val_set'][1])}, " f"y_test:{Counter(case3_norm_data['test_set'][1])}\n") print('case3 train svm on new train and evaluate on 30% test') svm_evalution(case3_norm_data['train_set'], case3_norm_data['val_set'], case3_norm_data['test_set'], name='svm on new train set') return case1_train_f, case2_train_f, new_train_f, case1_val_f, case1_test_f
def main(normal_f='', attack_f='', epochs=10, label_dict={ 'normal': '0', 'attack': '1' }, output_dir='./log', select_train_size=0.03, show_flg=False): """ :param normal_f: :param attack_f: :param output_dir: :return: """ # step 1 load data (attack data size == normal data size) # step 1.1. mix normal (label = 0)and attack (label=1) data, then normalize the mixed data [0,1] (X, y), output_f = mix_normal_attack_and_label( normal_f, attack_f, label_dict=label_dict, start_feat_idx=[0, '-'], output_f=os.path.join(output_dir, 'original_mix_data.csv')) if show_flg: pca_show(X, y, name='pca n_components=2 on all data') # X= np.asarray(X, dtype=float) X = normalizate_data(np.asarray(X, dtype=float)) X, y = dimension_reduction(X, y, n_components=5) if show_flg: pca_show(X, y, name='pca n_components=2 on dimension reduction data') X = normalizate_data(np.asarray(X, dtype=float)) y = np.asarray(y, dtype=int) _ = save_numpy_data( (X, y), output_f=os.path.join(output_dir, 'original_normalized_mix_data.txt')) # t_sne_show(X, y) # step 1.2. split train (70%) and test (30%) on original mixed data X_train, X_test, y_train, y_test = train_test_split(np.asarray( X, dtype=float), np.asarray(y, dtype=int), test_size=0.3, random_state=1) train_set = (X_train, y_train) if show_flg: pca_show(X_train, y_train, name='pca n_components=2, 70% train_set') test_set = (X_test, y_test) if show_flg: pca_show(X_test, y_test, name='pca n_components=2, 30% test_set') original_train_f = save_numpy_data( train_set, output_f=os.path.join(output_dir, 'original_normalized_train_set.csv')) original_test_f = save_numpy_data( test_set, output_f=os.path.join( output_dir, 'original_normalized_test_set.csv')) # for evalution print('test svm on 70% train-set, 30% test-set') test_SVM(X_train, y_train, X_test, y_test, name='svm on 70% train set') # step 4. separate normal and attack data in original train set _, X_selected, _, y_selected = train_test_split( X_train, y_train, test_size=select_train_size, random_state=1) original_select_train_f = save_numpy_data( (X_selected, y_selected), output_f=os.path.join(output_dir, 'original_selected_mix_data.csv')) print('test svm on %.2f%% train-set, 30%% test-set' % (select_train_size * 100)) X_predict_normal,y_predict_normal, X_predict_attack, y_predict_attack=\ test_SVM(X_selected,y_selected, X_test, y_test, name='svm on %.2f%% train set'%(select_train_size*100)) normal_train_set_f = save_numpy_data( (X_predict_normal, y_predict_normal), output_f=os.path.join(output_dir, 'normal_data_for_GAN.csv')) attack_train_set_f = save_numpy_data( (X_predict_attack, y_predict_attack), output_f=os.path.join(output_dir, 'attack_data_for_GAN.csv')) # step 5. build different gan for normal and attack data separately, just use train_size = 0.3. num = 10000 # 5.1 noraml_gan new_gen_normal_f, normal_gan_loss_file, normal_gan_decision_file = run_gan_main( input_f=normal_train_set_f, name='normal', generated_num=num, output_dir=output_dir, epochs=epochs, show_flg=show_flg) # 5.2 attack_gan new_gen_attack_f, attack_gan_loss_file, attack_gan_decision_file = run_gan_main( input_f=attack_train_set_f, name='attack', generated_num=num, output_dir=output_dir, epochs=epochs, show_flg=show_flg) # 5.3 merge the generated data (normal and attack data) (_, _), generate_train_f = mix_normal_attack_and_label( new_gen_normal_f, new_gen_attack_f, label_dict=label_dict, start_feat_idx=[0, '-'], output_f=os.path.join(output_dir, 'generated_%d_mix_data.csv' % num)) # step 6. mix original train data and new generated data new_train_set_f = mix_two_files(original_select_train_f, generate_train_f, output_f=os.path.join( output_dir, '0_new_train_set.csv')) print( '\'new train set\' (includes original train set and generated train_set) is in \'%s\'.' % new_train_set_f) print('test svm on new train-set, 30% test-set') X_new_train, y_new_train = open_file(new_train_set_f) test_SVM(X_new_train, y_new_train, X_test, y_test, name='svm on new train set') return original_train_f, original_select_train_f, new_train_set_f, original_test_f