def achieved_wgan_generated_data(input_file, sample_ratio=1, training_set_percent=0.9, critic=1, distance_threshold=0.05): X, Y = load_data_from_files_tmp(input_file, sample_ratio, preprocess=True) # print('X.shape:', X.shape, ' Y.shape:', np.asarray(Y).shape) print('X[0]:', X[0]) X = normalize_data(X, axis=0, low=-1, high=1, eps=1e-5) print('Normalized X[0]:', X[0]) print('X.shape:', X.shape, ' Y.shape:', np.asarray(Y).shape) print('label:', Counter(Y)) features_selected = [str(i) for i in range(X.shape[1]) ] # re-define new features name, E.g. '0', '1', ... show_flg = True save_flg = True in_size = X.shape[1] h_size = 12 out_size = 1 dtype = torch.float percent = training_set_percent mini_batch_size = 50 epochs = 0 # invalid for my function, can't be str root_dir = './original_data_wgan_data_' + '%.2f' % percent + 'percent_' + time.strftime( "%Y%m%d-%H:%M:%S", time.localtime()) if not os.path.exists(root_dir): os.mkdir(root_dir) training_set, testing_set = split_data(X, Y, percent) print('training percent', percent, ':training_set (Benigin and Attack)', training_set[0].shape, ', testing_set (Benigin and Attack)', testing_set[0].shape) if save_flg: test_percent = '%.2f' % (1 - percent) save_tensor_data(training_set, output_file=os.path.join( root_dir, str(percent) + '_training_set.csv')) save_tensor_data(testing_set, output_file=os.path.join( root_dir, str(test_percent) + '_testing_set.csv')) training_file_lst = [ os.path.join(root_dir, str(percent) + '_origin_training_set.arff'), os.path.join(root_dir, str(percent) + '_training_set.csv') ] merge_files(training_file_lst, header='header.txt', feature_lst=features_selected) # add_arff_header testing_file_lst = [ os.path.join(root_dir, str(test_percent) + '_origin_testing_set.arff'), os.path.join(root_dir, str(test_percent) + '_testing_set.csv') ] merge_files(testing_file_lst, header='header.txt', feature_lst=features_selected) # add_arff_header nn_size_lst = [in_size, h_size, out_size] for i in range(2): if i == 0: data_flg = 'benign_data' else: data_flg = 'attack_data' wgan = WGAN((training_set[0], training_set[1]), nn_size_lst, mini_batch_size, epochs, show_flg, data_flg, critic, distance_threshold) # wgan.train() wgan.train_gp() input = torch.randn((1000, in_size // 10)) gen_data = wgan.G(input) output_file = os.path.join(root_dir, str(percent) + '_gen_' + data_flg + '.csv') # print('data_flg ',data_flg) if data_flg == 'benign_data': data_type = '1' save_data(gen_data, data_type, output_file) else: data_type = '0' save_data(gen_data, data_type, output_file) ## merge files all_file_lst = [ os.path.join(root_dir, str(percent) + '_all_in_one_file.csv.arff'), # all_in_one_arff_file os.path.join(root_dir, str(percent) + '_training_set.csv'), # original_training_set_file os.path.join(root_dir, str(percent) + '_gen_benign_data.csv'), # gen_benign_data_file os.path.join(root_dir, str(percent) + '_gen_attack_data.csv') ] # gen_attack_data_file merge_files(all_file_lst, header='header.txt', feature_lst=features_selected) return all_file_lst
def achieve_reduced_features_data(input_file, epochs=1): start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) st = time.time() print('It starts at ', start_time) # ----be careful with the ' ' in items # all_features= "Flow ID, Source IP, Source Port, Destination IP, Destination Port, Protocol, Timestamp, Flow Duration," \ # " Total Fwd Packets, Total Backward Packets,Total Length of Fwd Packets, Total Length of Bwd Packets," \ # " Fwd Packet Length Max, Fwd Packet Length Min, Fwd Packet Length Mean, Fwd Packet Length Std," \ # "Bwd Packet Length Max, Bwd Packet Length Min, Bwd Packet Length Mean, Bwd Packet Length Std,Flow Bytes/s," \ # " Flow Packets/s, Flow IAT Mean, Flow IAT Std, Flow IAT Max, Flow IAT Min,Fwd IAT Total, Fwd IAT Mean," \ # " Fwd IAT Std, Fwd IAT Max, Fwd IAT Min,Bwd IAT Total, Bwd IAT Mean, Bwd IAT Std, Bwd IAT Max, Bwd IAT Min," \ # "Fwd PSH Flags, Bwd PSH Flags, Fwd URG Flags, Bwd URG Flags, Fwd Header Length, Bwd Header Length," \ # "Fwd Packets/s, Bwd Packets/s, Min Packet Length, Max Packet Length, Packet Length Mean, Packet Length Std," \ # " Packet Length Variance,FIN Flag Count, SYN Flag Count, RST Flag Count, PSH Flag Count, ACK Flag Count," \ # " URG Flag Count, CWE Flag Count, ECE Flag Count, Down/Up Ratio, Average Packet Size, Avg Fwd Segment Size," \ # " Avg Bwd Segment Size, Fwd Header Length,Fwd Avg Bytes/Bulk, Fwd Avg Packets/Bulk, Fwd Avg Bulk Rate," \ # " Bwd Avg Bytes/Bulk, Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets, Subflow Fwd Bytes," \ # " Subflow Bwd Packets, Subflow Bwd Bytes,Init_Win_bytes_forward, Init_Win_bytes_backward, act_data_pkt_fwd," \ # " min_seg_size_forward,Active Mean, Active Std, Active Max, Active Min,Idle Mean, Idle Std, Idle Max, Idle Min," \ # " Label" selected_features = " Source Port, Destination Port, Protocol, Flow Duration," \ " Total Fwd Packets, Total Backward Packets,Total Length of Fwd Packets, Total Length of Bwd Packets," \ " Fwd Packet Length Max, Fwd Packet Length Min, Fwd Packet Length Mean, Fwd Packet Length Std," \ "Bwd Packet Length Max, Bwd Packet Length Min, Bwd Packet Length Mean, Bwd Packet Length Std,Flow Bytes/s," \ " Flow Packets/s, Flow IAT Mean, Flow IAT Std, Flow IAT Max, Flow IAT Min,Fwd IAT Total, Fwd IAT Mean," \ " Fwd IAT Std, Fwd IAT Max, Fwd IAT Min,Bwd IAT Total, Bwd IAT Mean, Bwd IAT Std, Bwd IAT Max, Bwd IAT Min," \ "Fwd PSH Flags, Bwd PSH Flags, Fwd URG Flags, Bwd URG Flags, Fwd Header Length, Bwd Header Length," \ "Fwd Packets/s, Bwd Packets/s, Min Packet Length, Max Packet Length, Packet Length Mean, Packet Length Std," \ " Packet Length Variance,FIN Flag Count, SYN Flag Count, RST Flag Count, PSH Flag Count, ACK Flag Count," \ " URG Flag Count, CWE Flag Count, ECE Flag Count, Down/Up Ratio, Average Packet Size, Avg Fwd Segment Size," \ " Avg Bwd Segment Size, Fwd Header Length,Fwd Avg Bytes/Bulk, Fwd Avg Packets/Bulk, Fwd Avg Bulk Rate," \ " Bwd Avg Bytes/Bulk, Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets, Subflow Fwd Bytes," \ " Subflow Bwd Packets, Subflow Bwd Bytes,Init_Win_bytes_forward, Init_Win_bytes_backward, act_data_pkt_fwd," \ " min_seg_size_forward,Active Mean, Active Std, Active Max, Active Min,Idle Mean, Idle Std, Idle Max, Idle Min" # input_file = '../original_data_no_sample/Wednesday-workingHours.pcap_ISCX_demo.csv' output_file = '../original_data_no_sample/features_selected_Wednesday-workingHours.pcap_ISCX.csv' invalid_file = '../original_data_no_sample/invalid_data_Wednesday-workingHours.pcap_ISCX.csv' selected_features_list = selected_features.split(',') _, _, output_file = select_features_from_file(input_file, selected_features_list, output_file, invalid_file) X, Y = load_data_from_file(output_file) new_X = normalize_data(X, axis=0, low=-1, high=1, eps=1e-5) new_Y = change_labels(Y, labels=[1, 0]) # 'BENIGN=1, others=0' output_file = '../original_data_no_sample/features_selected_Normalized_Wednesday-workingHours.pcap_ISCX.csv' save_data_in_autoencoder(new_X, new_Y, output_file) model = AutoEncoder(new_X, new_Y, epochs) # 1. train model model.train() # torch.save(model.state_dict(), './sim_autoencoder.pth') # 2. encoding data and save the encoding data reduced_output_file = '../original_data_no_sample/features_selected_Normalized_Reduced_data_Wednesday-workingHours.pcap_ISCX.csv' reduced_features_data = model.encoder(torch.Tensor(new_X)) reduced_features_data = normalize_data(reduced_features_data.tolist(), axis=0, low=0, high=1, eps=1e-5) save_data_in_autoencoder(reduced_features_data, new_Y, reduced_output_file) end_time = time.strftime('%Y-%h-%d %H:%M:%S', time.localtime()) print('It ends at ', end_time) print('All takes %.4f s', time.time() - st) return reduced_output_file
'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags', ' Fwd URG Flags', ' Fwd Header Length', ' Bwd Packets/s', ' Packet Length Mean', ' ACK Flag Count', ' Down/Up Ratio', ' Avg Fwd Segment Size', ' Fwd Header Length.1', 'Fwd Avg Bytes/Bulk', ' Fwd Avg Packets/Bulk', ' Bwd Avg Bytes/Bulk', 'Bwd Avg Bulk Rate', 'Subflow Fwd Packets', ' Subflow Fwd Bytes', 'Init_Win_bytes_forward', ' act_data_pkt_fwd', ' Active Std', ' Active Min', ' Idle Max' ] # X, Y = load_data(input_file, features_selected, output_file) # output_file = '../original_data_no_sample/Wednesday-workingHours.pcap_ISCX_feature_selected.csv' # X, Y= load_data_from_files(output_file, sample_ratio=0.05, preprocess=True) # print('X.shape:', X.shape, ' Y.shape:', np.asarray(Y).shape) print('X[0]:', X[0]) X = normalize_data(X, axis=0, low=-1, high=1, eps=1e-5) print('Normalized X[0]:', X[0]) print('X.shape:', X.shape, ' Y.shape:', np.asarray(Y).shape) print('label:', Counter(Y)) show_flg = True save_flg = True in_size = 41 h_size = 64 out_size = 1 dtype = torch.float percent = 0.05 root_dir = './sample_wgan_data_' + str( percent) + 'percent_' + time.strftime("%Y%m%d-%H:%M:%S", time.localtime()) if not os.path.exists(root_dir):