def main(): data_dir_path = './data' model_dir_path = './models' # ecg data in which each row is a temporal sequence data of continuous values ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None) print(ecg_data.head()) ecg_np_data = ecg_data.as_matrix() scaler = MinMaxScaler() ecg_np_data = scaler.fit_transform(ecg_np_data) print(ecg_np_data.shape) ae = FeedForwardAutoEncoder() # fit the data and save model into model_dir_path if DO_TRAINING: ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9) # load back the model saved in model_dir_path detect anomaly ae.load_model(model_dir_path) anomaly_information = ae.anomaly(ecg_np_data[:23, :]) reconstruction_error = [] for idx, (is_anomaly, dist) in enumerate(anomaly_information): print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')') reconstruction_error.append(dist) visualize_reconstruction_error(reconstruction_error, ae.threshold)
def main(): data_dir_path = './data' model_dir_path = './models' ecg_data = pd.read_csv(data_dir_path + '/ground_anomaly.csv') ecg_data = ecg_data[1:] # print([name for name in ecg_data.columns]) ecg_data=ecg_data.drop(['TIMESTAMP', 'RECORD', 'AmbTemp_C_Avg', 'InvPAC_kW_Avg', 'PwrMtrP_kW_Avg'], axis=1) ecg_np_data = ecg_data.as_matrix() scaler = MinMaxScaler() ecg_np_data = scaler.fit_transform(ecg_np_data) ae = LstmAutoEncoder() print(ecg_data.shape) column = ecg_data.shape[0] print(column) # fit the data and save model into model_dir_path ae.fit(ecg_np_data[:10000, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.95) # load back the model saved in model_dir_path detect anomaly ae.load_model(model_dir_path) anomaly_information = ae.anomaly(ecg_np_data[:10000, :]) reconstruction_error = [] abnormal_number = 0 idx_list = [] for idx, (is_anomaly, dist) in enumerate(anomaly_information): if is_anomaly: abnormal_number = abnormal_number + 1 print(idx) idx_list.append(idx) print('# ' + str(idx) + ' is abnormal.') reconstruction_error.append(dist) print(abnormal_number) print(idx_list) visualize_reconstruction_error(reconstruction_error, ae.threshold)
def main(): # data_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/data' data_dir_path = '/Users/Shariful/Documents/DataCamp/ADFA-LD(tf-idf)' # model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/models' model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/adfa_demo/models' # ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None) # ecg_data1 = pd.read_csv(data_dir_path + '/test_normal.csv', skiprows=1, \ # index_col=None, header=None) ecg_data2 = pd.read_csv(data_dir_path + '/train_normal.csv', skiprows=1, \ index_col=None, header=None) ecg_data3 = pd.read_csv(data_dir_path + '/test_attack.csv', skiprows=1, \ index_col=None, header=None) # ecg_data1 = ecg_data1.iloc[:, 0:-1] ecg_data2 = ecg_data2.iloc[:, 0:-1] ecg_data3 = ecg_data3.iloc[:, 0:-1] ecg_data = pd.concat([ecg_data2, ecg_data3], ignore_index=True) # print(ecg_data.head()) ecg_np_data = ecg_data.as_matrix() scaler = MinMaxScaler() ecg_np_data = scaler.fit_transform(ecg_np_data) print(ecg_np_data.shape) ae = LstmAutoEncoder() # fit the data and save model into model_dir_path ae.fit(ecg_np_data[0:832, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9) # load back the model saved in model_dir_path detect anomaly ae.load_model(model_dir_path) # anomaly_information = ae.anomaly(ecg_np_data[:23, :]) anomaly_information = ae.anomaly(ecg_np_data, threshold=1.75) reconstruction_error = [] for idx, (is_anomaly, dist) in enumerate(anomaly_information): print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')') reconstruction_error.append(dist) visualize_reconstruction_error(reconstruction_error, ae.threshold)
def main(): #================read training dataset==================== # train_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/train/5_gram.csv' # attack test path # test_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_attack_2.csv' # test_data = pd.read_csv(test_path, index_col=0, usecols=[0,1,2,3,4,5]) # test_data_np = test_data.as_matrix() # normal test path # data_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/data' data_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/train' # model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/models' model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/models_5_gram' score_dir_path = '/Users/Shariful/Documents/GitHubRepo/deeplearning/syscall_anomaly/scores_on_testset' # adfa_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None) adfa_data = pd.read_csv(data_dir_path + '/5_gram.csv', \ index_col=0, usecols=[0,1,2,3,4,5]) ##==================Fit the LSTM model===================== ## ['0','1','2','3','4'] ## adfa_data = adfa_data.iloc[:, 0:-1] ## print(adfa_data.head()) # adfa_np_data = adfa_data.as_matrix() ## scaler = MinMaxScaler() ## adfa_np_data = scaler.fit_transform(adfa_np_data) ## print(adfa_np_data.shape) # # ae = LstmAutoEncoder() # # # fit the data and save model into model_dir_path # ae.fit(adfa_np_data, model_dir_path=model_dir_path, batch_size=100, \ # epochs=20, estimated_negative_sample_ratio=None) ##==========Load the saved model=========== # # # load back the model saved in model_dir_path detect anomaly # ae.load_model(model_dir_path) #=============read test dataset=============== # test data set test_idx_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_test_idx.csv' df_test_idx = pd.read_csv(test_idx_path, header = None, skiprows = 1) test_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_test.csv' df_test = pd.read_csv(test_path, header = None, skiprows = 1) df_test_np = df_test.as_matrix() # df_test_np = df_test_np[0:123649,:] test_labels = np.hstack((np.ones(60, dtype = int), \ np.zeros(df_test_idx.shape[0]-60, dtype = int))) # ecg_np_test_data = adfa_np_data[0:43559, :] # test_data_np = np.vstack((ecg_np_test_data, test_data_np)) ##================predict scores on testing set============ # # ## anomaly_information = ae.anomaly(adfa_np_data[:23, :]) # anomaly_information = ae.anomaly(df_test_np, threshold=150) ## reconstruction_error = [] # idx_out = 0 # max_scores = np.zeros((df_test_idx.shape[0])) # for idx_in, (is_anomaly, dist) in enumerate(anomaly_information): ## print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')') ## reconstruction_error.append(dist) # # #finding the maximum score out of all subsequences' scores # if idx_in <= df_test_idx.loc[idx_out][:][1]: # if max_scores[idx_out] < dist: # max_scores[idx_out] = dist # else: # idx_out += 1 # max_scores[idx_out] = dist # ## visualize_reconstruction_error(reconstruction_error, ae.threshold) # visualize_reconstruction_error(max_scores, ae.threshold) #=============load and plot the computed scores on testing set============== max_scores = pd.read_csv('/Users/Shariful/Documents/GitHubRepo/deeplearning/syscall_anomaly/scores_on_testset/lstm_128_units.csv', \ header = None) visualize_reconstruction_error(max_scores, 150) # draw the roc curve plot_ROC(test_labels, max_scores)
def AutoEncoder_test(X_data, Y_data, sub_output_dir, num, model_name, ae, error_list) : model_dir_path = sub_output_dir + model_name + '/' anomaly_dir = model_dir_path + 'anomaly/' png_dir_1 = sub_output_dir + '1_png/' png_dir_2 = model_dir_path + 'png/' metrics_dir_1 = sub_output_dir + '2_metrics/' metrics_dir_2 = model_dir_path + 'metrics/' confusion_dir_1 = sub_output_dir + '3_confusion/' confusion_dir_2 = model_dir_path + 'confusion/' create_directory(model_dir_path) create_directory(anomaly_dir) create_directory(png_dir_1) create_directory(png_dir_2) create_directory(metrics_dir_1) create_directory(metrics_dir_2) create_directory(confusion_dir_1) create_directory(confusion_dir_2) x_size = len(X_data) y_size = 0 for i in range (x_size) : if Y_data[i] == 0 : y_size +=1 estimated_negative_sample_ratio = y_size / x_size # fit the data and save model into model_dir_path history = ae.fit(X_data, model_dir_path=model_dir_path, estimated_negative_sample_ratio=estimated_negative_sample_ratio) # load back the model saved in model_dir_path detect anomaly #ae.load_model(model_dir_path) if 0 : _, Xtest, _, Ytest = train_test_split(X_data, Y_data, test_size=0.5, random_state=1004) else : Xtest = X_data Ytest = Y_data adjusted_threshold = ae.threshold anomaly_information = ae.anomaly(Xtest, adjusted_threshold) reconstruction_error = [] Ypred = [] file_name_info = anomaly_dir + str(num) + '_anomaly.txt' f1 = open(file_name_info, mode='at') f2 = open(model_dir_path + 'dist.csv', mode='at') for idx, (is_anomaly, dist) in enumerate(anomaly_information): temp_str = '# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')' #print(temp_str) f1.write(temp_str + '\n') index = Y_data[idx] predicted_label = 1 if is_anomaly else 0 Ypred.append(predicted_label) reconstruction_error.append(dist) anomal_str = str(idx) + ',' + str(index) + ',' + str(dist) f2.write(anomal_str+'\n') f1.close() f2.close() png_name_info_1 = png_dir_1 + str(num) + '_' + model_name + '_anomaly.png' png_name_info_2 = png_dir_2 + str(num) + '_' + model_name + '_anomaly.png' png_title = str(num) + '_' + model_name + '_' + str(len(X_data)) visualize_reconstruction_error(reconstruction_error, ae.threshold, Y_data, png_name_info_1, png_name_info_2, png_title, WINDOW_SIZE, error_list) plot_training_history_file(history, model_dir_path, num) #visualize_anomaly(Ytest, reconstruction_error, adjusted_threshold) visualize_anomaly_errors(Ytest, reconstruction_error, adjusted_threshold, error_list, png_title, model_dir_path, num) report_evaluation_metrics_file(Ytest, Ypred, metrics_dir_1, metrics_dir_2, num, model_name) plot_confusion_matrix_file(Ytest, Ypred, confusion_dir_1, confusion_dir_2, num, model_name)
def main(): #================read training dataset==================== # train_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/train/5_gram.csv' # attack test path # test_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_attack_2.csv' # test_data = pd.read_csv(test_path, index_col=0, usecols=[0,1,2,3,4,5]) # test_data_np = test_data.as_matrix() # normal test path # data_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/data' data_dir_path = (r'/Users/Shariful/Documents/SysCallDataset/PreparedData' r'/Canali_dataset/sliding_window_5') # model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/models' model_dir_path = (r'/Users/Shariful/Documents/GitHubRepo/deeplearning/' r'syscall_anomaly/Canali/trained_models') score_dir_path = (r'/Users/Shariful/Documents/GitHubRepo/deeplearning/' r'syscall_anomaly/Canali/scores') canali_data = pd.read_csv(data_dir_path + '/train_set.csv', header=None) # canali_data = pd.read_csv(data_dir_path + '/train_set.csv', \ # index_col=0, usecols=[0,1,2,3,4,5]) #==================Fit the LSTM model===================== # ['0','1','2','3','4'] # canali_data = canali_data.iloc[:, 0:-1] # print(canali_data.head()) canali_np_data = canali_data.as_matrix() # scaler = MinMaxScaler() # canali_np_data = scaler.fit_transform(canali_np_data) # print(canali_np_data.shape) ae = LstmAutoEncoder() # fit the data and save model into model_dir_path ae.fit(canali_np_data, model_dir_path=model_dir_path, batch_size=1000, \ epochs=20, estimated_negative_sample_ratio=None) #==========Load the saved model=========== # load back the model saved in model_dir_path detect anomaly ae.load_model(model_dir_path) #=============read test dataset=============== # test data set test_idx_path = data_dir_path + '/test_set_index_range_label.csv' df_test_idx = pd.read_csv(test_idx_path, header=None) test_path = data_dir_path + '/test_set.csv' df_test = pd.read_csv(test_path, header=None) df_test_np = df_test.as_matrix() # df_test_np = df_test_np[0:123649,:] test_labels = np.array(df_test_idx.iloc[:, -1]) # ecg_np_test_data = canali_np_data[0:43559, :] # test_data_np = np.vstack((ecg_np_test_data, test_data_np)) #================predict scores on testing set============ # anomaly_information = ae.anomaly(canali_np_data[:23, :]) anomaly_information = ae.anomaly(df_test_np, threshold=150) # reconstruction_error = [] idx_out = 0 max_scores = np.zeros((df_test_idx.shape[0])) for idx_in, (is_anomaly, dist) in enumerate(anomaly_information): # print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')') # reconstruction_error.append(dist) #finding the maximum score out of all subsequences' scores if idx_in <= df_test_idx.loc[idx_out][:][1]: if max_scores[idx_out] < dist: max_scores[idx_out] = dist else: idx_out += 1 max_scores[idx_out] = dist # visualize_reconstruction_error(reconstruction_error, ae.threshold) visualize_reconstruction_error(max_scores, ae.threshold) #=============load and plot the computed scores on testing set============== # max_scores = pd.read_csv('/Users/Shariful/Documents/GitHubRepo/deeplearning/syscall_anomaly/scores_on_testset/lstm_128_units.csv', \ # header = None) # visualize_reconstruction_error(max_scores, 150) # draw the roc curve plot_ROC(test_labels, max_scores) # save the computed scores np.savetxt(score_dir_path + '/lstm_128_units.csv', max_scores, delimiter=",")