def shuffle(fn): print(fn) df = pd.read_csv(fn, dtype=get_dtype4normalized(), engine='c') # 1min df = df.sample(frac=1, random_state = getSeed(), replace=False)# 20 sec fn_o = fn.replace('fold','shuffled_fold') tick = time.time() df.to_csv(fn_o,index=False, chunksize=10**4) # 7min print("Wrote in {:.2f}".format(time.time()-tick))
def worker(fn, cnt): reader = pd.read_csv(fn, usecols=get_cols4eval(), engine='c', dtype=get_dtype4normalized(), chunksize=10**6) # 1.5 min df = pd.concat([df for df in reader], sort=False) print(ntpath.basename(fn), df.Label.value_counts()) g = df.groupby(['Label'], sort=False) # 0.00 sec new_df = pd.DataFrame( g.apply(lambda x: x.sample(cnt, random_state=getSeed(), replace=True). reset_index(drop=True))) # 33 sec outfile = fn.replace(foldname_prefix, '{}bal_fold_'.format(K)) new_df = new_df.sample(frac=1, random_state=getSeed(), replace=False) # shuffling, 1min tick = time.time() new_df.to_csv(outfile, chunksize=10**5, index=False) print("Written in {:.2f} ".format(time.time() - tick)) # 3.5 mins for SFS
def caller(dataroot): K = 5 #dataroot = '/data/juma/data/ids18/CSVs/WS_l_old' dataroot = '/data/juma/data/ids18/CSVs_r_1.0/SR_10/RPS_SI_10_l' print(ntpath.basename(dataroot)) # loading flow ids, takes 100 sec tick = time.time() df_list = [] for i, fn in enumerate(tqdm(glob(join(dataroot, '*Meter.csv')))): df = pd.read_csv(fn, usecols=['Flow ID', 'Label'], dtype={ 'Flow ID': str, 'Label': str }) #20min, for RPS_10: # 1. drop duplicates 2.filter out. df = df.drop_duplicates(subset=['Flow ID', 'Label']) df = df[df['Label'] != 'Benign'] df_list.append(df) df = pd.concat(df_list, sort=False) print("Flow ids are read in {:.2f} sec".format( time.time() - tick)) # data is read in 6 min, 2 min for RPS_10 tick = time.time() flowids, flowlabels = get_flowids_and_labels(df) tock = time.time() print('obtained UNIQUE flowid and labels in {:.2f} sec'.format( tock - tick)) # 1100sec for RPS skf = StratifiedKFold(n_splits=K, random_state=getSeed(), shuffle=True) tick = time.time() flowids_per_fold = [] for fold_index, (train_index, test_index) in enumerate( skf.split(np.zeros(len(flowlabels)), flowlabels)): if fold_index >= 3: tock = time.time() print("-------------{}---------------Kfold split took: {:.2f} sec". format(fold_index, tock - tick)) tick = time.time() test_flowids = [flowids[i] for i in test_index] unique, counts = np.unique(flowlabels[test_index], return_counts=True) #print("Testing fold ", fold_index, get_overlap(test_flowids,flowids[train_index])) make_fold_i(dataroot, test_flowids, fold_index) print("Fold #{} is done in {:.2f}".format(fold_index, time.time() - tick))
def split_normal(dataroot): tick = time.time() for csv_file in glob(join(dataroot,'*Meter.csv')): df = chunk_read(csv_file) df = df.sort_values(['Flow ID']) # cannot shuffle due to approx split df = df[df['Label']=='Benign'] flowids = np.sort(df['Flow ID'].unique()) np.random.seed(getSeed()) np.random.shuffle(flowids) n = len(flowids)//K for i in range(NUM_OF_FOLDS): fn = join(dataroot,foldname_regex.format(i)) fids_fold = flowids[i*n:(i+1)*n] df_p = df.loc[(df['Flow ID'].isin(fids_fold))].copy() normalize_n_write_normal(df_p,fn) print("Normal split is done in {:.2f}".format(time.time()-tick))
def split_n_write_mal(csv_files, label, dataroot): df_ls = [] for csv_filename in csv_files: fn = join(dataroot, csv_filename) df_i = chunk_read(fn) df_ls.append(df_i) df = pd.concat(df_ls, sort=False) df = df[df['Label']==label] if not label in df.Label.unique(): return assert len(df.Label.unique())==1, "There should be only one label {}".format(df.Label.unique()) print(label,df.Label.value_counts()[label]) flowids = np.sort(df['Flow ID'].unique()) np.random.seed(getSeed()) np.random.shuffle(flowids) # FLOW shuffle reduces bias in data split while FLOWRECORD shuffle reduces bias in model num_flows = len(flowids) if num_flows<K: print("Category {1} has less than K flows: {0} ".format(num_flows,label)) return n = num_flows//K folds_df = [] for i in range(NUM_OF_FOLDS): fn = join(dataroot,foldname_regex.format(i)) fold_fids = flowids[i*n:(i+1)*n] fold_df = df.loc[(df['Flow ID'].isin(fold_fids))].copy() fold_df = normalize_df(fold_df) folds_df.append(fold_df) return folds_df fsize = os.path.getsize(fn) if fsize==0: fold_df.to_csv(fn, index=False) else: fold_df.to_csv(fn, mode='a', header=False, index=False)
def make_fold(dataroot): fraction = 1 file_ending = '*Meter.csv' K = 5 outputdir = join(dataroot, 'folds_fraction_{}'.format(fraction)) ensure_dir(outputdir) df = read_data(dataroot, file_ending, fraction=fraction) df = normalize_df(df, join(outputdir, 'data_stats.pickle'), train_data=True) flowids, flowlabels, grouped = group_data(df, K, outputdir) skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=getSeed()) for fold_index, (train_index, test_index) in enumerate(skf.split(flowids, flowlabels)): print("Fold - ", fold_index) test_flowids = flowids[test_index] fold_df = get_flow_records(test_flowids, df, grouped) fold_df.to_csv(join(outputdir, 'fold_{}.csv'.format(fold_index)), index=False, encoding='utf-8-sig')
def fold_worker(fn, b,m,d): #Assumptions: # 1. Benign records are in the beginning of each fold # 2. num of benign records is `b` # 3. num of duplicates made in over/under sampling is `d` # 4. num of malicious duplicated records `m` # then, using `m+d` benign records ensures we have equal #records against balancing case # which means we should remove last `b-(m+d)` benign records from fold K=5 # num iof folds num_to_exclude = (b - (m+d))//K tick = time.time() df = pd.read_csv(fn, engine='c', dtype=get_dtype4normalized()) # 4~5 min print("Read fold in {:.2f} min".format((time.time()-tick)/60)) N = df.shape[0] r_df = df[:N-num_to_exclude] sh_df = r_df.sample(frac=1,random_state=getSeed(), replace=False) outfile = fn.replace(in_fold, out_fold) assert fn!=outfile, "outfile is same as input file {}".format(ntpath.basename(fn)) sh_df.to_csv(outfile,chunksize=10**5, index=False)
import utils from os import path import numpy as np from scipy import stats, sparse from tqdm import tqdm from sklearn.cluster import AgglomerativeClustering import statsmodels.api as sm import matplotlib.pyplot as plt ##Set a random seed to make it reproducible! np.random.seed(utils.getSeed()) utils.set_mpl_params() #load up data: x, y = utils.load_feature_and_label_matrices(type='morgan') ##select a subset of columns of 'y' to use as a test matrix: #this is the same each time thanks to setting the random.seed. col_indices = np.random.choice(243, 10, replace=False) x_, y_ = utils.get_subset(x, y, indices=col_indices) #This will be used for clustering: distance_matrix = utils.fast_dice(x_) #choose a random target: idx = np.random.choice(y_.shape[1]) all_positive_indices = (y_[:,idx]==1).nonzero()[0] pos_test_counts = {index: 0 for index in all_positive_indices}
import os from sklearn import metrics from os.path import join import numpy as np import time from tqdm import tqdm import pandas as pd from dataset_loader import FlowRecordDataset from utils import get_cols4ml, encode_label from utils import getSeed import ntpath LARGEST_BATCH_SIZE = 4096 torch.manual_seed(getSeed()) class Softmax(nn.Module): def __init__(self, input_dim, num_classes): print("Initializing softmax") super(Softmax, self).__init__() self.classifier = nn.Linear(input_dim, num_classes) def forward(self, x): output = self.classifier(x) return output class CNN2(nn.Module): # reference architecture: https://github.com/vinayakumarr/Network-Intrusion-Detection/blob/master/UNSW-NB15/CNN/multiclass/cnn2.py
import pandas as pd from utils import getSeed from glob import glob from os.path import join import ntpath SEED = getSeed() def read_flowIDs(dataroot, file_ending='*Meter.csv', fraction=1): filenames = [i for i in glob(join(dataroot, file_ending))] df_list = [] for f in filenames: print("reading ", ntpath.basename(f)) df = pd.read_csv(f, usecols=['Flow ID', 'Label'], dtype={ 'Flow ID': str, 'Label': str }) df_list.append(df.sample(frac=fraction, random_state=SEED)) combined_csv = pd.concat(df_list, sort=False) return combined_csv def group_data(df, K): #remove classes less than K items print("Grouping to remove small (er than K) classes") labels = [ label for (flowid, label) in df.groupby(['Flow ID', 'Label']).groups.keys() ]