def shuffle(fn):
    print(fn)
    df = pd.read_csv(fn, dtype=get_dtype4normalized(), engine='c') # 1min

    df = df.sample(frac=1, random_state = getSeed(), replace=False)# 20 sec

    fn_o = fn.replace('fold','shuffled_fold')
    tick = time.time()
    df.to_csv(fn_o,index=False, chunksize=10**4) # 7min
    print("Wrote in {:.2f}".format(time.time()-tick))
def worker(fn, cnt):
    reader = pd.read_csv(fn,
                         usecols=get_cols4eval(),
                         engine='c',
                         dtype=get_dtype4normalized(),
                         chunksize=10**6)  # 1.5 min

    df = pd.concat([df for df in reader], sort=False)
    print(ntpath.basename(fn), df.Label.value_counts())
    g = df.groupby(['Label'], sort=False)  # 0.00 sec
    new_df = pd.DataFrame(
        g.apply(lambda x: x.sample(cnt, random_state=getSeed(), replace=True).
                reset_index(drop=True)))  # 33 sec
    outfile = fn.replace(foldname_prefix, '{}bal_fold_'.format(K))
    new_df = new_df.sample(frac=1, random_state=getSeed(),
                           replace=False)  # shuffling, 1min
    tick = time.time()
    new_df.to_csv(outfile, chunksize=10**5, index=False)
    print("Written in {:.2f} ".format(time.time() - tick))  # 3.5 mins for SFS
def caller(dataroot):
    K = 5
    #dataroot = '/data/juma/data/ids18/CSVs/WS_l_old'
    dataroot = '/data/juma/data/ids18/CSVs_r_1.0/SR_10/RPS_SI_10_l'
    print(ntpath.basename(dataroot))

    # loading flow ids, takes 100 sec
    tick = time.time()
    df_list = []
    for i, fn in enumerate(tqdm(glob(join(dataroot, '*Meter.csv')))):
        df = pd.read_csv(fn,
                         usecols=['Flow ID', 'Label'],
                         dtype={
                             'Flow ID': str,
                             'Label': str
                         })  #20min, for RPS_10:
        # 1. drop duplicates 2.filter out.
        df = df.drop_duplicates(subset=['Flow ID', 'Label'])
        df = df[df['Label'] != 'Benign']
        df_list.append(df)
    df = pd.concat(df_list, sort=False)
    print("Flow ids are read in {:.2f} sec".format(
        time.time() - tick))  # data is read in 6 min, 2 min for RPS_10

    tick = time.time()
    flowids, flowlabels = get_flowids_and_labels(df)
    tock = time.time()
    print('obtained UNIQUE flowid and labels in {:.2f} sec'.format(
        tock - tick))  # 1100sec for RPS
    skf = StratifiedKFold(n_splits=K, random_state=getSeed(), shuffle=True)
    tick = time.time()
    flowids_per_fold = []
    for fold_index, (train_index, test_index) in enumerate(
            skf.split(np.zeros(len(flowlabels)), flowlabels)):
        if fold_index >= 3:
            tock = time.time()
            print("-------------{}---------------Kfold split took: {:.2f} sec".
                  format(fold_index, tock - tick))
            tick = time.time()
            test_flowids = [flowids[i] for i in test_index]
            unique, counts = np.unique(flowlabels[test_index],
                                       return_counts=True)
            #print("Testing fold ", fold_index, get_overlap(test_flowids,flowids[train_index]))
            make_fold_i(dataroot, test_flowids, fold_index)
            print("Fold #{} is done in {:.2f}".format(fold_index,
                                                      time.time() - tick))
Beispiel #4
0
def split_normal(dataroot):
  tick = time.time()
  for csv_file in glob(join(dataroot,'*Meter.csv')):
    df = chunk_read(csv_file)
    df = df.sort_values(['Flow ID']) # cannot shuffle due to approx split
    df = df[df['Label']=='Benign']
   
    flowids = np.sort(df['Flow ID'].unique())  
    np.random.seed(getSeed())
    np.random.shuffle(flowids)
    
    n = len(flowids)//K
    for i in range(NUM_OF_FOLDS):
        fn = join(dataroot,foldname_regex.format(i))
        fids_fold = flowids[i*n:(i+1)*n]
        df_p = df.loc[(df['Flow ID'].isin(fids_fold))].copy()
        normalize_n_write_normal(df_p,fn)
  print("Normal split is done in {:.2f}".format(time.time()-tick))
Beispiel #5
0
def split_n_write_mal(csv_files, label, dataroot):
    df_ls = []
    for csv_filename in csv_files:
            fn = join(dataroot, csv_filename)
            df_i = chunk_read(fn)
            df_ls.append(df_i)
    df = pd.concat(df_ls, sort=False)
    df = df[df['Label']==label]

    if not label in df.Label.unique():
        return
    assert len(df.Label.unique())==1, "There should be only one label {}".format(df.Label.unique())
    print(label,df.Label.value_counts()[label])

    flowids = np.sort(df['Flow ID'].unique())
    np.random.seed(getSeed())
    np.random.shuffle(flowids) # FLOW shuffle reduces bias in data split while FLOWRECORD shuffle reduces bias in model
    num_flows = len(flowids)
    if num_flows<K:
        print("Category {1} has less than K flows: {0} ".format(num_flows,label))
        return

    n = num_flows//K
    folds_df = [] 
    for i in range(NUM_OF_FOLDS):
        fn = join(dataroot,foldname_regex.format(i))
        fold_fids = flowids[i*n:(i+1)*n]
        
        fold_df = df.loc[(df['Flow ID'].isin(fold_fids))].copy()
        fold_df = normalize_df(fold_df)
        folds_df.append(fold_df)
    return folds_df



        fsize =  os.path.getsize(fn)
        if fsize==0:
            fold_df.to_csv(fn, index=False)
        else:
            fold_df.to_csv(fn,  mode='a', header=False, index=False)
def make_fold(dataroot):
    fraction = 1
    file_ending = '*Meter.csv'
    K = 5

    outputdir = join(dataroot, 'folds_fraction_{}'.format(fraction))
    ensure_dir(outputdir)

    df = read_data(dataroot, file_ending, fraction=fraction)
    df = normalize_df(df,
                      join(outputdir, 'data_stats.pickle'),
                      train_data=True)
    flowids, flowlabels, grouped = group_data(df, K, outputdir)

    skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=getSeed())
    for fold_index, (train_index,
                     test_index) in enumerate(skf.split(flowids, flowlabels)):
        print("Fold - ", fold_index)
        test_flowids = flowids[test_index]
        fold_df = get_flow_records(test_flowids, df, grouped)
        fold_df.to_csv(join(outputdir, 'fold_{}.csv'.format(fold_index)),
                       index=False,
                       encoding='utf-8-sig')
def fold_worker(fn, b,m,d):
    #Assumptions:
    # 1. Benign records are in the beginning of each fold
    # 2. num of benign records is `b`
    # 3. num of duplicates made in over/under sampling is `d`
    # 4. num of malicious duplicated records  `m`

    # then, using `m+d` benign records ensures we have equal #records against balancing case
    # which means we should remove last `b-(m+d)` benign records from fold
    K=5 # num iof folds
    num_to_exclude = (b - (m+d))//K
     

    tick = time.time()    
    df = pd.read_csv(fn, engine='c', dtype=get_dtype4normalized()) # 4~5 min
    print("Read fold in {:.2f} min".format((time.time()-tick)/60))
    N = df.shape[0]
    r_df = df[:N-num_to_exclude]
    
    sh_df = r_df.sample(frac=1,random_state=getSeed(), replace=False)

    outfile = fn.replace(in_fold, out_fold)
    assert fn!=outfile, "outfile is same as input file {}".format(ntpath.basename(fn))
    sh_df.to_csv(outfile,chunksize=10**5, index=False)
import utils
from os import path

import numpy as np
from scipy import stats, sparse

from tqdm import tqdm
from sklearn.cluster import AgglomerativeClustering
import statsmodels.api as sm
import matplotlib.pyplot as plt

##Set a random seed to make it reproducible!
np.random.seed(utils.getSeed())
utils.set_mpl_params()

#load up data:
x, y = utils.load_feature_and_label_matrices(type='morgan')
##select a subset of columns of 'y' to use as a test matrix:
#this is the same each time thanks to setting the random.seed.
col_indices = np.random.choice(243, 10, replace=False)
x_, y_ = utils.get_subset(x, y, indices=col_indices)


#This will be used for clustering:
distance_matrix = utils.fast_dice(x_)


#choose a random target:
idx = np.random.choice(y_.shape[1])
all_positive_indices = (y_[:,idx]==1).nonzero()[0]
pos_test_counts = {index: 0 for index in all_positive_indices}
Beispiel #9
0
import os
from sklearn import metrics
from os.path import join
import numpy as np
import time
from tqdm import tqdm
import pandas as pd

from dataset_loader import FlowRecordDataset
from utils import get_cols4ml, encode_label
from utils import getSeed
import ntpath

LARGEST_BATCH_SIZE = 4096
torch.manual_seed(getSeed())


class Softmax(nn.Module):
    def __init__(self, input_dim, num_classes):
        print("Initializing softmax")
        super(Softmax, self).__init__()
        self.classifier = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        output = self.classifier(x)
        return output


class CNN2(nn.Module):
    # reference architecture:  https://github.com/vinayakumarr/Network-Intrusion-Detection/blob/master/UNSW-NB15/CNN/multiclass/cnn2.py
import pandas as pd
from utils import getSeed
from glob import glob
from os.path import join
import ntpath
SEED = getSeed()


def read_flowIDs(dataroot, file_ending='*Meter.csv', fraction=1):
    filenames = [i for i in glob(join(dataroot, file_ending))]
    df_list = []
    for f in filenames:
        print("reading ", ntpath.basename(f))
        df = pd.read_csv(f,
                         usecols=['Flow ID', 'Label'],
                         dtype={
                             'Flow ID': str,
                             'Label': str
                         })
        df_list.append(df.sample(frac=fraction, random_state=SEED))
    combined_csv = pd.concat(df_list, sort=False)
    return combined_csv


def group_data(df, K):
    #remove classes less than K items
    print("Grouping to remove small (er than K) classes")
    labels = [
        label
        for (flowid, label) in df.groupby(['Flow ID', 'Label']).groups.keys()
    ]