Esempio n. 1
0
def load_folds(dataroot, fold_prefix, K):
    df_list = [
        pd.read_csv(join(dataroot, fold_prefix.format(i)),
                    usecols=get_cols4ml(),
                    dtype=get_dtype4normalized()) for i in range(K)
    ]

    fold_data = [ (df.drop(columns=['Label']).values, encode_label(df.Label.values)) \
    for df in df_list]
    return fold_data
Esempio n. 2
0
    def __init__(self, csv_file):
        self.csv_file = csv_file
        self.num_records = self.get_num_records(csv_file)

        df = pd.read_csv(csv_file,
                         engine='c',
                         usecols=get_cols4ml(),
                         dtype=get_dtype4normalized())
        self.x = torch.FloatTensor(df.drop(columns=['Label']).values)
        self.y = torch.LongTensor(self.encode_label(df.Label.values))
Esempio n. 3
0
    def __init__(self, csv_file, chunksize=10**4):
        self.csv_file = csv_file
        self.chunksize = chunksize
        self.seen_so_far = 0  # number of flow records seen so far
        self.seen_chunks = 0
        self.iterableReader = pd.read_csv(csv_file,
                                          engine='c',
                                          usecols=get_cols4ml(),
                                          dtype=get_dtype4normalized(),
                                          chunksize=chunksize)

        label_to_id, id_to_label, _ = get_ids18_mappers()
        self.label_to_id = label_to_id
Esempio n. 4
0
    def load_val_dataset(self, val_filename, num_records):
        print("=========loading validation dataset========")
        #skip_ratio = 0.95
        #skip_idx = np.random.choice(num_records,int(num_records*skip_ratio)) #% skipping 10% of rows
        #skip_idx = skip_idx[skip_idx!=0]
        #df = pd.read_csv(val_filename, usecols=get_cols4ml(), skiprows=skip_idx )
        #print(df.Label.value_counts())
        df = pd.read_csv(val_filename, usecols=get_cols4ml(), nrows=400000)
        print(df.Label.value_counts())

        y = encode_label(df.Label.values)
        y = torch.LongTensor(y)
        x = torch.FloatTensor(df.drop(columns=['Label']).values)

        dataset = utils.TensorDataset(x, y)
        loader = utils.DataLoader(dataset, batch_size=1024 * 4)
        return loader
def evaluate(traindir, testdir, classifier_name):
    pred_any_list = []
    pred_majority_list = []
    pred_all_list = []
    y_test_perflowid_list = []

    pre_fingerprint = join(traindir, 'c_{}'.format(classifier_name))
    balancing_technique = get_balancing_technique()
    label_to_id, id_to_label = get_ddos19_mappers()

    filenames = [
        'LDAP.csv', 'MSSQL.csv', 'NetBIOS.csv', 'SYN.csv', 'UDP.csv',
        'UDP-Lag.csv', 'records.csv'
    ]

    total_prediction_time = 0
    total_records = 0

    for fn in filenames:
        print("---------------------------")
        print("Reading {}".format(fn))
        tick = time.time()
        test_df = pd.read_csv(
            join(testdir, fn),
            usecols=get_cols4ml())  #read in 2min, requires 14GB memory
        tock = time.time()
        input_dim = test_df.shape[1] - 2  # flow id and Label is dropped
        num_class = len(label_to_id.keys())
        print("Read {} records in {:.2f} min".format(test_df.shape[0],
                                                     (tock - tick) / 60.))
        if test_df.shape[0] < 1:
            continue
        test_df = test_df.sort_values(
            by=['Flow ID',
                'Label'])  # makes grouping,faster. Allows predict per flowid
        dummy_num_records = test_df.shape[0]
        class_weight = None
        classifier_args, config = get_args(classifier_name, dummy_num_records,
                                           num_class, input_dim, class_weight,
                                           balancing_technique)
        # directories for results
        train_fingerprint = join(
            traindir, 'c_{}'.format(classifier_name +
                                    config))  # fingerprint already there
        logdir = join(train_fingerprint, 'log')  #already there
        runs_dir = join(logdir, 'runs')
        test_df = normalize_df(test_df, join(runs_dir, 'data_stats.pickle'))

        fingerprint = join(testdir,
                           'c_{}'.format(classifier_name +
                                         config))  # fingerprint already there
        #create classifier
        loader = ClassifierLoader()
        classifier_args['runs_dir'] = runs_dir
        clf = loader.load(classifier_args)

        # predict part
        print("Grouping data \r")
        tick = time.time()
        test_flowids, y_test_perflowid_str, grouped, group_sizes = group_data(
            test_df)
        test_df = test_df.drop(columns=['Flow ID', 'Label'])
        tock = time.time()
        print("Done. In {:.0f}min".format((tock - tick) / 60.))

        y_test_perflowid = encode_label(y_test_perflowid_str, label_to_id)

        pred_any, pred_majority, pred_all, prediction_time = predict_per_flow(
            classifier_name, clf, grouped, test_df, y_test_perflowid,
            group_sizes)  # takes 2-3 min

        total_prediction_time += prediction_time
        total_records += test_df.shape[0]

        pred_any_list += pred_any
        pred_majority_list += pred_majority
        pred_all_list += pred_all

        y_test_perflowid_list += y_test_perflowid

    pd.DataFrame({
        'Records': [total_records],
        'Time': [total_prediction_time]
    }).to_csv(join(testdir, 'timing.csv'), index=False)
    pred_list_tuples = (pred_any_list, pred_majority_list, pred_all_list)
    result_logger_ddos19(fingerprint, y_test_perflowid_list, pred_list_tuples,
                         id_to_label)
Esempio n. 6
0
def classify(dataroot,classifier_name):
        K=5
        balance = get_balancing_technique()
        train_data = []
        #single fold 29M records
        # 4 folds 120M records
        # if 20M records require 5% RAM
        # then 120M records require 30% memory
        print("Reading the data...")
        tick=time.time()
        label_to_id, id_to_label, _ = get_ids18_mappers()
        num_train_records = 0
        print("Reading 4 folds ")
        
        if balance=='with_loss' or balance=='no' or balance=='with_loss_sub': 
            regex  = 'r_fold_{}.csv'
        elif balance=='explicit':
            regex = 'bal_fold_{}.csv'
            
        for fold_index in tqdm(range(K)):
            if fold_index==0:
                continue
            reader = pd.read_csv(join(dataroot,regex.format(fold_index)),chunksize=10**6, usecols=get_cols4ml(), dtype=get_dtype4normalized())# 10**6 rows read in 9min, total is 29*10**6
            # remove the extra header row
            for df in tqdm(reader):
                y_str = df.Label.values
                x = df.drop(columns=['Label']).values
                train_data.append((x,encode_label(y_str)))
                num_train_records +=df.shape[0]
                print(df.memory_usage(deep=True).sum()*(799902)/(1024*1024*1024 ))
        tock = time.time()
        print("read data in {:.2f}".format(tock-tick)) # 24min

        classifier_args, config = get_args(classifier_name, num_class='dummy', class_weight=None)
        pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
            
        fingerprint = pre_fingerprint + config
        logdir = join(fingerprint,'log')
        ensure_dir(logdir)
               
        X_train = np.concatenate([fold[0] for fold in train_data ],axis=0)
        y_train = np.concatenate([fold[1] for fold in train_data ],axis=0)
        classifier_args['runs_dir']=logdir

        print("Start training")
        tick = time.time()
        clf= get_classifier(classifier_args)
        print("classes")
        print(np.unique(y_train))
        clf.fit(X_train, y_train)
        fn = classifier_args['runs_dir']+'.pkl'
        pickle.dump(clf,open(fn,'wb'))
        print("Done training {} flow records in {:.2f} sec".format(y_train.shape[0],time.time()-tick))