def train(dataroot,classifier_name): print("Reading the data...") df = read_ddos_data(dataroot) # takes 10GB RAM, loads in 68 seconds print("read data of shape ", df.shape) label_to_id, id_to_label = get_ddos19_mappers() balancing_technique = get_balancing_technique() input_dim = df.shape[1]-2 # because we remove Label and FlowID columns from X num_class = len(label_to_id.keys()) WS_flow_count = 13684951 # 13.7 mln records on PCAP-01-12 num_iters = WS_flow_count*10 class_weight = None classifier_args,config = get_args(classifier_name,WS_flow_count,num_class,input_dim, class_weight,balancing_technique) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = pre_fingerprint + config logdir = join(fingerprint,'log') runs_dir = join(logdir,'runs') ensure_dir(runs_dir) df = normalize_df(df,join(runs_dir,'data_stats.pickle'),train_data=True) X_train, y_train = df_to_array(df) y_train = encode_label(y_train,label_to_id) classifier_args['runs_dir'] = runs_dir train_and_save_classifier(X_train,y_train,classifier_args)
def train(dataroot, classifier_name='cnn'): balance = get_balancing_technique() K = 10 fold_prefix = str(K) + 'bal_fold_{}.csv' if balance == 'explicit' else str( K) + 'r_fold_{}.csv' class_weight = get_class_weights(dataroot) classifier_args, config = get_args(classifier_name, class_weight) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = join(pre_fingerprint + config, 'K_{}'.format(K)) print(fingerprint) folds_data = load_folds(dataroot, fold_prefix, K) for test_index in range(K): print('-----------{}----------'.format(test_index)) X_train = np.concatenate( [fold[0] for i, fold in enumerate(folds_data) if i != test_index], axis=0) y_train = np.concatenate( [fold[1] for i, fold in enumerate(folds_data) if i != test_index], axis=0) logdir = join(fingerprint, 'log', '{}'.format(test_index)) ensure_dir(logdir) classifier_args['runs_dir'] = logdir clf = get_classifier(classifier_args) clf.fit(X_train, y_train) modelname = join(classifier_args['runs_dir'], 'model.pkl') pickle.dump(clf, open(modelname, 'wb'))
def train(dataroot, classifier_name='cnn'): balance = get_balancing_technique() K = 10 fold_prefix = '{}bal_fold_{}.csv' if balance == 'explicit' else '{}r_fold_{}.csv' class_weight = get_class_weights(dataroot) classifier_args, config = get_args(classifier_name, class_weight) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = join(pre_fingerprint + config, 'K_{}'.format(K)) print(fingerprint) num_epochs = 40 for test_index in range(K): print('-----------{}----------'.format(test_index)) dev_indices = [i for i in range(K) if i != test_index] val_index = dev_indices[0] train_indices = dev_indices[1:] val_csv = join(dataroot, fold_prefix.format(K, val_index)) list_of_train_csvs = [ join(dataroot, fold_prefix.format(K, i)) for i in train_indices ] logdir = join(fingerprint, 'log', '{}'.format(test_index)) ensure_dir(logdir) classifier_args['runs_dir'] = logdir clf = get_classifier(classifier_args) clf.fit(list_of_train_csvs, val_csv, num_epochs)
def classify(dataroot,classifier_name): K=5 balance = get_balancing_technique() train_data = [] #single fold 29M records # 4 folds 120M records # if 20M records require 5% RAM # then 120M records require 30% memory print("Reading the data...") tick=time.time() label_to_id, id_to_label, _ = get_ids18_mappers() num_train_records = 0 print("Reading 4 folds ") if balance=='with_loss' or balance=='no' or balance=='with_loss_sub': regex = 'r_fold_{}.csv' elif balance=='explicit': regex = 'bal_fold_{}.csv' for fold_index in tqdm(range(K)): if fold_index==0: continue reader = pd.read_csv(join(dataroot,regex.format(fold_index)),chunksize=10**6, usecols=get_cols4ml(), dtype=get_dtype4normalized())# 10**6 rows read in 9min, total is 29*10**6 # remove the extra header row for df in tqdm(reader): y_str = df.Label.values x = df.drop(columns=['Label']).values train_data.append((x,encode_label(y_str))) num_train_records +=df.shape[0] print(df.memory_usage(deep=True).sum()*(799902)/(1024*1024*1024 )) tock = time.time() print("read data in {:.2f}".format(tock-tick)) # 24min classifier_args, config = get_args(classifier_name, num_class='dummy', class_weight=None) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = pre_fingerprint + config logdir = join(fingerprint,'log') ensure_dir(logdir) X_train = np.concatenate([fold[0] for fold in train_data ],axis=0) y_train = np.concatenate([fold[1] for fold in train_data ],axis=0) classifier_args['runs_dir']=logdir print("Start training") tick = time.time() clf= get_classifier(classifier_args) print("classes") print(np.unique(y_train)) clf.fit(X_train, y_train) fn = classifier_args['runs_dir']+'.pkl' pickle.dump(clf,open(fn,'wb')) print("Done training {} flow records in {:.2f} sec".format(y_train.shape[0],time.time()-tick))
def evaluator(dataroot, classifier_name): print('evaluating ', ntpath.basename(dataroot)) test_csv_file = join(dataroot, 'fold_0.csv') result_test = subprocess.run(['wc', '-l', test_csv_file], stdout=subprocess.PIPE) test_records = int(result_test.stdout.split()[0]) # load Classifier class_weight = get_class_weights( dataroot) # because it is not important for evaluation num_class = 14 # because we remove Label,FlowID,Timestamp columns from X classifier_args, config = get_args(classifier_name, num_class, class_weight) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = pre_fingerprint + config print('clf fingerprint', ntpath.basename(fingerprint)) classifier_args['runs_dir'] = join(fingerprint, 'log') clf = ClassifierLoader().load(classifier_args) # classifier loaded # load data col_names = get_cols4eval() col_names.append('Timestamp') df = pd.read_csv(test_csv_file, usecols=col_names, dtype=get_dtype4normalized()) print("Record distribution:") print(df.Label.value_counts()) df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype(str) # type string #group data df = df.sort_values(by=['Flow ID', 'Label' ]) # replaces ordering task in per_flow_eval flowids, flowlabels, grouped = group_data(df) y = encode_label(flowlabels) print("data is grouped and labels are encoded") pred_any, pred_maj, pred_all, _ = evaluate_per_flow(clf, y, grouped, df) any_cm = confusion_matrix(y, pred_any) maj_cm = confusion_matrix(y, pred_maj) all_cm = confusion_matrix(y, pred_all) any_acc = metrics.balanced_accuracy_score(y, pred_any) maj_acc = metrics.balanced_accuracy_score(y, pred_maj) all_acc = metrics.balanced_accuracy_score(y, pred_all) print(any_acc, maj_acc, all_acc) result_logger_ids18(fingerprint, np.unique(y), (any_cm, maj_cm, all_cm), 'test')
def evaluator(dataroot, classifier_name): K = 10 print("\nevaling ", dataroot) gt_num_class = pd.read_csv(join(dataroot, '{}fold_0.csv'.format(K)), usecols=['Label'])['Label'].nunique() # load Classifier classifier_args, config = get_args(classifier_name, class_weight=None) print("Balancing technique: ", classifier_args['balance']) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = join(pre_fingerprint + config, 'K_{}'.format(K)) logdir = join(fingerprint, 'log') gt_classes = None for test_index in range(K): print("************* Testing holdout ", test_index, '************') runs_dir = join(logdir, '{}'.format(test_index)) print(runs_dir) print("with model: ", runs_dir) classifier_args['runs_dir'] = runs_dir loader = ClassifierLoader() clf = loader.load(classifier_args) # classifier loaded # load data col_names = get_cols4eval() col_names.append('Timestamp') test_csv_file = join(dataroot, '{}fold_{}.csv'.format(K, test_index)) df = pd.read_csv(test_csv_file, usecols=col_names, dtype=get_dtype4normalized(), nrows=4096) #,skiprows=skip_idx) df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype( str) # type string df = df.sort_values(by=['Flow ID', 'Day', 'Label']) # Done inference_times = predict(df, clf) for key, val in inference_times.items(): print(key, val) break
def classify(dataroot, classifier_name): K = 5 fraction = 1 #total_records = 6907705; # in fold fraction after removin small classes <K folds_df = [] fold_root = join(dataroot, 'folds_fraction_{}'.format(fraction)) print("Reading the data...") ds_list = [] for fold_index in range(K): df = pd.read_csv(join(fold_root, 'fold_{}.csv'.format(fold_index))) folds_df.append(df) ds_list.append(df.Label) total_df = pd.concat(folds_df) total_label_df = pd.concat(ds_list) labels = total_label_df.sort_values().unique() total_records = total_label_df.shape[0] #labels,labels_d = get_labels(total_label_df.unique()) label_to_id, id_to_label, _ = get_ids18_mappers() class_weight = get_class_weights( encode_label(total_label_df.values, label_to_id)) balance = get_balancing_technique() input_dim = folds_df[0].shape[ 1] - 2 # because we remove Label and FlowID columns from X gt_num_class = len(label_to_id) num_class = len(labels) assert gt_num_class == num_class, 'all classess should be observed gt_classes!=observed_classes {}!={}'.format( gt_num_class, num_class) classifier_args, config = get_args(classifier_name, total_records, gt_num_class, input_dim, class_weight, balance) pre_fingerprint = join( dataroot, 'r_{}_c_{}_k_{}'.format(fraction, classifier_name, str(K))) fingerprint = pre_fingerprint + '_mem_constrained' + config logdir = join(pre_fingerprint + config, 'log') runs_dir = get_runs_dir(logdir) classifier_args['runs_dir'] = runs_dir clf = get_classifier(classifier_args) time_inference(classifier_name, clf, total_df, dataroot)
def evaluator(dataroot,classifier_name): K=10 print("\nevaling ",dataroot) gt_num_class = pd.read_csv(join(dataroot,'{}fold_0.csv'.format(K)),usecols=['Label'])['Label'].nunique() # load Classifier classifier_args, config = get_args(classifier_name, class_weight=None ) print("Balancing technique: ", classifier_args['balance']) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = join(pre_fingerprint + config,'K_{}'.format(K)) logdir = join(fingerprint,'log') gt_classes = None for test_index in range(K): print("************* Testing holdout ", test_index,'************') runs_dir = join(logdir,'{}'.format(test_index)) print(runs_dir) print("with model: ",runs_dir) classifier_args['runs_dir'] = runs_dir loader = ClassifierLoader() clf = loader.load(classifier_args) # classifier loaded # load data col_names = get_cols4eval() col_names.append('Timestamp') test_csv_file = join(dataroot, '{}fold_{}.csv'.format(K,test_index)) df = pd.read_csv(test_csv_file,usecols=col_names,dtype=get_dtype4normalized(), nrows=10000)#,skiprows=skip_idx) x = df.drop(columns=['Flow ID','Label','Timestamp']).values # Done for i in range(10): inference_time = inference(clf,x) print(inference_time) break
def classify(dataroot, classifier_name='cnn'): class_weight = get_class_weights(dataroot) balance = get_balancing_technique() print('balancing technique ', balance) if balance == 'explicit': train_csv = join(dataroot, 'bal_train.csv') val_csv = join(dataroot, 'bal_fold_1.csv' ) # no need to use bal__fold because it is shuffled else: train_csv = join(dataroot, 'r_train.csv') val_csv = join(dataroot, 'r_fold_1.csv') result_val = subprocess.run(['wc', '-l', val_csv], stdout=subprocess.PIPE) result_train = subprocess.run(['wc', '-l', train_csv], stdout=subprocess.PIPE) train_records = int(result_train.stdout.split()[0]) - 1 # for the header val_records = int(result_val.stdout.split()[0]) - 1 print("Number of train and val records ({},{})".format( train_records, val_records)) num_epochs = 40 label_to_id, id_to_label, _ = get_ids18_mappers() #class_weight = None class_weight = get_class_weights(dataroot) if balance == 'with_loss_inverse': class_weight = 1. / class_weight num_class = len(label_to_id) # we assume all the categories are observed classifier_args, config = get_args(classifier_name, num_class, class_weight) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = pre_fingerprint + config logdir = join(fingerprint, 'log') ensure_dir(logdir) classifier_args['runs_dir'] = logdir clf = get_classifier(classifier_args) clf.fit(train_csv, val_csv, num_epochs, train_records, val_records)
def classify(dataroot, classifier_name): K = 5 fraction = 1 label_to_id, id_to_label, _ = get_ids18_mappers() #class_weight = get_class_weights(encode_label(total_label_df.values,label_to_id)) class_weight = None balance = get_balancing_technique() input_dim = 78 # because we remove Label and FlowID columns from X gt_num_class = len(label_to_id) classifier_args, config = get_args(classifier_name, gt_num_class, input_dim, class_weight, balance) pre_fingerprint = join( dataroot, 'r_{}_c_{}_k_{}'.format(fraction, classifier_name, str(K))) fingerprint = pre_fingerprint + '_mem_constrained' + config logdir = join(pre_fingerprint + config, 'log') cm_any = np.zeros((gt_num_class, gt_num_class), dtype=float) cm_majority = np.zeros((gt_num_class, gt_num_class), dtype=float) cm_all = np.zeros((gt_num_class, gt_num_class), dtype=float) kfold_feature_importance = np.zeros(input_dim, dtype=np.float) for fold_index in range(K): print('###################################') print("Fold ", fold_index) test_df = pd.read_csv(join(dataroot, 'fold_{}.csv'.format(fold_index))) runs_dir = join(logdir, 'fold_{}'.format(fold_index)) # for mem constrained experiemnt II, we need same classifier CSVs_r_1 for all memories start = runs_dir.find('CSVs_r_') end = runs_dir.find('SR_10') CSV_dirname = runs_dir[start:end - 1] #runs_dir = runs_dir.replace(CSV_dirname,'CSVs_r_1.0') classifier_args['runs_dir'] = runs_dir #---------------- loader = ClassifierLoader() clf = loader.load(classifier_args) print("Loaded Classifier!") if classifier_name == 'forest': kfold_feature_importance += clf.feature_importances_ flowids_test, y_flowid_test, grouped = group_data(test_df) y_flowid_test = encode_label(y_flowid_test, label_to_id) pred_any, pred_majority, pred_all, duration = predict_fold( classifier_name, clf, test_df, y_flowid_test, grouped, dataroot) assert pred_any.shape == pred_majority.shape, "any and majority shapes should be same {},{}".format( pred_any.shape, pred_majority.shape) acc_pred_any = 100 * metrics.balanced_accuracy_score( y_flowid_test, pred_any) acc_pred_majority = 100 * metrics.balanced_accuracy_score( y_flowid_test, pred_majority) acc_pred_all = 100 * metrics.balanced_accuracy_score( y_flowid_test, pred_all) print( "Fold Local Balanced accuracy(any,majority,all): ({:.2f},{:.2f},{:.2f})" .format(acc_pred_any, acc_pred_majority, acc_pred_all)) any_cm_i = confusion_matrix(y_flowid_test, pred_any) majority_cm_i = confusion_matrix(y_flowid_test, pred_majority) all_cm_i = confusion_matrix(y_flowid_test, pred_all) result_logger_ids18(fingerprint, y_flowid_test, (any_cm_i, majority_cm_i, all_cm_i), id_to_label, str(fold_index) + '_') cm_any += any_cm_i cm_majority += majority_cm_i cm_all += all_cm_i if classifier_name == 'forest': print_feature_importance( kfold_feature_importance, join(dataroot, 'folds_fraction_{}'.format(fraction), 'feature_selection.csv')) print(dataroot, classifier_name) result_logger_ids18(fingerprint, y_flowid_test, (cm_any, cm_majority, cm_all), id_to_label, 'avg_')
def evaluate(traindir, testdir, classifier_name): pred_any_list = [] pred_majority_list = [] pred_all_list = [] y_test_perflowid_list = [] pre_fingerprint = join(traindir, 'c_{}'.format(classifier_name)) balancing_technique = get_balancing_technique() label_to_id, id_to_label = get_ddos19_mappers() filenames = [ 'LDAP.csv', 'MSSQL.csv', 'NetBIOS.csv', 'SYN.csv', 'UDP.csv', 'UDP-Lag.csv', 'records.csv' ] total_prediction_time = 0 total_records = 0 for fn in filenames: print("---------------------------") print("Reading {}".format(fn)) tick = time.time() test_df = pd.read_csv( join(testdir, fn), usecols=get_cols4ml()) #read in 2min, requires 14GB memory tock = time.time() input_dim = test_df.shape[1] - 2 # flow id and Label is dropped num_class = len(label_to_id.keys()) print("Read {} records in {:.2f} min".format(test_df.shape[0], (tock - tick) / 60.)) if test_df.shape[0] < 1: continue test_df = test_df.sort_values( by=['Flow ID', 'Label']) # makes grouping,faster. Allows predict per flowid dummy_num_records = test_df.shape[0] class_weight = None classifier_args, config = get_args(classifier_name, dummy_num_records, num_class, input_dim, class_weight, balancing_technique) # directories for results train_fingerprint = join( traindir, 'c_{}'.format(classifier_name + config)) # fingerprint already there logdir = join(train_fingerprint, 'log') #already there runs_dir = join(logdir, 'runs') test_df = normalize_df(test_df, join(runs_dir, 'data_stats.pickle')) fingerprint = join(testdir, 'c_{}'.format(classifier_name + config)) # fingerprint already there #create classifier loader = ClassifierLoader() classifier_args['runs_dir'] = runs_dir clf = loader.load(classifier_args) # predict part print("Grouping data \r") tick = time.time() test_flowids, y_test_perflowid_str, grouped, group_sizes = group_data( test_df) test_df = test_df.drop(columns=['Flow ID', 'Label']) tock = time.time() print("Done. In {:.0f}min".format((tock - tick) / 60.)) y_test_perflowid = encode_label(y_test_perflowid_str, label_to_id) pred_any, pred_majority, pred_all, prediction_time = predict_per_flow( classifier_name, clf, grouped, test_df, y_test_perflowid, group_sizes) # takes 2-3 min total_prediction_time += prediction_time total_records += test_df.shape[0] pred_any_list += pred_any pred_majority_list += pred_majority pred_all_list += pred_all y_test_perflowid_list += y_test_perflowid pd.DataFrame({ 'Records': [total_records], 'Time': [total_prediction_time] }).to_csv(join(testdir, 'timing.csv'), index=False) pred_list_tuples = (pred_any_list, pred_majority_list, pred_all_list) result_logger_ddos19(fingerprint, y_test_perflowid_list, pred_list_tuples, id_to_label)
def evaluator(dataroot, classifier_name): print(ntpath.basename(dataroot)) test_csv_file = join(dataroot, 'fold_0.csv') # load Classifier classifier_args, config = get_args(classifier_name, num_class='dummy', class_weight=None) print("Balancing technique: ", classifier_args['balance']) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = pre_fingerprint + config logdir = join(fingerprint, 'log') if 'mem_const_exp' == 'mem_const_exp': # for mem constraint exp start = logdir.find('CSVs_r') end = logdir.find('_m_') CSV_dirname = logdir[start:end] logdir = logdir.replace(CSV_dirname, 'CSVs_r_1.0') # end print(logdir) classifier_args['runs_dir'] = logdir loader = ClassifierLoader() clf = loader.load(classifier_args) if 'noprint_clf_attr' == 'print_clf_attr' and 'tree' in classifier_name: print("maximum depth of the tree ", clf.tree_.max_depth) import matplotlib.pyplot as plt from sklearn.tree import plot_tree plt.figure() plot_tree(clf, filled=True) plt.savefig(join(logdir, 'tree_plot.png'), dpi=1000) return if 'norf_attr' == 'rf_attr' and 'forest' in classifier_name: depth = [est.tree_.max_depth for est in clf.estimators_] print(depth) depth = np.array(depth) print("forest depth", depth.mean(), depth.max(), depth.min()) print("maximum depth of the tree ", clf.base_estimator_.max_depth) return import matplotlib.pyplot as plt from sklearn.tree import plot_tree plt.figure() plot_tree(clf, filled=True) plt.savefig(join(logdir, 'tree_plot.png'), dpi=1000) return print("Classifier Loaded!") # classifier loaded # load data col_names = get_cols4eval() col_names.append('Timestamp') df = pd.read_csv(test_csv_file, usecols=col_names, dtype=get_dtype4normalized()) #,skiprows=skip_idx) df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype(str) # type string df = df.sort_values(by=['Flow ID', 'Day', 'Label']) print(df.Label.value_counts()) # Done pred_per_record = predict_per_record(df, clf) per_record_evaluation(df, pred_per_record) tick = time.time() flowids, flowlabels_str, grouped = group_data(df) print("Grouped in {:.2f} min".format((time.time() - tick) / 60)) y = encode_label(flowlabels_str) print("data is grouped and labels are encoded") pred_any, pred_maj, pred_all, y = evaluate_per_flow( clf, y, grouped, df, pred_per_record) gt_classes = np.unique(y) pred_classes = np.unique(pred_any) nunique_gt = len(gt_classes) nunique_pred = len(pred_classes) assert nunique_gt >= nunique_pred, "should not predict non existing class(es), but \n{} < \n{}".format( gt_classes, pred_classes) any_cm = confusion_matrix(y, pred_any) majority_cm = confusion_matrix(y, pred_maj) all_cm = confusion_matrix(y, pred_all) any_acc = metrics.balanced_accuracy_score(y, pred_any) maj_acc = metrics.balanced_accuracy_score(y, pred_maj) all_acc = metrics.balanced_accuracy_score(y, pred_all) print(any_acc, maj_acc, all_acc) result_logger_ids18(fingerprint, gt_classes, (any_cm, majority_cm, all_cm), 'test')