def train(dataroot,classifier_name): print("Reading the data...") df = read_ddos_data(dataroot) # takes 10GB RAM, loads in 68 seconds print("read data of shape ", df.shape) label_to_id, id_to_label = get_ddos19_mappers() balancing_technique = get_balancing_technique() input_dim = df.shape[1]-2 # because we remove Label and FlowID columns from X num_class = len(label_to_id.keys()) WS_flow_count = 13684951 # 13.7 mln records on PCAP-01-12 num_iters = WS_flow_count*10 class_weight = None classifier_args,config = get_args(classifier_name,WS_flow_count,num_class,input_dim, class_weight,balancing_technique) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = pre_fingerprint + config logdir = join(fingerprint,'log') runs_dir = join(logdir,'runs') ensure_dir(runs_dir) df = normalize_df(df,join(runs_dir,'data_stats.pickle'),train_data=True) X_train, y_train = df_to_array(df) y_train = encode_label(y_train,label_to_id) classifier_args['runs_dir'] = runs_dir train_and_save_classifier(X_train,y_train,classifier_args)
def train(task, size, data, shards, checkpoint): name = '{}/{}-{}/'.format(task, size, shards) model = getattr(models, task)[size] task = getattr(tasks, task) df = utils.get_df(data, shards) df = utils.normalize_df(df) df = df.sample(frac=1) dataset = utils.Data(df, task) callbacks = get_callbacks('logs/{}'.format(name)) model.compile(optimizer='adam', loss=task['outputs'], metrics=task.get('metrics')) model.summary() model.fit( dataset, callbacks=callbacks, workers=2, max_queue_size=10, use_multiprocessing=True, ) if checkpoint: model.save('checkpoints/{}'.format(name))
def make_fold_i(dataroot, flowids, fold_index): #filter data based on flowids #chunksize 10^4: 100min # chunksize 10^5: 12min # chunksize 10^6: 3min chunksize = 10**6 df_list = [] for i, fn in enumerate(glob(join(dataroot, '*Meter.csv'))): tick_start = time.time() TextFileReaderObject = pd.read_csv(fn, engine='c', dtype=get_dtype(), chunksize=chunksize) df_per_file = pd.concat([ df_chunk[df_chunk['Label'] != 'Benign'] for df_chunk in tqdm(TextFileReaderObject) ], sort=False) print("CSV file per day is readNconcat in {:.2f}".format(time.time() - tick_start)) tick_record = time.time() df_per_file = get_flow_records_by_gid(flowids, df_per_file) if df_per_file is None: # no flow from this CSV for the given fold continue print('Flow records are obtain in {:.2f} sec'.format(time.time() - tick_record)) tick = time.time() print("df_per_file.shape = ", df_per_file.shape) df_per_file_norm = normalize_df(df_per_file) print('normalalization time {:.2f} sec'.format(time.time() - tick)) print(df_per_file.shape) if i == 0: df_per_file_norm.to_csv( join(dataroot, 'fold_{}.csv'.format(fold_index))) df_per_file.to_csv( join(dataroot, 'nonnormalized_fold_{}.csv'.format(fold_index))) else: df_per_file_norm.to_csv(join(dataroot, 'fold_{}.csv'.format(fold_index)), mode='a', header=False) df_per_file.to_csv(join( dataroot, 'nonnormalized_fold_{}.csv'.format(fold_index)), mode='a', header=False) print("Time spent per CSV file {:.2f} ".format(time.time() - tick_start)) print("Done for fold ", fold_index)
def split_n_write_mal(csv_files, label, dataroot): df_ls = [] for csv_filename in csv_files: fn = join(dataroot, csv_filename) df_i = chunk_read(fn) df_ls.append(df_i) df = pd.concat(df_ls, sort=False) df = df[df['Label']==label] if not label in df.Label.unique(): return assert len(df.Label.unique())==1, "There should be only one label {}".format(df.Label.unique()) print(label,df.Label.value_counts()[label]) flowids = np.sort(df['Flow ID'].unique()) np.random.seed(getSeed()) np.random.shuffle(flowids) # FLOW shuffle reduces bias in data split while FLOWRECORD shuffle reduces bias in model num_flows = len(flowids) if num_flows<K: print("Category {1} has less than K flows: {0} ".format(num_flows,label)) return n = num_flows//K folds_df = [] for i in range(NUM_OF_FOLDS): fn = join(dataroot,foldname_regex.format(i)) fold_fids = flowids[i*n:(i+1)*n] fold_df = df.loc[(df['Flow ID'].isin(fold_fids))].copy() fold_df = normalize_df(fold_df) folds_df.append(fold_df) return folds_df fsize = os.path.getsize(fn) if fsize==0: fold_df.to_csv(fn, index=False) else: fold_df.to_csv(fn, mode='a', header=False, index=False)
def make_fold(dataroot): fraction = 1 file_ending = '*Meter.csv' K = 5 outputdir = join(dataroot, 'folds_fraction_{}'.format(fraction)) ensure_dir(outputdir) df = read_data(dataroot, file_ending, fraction=fraction) df = normalize_df(df, join(outputdir, 'data_stats.pickle'), train_data=True) flowids, flowlabels, grouped = group_data(df, K, outputdir) skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=getSeed()) for fold_index, (train_index, test_index) in enumerate(skf.split(flowids, flowlabels)): print("Fold - ", fold_index) test_flowids = flowids[test_index] fold_df = get_flow_records(test_flowids, df, grouped) fold_df.to_csv(join(outputdir, 'fold_{}.csv'.format(fold_index)), index=False, encoding='utf-8-sig')
outputdir = join(dataroot) #ensure_dir(outputdir) nrows = None print('nrows = {} '.format(nrows), end=':\n ') tick = time.time() df = read_data(dataroot, nrows=nrows) #20min print("Data is read in {:.2f} sec".format(time.time() - tick)) tick = time.time() df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype(str) # type string print('new column created in {:.2f} sec'.format(time.time() - tick)) tick = time.time() df = normalize_df(df, join(outputdir, 'data_stats.pickle'), train_data=False) print("Done normalizing in {:.2f} sec".format(time.time() - tick)) tick = time.time() flowids, flowlabels = get_flowids_and_labels(df) tock = time.time() print('obtained flowid and labels in {:.2f} sec'.format(tock - tick)) unique, counts = np.unique(flowlabels, return_counts=True) print(np.asarray((unique, counts)).T) skf = StratifiedKFold(n_splits=K, random_state=SEED) tick = time.time() for fold_index, (train_index, test_index) in enumerate(skf.split(flowids, flowlabels)):
def evaluate(traindir, testdir, classifier_name): pred_any_list = [] pred_majority_list = [] pred_all_list = [] y_test_perflowid_list = [] pre_fingerprint = join(traindir, 'c_{}'.format(classifier_name)) balancing_technique = get_balancing_technique() label_to_id, id_to_label = get_ddos19_mappers() filenames = [ 'LDAP.csv', 'MSSQL.csv', 'NetBIOS.csv', 'SYN.csv', 'UDP.csv', 'UDP-Lag.csv', 'records.csv' ] total_prediction_time = 0 total_records = 0 for fn in filenames: print("---------------------------") print("Reading {}".format(fn)) tick = time.time() test_df = pd.read_csv( join(testdir, fn), usecols=get_cols4ml()) #read in 2min, requires 14GB memory tock = time.time() input_dim = test_df.shape[1] - 2 # flow id and Label is dropped num_class = len(label_to_id.keys()) print("Read {} records in {:.2f} min".format(test_df.shape[0], (tock - tick) / 60.)) if test_df.shape[0] < 1: continue test_df = test_df.sort_values( by=['Flow ID', 'Label']) # makes grouping,faster. Allows predict per flowid dummy_num_records = test_df.shape[0] class_weight = None classifier_args, config = get_args(classifier_name, dummy_num_records, num_class, input_dim, class_weight, balancing_technique) # directories for results train_fingerprint = join( traindir, 'c_{}'.format(classifier_name + config)) # fingerprint already there logdir = join(train_fingerprint, 'log') #already there runs_dir = join(logdir, 'runs') test_df = normalize_df(test_df, join(runs_dir, 'data_stats.pickle')) fingerprint = join(testdir, 'c_{}'.format(classifier_name + config)) # fingerprint already there #create classifier loader = ClassifierLoader() classifier_args['runs_dir'] = runs_dir clf = loader.load(classifier_args) # predict part print("Grouping data \r") tick = time.time() test_flowids, y_test_perflowid_str, grouped, group_sizes = group_data( test_df) test_df = test_df.drop(columns=['Flow ID', 'Label']) tock = time.time() print("Done. In {:.0f}min".format((tock - tick) / 60.)) y_test_perflowid = encode_label(y_test_perflowid_str, label_to_id) pred_any, pred_majority, pred_all, prediction_time = predict_per_flow( classifier_name, clf, grouped, test_df, y_test_perflowid, group_sizes) # takes 2-3 min total_prediction_time += prediction_time total_records += test_df.shape[0] pred_any_list += pred_any pred_majority_list += pred_majority pred_all_list += pred_all y_test_perflowid_list += y_test_perflowid pd.DataFrame({ 'Records': [total_records], 'Time': [total_prediction_time] }).to_csv(join(testdir, 'timing.csv'), index=False) pred_list_tuples = (pred_any_list, pred_majority_list, pred_all_list) result_logger_ddos19(fingerprint, y_test_perflowid_list, pred_list_tuples, id_to_label)
def normalize_n_write_normal(df, fn): normalize_df(df).to_csv(fn, index=False, chunksize=10**4, mode='a',header=False)
def _main(): np.random.seed(rs) logger.info("Running script for Approach 1") tr_df = pd.read_csv(os.path.join("data", "cs-training.csv"), index_col=0) te_df = pd.read_csv(os.path.join("data", "cs-test.csv"), index_col=0) tr_df, te_df = _preprocess_data(tr_df, te_df) # Add features tr_df, te_df = feats.add_features_based_on_NumOCLL(tr_df, te_df) tr_df, te_df = feats.add_features_based_on_NumRELL(tr_df, te_df) tr_df, te_df = feats.add_features_based_on_RUoUL(tr_df, te_df) # Preparing dataset for training excluded_cols = [ "age", "MonthlyIncome", "MonthlyIncome_Imputed", "SeriousDlqin2yrs" ] train_df = tr_df[tr_df.columns.difference(excluded_cols)] cols = train_df.columns.values.tolist() X, _ = utils.normalize_df(train_df) X = X.as_matrix() y = tr_df["SeriousDlqin2yrs"].values # Split sss = StratifiedShuffleSplit(n_splits=3, random_state=rs, test_size=0.3) for train_index, test_index in sss.split(X, y): X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], y[ train_index], y[test_index] logger.info("X {}, train {}, valid {}" \ .format(X.shape, X_train.shape, X_valid.shape)) # Train logger.info("Features used for training : {}".format(cols)) base_estimators = [ ExtraTreesClassifier(n_estimators=400, n_jobs=-1, random_state=rs), LogisticRegressionCV(random_state=rs), RandomForestClassifier(bootstrap=True, criterion="gini", max_depth=None, max_features=5, n_estimators=150, n_jobs=-1, random_state=rs), # SVC(C=0.01, gamma=0.01, kernel="rbf", probability=True, # random_state=rs) ] # Each classifier is trained on 5 stratified splits # and the one (amongst the 5) with best AUC score is selected best_auc = 0.0 common_top_n_features = [] for est in base_estimators: fitted_est = utils.train_estimator(est, X_train, y_train, 5) top_n_features = [] top_n_features_df = utils.log_important_features(est, cols) if top_n_features_df.shape[0] > 0: top_n_features = top_n_features_df.head(15).feature.values.tolist() common_top_n_features.extend(top_n_features) common_top_n_features = list(set(common_top_n_features)) logger.info("{} common_top_n_features : {}" \ .format(len(common_top_n_features), common_top_n_features)) preds = fitted_est.predict(X_valid) score = roc_auc_score(y_valid, preds) logger.info("AUC : {:.5f}".format(score)) if score > best_auc: best_auc = score best_est = fitted_est logger.info("Best estimator : {}".format(best_est)) # Re-fitting the best estimator using the common top N features refit = False # TODO read from config if refit == True: logger.info("Re-fitting best estimator {} using top N features ..." \ .format(best_est.__class__.__name__)) X, _ = utils.normalize_df(train_df[common_top_n_features]) X = X.as_matrix() y = tr_df["SeriousDlqin2yrs"].values sss = StratifiedShuffleSplit(n_splits=3, random_state=rs, test_size=0.3) for train_index, test_index in sss.split(X, y): X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], \ y[train_index], y[test_index] fitted_best_est = utils.train_estimator(best_est, X_train, y_train, 5) preds = fitted_best_est.predict(X_valid) score = roc_auc_score(y_valid, preds) logger.info("AUC : {:.5f}".format(score)) if score > best_auc: best_auc = score best_est = fitted_est # Getting the predictions logger.info("Get the predictions using {} ...".format(best_est)) te_df_, _ = utils.normalize_df(te_df[cols]) identifiers = te_df_.index.tolist() if refit == True: p = [ x[1] for x in best_est.predict_proba(te_df_[common_top_n_features]) ] else: p = [x[1] for x in best_est.predict_proba(te_df_)] _prepare_submission_file(identifiers, p)
def _predict_monthly_income(tr_df, te_df): logger.info("Preparing dataset to train model to predict MonthlyIncome") mask = np.logical_not(tr_df.MonthlyIncome.isnull()) tr_tr = tr_df[mask] # Train's training data (has MonthlyIncome) tr_te = tr_df[tr_df.MonthlyIncome.isnull()] # Train's test data mask = np.logical_not(te_df.MonthlyIncome.isnull()) te_tr = te_df[mask] te_te = te_df[te_df.MonthlyIncome.isnull()] logger.info("tr_tr, tr_te : {},{}".format(tr_tr.shape, tr_te.shape)) logger.info("te_tr, te_te : {},{}".format(te_tr.shape, te_te.shape)) # Prepare the dataset : Normalizing the dataset tr_tr, scaler_1 = utils.normalize_df(tr_tr) tr_te.drop(["MonthlyIncome"], axis=1, inplace=True) # Temporarily tr_te, _ = utils.normalize_df(tr_te) tr_te["MonthlyIncome"] = None # add it back in te_tr, scaler_2 = utils.normalize_df(te_tr) te_te.drop(["MonthlyIncome"], axis=1, inplace=True) # Temporarily te_te, _ = utils.normalize_df(te_te) te_te["MonthlyIncome"] = None # add it back in # Prepare the dataset : split cols = [ "RUoUL", "age", "NumLate3059", "NumLate6089", "NumLate90", "DebtRatio", "NumOCLL", "NumRELL", "NumDependents" ] X_train, Y_train = tr_tr[cols], tr_tr[["MonthlyIncome"]] Y_train = Y_train.MonthlyIncome.ravel() X_test, Y_test = tr_te[cols], tr_te[["MonthlyIncome"]] Y_test = Y_test.MonthlyIncome.ravel() logger.info("X_train : {}, X_test : {}, Y_train : {}, Y_test : {}" \ .format(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)) # Train the model pickle_file = "monthly_income_predictor.pkl" if os.path.exists(pickle_file): est = joblib.load(pickle_file) else: logger.info("Training model to predict MonthlyIncome") est = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=329521) scorer_name = "neg_median_absolute_error" scores = utils.get_cv_scores(est, X_train, Y_train, scorer_name) est.fit(X_train, Y_train) joblib.dump(est, "monthly_income_predictor.pkl") # Predict the MonthlyIncome est_name = est.__class__.__name__ logger.info("{} rows in tr_te missing MonthlyIncome".format(len(X_test))) logger.info("{} rows in te_te missing MonthlyIncome".format(len(te_te))) logger.info("Using {} to predict MonthlyIncome".format(est_name)) predictions_1 = est.predict(X_test) predictions_2 = est.predict(te_te[cols]) # Set the MonthlyIncome for the rows where it was missing (X_test & te_te) X_train["MonthlyIncome"] = Y_train # Adding it back to X_train X_test["MonthlyIncome"] = predictions_1 te_te["MonthlyIncome"] = predictions_2 # Un-scale the MonthlyIncome - we used a MinMaxScaler earlier # First, un-scale MonthlyIncome values in X_test # X_train constructed from tr_tr, X_test constructed from tr_te logger.info("Un-scaling the MonthlyIncome values in X_train & X_test") X_train["MonthlyIncome"] = \ utils.unscale_column_values(X_train["MonthlyIncome"], 8, scaler_1) X_test["MonthlyIncome"] = \ utils.unscale_column_values(predictions_1, 8, scaler_1) # Next, un-scale MonthlyIncome values in te_te logger.info("Un-scaling the MonthlyIncome values in te_tr & te_te") te_tr["MonthlyIncome"] = \ utils.unscale_column_values(te_tr["MonthlyIncome"], 8, scaler_2) te_te["MonthlyIncome"] = \ utils.unscale_column_values(predictions_2, 8, scaler_2) # Concat the DataFrames because we now have the missing MonthlyIncome, tmp_df_1 = pd.concat([X_train, X_test]) tmp_df_2 = pd.concat([te_tr, te_te]) # Next, merge the tmp_df with the existing train/test datasets df0 = pd.read_csv(os.path.join("data", "cs-training.csv"), index_col=0) df0 = df0[["SeriousDlqin2yrs", "NumberOfDependents"]] df0.rename(columns={"NumberOfDependents": "NumDependents"}, inplace=True) tr_df = pd.merge(tr_df, df0, left_index=True, right_index=True) df1 = pd.merge(tr_df, tmp_df_1, left_index=True, right_index=True, suffixes=("", "_y")) # Repeat, for test, df0 = pd.read_csv(os.path.join("data", "cs-test.csv"), index_col=0) df0 = df0[["SeriousDlqin2yrs", "NumberOfDependents"]] df0.rename(columns={"NumberOfDependents": "NumDependents"}, inplace=True) te_df = pd.merge(te_df, df0, left_index=True, right_index=True) df2 = pd.merge(te_df, tmp_df_2, left_index=True, right_index=True, suffixes=("", "_y")) # Next, retains the columns we need - and, in the order we need cols = [ "SeriousDlqin2yrs", "RUoUL", "age", "NumLate3059", "DebtRatio", "MonthlyIncome", "MonthlyIncome_y", "NumOCLL", "NumLate90", "NumRELL", "NumLate6089", "NumDependents" ] df1, df2 = df1[cols], df2[cols] df1.rename(columns={"MonthlyIncome_y": "MonthlyIncome_Imputed"}, inplace=True) df2.rename(columns={"MonthlyIncome_y": "MonthlyIncome_Imputed"}, inplace=True) tr_df, te_df = df1, df2 tr_df.to_csv(os.path.join("data", "tr_with_income.csv")) te_df.to_csv(os.path.join("data", "te_with_income.csv")) logger.info("Done predicting the missing MonthlyIncome ...") tr_df["MonthlyIncome"] = tr_df["MonthlyIncome_Imputed"] te_df["MonthlyIncome"] = te_df["MonthlyIncome_Imputed"] return tr_df, te_df