コード例 #1
0
def train(dataroot,classifier_name):
        print("Reading the data...")
        df = read_ddos_data(dataroot) # takes 10GB RAM, loads in 68 seconds
        print("read data of shape ", df.shape)
        label_to_id, id_to_label = get_ddos19_mappers()
        
        balancing_technique = get_balancing_technique()
        input_dim = df.shape[1]-2 # because we remove Label and FlowID columns from X        
        num_class = len(label_to_id.keys())
        WS_flow_count = 13684951 # 13.7 mln records on PCAP-01-12
        num_iters = WS_flow_count*10
        class_weight = None
        classifier_args,config = get_args(classifier_name,WS_flow_count,num_class,input_dim, class_weight,balancing_technique)
        pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
        fingerprint = pre_fingerprint + config
        logdir = join(fingerprint,'log')
        runs_dir = join(logdir,'runs')
        ensure_dir(runs_dir)
        
        df = normalize_df(df,join(runs_dir,'data_stats.pickle'),train_data=True)
        
        X_train, y_train = df_to_array(df)
        y_train = encode_label(y_train,label_to_id)        
        classifier_args['runs_dir'] = runs_dir
        train_and_save_classifier(X_train,y_train,classifier_args)
コード例 #2
0
def train(task, size, data, shards, checkpoint):
    name = '{}/{}-{}/'.format(task, size, shards)
    model = getattr(models, task)[size]
    task = getattr(tasks, task)

    df = utils.get_df(data, shards)
    df = utils.normalize_df(df)
    df = df.sample(frac=1)
    dataset = utils.Data(df, task)
    callbacks = get_callbacks('logs/{}'.format(name))

    model.compile(optimizer='adam',
                  loss=task['outputs'],
                  metrics=task.get('metrics'))
    model.summary()

    model.fit(
        dataset,
        callbacks=callbacks,
        workers=2,
        max_queue_size=10,
        use_multiprocessing=True,
    )

    if checkpoint:
        model.save('checkpoints/{}'.format(name))
コード例 #3
0
def make_fold_i(dataroot, flowids, fold_index):
    #filter data based on flowids
    #chunksize 10^4: 100min
    # chunksize 10^5: 12min
    # chunksize 10^6: 3min
    chunksize = 10**6

    df_list = []
    for i, fn in enumerate(glob(join(dataroot, '*Meter.csv'))):
        tick_start = time.time()
        TextFileReaderObject = pd.read_csv(fn,
                                           engine='c',
                                           dtype=get_dtype(),
                                           chunksize=chunksize)
        df_per_file = pd.concat([
            df_chunk[df_chunk['Label'] != 'Benign']
            for df_chunk in tqdm(TextFileReaderObject)
        ],
                                sort=False)
        print("CSV file per day is readNconcat in {:.2f}".format(time.time() -
                                                                 tick_start))
        tick_record = time.time()
        df_per_file = get_flow_records_by_gid(flowids, df_per_file)
        if df_per_file is None:  # no flow from this CSV for the given fold
            continue
        print('Flow records are obtain in {:.2f} sec'.format(time.time() -
                                                             tick_record))

        tick = time.time()
        print("df_per_file.shape = ", df_per_file.shape)
        df_per_file_norm = normalize_df(df_per_file)
        print('normalalization time {:.2f} sec'.format(time.time() - tick))
        print(df_per_file.shape)
        if i == 0:
            df_per_file_norm.to_csv(
                join(dataroot, 'fold_{}.csv'.format(fold_index)))
            df_per_file.to_csv(
                join(dataroot, 'nonnormalized_fold_{}.csv'.format(fold_index)))
        else:
            df_per_file_norm.to_csv(join(dataroot,
                                         'fold_{}.csv'.format(fold_index)),
                                    mode='a',
                                    header=False)
            df_per_file.to_csv(join(
                dataroot, 'nonnormalized_fold_{}.csv'.format(fold_index)),
                               mode='a',
                               header=False)
        print("Time spent per CSV file {:.2f} ".format(time.time() -
                                                       tick_start))
    print("Done for fold ", fold_index)
コード例 #4
0
def split_n_write_mal(csv_files, label, dataroot):
    df_ls = []
    for csv_filename in csv_files:
            fn = join(dataroot, csv_filename)
            df_i = chunk_read(fn)
            df_ls.append(df_i)
    df = pd.concat(df_ls, sort=False)
    df = df[df['Label']==label]

    if not label in df.Label.unique():
        return
    assert len(df.Label.unique())==1, "There should be only one label {}".format(df.Label.unique())
    print(label,df.Label.value_counts()[label])

    flowids = np.sort(df['Flow ID'].unique())
    np.random.seed(getSeed())
    np.random.shuffle(flowids) # FLOW shuffle reduces bias in data split while FLOWRECORD shuffle reduces bias in model
    num_flows = len(flowids)
    if num_flows<K:
        print("Category {1} has less than K flows: {0} ".format(num_flows,label))
        return

    n = num_flows//K
    folds_df = [] 
    for i in range(NUM_OF_FOLDS):
        fn = join(dataroot,foldname_regex.format(i))
        fold_fids = flowids[i*n:(i+1)*n]
        
        fold_df = df.loc[(df['Flow ID'].isin(fold_fids))].copy()
        fold_df = normalize_df(fold_df)
        folds_df.append(fold_df)
    return folds_df



        fsize =  os.path.getsize(fn)
        if fsize==0:
            fold_df.to_csv(fn, index=False)
        else:
            fold_df.to_csv(fn,  mode='a', header=False, index=False)
コード例 #5
0
def make_fold(dataroot):
    fraction = 1
    file_ending = '*Meter.csv'
    K = 5

    outputdir = join(dataroot, 'folds_fraction_{}'.format(fraction))
    ensure_dir(outputdir)

    df = read_data(dataroot, file_ending, fraction=fraction)
    df = normalize_df(df,
                      join(outputdir, 'data_stats.pickle'),
                      train_data=True)
    flowids, flowlabels, grouped = group_data(df, K, outputdir)

    skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=getSeed())
    for fold_index, (train_index,
                     test_index) in enumerate(skf.split(flowids, flowlabels)):
        print("Fold - ", fold_index)
        test_flowids = flowids[test_index]
        fold_df = get_flow_records(test_flowids, df, grouped)
        fold_df.to_csv(join(outputdir, 'fold_{}.csv'.format(fold_index)),
                       index=False,
                       encoding='utf-8-sig')
コード例 #6
0
    outputdir = join(dataroot)
    #ensure_dir(outputdir)

    nrows = None
    print('nrows = {} '.format(nrows), end=':\n ')
    tick = time.time()
    df = read_data(dataroot, nrows=nrows)  #20min
    print("Data is read in {:.2f} sec".format(time.time() - tick))

    tick = time.time()
    df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype(str)  # type string
    print('new column created in {:.2f} sec'.format(time.time() - tick))

    tick = time.time()
    df = normalize_df(df,
                      join(outputdir, 'data_stats.pickle'),
                      train_data=False)
    print("Done normalizing in {:.2f} sec".format(time.time() - tick))

    tick = time.time()
    flowids, flowlabels = get_flowids_and_labels(df)
    tock = time.time()
    print('obtained flowid and labels in {:.2f} sec'.format(tock - tick))

    unique, counts = np.unique(flowlabels, return_counts=True)
    print(np.asarray((unique, counts)).T)

    skf = StratifiedKFold(n_splits=K, random_state=SEED)
    tick = time.time()
    for fold_index, (train_index,
                     test_index) in enumerate(skf.split(flowids, flowlabels)):
コード例 #7
0
def evaluate(traindir, testdir, classifier_name):
    pred_any_list = []
    pred_majority_list = []
    pred_all_list = []
    y_test_perflowid_list = []

    pre_fingerprint = join(traindir, 'c_{}'.format(classifier_name))
    balancing_technique = get_balancing_technique()
    label_to_id, id_to_label = get_ddos19_mappers()

    filenames = [
        'LDAP.csv', 'MSSQL.csv', 'NetBIOS.csv', 'SYN.csv', 'UDP.csv',
        'UDP-Lag.csv', 'records.csv'
    ]

    total_prediction_time = 0
    total_records = 0

    for fn in filenames:
        print("---------------------------")
        print("Reading {}".format(fn))
        tick = time.time()
        test_df = pd.read_csv(
            join(testdir, fn),
            usecols=get_cols4ml())  #read in 2min, requires 14GB memory
        tock = time.time()
        input_dim = test_df.shape[1] - 2  # flow id and Label is dropped
        num_class = len(label_to_id.keys())
        print("Read {} records in {:.2f} min".format(test_df.shape[0],
                                                     (tock - tick) / 60.))
        if test_df.shape[0] < 1:
            continue
        test_df = test_df.sort_values(
            by=['Flow ID',
                'Label'])  # makes grouping,faster. Allows predict per flowid
        dummy_num_records = test_df.shape[0]
        class_weight = None
        classifier_args, config = get_args(classifier_name, dummy_num_records,
                                           num_class, input_dim, class_weight,
                                           balancing_technique)
        # directories for results
        train_fingerprint = join(
            traindir, 'c_{}'.format(classifier_name +
                                    config))  # fingerprint already there
        logdir = join(train_fingerprint, 'log')  #already there
        runs_dir = join(logdir, 'runs')
        test_df = normalize_df(test_df, join(runs_dir, 'data_stats.pickle'))

        fingerprint = join(testdir,
                           'c_{}'.format(classifier_name +
                                         config))  # fingerprint already there
        #create classifier
        loader = ClassifierLoader()
        classifier_args['runs_dir'] = runs_dir
        clf = loader.load(classifier_args)

        # predict part
        print("Grouping data \r")
        tick = time.time()
        test_flowids, y_test_perflowid_str, grouped, group_sizes = group_data(
            test_df)
        test_df = test_df.drop(columns=['Flow ID', 'Label'])
        tock = time.time()
        print("Done. In {:.0f}min".format((tock - tick) / 60.))

        y_test_perflowid = encode_label(y_test_perflowid_str, label_to_id)

        pred_any, pred_majority, pred_all, prediction_time = predict_per_flow(
            classifier_name, clf, grouped, test_df, y_test_perflowid,
            group_sizes)  # takes 2-3 min

        total_prediction_time += prediction_time
        total_records += test_df.shape[0]

        pred_any_list += pred_any
        pred_majority_list += pred_majority
        pred_all_list += pred_all

        y_test_perflowid_list += y_test_perflowid

    pd.DataFrame({
        'Records': [total_records],
        'Time': [total_prediction_time]
    }).to_csv(join(testdir, 'timing.csv'), index=False)
    pred_list_tuples = (pred_any_list, pred_majority_list, pred_all_list)
    result_logger_ddos19(fingerprint, y_test_perflowid_list, pred_list_tuples,
                         id_to_label)
コード例 #8
0
def normalize_n_write_normal(df, fn):
    normalize_df(df).to_csv(fn, index=False, chunksize=10**4, mode='a',header=False)
コード例 #9
0
def _main():
    np.random.seed(rs)
    logger.info("Running script for Approach 1")
    tr_df = pd.read_csv(os.path.join("data", "cs-training.csv"), index_col=0)
    te_df = pd.read_csv(os.path.join("data", "cs-test.csv"), index_col=0)
    tr_df, te_df = _preprocess_data(tr_df, te_df)

    # Add features
    tr_df, te_df = feats.add_features_based_on_NumOCLL(tr_df, te_df)
    tr_df, te_df = feats.add_features_based_on_NumRELL(tr_df, te_df)
    tr_df, te_df = feats.add_features_based_on_RUoUL(tr_df, te_df)

    # Preparing dataset for training
    excluded_cols = [
        "age", "MonthlyIncome", "MonthlyIncome_Imputed", "SeriousDlqin2yrs"
    ]
    train_df = tr_df[tr_df.columns.difference(excluded_cols)]
    cols = train_df.columns.values.tolist()
    X, _ = utils.normalize_df(train_df)
    X = X.as_matrix()
    y = tr_df["SeriousDlqin2yrs"].values

    # Split
    sss = StratifiedShuffleSplit(n_splits=3, random_state=rs, test_size=0.3)
    for train_index, test_index in sss.split(X, y):
        X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], y[
            train_index], y[test_index]

    logger.info("X {}, train {}, valid {}" \
                .format(X.shape, X_train.shape, X_valid.shape))

    # Train
    logger.info("Features used for training : {}".format(cols))
    base_estimators = [
        ExtraTreesClassifier(n_estimators=400, n_jobs=-1, random_state=rs),
        LogisticRegressionCV(random_state=rs),
        RandomForestClassifier(bootstrap=True,
                               criterion="gini",
                               max_depth=None,
                               max_features=5,
                               n_estimators=150,
                               n_jobs=-1,
                               random_state=rs),
        # SVC(C=0.01, gamma=0.01, kernel="rbf", probability=True,
        #     random_state=rs)
    ]

    # Each classifier is trained on 5 stratified splits
    # and the one (amongst the 5) with best AUC score is selected
    best_auc = 0.0
    common_top_n_features = []
    for est in base_estimators:
        fitted_est = utils.train_estimator(est, X_train, y_train, 5)
        top_n_features = []
        top_n_features_df = utils.log_important_features(est, cols)
        if top_n_features_df.shape[0] > 0:
            top_n_features = top_n_features_df.head(15).feature.values.tolist()
        common_top_n_features.extend(top_n_features)
        common_top_n_features = list(set(common_top_n_features))
        logger.info("{} common_top_n_features : {}" \
                    .format(len(common_top_n_features), common_top_n_features))
        preds = fitted_est.predict(X_valid)
        score = roc_auc_score(y_valid, preds)
        logger.info("AUC : {:.5f}".format(score))
        if score > best_auc:
            best_auc = score
            best_est = fitted_est

    logger.info("Best estimator : {}".format(best_est))

    # Re-fitting the best estimator using the common top N features
    refit = False  # TODO read from config
    if refit == True:
        logger.info("Re-fitting best estimator {} using top N features ..." \
                    .format(best_est.__class__.__name__))
        X, _ = utils.normalize_df(train_df[common_top_n_features])
        X = X.as_matrix()
        y = tr_df["SeriousDlqin2yrs"].values
        sss = StratifiedShuffleSplit(n_splits=3,
                                     random_state=rs,
                                     test_size=0.3)
        for train_index, test_index in sss.split(X, y):
            X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], \
                                                 y[train_index], y[test_index]
        fitted_best_est = utils.train_estimator(best_est, X_train, y_train, 5)
        preds = fitted_best_est.predict(X_valid)
        score = roc_auc_score(y_valid, preds)
        logger.info("AUC : {:.5f}".format(score))
        if score > best_auc:
            best_auc = score
            best_est = fitted_est

    # Getting the predictions
    logger.info("Get the predictions using {} ...".format(best_est))
    te_df_, _ = utils.normalize_df(te_df[cols])
    identifiers = te_df_.index.tolist()
    if refit == True:
        p = [
            x[1] for x in best_est.predict_proba(te_df_[common_top_n_features])
        ]
    else:
        p = [x[1] for x in best_est.predict_proba(te_df_)]
    _prepare_submission_file(identifiers, p)
コード例 #10
0
def _predict_monthly_income(tr_df, te_df):
    logger.info("Preparing dataset to train model to predict MonthlyIncome")
    mask = np.logical_not(tr_df.MonthlyIncome.isnull())
    tr_tr = tr_df[mask]  # Train's training data (has MonthlyIncome)
    tr_te = tr_df[tr_df.MonthlyIncome.isnull()]  # Train's test data
    mask = np.logical_not(te_df.MonthlyIncome.isnull())
    te_tr = te_df[mask]
    te_te = te_df[te_df.MonthlyIncome.isnull()]
    logger.info("tr_tr, tr_te : {},{}".format(tr_tr.shape, tr_te.shape))
    logger.info("te_tr, te_te : {},{}".format(te_tr.shape, te_te.shape))

    # Prepare the dataset : Normalizing the dataset
    tr_tr, scaler_1 = utils.normalize_df(tr_tr)
    tr_te.drop(["MonthlyIncome"], axis=1, inplace=True)  # Temporarily
    tr_te, _ = utils.normalize_df(tr_te)
    tr_te["MonthlyIncome"] = None  # add it back in
    te_tr, scaler_2 = utils.normalize_df(te_tr)
    te_te.drop(["MonthlyIncome"], axis=1, inplace=True)  # Temporarily
    te_te, _ = utils.normalize_df(te_te)
    te_te["MonthlyIncome"] = None  # add it back in

    # Prepare the dataset : split
    cols = [
        "RUoUL", "age", "NumLate3059", "NumLate6089", "NumLate90", "DebtRatio",
        "NumOCLL", "NumRELL", "NumDependents"
    ]
    X_train, Y_train = tr_tr[cols], tr_tr[["MonthlyIncome"]]
    Y_train = Y_train.MonthlyIncome.ravel()
    X_test, Y_test = tr_te[cols], tr_te[["MonthlyIncome"]]
    Y_test = Y_test.MonthlyIncome.ravel()
    logger.info("X_train : {}, X_test : {}, Y_train : {}, Y_test : {}" \
                .format(X_train.shape, X_test.shape, Y_train.shape,
                        Y_test.shape))

    # Train the model
    pickle_file = "monthly_income_predictor.pkl"
    if os.path.exists(pickle_file):
        est = joblib.load(pickle_file)
    else:
        logger.info("Training model to predict MonthlyIncome")
        est = RandomForestRegressor(n_estimators=200,
                                    n_jobs=-1,
                                    random_state=329521)
        scorer_name = "neg_median_absolute_error"
        scores = utils.get_cv_scores(est, X_train, Y_train, scorer_name)
        est.fit(X_train, Y_train)
        joblib.dump(est, "monthly_income_predictor.pkl")

    # Predict the MonthlyIncome
    est_name = est.__class__.__name__
    logger.info("{} rows in tr_te missing MonthlyIncome".format(len(X_test)))
    logger.info("{} rows in te_te missing MonthlyIncome".format(len(te_te)))
    logger.info("Using {} to predict MonthlyIncome".format(est_name))
    predictions_1 = est.predict(X_test)
    predictions_2 = est.predict(te_te[cols])

    # Set the MonthlyIncome for the rows where it was missing (X_test & te_te)
    X_train["MonthlyIncome"] = Y_train  # Adding it back to X_train
    X_test["MonthlyIncome"] = predictions_1
    te_te["MonthlyIncome"] = predictions_2

    # Un-scale the MonthlyIncome - we used a MinMaxScaler earlier
    # First, un-scale  MonthlyIncome values in X_test
    # X_train constructed from tr_tr, X_test constructed from tr_te
    logger.info("Un-scaling the MonthlyIncome values in X_train & X_test")
    X_train["MonthlyIncome"] = \
        utils.unscale_column_values(X_train["MonthlyIncome"], 8, scaler_1)
    X_test["MonthlyIncome"] = \
        utils.unscale_column_values(predictions_1, 8, scaler_1)
    # Next, un-scale  MonthlyIncome values in te_te
    logger.info("Un-scaling the MonthlyIncome values in te_tr & te_te")
    te_tr["MonthlyIncome"] = \
        utils.unscale_column_values(te_tr["MonthlyIncome"], 8, scaler_2)
    te_te["MonthlyIncome"] = \
        utils.unscale_column_values(predictions_2, 8, scaler_2)

    # Concat the DataFrames because we now have the missing MonthlyIncome,
    tmp_df_1 = pd.concat([X_train, X_test])
    tmp_df_2 = pd.concat([te_tr, te_te])

    # Next, merge the tmp_df with the existing train/test datasets
    df0 = pd.read_csv(os.path.join("data", "cs-training.csv"), index_col=0)
    df0 = df0[["SeriousDlqin2yrs", "NumberOfDependents"]]
    df0.rename(columns={"NumberOfDependents": "NumDependents"}, inplace=True)
    tr_df = pd.merge(tr_df, df0, left_index=True, right_index=True)
    df1 = pd.merge(tr_df,
                   tmp_df_1,
                   left_index=True,
                   right_index=True,
                   suffixes=("", "_y"))
    # Repeat, for test,
    df0 = pd.read_csv(os.path.join("data", "cs-test.csv"), index_col=0)
    df0 = df0[["SeriousDlqin2yrs", "NumberOfDependents"]]
    df0.rename(columns={"NumberOfDependents": "NumDependents"}, inplace=True)
    te_df = pd.merge(te_df, df0, left_index=True, right_index=True)
    df2 = pd.merge(te_df,
                   tmp_df_2,
                   left_index=True,
                   right_index=True,
                   suffixes=("", "_y"))

    # Next, retains the columns we need - and, in the order we need
    cols = [
        "SeriousDlqin2yrs", "RUoUL", "age", "NumLate3059", "DebtRatio",
        "MonthlyIncome", "MonthlyIncome_y", "NumOCLL", "NumLate90", "NumRELL",
        "NumLate6089", "NumDependents"
    ]
    df1, df2 = df1[cols], df2[cols]
    df1.rename(columns={"MonthlyIncome_y": "MonthlyIncome_Imputed"},
               inplace=True)
    df2.rename(columns={"MonthlyIncome_y": "MonthlyIncome_Imputed"},
               inplace=True)

    tr_df, te_df = df1, df2
    tr_df.to_csv(os.path.join("data", "tr_with_income.csv"))
    te_df.to_csv(os.path.join("data", "te_with_income.csv"))

    logger.info("Done predicting the missing MonthlyIncome ...")
    tr_df["MonthlyIncome"] = tr_df["MonthlyIncome_Imputed"]
    te_df["MonthlyIncome"] = te_df["MonthlyIncome_Imputed"]
    return tr_df, te_df