def test_get_clean_df_dropna_rows():
    data = [[0], [np.nan], [-np.inf], [np.inf]]
    columns = ["dummy"]
    df_inp = pd.DataFrame(data=data, columns=columns)
    df_out = clean.get_clean_df(df_inp)
    pandas.testing.assert_frame_equal(
        df_out,
        pd.DataFrame(
            data=[[0]],
            columns=columns,
        ),
    )
def test_get_clean_df_replace_custom_smaller_than_minus_1_values_with_0():
    data = [
        [-2, -1, -2],
        [-1, -1, -2],
        [-1, -2, -2],
        [-2, -2, -2],
    ]
    columns = ["Flow IAT Min", "Fwd IAT Min", "dummy"]
    df_inp = pd.DataFrame(data=data, columns=columns)
    df_out = clean.get_clean_df(df_inp)
    pandas.testing.assert_frame_equal(
        df_out,
        pd.DataFrame(
            # only affect the custom columns
            data=[[0, -1, -2], [-1, -1, -2], [-1, 0, -2], [0, 0, -2]],
            columns=columns,
        ),
    )
def test_get_clean_df_remove_custom_minus_1_values():
    data = [
        [0, 0, -1],
        [0, -1, 0],
        [-1, 0, 0],
        [-1, -1, 0],
    ]
    columns = ["Init Fwd Win Byts", "Init Bwd Win Byts", "dummy"]
    df_inp = pd.DataFrame(data=data, columns=columns)
    df_out = clean.get_clean_df(df_inp)
    pandas.testing.assert_frame_equal(
        df_out,
        pd.DataFrame(
            # only affect the custom columns
            data=[[0, 0, -1]],
            columns=columns,
        ),
    )
Example #4
0
dataset_name = "cse-cic-ids2017"

DTYPE_MAP = {"int64": np.dtype(int), "object": np.dtype(str)}
DATA_TYPES = {
    FEATURES_2018_TO_2017[feature]: DTYPE_MAP[dtype]
    for feature, dtype in load.DTYPE.items()
}

dataset_name_clean = Path(dataset_name + "-clean")
dataset_name_clean.mkdir(parents=False, exist_ok=True)
for dataset_file in sorted(glob(f"{dataset_name}/*.csv")):
    file_name = dataset_file.split("/")[-1]
    print("#" * 80)
    print("New datafile:", dataset_file)
    print("#" * 80)
    df = pd.read_csv(
        dataset_file,
        usecols=DATA_TYPES.keys(),
        skipinitialspace=True,
        encoding="latin1",
    )
    # Replace 2017 columns names with the corresponding 2018 values
    df.rename(columns=FEATURES_2017_TO_2018, inplace=True)
    df = clean.get_clean_df(df, verbose=0)
    # Remove non-ascii characters from the Label column
    df["Label"] = df["Label"].str.encode("ascii", "ignore").str.decode("ascii")
    # Replace (or remove inexistent in 2018) labels
    df.replace({"Label": LABELS_2017_TO_2018}, inplace=True)
    df.dropna(axis=0, inplace=True)
    df.to_csv(f"{dataset_name_clean}/{file_name}", index=False)
def test_get_clean_df_remove_timestamp():
    data = [["2018"]]
    columns = ["Timestamp"]
    df_inp = pd.DataFrame(data=data, columns=columns)
    df_out = clean.get_clean_df(df_inp)
    assert list(df_out.columns) == []
def test_get_clean_df_convert_all_columns_to_integers():
    data = [["1", "1.1"]]
    columns = ["dummy1", "dummy2"]
    df_inp = pd.DataFrame(data=data, columns=columns)
    df_out = clean.get_clean_df(df_inp)
    assert list(df_out.dtypes) == [np.dtype("int"), np.dtype("int")]
def test_get_clean_df_remove_categorical_variables():
    data = [[0, 0, 0, 0, 0]]
    columns = ["Protocol", "Src IP", "Src Port", "Dst Port", "Dst IP"]
    df_inp = pd.DataFrame(data=data, columns=columns)
    df_out = clean.get_clean_df(df_inp)
    assert list(df_out.columns) == []
# Summarize one of the smaller data sets.
# 
# The following observations can be made:
# 
# 1. 'Flow Byts/s' and 'Flow Pkts/s' columns contain non-numeric values
# 2. 'Init Fwd Win Byts' and 'Init Bwd Win Byts' contain a negative number '-1'
# 3. 'Flow IAT Min' amd 'Fwd IAT Min' contain large absolute negative values
# 
# The rows with those values in the respective columns will be removed (1. and 2., note that 2. results in a significant decrease in the number of non-Benign flows for a couple of data sets, e.g. for 'DoS attacks-Hulk', 'DDOS attack-HOIC', 'DDOS attack-LOIC-UDP') or replaced (3.) in the the `get_clean_df` function.

# In[15]:


df = pd.read_csv(f'{dataset_name}/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv')
df = clean.get_clean_df(df, verbose=2)
df['target'] = df.pop('Label')
feature_list = clean.get_feature_list(df, tolerance=0.0001, sample_fraction=0.5)
print(feature_list)
del df


# A 16 GB machine is unable to keep copies of the largest dataset `Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv` in memory. Therefore some of the low variance and duplicate features found in smaller datasets are removed upfront from the lagest dataset to reduce its size. Moreover the lagest data file contains `extra_features` not present in other data files, and they are therefore removed. Additionally, due to a large number (almost 8 millions) samples if the largest data set a sample of 5% (instead of 50% as in all other data files) is used in the process of feature selection.

# Many people (e.g. Frank Harrell https://twitter.com/f2harrell/status/1137012097391312897?lang=en `Feature selection doesn't work in general because it can't find the right variables and distorts statistical properties.  One summary of the evils of stepwise`) claim that no feature selection should be performed. In this case reducing the number of features is necessary due to limited computing resources.
# 
# In principle a feature selection should happen on an isolated subset of the data, in order to not involve the test data in any model choices. This approach is not followed strictly here, as the feature selection is performed based on the full dataset, but this is acceptable, since another separate test set https://www.unb.ca/cic/datasets/ids-2017.html is used for the final estimation of the model performance.
# 
# The features are selected in `get_feature_list` using an addition process, where features are added on-by-one in the order of importance, only if by adding a feature the performance metrics (the macro average of recall across all target classes) increases by a threshold.

# In[16]: