### Remove redundant columns, useless columns and unused targets
### K: _number columns are numerical representations of other existing columns.
### K: category and subcategory are other labels.
### K: saddr and daddr may specialize the model to a single network
redundant_columns = ['state_number', 'proto_number', 'flgs_number']
other_targets = ['category', 'subcategory']
misc_columns = ['saddr', 'daddr']
print('Removing redundant columns:', redundant_columns)
print('Removing useless targets:', other_targets)
print('Removing misc columns:', misc_columns)
columns_to_remove = redundant_columns + other_targets + misc_columns
df.drop(axis='columns', columns=columns_to_remove, inplace=True)

###############################################################################
### Remove NaN columns (with a lot of NaN values)
df, log = remove_nan_columns(df, 1 / 2, verbose=False)
print(log)

###############################################################################
### Encode categorical features
print('Encoding categorical features (ordinal encoding).')
my_encoder = OrdinalEncoder()
df['flgs'] = my_encoder.fit_transform(df['flgs'].values.reshape(-1, 1))
df['proto'] = my_encoder.fit_transform(df['proto'].values.reshape(-1, 1))
df['sport'] = my_encoder.fit_transform(df['sport'].astype(str).values.reshape(
    -1, 1))
df['dport'] = my_encoder.fit_transform(df['dport'].astype(str).values.reshape(
    -1, 1))
df['state'] = my_encoder.fit_transform(df['state'].values.reshape(-1, 1))
print('Objects:', list(df.select_dtypes(['object']).columns))
Beispiel #2
0
state = 0
try:
  state = int (sys.argv [1])
except:
  pass
print ("STATE = ", state)
STATES = [0, 10, 100, 1000, 10000]

pd.set_option ('display.max_rows', None)
pd.set_option ('display.max_columns', 5)

df = load_dataset ()
print ("Data Loaded")
remove_columns_with_one_value (df, verbose=False)
remove_nan_columns (df, 0.6, verbose=False)
#making the final DataFrame
#dropping the number of the rows column
df = df.drop(df.columns[0], axis=1)

#dropping unrelated columns
df.drop(axis='columns', columns=['ts', 'te', 'sa', 'da'], inplace=True)


#sampling the df
df = df.sample (frac=1, replace=True, random_state=0)
#################################
## Encoding the data           ##
#################################

cat_cols, num_cols = df.columns[df.dtypes == 'O'], df.columns[df.dtypes != 'O']