class AnnMLPBinary: def __init__(self): os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '2' # Ignore low level instruction warnings tf.logging.set_verbosity(tf.logging.ERROR) # Set tensorflow verbosity self.g = tf.Graph() self.tf_sess = tf.Session( config=tf.ConfigProto(log_device_placement=True), graph=self.g) self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.random_state = 20 self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.folder = 'viz' # Datasets self.X = None self.y = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.n_features = None self.label_map_int_2_string = { 0: 'good', 1: 'bad', '0': 'good', '1': 'bad' } self.label_map_string_2_int = { 'normal': 0, 'dos': 1, 'u2r': 1, 'r2l': 1, 'probe': 1 } # K-fold validation self.splits = 5 self.kfold = StratifiedKFold(n_splits=self.splits, shuffle=True, random_state=self.random_state) # Network parameters self.epochs = 20 self.batch_size = 100 self.verbose = 0 # Scores self.metric_loss = [] self.metric_acc = [] self.metric_dr = [] self.metric_far = [] self.metric_val_loss = [] self.metric_val_acc = [] self.metric_val_dr = [] self.metric_val_far = [] with timer('\nPreparing dataset'): self.load_data() self.set_y() self.remove_target_from_X() self.n_features = self.X.shape[1] self.train_test_split() with timer('\nTraining & validating model with kfold'): self.g.as_default() # Reset graph for tensorboard display K.clear_session() # Train model on K-1 and validate using remaining fold for train, val in self.kfold.split(self.X_train, self.y_train): #self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_cv') self.model = self.get_model() self.history = self.model.fit( self.X_train.iloc[train], self.y_train.iloc[train], validation_data=(self.X_train.iloc[val], self.y_train.iloc[val]), epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose) #callbacks=[self.tensorboard]) self.metric_loss.append(self.history.history['loss']) self.metric_acc.append(self.history.history['acc']) self.metric_dr.append(self.history.history['dr']) self.metric_far.append(self.history.history['far']) self.metric_val_loss.append(self.history.history['val_loss']) self.metric_val_acc.append(self.history.history['val_acc']) self.metric_val_dr.append(self.history.history['val_dr']) self.metric_val_far.append(self.history.history['val_far']) print('\nTraining mean loss', np.mean(self.metric_loss)) print('Training mean acc', np.mean(self.metric_acc)) print('Training mean dr', np.mean(self.metric_dr)) print('Training mean far', np.mean(self.metric_far)) print('\nValidation mean loss', np.mean(self.metric_val_loss)) print('Validation mean acc', np.mean(self.metric_val_acc)) print('Validation mean dr', np.mean(self.metric_val_dr)) print('Validation mean far', np.mean(self.metric_val_far)) with timer('\nTesting model on unseen test set'): self.g.as_default() # Reset graph for tensorboard display K.clear_session() self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_test') self.model = self.get_model() # Train model on complete train set and validate with unseen test set self.history = self.model.fit(self.X_train, self.y_train, validation_data=(self.X_test, self.y_test), epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose, callbacks=[self.tensorboard]) with timer('\nVisualising results'): # Plot model plot_model(self.model, to_file='viz/annMLPBinary - model plot.png') # Get single class prediction (rather than multi class probability summing to 1) y_pred = self.model.predict_classes(self.X_test) print('Test loss', np.mean(self.history.history['loss'])) print('Test acc', np.mean(self.history.history['acc'])) print('Test dr', np.mean(self.history.history['dr'])) print('Test far', np.mean(self.history.history['far'])) # Remap to string class targets self.y_pred = self.map_target_to_label(y_pred) self.y_pred = self.y_pred.ravel() self.y_test = self.map_target_to_label(self.y_test) self.visualize.confusion_matrix(self.y_test, self.y_pred, self.__class__.__name__) epochs = range(1, len(self.history.history['loss']) + 1) # Plot loss fig, ax = plt.subplots(figsize=(15, 8)) plt.style.use('ggplot') ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.tick_params(axis='both', which='major', labelsize=12) ax.plot(epochs, np.mean(self.metric_loss, axis=0), 'g', label='Training') ax.plot(epochs, np.mean(self.metric_val_loss, axis=0), 'b', label='Validation') ax.plot(epochs, self.history.history['loss'], 'r', label='Test') self.title = '{} - {}'.format(self.__class__.__name__, 'Loss') plt.title(self.title, fontsize=18) plt.xlabel('Epochs', fontsize=14) plt.ylabel('Loss', fontsize=14) plt.legend(loc=1, prop={'size': 14}) plt.savefig(fname=self.fname(self.title), dpi=300, format='png') plt.show() # Plot accuracy plt.clf() fig, ax = plt.subplots(figsize=(15, 8)) plt.style.use('ggplot') ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.tick_params(axis='both', which='major', labelsize=12) ax.plot(epochs, np.mean(self.metric_acc, axis=0), 'g', label='Training') ax.plot(epochs, np.mean(self.metric_val_acc, axis=0), 'b', label='Validation') ax.plot(epochs, self.history.history['acc'], 'r', label='Test') self.title = '{} - {}'.format(self.__class__.__name__, 'Accuracy') plt.title(self.title, fontsize=18) plt.xlabel('Epochs', fontsize=14) plt.ylabel('Accuracy', fontsize=14) plt.legend(loc=4, prop={'size': 14}) plt.savefig(fname=self.fname(self.title), dpi=300, format='png') plt.show() # Plot detection rate plt.clf() fig, ax = plt.subplots(figsize=(15, 8)) plt.style.use('ggplot') ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.tick_params(axis='both', which='major', labelsize=12) ax.plot(epochs, np.mean(self.metric_dr, axis=0), 'g', label='Training') ax.plot(epochs, np.mean(self.metric_val_dr, axis=0), 'b', label='Validation') ax.plot(epochs, self.history.history['dr'], 'r', label='Test') self.title = '{} - {}'.format(self.__class__.__name__, 'Detection Rate') plt.title(self.title, fontsize=18) plt.xlabel('Epochs', fontsize=14) plt.ylabel('Detection Rate', fontsize=14) plt.legend(loc=4, prop={'size': 14}) plt.savefig(fname=self.fname(self.title), dpi=300, format='png') plt.show() # Plot false alarm rate plt.clf() fig, ax = plt.subplots(figsize=(15, 8)) plt.style.use('ggplot') ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.tick_params(axis='both', which='major', labelsize=12) ax.plot(epochs, np.mean(self.metric_far, axis=0), 'g', label='Training') ax.plot(epochs, np.mean(self.metric_val_far, axis=0), 'b', label='Validation') ax.plot(epochs, self.history.history['far'], 'r', label='Test') self.title = '{} - {}'.format(self.__class__.__name__, 'False Alarm Rate') plt.title(self.title, fontsize=18) plt.xlabel('Epochs', fontsize=14) plt.ylabel('False Alarm Rate', fontsize=14) plt.legend(loc=1, prop={'size': 14}) plt.savefig(fname=self.fname(self.title), dpi=300, format='png') plt.show() self.log_file() print('Finished') @staticmethod def dr(y_true, y_pred): y_pred_pos = K.round(K.clip(y_pred, 0, 1)) y_pred_neg = 1 - y_pred_pos y_pos = K.round(K.clip(y_true, 0, 1)) tp = K.sum(y_pos * y_pred_pos) fn = K.sum(y_pos * y_pred_neg) return tp / (tp + fn + K.epsilon()) @staticmethod def far(y_true, y_pred): y_pred_pos = K.round(K.clip(y_pred, 0, 1)) y_pred_neg = 1 - y_pred_pos y_pos = K.round(K.clip(y_true, 0, 1)) y_neg = 1 - y_pos tn = K.sum(y_neg * y_pred_neg) fp = K.sum(y_neg * y_pred_pos) return fp / (tn + fp + K.epsilon()) def get_model(self): model = models.Sequential() model.add( layers.Dense(25, activation='relu', input_shape=(self.n_features, ))) model.add(layers.Dropout(0.08)) model.add(layers.Dense(25, activation='relu')) model.add(layers.Dropout(0.08)) model.add(layers.Dense(25, activation='relu')) model.add(layers.Dropout(0.08)) model.add(layers.Dense(25, activation='relu')) model.add(layers.Dropout(0.08)) model.add(layers.Dense(1, activation='sigmoid')) model.compile(optimizer=optimizers.RMSprop(lr=0.0023), loss='binary_crossentropy', metrics=['accuracy', self.dr, self.far]) return model def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open( 'logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.X = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_Tensor2d_type_2') print('\tRow count:\t', '{}'.format(self.X.shape[0])) print('\tColumn count:\t', '{}'.format(self.X.shape[1])) def set_y(self): self.y = self.X['attack_category'] self.y = self.y.map(self.label_map_string_2_int) def remove_target_from_X(self): self.X.drop('attack_category', axis=1, inplace=True) def train_test_split(self): self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=0.30, random_state=self.random_state) def map_target_to_label(self, t): return np.vectorize(self.label_map_int_2_string.get)(t) def fname(self, title): return '{}/{}.png'.format(self.folder, title)
class Modelling: def __init__(self): os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '2' # Ignore low level instruction warnings tf.logging.set_verbosity(tf.logging.ERROR) # Set tensorflow verbosity # self.logfile = None # self.gettrace = getattr(sys, 'gettrace', None) # self.original_stdout = sys.stdout # self.timestr = time.strftime("%Y%m%d-%H%M%S") # self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.full = None self.X = None self.y = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.n_features = None self.random_state = 20 self.label_multi = { 0: 'normal', '0': 'normal', 1: 'dos', '1': 'dos', 2: 'u2r', '2': 'u2r', 3: 'r2l', '3': 'r2l', 4: 'probe', '4': 'probe' } self.label_binary = {0: 'good', '0': 'good', 1: 'bad', '1': 'bad'} with timer('\nLoading dataset'): self.load_data() with timer('\nSetting X and y'): self.set_X() self.n_features = self.X.shape[1] models = (RandomForestClf(), AnnSLPBinary(self.n_features), AnnMLPBinary(self.n_features), AnnMLPMulti(self.n_features)) classification_type = ('Binary', 'Multi') for m, ctype in itertools.product(models, classification_type): score = False if ctype == 'Binary' and m.binary_enabled: self.set_y_binary() score = True elif ctype == 'Multi' and m.multi_enabled: self.set_y_multi() score = True if not score: continue with timer('\nTraining and scoring {} - {} target'.format( m.__class__.__name__, ctype)): m.base['model'] = m.get_model() #self.train_test_split() m.score(self.X, self.y, ctype) m.y_test[ctype] = pd.Series(m.y_test[ctype]) m.y_pred[ctype] = pd.Series(m.y_pred[ctype]) m.y_test[ctype] = m.y_test[ctype].astype(int) m.y_pred[ctype] = m.y_pred[ctype].astype(int) if ctype == 'Binary': m.y_test[ctype] = self.series_map_ac_binary_to_label( m.y_test[ctype]) m.y_pred[ctype] = self.series_map_ac_binary_to_label( m.y_pred[ctype]) else: m.y_test[ctype] = self.series_map_ac_multi_to_label( m.y_test[ctype]) m.y_pred[ctype] = self.series_map_ac_multi_to_label( m.y_pred[ctype]) title = '{} - {} - {} '.format('CM', m.__class__.__name__, ctype) self.visualize.confusion_matrix(m.y_test[ctype], m.y_pred[ctype], title) self.scores(m.y_test[ctype], m.y_pred[ctype]) # Append the scores to a scores array. I could then do an np.mean(scores) to get the mean(average) from all the kfolds # save the epoch number and gfold number if possible as well, to get a per/epoch score # self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open( 'logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.full = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_Tensor2d_type_1') def set_X(self): self.X = self.full.loc[:, self.full.columns != 'attack_category'] def set_y_binary(self): self.y = self.full.loc[:, ['attack_category']] self.df_map_ac_label_to_binary() self.y = self.y.values.ravel() def set_y_multi(self): self.y = self.full.loc[:, ['attack_category']] self.df_map_ac_label_to_multi() self.y = self.y.values.ravel() def train_test_split(self): self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=0.30, random_state=self.random_state) def df_map_ac_label_to_binary(self): conditions = [(self.y['attack_category'] == 'normal'), (self.y['attack_category'] == 'dos') | (self.y['attack_category'] == 'u2r') | (self.y['attack_category'] == 'r2l') | (self.y['attack_category'] == 'probe')] self.y['attack_category'] = np.select(conditions, [0, 1]) def df_map_ac_label_to_multi(self): conditions = [(self.y['attack_category'] == 'normal'), (self.y['attack_category'] == 'dos'), (self.y['attack_category'] == 'u2r'), (self.y['attack_category'] == 'r2l'), (self.y['attack_category'] == 'probe')] self.y['attack_category'] = np.select( conditions, ['0', '1', '2', '3', '4']) # string for get_dummies encoding def series_map_ac_multi_to_label(self, s): return s.map(self.label_multi) def series_map_ac_binary_to_label(self, s): return s.map(self.label_binary) def scores(self, y_test, y_pred): print('Accuracy {}'.format(accuracy_score(y_test, y_pred))) print('F1 {}'.format(classification_report(y_test, y_pred, digits=10)))
class Linearity: def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.random_state = 20 self.X = None self.y = None self.sample = None self.full = None self.ac_count = {} self.scale_cols = [ 'duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate' ] self.full_weights = { 'normal': 1, 'dos': 1, 'probe': 1, 'u2r': 1, 'r2l': 1 } self.minimal_weights = { 'normal': 0.01, 'dos': 0.01, 'probe': 0.2, 'u2r': 0.5, 'r2l': 0.5 } with timer('\nLoading dataset'): self.load_data() self.set_attack_category_count() self.ds.shape() with timer('\nEncode and Scale dataset'): # Encode categoricals le = preprocessing.LabelEncoder() self.full['protocol_type'] = le.fit_transform( self.full['protocol_type']) self.full['service'] = le.fit_transform(self.full['service']) self.full['flag'] = le.fit_transform(self.full['flag']) # Scale sc = StandardScaler() self.full[self.scale_cols] = sc.fit_transform( self.full[self.scale_cols]) with timer('\nPlotting scatter graphs'): self.sample_dataset(self.full_weights) print(self.sample.shape) self.set_X_y('target') self.scatter() with timer('\nPlotting scatter graphs with convex hull'): self.sample_dataset(self.full_weights) print(self.sample.shape) self.set_X_y('target') self.convex_hull() with timer('\nPlotting linear separability with classifiers'): self.sample_dataset(self.minimal_weights) print(self.sample.shape) self.set_X_y('target') self.classifiers() self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open( 'logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def scatter(self): self.visualize.scatter(self.X, cola='src_bytes', colb='dst_bytes', hue='target') self.visualize.scatter(self.X, cola='count', colb='diff_srv_rate', hue='target') self.visualize.scatter(self.X, cola='duration', colb='src_bytes', hue='target') self.visualize.scatter(self.X, cola='dst_host_srv_count', colb='dst_bytes', hue='target') self.visualize.scatter(self.X, cola='serror_rate', colb='rerror_rate', hue='target') self.visualize.scatter(self.X, cola='dst_host_srv_count', colb='dst_bytes', hue='target') self.visualize.scatter(self.X, cola='srv_diff_host_rate', colb='srv_count', hue='target') def convex_hull(self): buckets = self.y.unique() self.visualize.convex_hull(self.X, buckets, cola='src_bytes', colb='dst_bytes', target='target') self.visualize.convex_hull(self.X, buckets, cola='count', colb='diff_srv_rate', target='target') self.visualize.convex_hull(self.X, buckets, cola='duration', colb='src_bytes', target='target') self.visualize.convex_hull(self.X, buckets, cola='dst_host_srv_count', colb='dst_bytes', target='target') self.visualize.convex_hull(self.X, buckets, cola='serror_rate', colb='rerror_rate', target='target') self.visualize.convex_hull(self.X, buckets, cola='dst_host_srv_count', colb='dst_bytes', target='target') self.visualize.convex_hull(self.X, buckets, cola='srv_diff_host_rate', colb='srv_count', target='target') def load_data(self): self.ds.dataset = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_processed') self.ds.target = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_target') self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1) def set_attack_category_count(self): ac = self.full['attack_category'].value_counts() for key, value in ac.items(): self.ac_count[key] = value def set_X_y(self, target): print('Setting X, with y as {}'.format(target)) self.X = self.sample self.y = self.sample[target] def sample_dataset(self, weights): print('Sampling dataset with weights {}'.format(weights)) self.sample = pd.DataFrame() for key, value in self.ac_count.items(): samples = int(value * weights[key]) df = self.full[self.full.attack_category == key].sample( samples, random_state=self.random_state) self.sample = self.sample.append(df) def classifiers(self): le = preprocessing.LabelEncoder() self.y = le.fit_transform(self.y) _y = self.y models = (Perceptron(max_iter=100, tol=1e-3, random_state=self.random_state), LinearSVC(max_iter=500, random_state=self.random_state, tol=1e-5), SVC(kernel='rbf', gamma=5, C=10.0, random_state=self.random_state)) titles = ('Perceptron', 'LinearSVC (linear kernel)', 'SVC with RBF kernel') columns = [('srv_diff_host_rate', 'srv_count'), ('dst_host_srv_count', 'count'), ('dst_host_srv_count', 'dst_bytes')] for clf, title in zip(models, titles): for cola, colb in columns: _x = self.X.loc[:, [cola, colb]] clf.fit(_x, _y) _y_pred = clf.predict(_x) self.visualize.boundary(_x, _y, clf, title, cola, colb) self.visualize.confusion_matrix( _y, _y_pred, title + ' - ' + cola + ' vs ' + colb)
class FeatureSelection: def __init__(self): self.logfile = False self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.visualize = Visualize() self.ds = KDDCup1999() self.X = None self.y = None self.full = None self.random_state = 20 self.num_features = 15 self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate'] with timer('\nLoading dataset'): self.load_data() self.encode_scale() self.set_X() with timer('\nFeature selection'): for selector in (Original(), UnivariateSelector(), RecursiveSelector(), PCASelector(), #KernelPCASelector(), ExtraTreesSelector(), RandomForestSelector()): for label in ('attack_category', 'target'): self.set_y(label) with timer('\nFitting selector ' + selector.__class__.__name__): selector.fit_model(self.X, self.y) x = selector.get_top_features(self.X, label) with timer('\nXGBoost scoring of features selected by ' + selector.__class__.__name__): self.score_with_xgboost(x, self.y, selector.title) self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed') self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target') self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1) self.ds.shape() print(self.ds.dataset.columns) self.ds.row_count_by_target('attack_category') def encode_scale(self): # Encode categoricals le = preprocessing.LabelEncoder() self.full['protocol_type'] = le.fit_transform(self.full['protocol_type']) self.full['service'] = le.fit_transform(self.full['service']) self.full['flag'] = le.fit_transform(self.full['flag']) # Scale sc = MinMaxScaler() self.full[self.scale_cols] = sc.fit_transform(self.full[self.scale_cols]) def set_X(self): self.X = self.full.iloc[:, :-2] def set_y(self, label): self.y = self.full[label] def score_with_xgboost(self, x, y, title): clf = XGBClassifier(n_estimators=100, random_state=self.random_state) kfold = StratifiedKFold(n_splits=10, random_state=self.random_state) results = cross_val_score(clf, x, y, cv=kfold) print("XGBoost Accuracy: %.2f%% (+/- %.2f%%)" % (results.mean() * 100, results.std() * 100)) y_pred = cross_val_predict(clf, x, y, cv=10) self.visualize.confusion_matrix(y, y_pred, title)
class Sampling: def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.random_state = 20 self.X = None self.y = None self.full = None # RF Feature selected plus sparse cols self.cols = [ 'count', 'diff_srv_rate', 'src_bytes', 'dst_host_srv_count', 'flag', 'dst_bytes', 'serror_rate', 'dst_host_diff_srv_rate', 'service', 'dst_host_count', 'dst_host_srv_diff_host_rate', 'logged_in', 'protocol_type', 'dst_host_same_src_port_rate', 'hot', 'srv_count', 'wrong_fragment', 'num_compromised', 'rerror_rate', 'srv_diff_host_rate', 'urgent', 'num_failed_logins', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login' ] with timer('\nLoading dataset'): self.load_data() with timer('\nScaling'): # Sampling options for sampler in (Original(), RandomOverSampler(), SMOTE(random_state=self.random_state), ADASYN(random_state=self.random_state), BorderlineSMOTE(random_state=self.random_state, kind='borderline-1')): self.X = self.full.loc[:, self.cols] self.X['target'] = self.full['target'] print('X shape with selected features and binary - ', self.X.shape) self.X = pd.get_dummies( data=self.X, columns=['protocol_type', 'service', 'flag']) print('X shape after encoding categoricals - ', self.X.shape) # Re-sample based on attack_category labels res_x = pd.DataFrame() res_x, res_y_attack_category, title = self.sample( sampler, self.X, self.full['attack_category']) res_y_target = res_x[ 'target'] # Grab target as y from resampled x set res_x.drop(columns=['target'], inplace=True) print('X shape after sampling and removing target - ', res_x.shape) print('y shape with attack_category after resample - ', res_y_attack_category.shape) print(res_y_attack_category.value_counts()) res_y_attack_category.value_counts().plot( kind='bar', title=title + ' - Resampled Count (attack_category)') plt.show() print('y shape with target after resample - ', res_y_target.shape) # Scale after resampling qt = QuantileTransformer(output_distribution='normal') res_x = qt.fit_transform(res_x) print('X shape after scaling - ', res_x.shape) # Score on attack_category multi-class self.model_and_score(res_x, res_y_attack_category, title, 'attack_category') # Score on binary target self.model_and_score(res_x, res_y_target, title, 'target') self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open( 'logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.ds.dataset = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_processed') self.ds.target = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_target') self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1) self.ds.shape() self.ds.row_count_by_target('attack_category') def set_y(self, label): self.y = self.full[label] def sample(self, sampler, X, y): title = sampler.__class__.__name__ res_x, res_y = sampler.fit_resample(X, y) if isinstance(res_x, np.ndarray): res_x = pd.DataFrame(res_x, columns=X.columns) if isinstance(res_y, np.ndarray): res_y = pd.Series(res_y) print('Shape after sampling with {} - x {}, y {}'.format( title, res_x.shape, res_y.shape)) return res_x, res_y, title def model_and_score(self, X, y, title, label): clf = XGBClassifier(n_estimators=50, random_state=self.random_state) kfold = StratifiedKFold(n_splits=5, random_state=self.random_state) results = cross_val_score(clf, X, y, cv=kfold) y_pred = cross_val_predict(clf, X, y, cv=5) print('{} - {} - XGBoost Accuracy: {:.2f}% (+/- {:.2f}'.format( title, label, results.mean() * 100, results.std() * 100)) self.visualize.confusion_matrix( y, y_pred, '{} - {} - Label {}'.format(title, clf.__class__.__name__, label))
class Scaling: def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.random_state = 20 self.X = None self.y = None self.full = None self.ac_count = {} self.scores = OrderedDict() self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate'] with timer('\nLoading dataset'): self.load_data() self.set_attack_category_count() with timer('\nEncoding categoricals'): le = preprocessing.LabelEncoder() self.full['protocol_type'] = le.fit_transform(self.full['protocol_type']) self.full['service'] = le.fit_transform(self.full['service']) self.full['flag'] = le.fit_transform(self.full['flag']) with timer('\nSetting X'): self.set_X() self.ds.shape() with timer('\nDistribution Before Scaling'): self.dist_before_scaling() with timer('\nScaling'): for scaler in (StandardScaler(), Normalizer(), MinMaxScaler(feature_range=(0, 1)), Binarizer(threshold=0.0), RobustScaler(quantile_range=(25, 75)), PowerTransformer(method='yeo-johnson'), QuantileTransformer(output_distribution='normal')): title, res_x = self.scale(scaler) label = 'attack_category' self.set_y(label) self.model_and_score(scaler, res_x, title, label) label = 'target' self.set_y(label) self.model_and_score(scaler, res_x, title, label) self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed') self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target') self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1) self.ds.shape() self.ds.row_count_by_target('attack_category') def set_attack_category_count(self): ac = self.full['attack_category'].value_counts() for key, value in ac.items(): self.ac_count[key] = value def set_X(self): self.X = self.full.loc[:, self.scale_cols] def set_y(self, label): self.y = self.full[label] def dist_before_scaling(self): self.visualize.kdeplot('Distribution Before Scaling', self.X, self.scale_cols) def scale(self, scaler): x = self.X[self.scale_cols] with warnings.catch_warnings(): warnings.simplefilter("ignore") res_x = scaler.fit_transform(x) res_x = pd.DataFrame(res_x, columns=self.scale_cols) title = 'Distribution After ' + scaler.__class__.__name__ self.visualize.kdeplot(title, res_x, self.scale_cols) return title, res_x def model_and_score(self, scaler, res_x, title, label): clf = XGBClassifier(n_estimators=100, random_state=self.random_state) kfold = StratifiedKFold(n_splits=10, random_state=self.random_state) results = cross_val_score(clf, res_x, self.y, cv=kfold) y_pred = cross_val_predict(clf, res_x, self.y, cv=10) print('{} - {} - XGBoost Accuracy: {:.2f}% (+/- {:.2f}'.format(title, label, results.mean() * 100, results.std() * 100)) self.visualize.confusion_matrix(self.y, y_pred, '{} - {} - Label {}'.format(title, clf.__class__.__name__, label))