Ejemplo n.º 1
0
class AnnMLPBinary:
    def __init__(self):
        os.environ[
            'TF_CPP_MIN_LOG_LEVEL'] = '2'  # Ignore low level instruction warnings
        tf.logging.set_verbosity(tf.logging.ERROR)  # Set tensorflow verbosity
        self.g = tf.Graph()
        self.tf_sess = tf.Session(
            config=tf.ConfigProto(log_device_placement=True), graph=self.g)

        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.random_state = 20
        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.folder = 'viz'

        # Datasets
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.n_features = None
        self.label_map_int_2_string = {
            0: 'good',
            1: 'bad',
            '0': 'good',
            '1': 'bad'
        }
        self.label_map_string_2_int = {
            'normal': 0,
            'dos': 1,
            'u2r': 1,
            'r2l': 1,
            'probe': 1
        }

        # K-fold validation
        self.splits = 5
        self.kfold = StratifiedKFold(n_splits=self.splits,
                                     shuffle=True,
                                     random_state=self.random_state)

        # Network parameters
        self.epochs = 20
        self.batch_size = 100
        self.verbose = 0

        # Scores
        self.metric_loss = []
        self.metric_acc = []
        self.metric_dr = []
        self.metric_far = []

        self.metric_val_loss = []
        self.metric_val_acc = []
        self.metric_val_dr = []
        self.metric_val_far = []

        with timer('\nPreparing dataset'):
            self.load_data()
            self.set_y()
            self.remove_target_from_X()
            self.n_features = self.X.shape[1]
            self.train_test_split()

        with timer('\nTraining & validating model with kfold'):
            self.g.as_default()  # Reset graph for tensorboard display
            K.clear_session()

            # Train model on K-1 and validate using remaining fold
            for train, val in self.kfold.split(self.X_train, self.y_train):
                #self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_cv')
                self.model = self.get_model()

                self.history = self.model.fit(
                    self.X_train.iloc[train],
                    self.y_train.iloc[train],
                    validation_data=(self.X_train.iloc[val],
                                     self.y_train.iloc[val]),
                    epochs=self.epochs,
                    batch_size=self.batch_size,
                    verbose=self.verbose)
                #callbacks=[self.tensorboard])

                self.metric_loss.append(self.history.history['loss'])
                self.metric_acc.append(self.history.history['acc'])
                self.metric_dr.append(self.history.history['dr'])
                self.metric_far.append(self.history.history['far'])
                self.metric_val_loss.append(self.history.history['val_loss'])
                self.metric_val_acc.append(self.history.history['val_acc'])
                self.metric_val_dr.append(self.history.history['val_dr'])
                self.metric_val_far.append(self.history.history['val_far'])

            print('\nTraining mean loss', np.mean(self.metric_loss))
            print('Training mean acc', np.mean(self.metric_acc))
            print('Training mean dr', np.mean(self.metric_dr))
            print('Training mean far', np.mean(self.metric_far))
            print('\nValidation mean loss', np.mean(self.metric_val_loss))
            print('Validation mean acc', np.mean(self.metric_val_acc))
            print('Validation mean dr', np.mean(self.metric_val_dr))
            print('Validation mean far', np.mean(self.metric_val_far))

        with timer('\nTesting model on unseen test set'):
            self.g.as_default()  # Reset graph for tensorboard display
            K.clear_session()

            self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_test')
            self.model = self.get_model()

            # Train model on complete train set and validate with unseen test set
            self.history = self.model.fit(self.X_train,
                                          self.y_train,
                                          validation_data=(self.X_test,
                                                           self.y_test),
                                          epochs=self.epochs,
                                          batch_size=self.batch_size,
                                          verbose=self.verbose,
                                          callbacks=[self.tensorboard])

        with timer('\nVisualising results'):
            # Plot model
            plot_model(self.model, to_file='viz/annMLPBinary - model plot.png')

            # Get single class prediction (rather than multi class probability summing to 1)
            y_pred = self.model.predict_classes(self.X_test)

            print('Test loss', np.mean(self.history.history['loss']))
            print('Test acc', np.mean(self.history.history['acc']))
            print('Test dr', np.mean(self.history.history['dr']))
            print('Test far', np.mean(self.history.history['far']))

            # Remap to string class targets
            self.y_pred = self.map_target_to_label(y_pred)
            self.y_pred = self.y_pred.ravel()
            self.y_test = self.map_target_to_label(self.y_test)

            self.visualize.confusion_matrix(self.y_test, self.y_pred,
                                            self.__class__.__name__)

            epochs = range(1, len(self.history.history['loss']) + 1)

            # Plot loss
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_loss, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_loss, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['loss'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__, 'Loss')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('Loss', fontsize=14)
            plt.legend(loc=1, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

            # Plot accuracy
            plt.clf()
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_acc, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_acc, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['acc'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__, 'Accuracy')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('Accuracy', fontsize=14)
            plt.legend(loc=4, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

            # Plot detection rate
            plt.clf()
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_dr, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_dr, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['dr'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__,
                                          'Detection Rate')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('Detection Rate', fontsize=14)
            plt.legend(loc=4, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

            # Plot false alarm rate
            plt.clf()
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_far, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_far, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['far'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__,
                                          'False Alarm Rate')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('False Alarm Rate', fontsize=14)
            plt.legend(loc=1, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

        self.log_file()
        print('Finished')

    @staticmethod
    def dr(y_true, y_pred):
        y_pred_pos = K.round(K.clip(y_pred, 0, 1))
        y_pred_neg = 1 - y_pred_pos
        y_pos = K.round(K.clip(y_true, 0, 1))
        tp = K.sum(y_pos * y_pred_pos)
        fn = K.sum(y_pos * y_pred_neg)
        return tp / (tp + fn + K.epsilon())

    @staticmethod
    def far(y_true, y_pred):
        y_pred_pos = K.round(K.clip(y_pred, 0, 1))
        y_pred_neg = 1 - y_pred_pos
        y_pos = K.round(K.clip(y_true, 0, 1))
        y_neg = 1 - y_pos
        tn = K.sum(y_neg * y_pred_neg)
        fp = K.sum(y_neg * y_pred_pos)
        return fp / (tn + fp + K.epsilon())

    def get_model(self):
        model = models.Sequential()
        model.add(
            layers.Dense(25,
                         activation='relu',
                         input_shape=(self.n_features, )))
        model.add(layers.Dropout(0.08))
        model.add(layers.Dense(25, activation='relu'))
        model.add(layers.Dropout(0.08))
        model.add(layers.Dense(25, activation='relu'))
        model.add(layers.Dropout(0.08))
        model.add(layers.Dense(25, activation='relu'))
        model.add(layers.Dropout(0.08))
        model.add(layers.Dense(1, activation='sigmoid'))
        model.compile(optimizer=optimizers.RMSprop(lr=0.0023),
                      loss='binary_crossentropy',
                      metrics=['accuracy', self.dr, self.far])
        return model

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.X = self.filehandler.read_csv(
            self.ds.config['path'],
            self.ds.config['file'] + '_Tensor2d_type_2')
        print('\tRow count:\t', '{}'.format(self.X.shape[0]))
        print('\tColumn count:\t', '{}'.format(self.X.shape[1]))

    def set_y(self):
        self.y = self.X['attack_category']
        self.y = self.y.map(self.label_map_string_2_int)

    def remove_target_from_X(self):
        self.X.drop('attack_category', axis=1, inplace=True)

    def train_test_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.30, random_state=self.random_state)

    def map_target_to_label(self, t):
        return np.vectorize(self.label_map_int_2_string.get)(t)

    def fname(self, title):
        return '{}/{}.png'.format(self.folder, title)
Ejemplo n.º 2
0
class Modelling:
    def __init__(self):
        os.environ[
            'TF_CPP_MIN_LOG_LEVEL'] = '2'  # Ignore low level instruction warnings
        tf.logging.set_verbosity(tf.logging.ERROR)  # Set tensorflow verbosity

        # self.logfile = None
        # self.gettrace = getattr(sys, 'gettrace', None)
        # self.original_stdout = sys.stdout
        # self.timestr = time.strftime("%Y%m%d-%H%M%S")
        # self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.full = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.n_features = None
        self.random_state = 20
        self.label_multi = {
            0: 'normal',
            '0': 'normal',
            1: 'dos',
            '1': 'dos',
            2: 'u2r',
            '2': 'u2r',
            3: 'r2l',
            '3': 'r2l',
            4: 'probe',
            '4': 'probe'
        }
        self.label_binary = {0: 'good', '0': 'good', 1: 'bad', '1': 'bad'}

        with timer('\nLoading dataset'):
            self.load_data()

        with timer('\nSetting X and y'):
            self.set_X()
            self.n_features = self.X.shape[1]

        models = (RandomForestClf(), AnnSLPBinary(self.n_features),
                  AnnMLPBinary(self.n_features), AnnMLPMulti(self.n_features))
        classification_type = ('Binary', 'Multi')

        for m, ctype in itertools.product(models, classification_type):
            score = False
            if ctype == 'Binary' and m.binary_enabled:
                self.set_y_binary()
                score = True
            elif ctype == 'Multi' and m.multi_enabled:
                self.set_y_multi()
                score = True

            if not score:
                continue

            with timer('\nTraining and scoring {} - {} target'.format(
                    m.__class__.__name__, ctype)):
                m.base['model'] = m.get_model()
                #self.train_test_split()
                m.score(self.X, self.y, ctype)

            m.y_test[ctype] = pd.Series(m.y_test[ctype])
            m.y_pred[ctype] = pd.Series(m.y_pred[ctype])
            m.y_test[ctype] = m.y_test[ctype].astype(int)
            m.y_pred[ctype] = m.y_pred[ctype].astype(int)

            if ctype == 'Binary':
                m.y_test[ctype] = self.series_map_ac_binary_to_label(
                    m.y_test[ctype])
                m.y_pred[ctype] = self.series_map_ac_binary_to_label(
                    m.y_pred[ctype])
            else:
                m.y_test[ctype] = self.series_map_ac_multi_to_label(
                    m.y_test[ctype])
                m.y_pred[ctype] = self.series_map_ac_multi_to_label(
                    m.y_pred[ctype])

            title = '{} - {} - {} '.format('CM', m.__class__.__name__, ctype)
            self.visualize.confusion_matrix(m.y_test[ctype], m.y_pred[ctype],
                                            title)
            self.scores(m.y_test[ctype], m.y_pred[ctype])

    # Append the scores to a scores array. I could then do an np.mean(scores) to get the mean(average) from all the kfolds
    # save the epoch number and gfold number if possible as well, to get a per/epoch score

    # self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.full = self.filehandler.read_csv(
            self.ds.config['path'],
            self.ds.config['file'] + '_Tensor2d_type_1')

    def set_X(self):
        self.X = self.full.loc[:, self.full.columns != 'attack_category']

    def set_y_binary(self):
        self.y = self.full.loc[:, ['attack_category']]
        self.df_map_ac_label_to_binary()
        self.y = self.y.values.ravel()

    def set_y_multi(self):
        self.y = self.full.loc[:, ['attack_category']]
        self.df_map_ac_label_to_multi()
        self.y = self.y.values.ravel()

    def train_test_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.30, random_state=self.random_state)

    def df_map_ac_label_to_binary(self):
        conditions = [(self.y['attack_category'] == 'normal'),
                      (self.y['attack_category'] == 'dos') |
                      (self.y['attack_category'] == 'u2r') |
                      (self.y['attack_category'] == 'r2l') |
                      (self.y['attack_category'] == 'probe')]
        self.y['attack_category'] = np.select(conditions, [0, 1])

    def df_map_ac_label_to_multi(self):
        conditions = [(self.y['attack_category'] == 'normal'),
                      (self.y['attack_category'] == 'dos'),
                      (self.y['attack_category'] == 'u2r'),
                      (self.y['attack_category'] == 'r2l'),
                      (self.y['attack_category'] == 'probe')]
        self.y['attack_category'] = np.select(
            conditions,
            ['0', '1', '2', '3', '4'])  # string for get_dummies encoding

    def series_map_ac_multi_to_label(self, s):
        return s.map(self.label_multi)

    def series_map_ac_binary_to_label(self, s):
        return s.map(self.label_binary)

    def scores(self, y_test, y_pred):
        print('Accuracy {}'.format(accuracy_score(y_test, y_pred)))
        print('F1 {}'.format(classification_report(y_test, y_pred, digits=10)))
Ejemplo n.º 3
0
class Linearity:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20

        self.X = None
        self.y = None
        self.sample = None
        self.full = None
        self.ac_count = {}
        self.scale_cols = [
            'duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
            'urgent', 'hot', 'num_failed_logins', 'logged_in',
            'num_compromised', 'root_shell', 'su_attempted', 'num_root',
            'num_file_creations', 'num_shells', 'num_access_files',
            'is_guest_login', 'count', 'srv_count', 'serror_rate',
            'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
            'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
            'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate'
        ]
        self.full_weights = {
            'normal': 1,
            'dos': 1,
            'probe': 1,
            'u2r': 1,
            'r2l': 1
        }
        self.minimal_weights = {
            'normal': 0.01,
            'dos': 0.01,
            'probe': 0.2,
            'u2r': 0.5,
            'r2l': 0.5
        }

        with timer('\nLoading dataset'):
            self.load_data()
            self.set_attack_category_count()
            self.ds.shape()
        with timer('\nEncode and Scale dataset'):
            # Encode categoricals
            le = preprocessing.LabelEncoder()
            self.full['protocol_type'] = le.fit_transform(
                self.full['protocol_type'])
            self.full['service'] = le.fit_transform(self.full['service'])
            self.full['flag'] = le.fit_transform(self.full['flag'])

            # Scale
            sc = StandardScaler()
            self.full[self.scale_cols] = sc.fit_transform(
                self.full[self.scale_cols])
        with timer('\nPlotting scatter graphs'):
            self.sample_dataset(self.full_weights)
            print(self.sample.shape)
            self.set_X_y('target')
            self.scatter()
        with timer('\nPlotting scatter graphs with convex hull'):
            self.sample_dataset(self.full_weights)
            print(self.sample.shape)
            self.set_X_y('target')
            self.convex_hull()
        with timer('\nPlotting linear separability with classifiers'):
            self.sample_dataset(self.minimal_weights)
            print(self.sample.shape)
            self.set_X_y('target')
            self.classifiers()

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def scatter(self):
        self.visualize.scatter(self.X,
                               cola='src_bytes',
                               colb='dst_bytes',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='count',
                               colb='diff_srv_rate',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='duration',
                               colb='src_bytes',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='dst_host_srv_count',
                               colb='dst_bytes',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='serror_rate',
                               colb='rerror_rate',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='dst_host_srv_count',
                               colb='dst_bytes',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='srv_diff_host_rate',
                               colb='srv_count',
                               hue='target')

    def convex_hull(self):
        buckets = self.y.unique()
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='src_bytes',
                                   colb='dst_bytes',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='count',
                                   colb='diff_srv_rate',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='duration',
                                   colb='src_bytes',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='dst_host_srv_count',
                                   colb='dst_bytes',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='serror_rate',
                                   colb='rerror_rate',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='dst_host_srv_count',
                                   colb='dst_bytes',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='srv_diff_host_rate',
                                   colb='srv_count',
                                   target='target')

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)

    def set_attack_category_count(self):
        ac = self.full['attack_category'].value_counts()
        for key, value in ac.items():
            self.ac_count[key] = value

    def set_X_y(self, target):
        print('Setting X, with y as {}'.format(target))
        self.X = self.sample
        self.y = self.sample[target]

    def sample_dataset(self, weights):
        print('Sampling dataset with weights {}'.format(weights))
        self.sample = pd.DataFrame()
        for key, value in self.ac_count.items():
            samples = int(value * weights[key])
            df = self.full[self.full.attack_category == key].sample(
                samples, random_state=self.random_state)
            self.sample = self.sample.append(df)

    def classifiers(self):
        le = preprocessing.LabelEncoder()
        self.y = le.fit_transform(self.y)
        _y = self.y

        models = (Perceptron(max_iter=100,
                             tol=1e-3,
                             random_state=self.random_state),
                  LinearSVC(max_iter=500,
                            random_state=self.random_state,
                            tol=1e-5),
                  SVC(kernel='rbf',
                      gamma=5,
                      C=10.0,
                      random_state=self.random_state))

        titles = ('Perceptron', 'LinearSVC (linear kernel)',
                  'SVC with RBF kernel')
        columns = [('srv_diff_host_rate', 'srv_count'),
                   ('dst_host_srv_count', 'count'),
                   ('dst_host_srv_count', 'dst_bytes')]
        for clf, title in zip(models, titles):
            for cola, colb in columns:
                _x = self.X.loc[:, [cola, colb]]
                clf.fit(_x, _y)
                _y_pred = clf.predict(_x)
                self.visualize.boundary(_x, _y, clf, title, cola, colb)
                self.visualize.confusion_matrix(
                    _y, _y_pred, title + ' - ' + cola + ' vs ' + colb)
Ejemplo n.º 4
0
class FeatureSelection:
    def __init__(self):
        self.logfile = False
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.visualize = Visualize()
        self.ds = KDDCup1999()
        self.X = None
        self.y = None
        self.full = None
        self.random_state = 20
        self.num_features = 15
        self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
                           'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login',
                           'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
                           'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
                           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']

        with timer('\nLoading dataset'):
            self.load_data()
            self.encode_scale()
            self.set_X()
        with timer('\nFeature selection'):
            for selector in (Original(),
                             UnivariateSelector(),
                             RecursiveSelector(),
                             PCASelector(),
                             #KernelPCASelector(),
                             ExtraTreesSelector(),
                             RandomForestSelector()):
                for label in ('attack_category', 'target'):
                    self.set_y(label)
                    with timer('\nFitting selector ' + selector.__class__.__name__):
                        selector.fit_model(self.X, self.y)
                        x = selector.get_top_features(self.X, label)
                    with timer('\nXGBoost scoring of features selected by ' + selector.__class__.__name__):
                        self.score_with_xgboost(x, self.y, selector.title)

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)
        self.ds.shape()
        print(self.ds.dataset.columns)
        self.ds.row_count_by_target('attack_category')

    def encode_scale(self):
        # Encode categoricals
        le = preprocessing.LabelEncoder()
        self.full['protocol_type'] = le.fit_transform(self.full['protocol_type'])
        self.full['service'] = le.fit_transform(self.full['service'])
        self.full['flag'] = le.fit_transform(self.full['flag'])

        # Scale
        sc = MinMaxScaler()
        self.full[self.scale_cols] = sc.fit_transform(self.full[self.scale_cols])

    def set_X(self):
        self.X = self.full.iloc[:, :-2]

    def set_y(self, label):
        self.y = self.full[label]

    def score_with_xgboost(self, x, y, title):
        clf = XGBClassifier(n_estimators=100, random_state=self.random_state)
        kfold = StratifiedKFold(n_splits=10, random_state=self.random_state)
        results = cross_val_score(clf, x, y, cv=kfold)
        print("XGBoost Accuracy: %.2f%% (+/- %.2f%%)" % (results.mean() * 100, results.std() * 100))
        y_pred = cross_val_predict(clf, x, y, cv=10)
        self.visualize.confusion_matrix(y, y_pred, title)
Ejemplo n.º 5
0
class Sampling:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20
        self.X = None
        self.y = None
        self.full = None

        # RF Feature selected plus sparse cols
        self.cols = [
            'count', 'diff_srv_rate', 'src_bytes', 'dst_host_srv_count',
            'flag', 'dst_bytes', 'serror_rate', 'dst_host_diff_srv_rate',
            'service', 'dst_host_count', 'dst_host_srv_diff_host_rate',
            'logged_in', 'protocol_type', 'dst_host_same_src_port_rate', 'hot',
            'srv_count', 'wrong_fragment', 'num_compromised', 'rerror_rate',
            'srv_diff_host_rate', 'urgent', 'num_failed_logins', 'root_shell',
            'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
            'num_access_files', 'is_guest_login'
        ]

        with timer('\nLoading dataset'):
            self.load_data()

        with timer('\nScaling'):
            # Sampling options
            for sampler in (Original(), RandomOverSampler(),
                            SMOTE(random_state=self.random_state),
                            ADASYN(random_state=self.random_state),
                            BorderlineSMOTE(random_state=self.random_state,
                                            kind='borderline-1')):

                self.X = self.full.loc[:, self.cols]
                self.X['target'] = self.full['target']
                print('X shape with selected features and binary - ',
                      self.X.shape)

                self.X = pd.get_dummies(
                    data=self.X, columns=['protocol_type', 'service', 'flag'])
                print('X shape after encoding categoricals - ', self.X.shape)

                # Re-sample based on attack_category labels
                res_x = pd.DataFrame()
                res_x, res_y_attack_category, title = self.sample(
                    sampler, self.X, self.full['attack_category'])

                res_y_target = res_x[
                    'target']  # Grab target as y from resampled x set
                res_x.drop(columns=['target'], inplace=True)
                print('X shape after sampling and removing target - ',
                      res_x.shape)
                print('y shape with attack_category after resample - ',
                      res_y_attack_category.shape)
                print(res_y_attack_category.value_counts())
                res_y_attack_category.value_counts().plot(
                    kind='bar',
                    title=title + ' - Resampled Count (attack_category)')
                plt.show()
                print('y shape with target after resample - ',
                      res_y_target.shape)

                # Scale after resampling
                qt = QuantileTransformer(output_distribution='normal')
                res_x = qt.fit_transform(res_x)
                print('X shape after scaling - ', res_x.shape)

                # Score on attack_category multi-class
                self.model_and_score(res_x, res_y_attack_category, title,
                                     'attack_category')

                # Score on binary target
                self.model_and_score(res_x, res_y_target, title, 'target')

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)
        self.ds.shape()
        self.ds.row_count_by_target('attack_category')

    def set_y(self, label):
        self.y = self.full[label]

    def sample(self, sampler, X, y):
        title = sampler.__class__.__name__
        res_x, res_y = sampler.fit_resample(X, y)
        if isinstance(res_x, np.ndarray):
            res_x = pd.DataFrame(res_x, columns=X.columns)

        if isinstance(res_y, np.ndarray):
            res_y = pd.Series(res_y)

        print('Shape after sampling with {} - x {},  y {}'.format(
            title, res_x.shape, res_y.shape))
        return res_x, res_y, title

    def model_and_score(self, X, y, title, label):
        clf = XGBClassifier(n_estimators=50, random_state=self.random_state)
        kfold = StratifiedKFold(n_splits=5, random_state=self.random_state)
        results = cross_val_score(clf, X, y, cv=kfold)
        y_pred = cross_val_predict(clf, X, y, cv=5)
        print('{} - {} - XGBoost Accuracy: {:.2f}% (+/- {:.2f}'.format(
            title, label,
            results.mean() * 100,
            results.std() * 100))
        self.visualize.confusion_matrix(
            y, y_pred,
            '{} - {} - Label {}'.format(title, clf.__class__.__name__, label))
Ejemplo n.º 6
0
class Scaling:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20
        self.X = None
        self.y = None
        self.full = None
        self.ac_count = {}
        self.scores = OrderedDict()
        self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
                           'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login',
                           'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
                           'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
                           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']

        with timer('\nLoading dataset'):
            self.load_data()
            self.set_attack_category_count()
        with timer('\nEncoding categoricals'):
            le = preprocessing.LabelEncoder()
            self.full['protocol_type'] = le.fit_transform(self.full['protocol_type'])
            self.full['service'] = le.fit_transform(self.full['service'])
            self.full['flag'] = le.fit_transform(self.full['flag'])
        with timer('\nSetting X'):
            self.set_X()
            self.ds.shape()
        with timer('\nDistribution Before Scaling'):
            self.dist_before_scaling()
        with timer('\nScaling'):
            for scaler in (StandardScaler(),
                           Normalizer(),
                           MinMaxScaler(feature_range=(0, 1)),
                           Binarizer(threshold=0.0),
                           RobustScaler(quantile_range=(25, 75)),
                           PowerTransformer(method='yeo-johnson'),
                           QuantileTransformer(output_distribution='normal')):
                title, res_x = self.scale(scaler)

                label = 'attack_category'
                self.set_y(label)
                self.model_and_score(scaler, res_x, title, label)

                label = 'target'
                self.set_y(label)
                self.model_and_score(scaler, res_x, title, label)

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)
        self.ds.shape()
        self.ds.row_count_by_target('attack_category')

    def set_attack_category_count(self):
        ac = self.full['attack_category'].value_counts()
        for key, value in ac.items():
            self.ac_count[key] = value

    def set_X(self):
        self.X = self.full.loc[:, self.scale_cols]

    def set_y(self, label):
        self.y = self.full[label]

    def dist_before_scaling(self):
        self.visualize.kdeplot('Distribution Before Scaling', self.X, self.scale_cols)

    def scale(self, scaler):
        x = self.X[self.scale_cols]

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            res_x = scaler.fit_transform(x)

        res_x = pd.DataFrame(res_x, columns=self.scale_cols)
        title = 'Distribution After ' + scaler.__class__.__name__
        self.visualize.kdeplot(title, res_x, self.scale_cols)
        return title, res_x

    def model_and_score(self, scaler, res_x, title, label):
        clf = XGBClassifier(n_estimators=100, random_state=self.random_state)
        kfold = StratifiedKFold(n_splits=10, random_state=self.random_state)
        results = cross_val_score(clf, res_x, self.y, cv=kfold)
        y_pred = cross_val_predict(clf, res_x, self.y, cv=10)
        print('{} - {} - XGBoost Accuracy: {:.2f}% (+/- {:.2f}'.format(title, label, results.mean() * 100,
                                                                       results.std() * 100))
        self.visualize.confusion_matrix(self.y, y_pred, '{} - {} - Label {}'.format(title, clf.__class__.__name__,
                                                                                    label))