Beispiel #1
0
 def initalizing(self,
                 ppreds,
                 pprobs,
                 spreds,
                 flows,
                 classes,
                 y_true,
                 initial_threshold=0.9):
     self.ppreds = ppreds
     self.pprobs = pprobs
     self.spreds = spreds
     self.flows = flows
     self.classes = classes
     self.y_true = y_true
     self.initial_threshold = initial_threshold
     self.h_threshold = [
         round((np.random.rand() % 0.05) + self.initial_threshold, 3)
         for c in self.classes
     ]
     self.h_threshold = [initial_threshold for c in self.classes]
     self.step = int(100 - (self.initial_threshold * 100) + 1)
     self.init_th = self.h_threshold.copy()
     self.isInit = True
     fprint(self.log, 'Initializing compilte')
     return '<Initializing Threshold class>'
Beispiel #2
0
    def sessionization(self):
        fprint(self.log, 'Convert packet dataset to session dataset')
        ts = timeit.default_timer()
        flows = cksess.get_flows(self.dataset)
        self.session = self.dataset[[flow[-1] for flow in flows]]
        self.isSess = True
        te = timeit.default_timer()
        fprint(self.log, '---> Done ({:.4f} seconds)'.format(te - ts))

        return '<Function: sessionization>'
Beispiel #3
0
    def read_csv(self, path, encoding=ckc.ISCX_DATASET_ENCODING):
        fprint(self.log, 'Read dataset: {}'.format(path))
        ts = timeit.default_timer()
        self.dataset = pd.read_csv(filepath_or_buffer=path, encoding=encoding)
        self.header = self.dataset.columns.tolist()
        self.dataset = self.dataset.values
        te = timeit.default_timer()
        fprint(self.log, '---> Done ({:.4f} seconds)'.format(te - ts))

        return '<Function: read csv>'
Beispiel #4
0
 def save(self, path):
     if self.isSess:
         fprint(self.log, 'Writing session dataset at {}'.format(path))
         ts = timeit.default_timer()
         pd.DataFrame(data=self.session).to_csv(
             path,
             index=False,
             header=self.header,
             encoding=ckc.ISCX_DATASET_ENCODING)
         te = timeit.default_timer()
         fprint(self.log, '---> Done ({:.4f} seconds)'.format(te - ts))
     else:
         return 'ERROR: Not sessionization'
     return '<Function: Save session>'
Beispiel #5
0
 def initalizing(self, ppreds, pprobs, spreds, sprobs, flows, classes,
                 y_true, delta):
     self.ppreds = ppreds
     self.pprobs = pprobs
     self.spreds = spreds
     self.sprobs = sprobs
     self.flows = flows
     self.classes = classes
     self.delta = delta
     self.y_true = y_true
     self.l_threshold = [delta[c][0] for c in range(len(self.classes))]
     self.step = self.getStep()
     self.init_th = self.l_threshold.copy()
     self.isInit = True
     fprint(self.log, 'Initializing compilte')
     return '<Initializing Threshold class>'
Beispiel #6
0
    def approximate(self):
        assert self.isInit, 'Class Threshold is not initialized'

        fprint(self.log, 'Processing finding approximate threshold')

        percentage = 0
        timer_deviding = self.step * len(self.classes) / 100

        ts = timeit.default_timer()

        for ci, cs in enumerate(self.classes):
            f1_scores = []
            for h_step in range(self.step):
                classified = []
                for flow_idx, fpreds, fprobs in zip(range(len(self.flows)),
                                                    self.ppreds, self.pprobs):
                    found = False

                    for pkt_idx, pred, prob in zip(range(len(fpreds)), fpreds,
                                                   fprobs):
                        if (self.h_threshold <= prob):
                            classified.append(pred)
                            found = True
                            break

                    if not found:
                        if (self.l_threshold[self.spreds[flow_idx]] <=
                                self.sprobs[flow_idx]):
                            classified.append(self.spreds[flow_idx])
                        else:
                            max_prob_idx = np.argmax(fprobs)
                            classified.append(fpreds[max_prob_idx])

                f1_scores.append(
                    f1_score(y_true=self.y_true,
                             y_pred=classified,
                             labels=self.classes,
                             average='macro'))

                self.l_threshold[ci] -= 0.01
                self.l_threshold[ci] = round(self.l_threshold[ci], 2)
                percentage += 1
                te = timeit.default_timer()
                if self.verbose:
                    print('Processing {:.3f}% ({:.4f} seconds)'.format(
                        percentage / timer_deviding, te - ts),
                          end='\r')

            self.l_threshold[ci] = round(
                self.init_th[ci] - (np.argmax(f1_scores) / 100), 2)
            fprint(
                self.log, 'Max F1: {} --> Now threshold: [{}]{}'.format(
                    np.argmax(f1_scores), self.h_threshold, self.l_threshold))

#         self.l_threshold = [{th > self.initial_threshold:round(th - 0.01, 2)}.get(True, th) for th in self.l_threshold]
        fprint(self.log,
               'Found approximate threshold: {}'.format(self.l_threshold))
        return '<Appoximate function>'
Beispiel #7
0
    def modelling(self):
        fprint(self.log, 'Training label encoder and scaler')
        ts = timeit.default_timer()
        self.le.fit(self.train_dataset[:, -1])
        self.le.fit(self.test_dataset[:, -1])
        self.scaler.fit(self.train_dataset[:, 1:-1])
        te = timeit.default_timer()
        fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts))

        fprint(self.log, 'Training model')
        ts = timeit.default_timer()

        self.sclf.fit(X=self.scaler.transform(self.train_dataset[:, 1:-1]),
                      y=self.le.transform(self.train_dataset[:, -1]))

        gc.collect()
        te = timeit.default_timer()
        fprint(self.log, '---> Done ({:.4f} seconds)'.format(te - ts))

        return '<Function: modelling>'
Beispiel #8
0
    def predict(self):
        pred_ts = timeit.default_timer()

        fprint(self.log, 'Predict session training dataset')
        ts = timeit.default_timer()
        self.spreds_train = self.sclf.predict(
            self.scaler.transform(self.train_dataset[:, 1:-1]))
        te = timeit.default_timer()
        fprint(
            self.log,
            'Session training dataset predict time: {} seconds'.format(te -
                                                                       ts))

        fprint(self.log, 'Predict session test dataset')
        ts = timeit.default_timer()
        self.spreds_test = self.sclf.predict(
            self.scaler.transform(self.test_dataset[:, 1:-1]))
        te = timeit.default_timer()
        fprint(self.log,
               'Session test dataset predict time: {} seconds'.format(te - ts))

        return '<Function: predict>'
Beispiel #9
0
    def read_csv(self, path, encoding=ckc.ISCX_DATASET_ENCODING):
        fprint(self.log, 'Reading dataset: {}'.format(path))
        ts = timeit.default_timer()
        self.dataset = pd.read_csv(filepath_or_buffer=path,
                                   encoding=encoding).values

        fprint(self.log, 'Skip data: {}'.format(self.skip_datas))
        for word in self.skip_datas:
            self.dataset = self.dataset[self.dataset[:, -1] != word]

        self.flows = cksess.get_flows(dataset=self.dataset)
        self.train_size = int(len(self.flows) * self.split_ratio)
        te = timeit.default_timer()
        fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts))

        fprint(self.log, 'Shuffling dataset by flows')
        ts = timeit.default_timer()
        self.dataset, _ = cksess.shuffle_flow(dataset=self.dataset,
                                              flows=self.flows,
                                              random_state=self.seed)
        self.flows = cksess.get_flows(dataset=self.dataset)
        te = timeit.default_timer()
        fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts))

        fprint(self.log, 'Creating training & test dataset')
        ts = timeit.default_timer()
        self.session = self.dataset[[flow[-1] for flow in self.flows]]
        self.train_session = self.session[:self.train_size]
        self.test_session = self.session[self.train_size:]
        self.train_dataset = self.dataset[cksess.flatten(
            self.flows[:self.train_size])]
        self.test_dataset = self.dataset[cksess.flatten(
            self.flows[self.train_size:])]
        self.train_flows = cksess.get_flows(dataset=self.train_dataset)
        self.test_flows = cksess.get_flows(dataset=self.test_dataset)
        gc.collect()
        te = timeit.default_timer()
        fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts))

        return '<Function: read & shuffling csv>'
Beispiel #10
0
    def read_csv(self, path, encoding=ckc.ISCX_DATASET_ENCODING):
        fprint(self.log, 'Reading dataset: {}'.format(path))
        ts = timeit.default_timer()
        dataset = pd.read_csv(filepath_or_buffer=path,
                              encoding=encoding).values

        fprint(self.log, 'Skip data: {}'.format(self.skip_datas))
        for word in self.skip_datas:
            dataset = dataset[dataset[:, -1] != word]

        self.train_size = int(len(dataset) * self.split_ratio)
        te = timeit.default_timer()
        fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts))

        fprint(self.log, 'Shuffling dataset by flows')
        ts = timeit.default_timer()
        np.random.shuffle(dataset)
        te = timeit.default_timer()
        fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts))

        fprint(self.log, 'Creating training & test dataset')
        ts = timeit.default_timer()
        self.train_dataset = dataset[:self.train_size]
        self.test_dataset = dataset[self.train_size:]
        gc.collect()
        te = timeit.default_timer()
        fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts))

        return '<Function: read & shuffling csv>'
Beispiel #11
0
    def predict(self):
        pred_ts = timeit.default_timer()

        fprint(self.log, 'Predict session training dataset')
        ts = timeit.default_timer()
        self.spreds_train = self.sclf.predict(
            self.sscaler.transform(self.train_session[:, 1:-1]))
        te = timeit.default_timer()
        fprint(
            self.log,
            'Session training dataset predict time: {} seconds'.format(te -
                                                                       ts))

        fprint(self.log, 'Predict session test dataset')
        ts = timeit.default_timer()
        self.spreds_test = self.sclf.predict(
            self.sscaler.transform(self.train_session[:, 1:-1]))
        te = timeit.default_timer()
        fprint(self.log,
               'Session test dataset predict time: {} seconds'.format(te - ts))
        self.sprobs_train_all = self.sclf.predict_proba(
            self.sscaler.transform(self.train_session[:, 1:-1]))
        self.sprobs_train = np.max(self.sprobs_train_all, axis=1)
        self.sprobs_test_all = self.sclf.predict_proba(
            self.sscaler.transform(self.test_session[:, 1:-1]))
        self.sprobs_test = np.max(self.sprobs_test_all, axis=1)

        fprint(self.log, 'Predict packet training dataset')
        ts = timeit.default_timer()
        self.ppreds_train = self.pclf.predict(
            self.pscaler.transform(self.train_dataset[:, 2:-1]))
        te = timeit.default_timer()
        self.pkt_train_ptime_mean = (te - ts) / len(self.ppreds_train)
        self.ppreds_train = [
            self.ppreds_train[flow] for flow in self.train_flows
        ]
        self.pprobs_train_all = self.pclf.predict_proba(
            self.pscaler.transform(self.train_dataset[:, 2:-1]))
        self.pprobs_train = np.max(self.pprobs_train_all, axis=1)
        self.pprobs_train_all = [
            self.pprobs_train_all[flow] for flow in self.train_flows
        ]
        self.pprobs_train = [
            self.pprobs_train[flow] for flow in self.train_flows
        ]
        fprint(
            self.log,
            'Packet training dataset predict time: {} seconds'.format(te - ts))

        fprint(self.log, 'Predict packet test dataset')
        ts = timeit.default_timer()
        self.ppreds_test = self.pclf.predict(
            self.pscaler.transform(self.test_dataset[:, 2:-1]))
        te = timeit.default_timer()
        packet_test_pred_time = te - ts
        self.pkt_test_ptime_mean = packet_test_pred_time / len(
            self.ppreds_test)
        self.ppreds_test = [self.ppreds_test[flow] for flow in self.test_flows]
        self.pprobs_test_all = self.pclf.predict_proba(
            self.pscaler.transform(self.test_dataset[:, 2:-1]))
        self.pprobs_test = np.max(self.pprobs_test_all, axis=1)
        self.pprobs_test_all = [
            self.pprobs_test_all[flow] for flow in self.test_flows
        ]
        self.pprobs_test = [self.pprobs_test[flow] for flow in self.test_flows]
        fprint(self.log,
               'Packet test dataset predict time: {} seconds'.format(te - ts))

        pred_te = timeit.default_timer()
        fprint(
            self.log,
            'Processing of predict part is finished ({} seconds)'.format(
                pred_te - pred_ts))

        return '<Function: predict>'
Beispiel #12
0
    def gradient(self, times=1, limit=-1):
        assert self.isInit, 'Class Threshold is not initialized'

        start_point = []
        last_idx = None
        for c in range(len(self.classes)):
            for i, k in enumerate(self.delta[c]):
                if k >= self.l_threshold[c]:
                    last_idx = i
            start_point.append(last_idx)

        search_count = 0
        path_count = 0
        solstice = False
        prev_base = []
        fprint(
            self.log, 'Start threshold: [{}]{}'.format(self.h_threshold,
                                                       self.l_threshold))
        if limit < 0:
            fprint(self.log, 'Find infinity')
        else:
            fprint(self.log, 'Find limit: {}'.format(limit))
        fprint(self.log, 'Print threshold every {} times'.format(times))
        d_position = [start_point[i] for i in range(len(self.classes))]
        max_point = [len(self.delta[c]) - 1 for c in range(len(self.classes))]

        ts = timeit.default_timer()
        prev_base.append(self.l_threshold.copy())
        while (not solstice):
            search_count += 1
            f1_scores = []
            '''
                Base point F1-score
            '''
            classified = []
            for flow_idx, fpreds, fprobs in zip(range(len(self.flows)),
                                                self.ppreds, self.pprobs):
                found = False

                for pkt_idx, pred, prob in zip(range(len(fpreds)), fpreds,
                                               fprobs):
                    if (self.h_threshold <= prob):
                        classified.append(pred)
                        found = True
                        break

                if not found:
                    if (self.l_threshold[self.spreds[flow_idx]] <=
                            self.sprobs[flow_idx]):
                        classified.append(self.spreds[flow_idx])
                    else:
                        max_prob_idx = np.argmax(fprobs)
                        classified.append(fpreds[max_prob_idx])

            classified = np.array(classified)
            base_f1 = f1_score(y_true=self.y_true,
                               y_pred=classified,
                               labels=self.classes,
                               average='macro')
            '''
                Surronding points F1-score
            '''

            for class_idx, c in enumerate(self.classes):
                if (d_position[class_idx] + 1 > max_point[class_idx]):
                    continue
                self.l_threshold[class_idx] = self.delta[class_idx][
                    d_position[class_idx] + 1]
                self.l_threshold[class_idx] = round(
                    self.l_threshold[class_idx], 3)

                classified = []
                for flow_idx, fpreds, fprobs in zip(range(len(self.flows)),
                                                    self.ppreds, self.pprobs):
                    found = False

                    for pkt_idx, pred, prob in zip(range(len(fpreds)), fpreds,
                                                   fprobs):
                        if (self.h_threshold <= prob):
                            classified.append(pred)
                            found = True
                            break

                    if not found:
                        if (self.l_threshold[self.spreds[flow_idx]] <=
                                self.sprobs[flow_idx]):
                            classified.append(self.spreds[flow_idx])
                        else:
                            max_prob_idx = np.argmax(fprobs)
                            classified.append(fpreds[max_prob_idx])

                classified = np.array(classified)
                f1_scores.append([
                    class_idx,
                    f1_score(y_true=self.y_true,
                             y_pred=classified,
                             labels=self.classes,
                             average='macro')
                ])

                self.l_threshold[class_idx] = self.delta[class_idx][
                    d_position[class_idx]]
                self.l_threshold[class_idx] = round(
                    self.l_threshold[class_idx], 3)

            tmp_f1 = np.max(f1_scores, axis=0)[-1]
            f1_scores = np.array(f1_scores, dtype=np.object)
            tmp_f1 = f1_scores[f1_scores[:, 1] == tmp_f1]
            max_f1 = np.squeeze(tmp_f1[np.random.choice(
                np.arange(len(tmp_f1)), 1)])
            te = timeit.default_timer()

            if max_f1[1] < base_f1:
                solstice = True
                prev_base = []
                prev_base.append(self.l_threshold.copy())
                fprint(
                    self.log,
                    'Total process count: {} ({:.4f} seconds)'.format(
                        search_count, te - ts))
            else:
                diff = (max_f1[1] - base_f1)
                d_position[max_f1[0]] += 1
                self.l_threshold[class_idx] = self.delta[max_f1[0]][d_position[
                    max_f1[0]]]

                if max_f1[1] > base_f1:
                    prev_base = []

                prev_base.append(self.l_threshold.copy())

                if self.verbose:
                    print(
                        '[{:3d}][Base: {:.6f}] [Max: {:.6f}] [diff: {:.6f}] [class: {:2d}] [delta: {:.3f}] ({:.4f} sec)'
                        .format(search_count, base_f1, max_f1[1], diff,
                                max_f1[0],
                                self.delta[max_f1[0]][d_position[max_f1[0]]],
                                te - ts),
                        end='\r')

            if (search_count > limit) & (limit > -1):
                fprint(
                    self.log,
                    'Process count is over than {} --> Stop process ({:.4f} seconds)'
                    .format(limit, te - ts))
                break

        print('')
        if solstice:
            print(
                '[{:3d}][Base: {:.6f}] [Max: {:.6f}] [diff: {:.6f}] [class: {:2d}] [delta: {:.3f}] ({:.4f} sec)'
                .format(search_count, base_f1, max_f1[1], diff, max_f1[0],
                        self.delta[max_f1[0]][d_position[max_f1[0]]], te - ts))
            fprint(
                self.log,
                'Found threshold: [{}]{}'.format(self.h_threshold,
                                                 self.l_threshold))
        else:
            last_th = self.l_threshold.copy()
            self.l_threshold = prev_base[0]
            fprint(self.log,
                   'Last threhsold will be set the same f1-score threhsold')
            fprint(self.log, '{} ---> {}'.format(last_th, self.l_threshold))

        return '<Gradient function>'
Beispiel #13
0
from cklib.ckstd import fprint
from cklib import ckstd
from cklib import DataFrame
import joblib
import gc

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

seed = 22
dataset_path = './bin/iscx2017session.csv'
clf_init = ['rf', 'dt', 'et', 'adt', 'arf', 'gbt']

if __name__ == "__main__":
    for clf in clf_init:
        fprint(None, '{} classifier using'.format(clf))
        dataframe = DataFrame.Session_Dataset(clf=clf, random_state=seed)
        dataframe.skip_data('Heartbleed', 'Infiltration',
                            u'Web Attack \x96 XSS',
                            u'Web Attack \x96 Sql Injection')
        dataframe.read_csv(path=dataset_path)
        dataframe.modelling()
        dataframe.predict()

        label_encoder = dataframe.getLabelEncoder()
        train_pred = dataframe.getTrainPredict()
        test_pred = dataframe.getTestPredict()
        train_true = label_encoder.transform(dataframe.getTrainLabel())
        test_true = label_encoder.transform(dataframe.getTestLabel())

        train_report = classification_report(
Beispiel #14
0
    def gradient(self, delta_step=0.01, times=1, limit=10):
        assert self.isInit != None, 'Class Threshold is not initialized'

        search_count = 0
        path_count = 0
        solstice = False
        delta = delta_step
        prev_base = []
        fprint(
            self.log, 'Start threshold: [{}]{}'.format(self.h_threshold,
                                                       self.l_threshold))
        fprint(self.log, 'Print threshold every {} times'.format(times))

        ts = timeit.default_timer()
        prev_base.append(self.l_threshold.copy())
        while (not solstice):
            search_count += 1
            f1_scores = []
            '''
                Base point F1-score
            '''
            classified = []
            for flow_idx, fpreds, fprobs in zip(range(len(self.flows)),
                                                self.ppreds, self.pprobs):
                found = False

                for pkt_idx, pred, prob in zip(range(len(fpreds)), fpreds,
                                               fprobs):
                    if (self.h_threshold <= prob):
                        classified.append(pred)
                        found = True
                        break

                if not found:
                    if (self.l_threshold[self.spreds[flow_idx]] <=
                            self.sprobs[flow_idx]):
                        classified.append(self.spreds[flow_idx])
                    else:
                        max_prob_idx = np.argmax(fprobs)
                        classified.append(fpreds[max_prob_idx])

            classified = np.array(classified)
            base_f1 = f1_score(y_true=self.y_true,
                               y_pred=classified,
                               labels=self.classes,
                               average='macro')
            '''
                Surronding points F1-score
            '''

            for ci, cs in enumerate(self.classes):
                for i in [-delta, delta]:
                    self.l_threshold[ci] += i
                    self.l_threshold[ci] = round(self.l_threshold[ci], 3)

                    if self.l_threshold[ci] > 1:
                        self.l_threshold[ci] -= i
                        self.l_threshold[ci] = round(self.l_threshold[ci], 3)
                        f1_scores.append([ci, i, 0.])
                        continue
                    if self.l_threshold[ci] < 0.5:
                        self.l_threshold[ci] -= i
                        self.l_threshold[ci] = round(self.l_threshold[ci], 3)
                        f1_scores.append([ci, i, 0.])
                        continue

                    if self.l_threshold in prev_base:
                        self.l_threshold[ci] -= i
                        self.l_threshold[ci] = round(self.l_threshold[ci], 3)
                        f1_scores.append([ci, i, 0.])
                        continue

                    classified = []
                    for flow_idx, fpreds, fprobs in zip(
                            range(len(self.flows)), self.ppreds, self.pprobs):
                        found = False

                        for pkt_idx, pred, prob in zip(range(len(fpreds)),
                                                       fpreds, fprobs):
                            if (self.h_threshold <= prob):
                                classified.append(pred)
                                found = True
                                break

                        if not found:
                            if (self.l_threshold[self.spreds[flow_idx]] <=
                                    self.sprobs[flow_idx]):
                                classified.append(self.spreds[flow_idx])
                            else:
                                max_prob_idx = np.argmax(fprobs)
                                classified.append(fpreds[max_prob_idx])

                    classified = np.array(classified)
                    f1_scores.append([
                        ci, i,
                        f1_score(y_true=self.y_true,
                                 y_pred=classified,
                                 labels=self.classes,
                                 average='macro')
                    ])

                    self.l_threshold[ci] -= i
                    self.l_threshold[ci] = round(self.l_threshold[ci], 3)

            chg_th_idx = np.argmax(f1_scores, axis=0)[-1]
            max_f1 = f1_scores[chg_th_idx]
            te = timeit.default_timer()

            if max_f1[2] < base_f1:
                solstice = True
                fprint(
                    self.log,
                    'Total process count: {} ({:.4f} seconds)'.format(
                        search_count, te - ts))
            else:
                diff = max_f1[2] - base_f1

                prev_th = self.l_threshold[max_f1[0]]
                self.l_threshold[max_f1[0]] += max_f1[1]

                if self.l_threshold[max_f1[0]] > 1.:
                    self.l_threshold[max_f1[0]] = 1.

                if self.l_threshold[max_f1[0]] < 0.5:
                    self.l_threshold[max_f1[0]] = 0.5

                self.l_threshold[max_f1[0]] = round(
                    self.l_threshold[max_f1[0]], 3)

                prev_base.append(self.l_threshold.copy())

                if self.verbose:
                    print('[Base: {}] [Max: {}] [difference: {}] [delta: {}]'.
                          format(base_f1, max_f1[2], diff, max_f1[1]))

            if search_count % times == 0:
                fprint(
                    self.log, '{} --> {} ({:.4f} seconds)'.format(
                        search_count, self.l_threshold, te - ts))

            if search_count > limit:
                fprint(
                    self.log,
                    'Process count is over than {} --> Stop process ({:.4f} seconds)'
                    .format(limit, te - ts))
                break

        print('')
        if solstice:
            fprint(
                self.log,
                'Found threshold: [{}]{}'.format(self.h_threshold,
                                                 self.l_threshold))
        else:
            fprint(
                self.log, 'Process stoped threshold: [{}]{}'.format(
                    self.h_threshold, self.l_threshold))

        return '<Gradient function>'