def initalizing(self, ppreds, pprobs, spreds, flows, classes, y_true, initial_threshold=0.9): self.ppreds = ppreds self.pprobs = pprobs self.spreds = spreds self.flows = flows self.classes = classes self.y_true = y_true self.initial_threshold = initial_threshold self.h_threshold = [ round((np.random.rand() % 0.05) + self.initial_threshold, 3) for c in self.classes ] self.h_threshold = [initial_threshold for c in self.classes] self.step = int(100 - (self.initial_threshold * 100) + 1) self.init_th = self.h_threshold.copy() self.isInit = True fprint(self.log, 'Initializing compilte') return '<Initializing Threshold class>'
def sessionization(self): fprint(self.log, 'Convert packet dataset to session dataset') ts = timeit.default_timer() flows = cksess.get_flows(self.dataset) self.session = self.dataset[[flow[-1] for flow in flows]] self.isSess = True te = timeit.default_timer() fprint(self.log, '---> Done ({:.4f} seconds)'.format(te - ts)) return '<Function: sessionization>'
def read_csv(self, path, encoding=ckc.ISCX_DATASET_ENCODING): fprint(self.log, 'Read dataset: {}'.format(path)) ts = timeit.default_timer() self.dataset = pd.read_csv(filepath_or_buffer=path, encoding=encoding) self.header = self.dataset.columns.tolist() self.dataset = self.dataset.values te = timeit.default_timer() fprint(self.log, '---> Done ({:.4f} seconds)'.format(te - ts)) return '<Function: read csv>'
def save(self, path): if self.isSess: fprint(self.log, 'Writing session dataset at {}'.format(path)) ts = timeit.default_timer() pd.DataFrame(data=self.session).to_csv( path, index=False, header=self.header, encoding=ckc.ISCX_DATASET_ENCODING) te = timeit.default_timer() fprint(self.log, '---> Done ({:.4f} seconds)'.format(te - ts)) else: return 'ERROR: Not sessionization' return '<Function: Save session>'
def initalizing(self, ppreds, pprobs, spreds, sprobs, flows, classes, y_true, delta): self.ppreds = ppreds self.pprobs = pprobs self.spreds = spreds self.sprobs = sprobs self.flows = flows self.classes = classes self.delta = delta self.y_true = y_true self.l_threshold = [delta[c][0] for c in range(len(self.classes))] self.step = self.getStep() self.init_th = self.l_threshold.copy() self.isInit = True fprint(self.log, 'Initializing compilte') return '<Initializing Threshold class>'
def approximate(self): assert self.isInit, 'Class Threshold is not initialized' fprint(self.log, 'Processing finding approximate threshold') percentage = 0 timer_deviding = self.step * len(self.classes) / 100 ts = timeit.default_timer() for ci, cs in enumerate(self.classes): f1_scores = [] for h_step in range(self.step): classified = [] for flow_idx, fpreds, fprobs in zip(range(len(self.flows)), self.ppreds, self.pprobs): found = False for pkt_idx, pred, prob in zip(range(len(fpreds)), fpreds, fprobs): if (self.h_threshold <= prob): classified.append(pred) found = True break if not found: if (self.l_threshold[self.spreds[flow_idx]] <= self.sprobs[flow_idx]): classified.append(self.spreds[flow_idx]) else: max_prob_idx = np.argmax(fprobs) classified.append(fpreds[max_prob_idx]) f1_scores.append( f1_score(y_true=self.y_true, y_pred=classified, labels=self.classes, average='macro')) self.l_threshold[ci] -= 0.01 self.l_threshold[ci] = round(self.l_threshold[ci], 2) percentage += 1 te = timeit.default_timer() if self.verbose: print('Processing {:.3f}% ({:.4f} seconds)'.format( percentage / timer_deviding, te - ts), end='\r') self.l_threshold[ci] = round( self.init_th[ci] - (np.argmax(f1_scores) / 100), 2) fprint( self.log, 'Max F1: {} --> Now threshold: [{}]{}'.format( np.argmax(f1_scores), self.h_threshold, self.l_threshold)) # self.l_threshold = [{th > self.initial_threshold:round(th - 0.01, 2)}.get(True, th) for th in self.l_threshold] fprint(self.log, 'Found approximate threshold: {}'.format(self.l_threshold)) return '<Appoximate function>'
def modelling(self): fprint(self.log, 'Training label encoder and scaler') ts = timeit.default_timer() self.le.fit(self.train_dataset[:, -1]) self.le.fit(self.test_dataset[:, -1]) self.scaler.fit(self.train_dataset[:, 1:-1]) te = timeit.default_timer() fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts)) fprint(self.log, 'Training model') ts = timeit.default_timer() self.sclf.fit(X=self.scaler.transform(self.train_dataset[:, 1:-1]), y=self.le.transform(self.train_dataset[:, -1])) gc.collect() te = timeit.default_timer() fprint(self.log, '---> Done ({:.4f} seconds)'.format(te - ts)) return '<Function: modelling>'
def predict(self): pred_ts = timeit.default_timer() fprint(self.log, 'Predict session training dataset') ts = timeit.default_timer() self.spreds_train = self.sclf.predict( self.scaler.transform(self.train_dataset[:, 1:-1])) te = timeit.default_timer() fprint( self.log, 'Session training dataset predict time: {} seconds'.format(te - ts)) fprint(self.log, 'Predict session test dataset') ts = timeit.default_timer() self.spreds_test = self.sclf.predict( self.scaler.transform(self.test_dataset[:, 1:-1])) te = timeit.default_timer() fprint(self.log, 'Session test dataset predict time: {} seconds'.format(te - ts)) return '<Function: predict>'
def read_csv(self, path, encoding=ckc.ISCX_DATASET_ENCODING): fprint(self.log, 'Reading dataset: {}'.format(path)) ts = timeit.default_timer() self.dataset = pd.read_csv(filepath_or_buffer=path, encoding=encoding).values fprint(self.log, 'Skip data: {}'.format(self.skip_datas)) for word in self.skip_datas: self.dataset = self.dataset[self.dataset[:, -1] != word] self.flows = cksess.get_flows(dataset=self.dataset) self.train_size = int(len(self.flows) * self.split_ratio) te = timeit.default_timer() fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts)) fprint(self.log, 'Shuffling dataset by flows') ts = timeit.default_timer() self.dataset, _ = cksess.shuffle_flow(dataset=self.dataset, flows=self.flows, random_state=self.seed) self.flows = cksess.get_flows(dataset=self.dataset) te = timeit.default_timer() fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts)) fprint(self.log, 'Creating training & test dataset') ts = timeit.default_timer() self.session = self.dataset[[flow[-1] for flow in self.flows]] self.train_session = self.session[:self.train_size] self.test_session = self.session[self.train_size:] self.train_dataset = self.dataset[cksess.flatten( self.flows[:self.train_size])] self.test_dataset = self.dataset[cksess.flatten( self.flows[self.train_size:])] self.train_flows = cksess.get_flows(dataset=self.train_dataset) self.test_flows = cksess.get_flows(dataset=self.test_dataset) gc.collect() te = timeit.default_timer() fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts)) return '<Function: read & shuffling csv>'
def read_csv(self, path, encoding=ckc.ISCX_DATASET_ENCODING): fprint(self.log, 'Reading dataset: {}'.format(path)) ts = timeit.default_timer() dataset = pd.read_csv(filepath_or_buffer=path, encoding=encoding).values fprint(self.log, 'Skip data: {}'.format(self.skip_datas)) for word in self.skip_datas: dataset = dataset[dataset[:, -1] != word] self.train_size = int(len(dataset) * self.split_ratio) te = timeit.default_timer() fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts)) fprint(self.log, 'Shuffling dataset by flows') ts = timeit.default_timer() np.random.shuffle(dataset) te = timeit.default_timer() fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts)) fprint(self.log, 'Creating training & test dataset') ts = timeit.default_timer() self.train_dataset = dataset[:self.train_size] self.test_dataset = dataset[self.train_size:] gc.collect() te = timeit.default_timer() fprint(self.log, '---> Done ({:.4f} seconds)\n'.format(te - ts)) return '<Function: read & shuffling csv>'
def predict(self): pred_ts = timeit.default_timer() fprint(self.log, 'Predict session training dataset') ts = timeit.default_timer() self.spreds_train = self.sclf.predict( self.sscaler.transform(self.train_session[:, 1:-1])) te = timeit.default_timer() fprint( self.log, 'Session training dataset predict time: {} seconds'.format(te - ts)) fprint(self.log, 'Predict session test dataset') ts = timeit.default_timer() self.spreds_test = self.sclf.predict( self.sscaler.transform(self.train_session[:, 1:-1])) te = timeit.default_timer() fprint(self.log, 'Session test dataset predict time: {} seconds'.format(te - ts)) self.sprobs_train_all = self.sclf.predict_proba( self.sscaler.transform(self.train_session[:, 1:-1])) self.sprobs_train = np.max(self.sprobs_train_all, axis=1) self.sprobs_test_all = self.sclf.predict_proba( self.sscaler.transform(self.test_session[:, 1:-1])) self.sprobs_test = np.max(self.sprobs_test_all, axis=1) fprint(self.log, 'Predict packet training dataset') ts = timeit.default_timer() self.ppreds_train = self.pclf.predict( self.pscaler.transform(self.train_dataset[:, 2:-1])) te = timeit.default_timer() self.pkt_train_ptime_mean = (te - ts) / len(self.ppreds_train) self.ppreds_train = [ self.ppreds_train[flow] for flow in self.train_flows ] self.pprobs_train_all = self.pclf.predict_proba( self.pscaler.transform(self.train_dataset[:, 2:-1])) self.pprobs_train = np.max(self.pprobs_train_all, axis=1) self.pprobs_train_all = [ self.pprobs_train_all[flow] for flow in self.train_flows ] self.pprobs_train = [ self.pprobs_train[flow] for flow in self.train_flows ] fprint( self.log, 'Packet training dataset predict time: {} seconds'.format(te - ts)) fprint(self.log, 'Predict packet test dataset') ts = timeit.default_timer() self.ppreds_test = self.pclf.predict( self.pscaler.transform(self.test_dataset[:, 2:-1])) te = timeit.default_timer() packet_test_pred_time = te - ts self.pkt_test_ptime_mean = packet_test_pred_time / len( self.ppreds_test) self.ppreds_test = [self.ppreds_test[flow] for flow in self.test_flows] self.pprobs_test_all = self.pclf.predict_proba( self.pscaler.transform(self.test_dataset[:, 2:-1])) self.pprobs_test = np.max(self.pprobs_test_all, axis=1) self.pprobs_test_all = [ self.pprobs_test_all[flow] for flow in self.test_flows ] self.pprobs_test = [self.pprobs_test[flow] for flow in self.test_flows] fprint(self.log, 'Packet test dataset predict time: {} seconds'.format(te - ts)) pred_te = timeit.default_timer() fprint( self.log, 'Processing of predict part is finished ({} seconds)'.format( pred_te - pred_ts)) return '<Function: predict>'
def gradient(self, times=1, limit=-1): assert self.isInit, 'Class Threshold is not initialized' start_point = [] last_idx = None for c in range(len(self.classes)): for i, k in enumerate(self.delta[c]): if k >= self.l_threshold[c]: last_idx = i start_point.append(last_idx) search_count = 0 path_count = 0 solstice = False prev_base = [] fprint( self.log, 'Start threshold: [{}]{}'.format(self.h_threshold, self.l_threshold)) if limit < 0: fprint(self.log, 'Find infinity') else: fprint(self.log, 'Find limit: {}'.format(limit)) fprint(self.log, 'Print threshold every {} times'.format(times)) d_position = [start_point[i] for i in range(len(self.classes))] max_point = [len(self.delta[c]) - 1 for c in range(len(self.classes))] ts = timeit.default_timer() prev_base.append(self.l_threshold.copy()) while (not solstice): search_count += 1 f1_scores = [] ''' Base point F1-score ''' classified = [] for flow_idx, fpreds, fprobs in zip(range(len(self.flows)), self.ppreds, self.pprobs): found = False for pkt_idx, pred, prob in zip(range(len(fpreds)), fpreds, fprobs): if (self.h_threshold <= prob): classified.append(pred) found = True break if not found: if (self.l_threshold[self.spreds[flow_idx]] <= self.sprobs[flow_idx]): classified.append(self.spreds[flow_idx]) else: max_prob_idx = np.argmax(fprobs) classified.append(fpreds[max_prob_idx]) classified = np.array(classified) base_f1 = f1_score(y_true=self.y_true, y_pred=classified, labels=self.classes, average='macro') ''' Surronding points F1-score ''' for class_idx, c in enumerate(self.classes): if (d_position[class_idx] + 1 > max_point[class_idx]): continue self.l_threshold[class_idx] = self.delta[class_idx][ d_position[class_idx] + 1] self.l_threshold[class_idx] = round( self.l_threshold[class_idx], 3) classified = [] for flow_idx, fpreds, fprobs in zip(range(len(self.flows)), self.ppreds, self.pprobs): found = False for pkt_idx, pred, prob in zip(range(len(fpreds)), fpreds, fprobs): if (self.h_threshold <= prob): classified.append(pred) found = True break if not found: if (self.l_threshold[self.spreds[flow_idx]] <= self.sprobs[flow_idx]): classified.append(self.spreds[flow_idx]) else: max_prob_idx = np.argmax(fprobs) classified.append(fpreds[max_prob_idx]) classified = np.array(classified) f1_scores.append([ class_idx, f1_score(y_true=self.y_true, y_pred=classified, labels=self.classes, average='macro') ]) self.l_threshold[class_idx] = self.delta[class_idx][ d_position[class_idx]] self.l_threshold[class_idx] = round( self.l_threshold[class_idx], 3) tmp_f1 = np.max(f1_scores, axis=0)[-1] f1_scores = np.array(f1_scores, dtype=np.object) tmp_f1 = f1_scores[f1_scores[:, 1] == tmp_f1] max_f1 = np.squeeze(tmp_f1[np.random.choice( np.arange(len(tmp_f1)), 1)]) te = timeit.default_timer() if max_f1[1] < base_f1: solstice = True prev_base = [] prev_base.append(self.l_threshold.copy()) fprint( self.log, 'Total process count: {} ({:.4f} seconds)'.format( search_count, te - ts)) else: diff = (max_f1[1] - base_f1) d_position[max_f1[0]] += 1 self.l_threshold[class_idx] = self.delta[max_f1[0]][d_position[ max_f1[0]]] if max_f1[1] > base_f1: prev_base = [] prev_base.append(self.l_threshold.copy()) if self.verbose: print( '[{:3d}][Base: {:.6f}] [Max: {:.6f}] [diff: {:.6f}] [class: {:2d}] [delta: {:.3f}] ({:.4f} sec)' .format(search_count, base_f1, max_f1[1], diff, max_f1[0], self.delta[max_f1[0]][d_position[max_f1[0]]], te - ts), end='\r') if (search_count > limit) & (limit > -1): fprint( self.log, 'Process count is over than {} --> Stop process ({:.4f} seconds)' .format(limit, te - ts)) break print('') if solstice: print( '[{:3d}][Base: {:.6f}] [Max: {:.6f}] [diff: {:.6f}] [class: {:2d}] [delta: {:.3f}] ({:.4f} sec)' .format(search_count, base_f1, max_f1[1], diff, max_f1[0], self.delta[max_f1[0]][d_position[max_f1[0]]], te - ts)) fprint( self.log, 'Found threshold: [{}]{}'.format(self.h_threshold, self.l_threshold)) else: last_th = self.l_threshold.copy() self.l_threshold = prev_base[0] fprint(self.log, 'Last threhsold will be set the same f1-score threhsold') fprint(self.log, '{} ---> {}'.format(last_th, self.l_threshold)) return '<Gradient function>'
from cklib.ckstd import fprint from cklib import ckstd from cklib import DataFrame import joblib import gc from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix seed = 22 dataset_path = './bin/iscx2017session.csv' clf_init = ['rf', 'dt', 'et', 'adt', 'arf', 'gbt'] if __name__ == "__main__": for clf in clf_init: fprint(None, '{} classifier using'.format(clf)) dataframe = DataFrame.Session_Dataset(clf=clf, random_state=seed) dataframe.skip_data('Heartbleed', 'Infiltration', u'Web Attack \x96 XSS', u'Web Attack \x96 Sql Injection') dataframe.read_csv(path=dataset_path) dataframe.modelling() dataframe.predict() label_encoder = dataframe.getLabelEncoder() train_pred = dataframe.getTrainPredict() test_pred = dataframe.getTestPredict() train_true = label_encoder.transform(dataframe.getTrainLabel()) test_true = label_encoder.transform(dataframe.getTestLabel()) train_report = classification_report(
def gradient(self, delta_step=0.01, times=1, limit=10): assert self.isInit != None, 'Class Threshold is not initialized' search_count = 0 path_count = 0 solstice = False delta = delta_step prev_base = [] fprint( self.log, 'Start threshold: [{}]{}'.format(self.h_threshold, self.l_threshold)) fprint(self.log, 'Print threshold every {} times'.format(times)) ts = timeit.default_timer() prev_base.append(self.l_threshold.copy()) while (not solstice): search_count += 1 f1_scores = [] ''' Base point F1-score ''' classified = [] for flow_idx, fpreds, fprobs in zip(range(len(self.flows)), self.ppreds, self.pprobs): found = False for pkt_idx, pred, prob in zip(range(len(fpreds)), fpreds, fprobs): if (self.h_threshold <= prob): classified.append(pred) found = True break if not found: if (self.l_threshold[self.spreds[flow_idx]] <= self.sprobs[flow_idx]): classified.append(self.spreds[flow_idx]) else: max_prob_idx = np.argmax(fprobs) classified.append(fpreds[max_prob_idx]) classified = np.array(classified) base_f1 = f1_score(y_true=self.y_true, y_pred=classified, labels=self.classes, average='macro') ''' Surronding points F1-score ''' for ci, cs in enumerate(self.classes): for i in [-delta, delta]: self.l_threshold[ci] += i self.l_threshold[ci] = round(self.l_threshold[ci], 3) if self.l_threshold[ci] > 1: self.l_threshold[ci] -= i self.l_threshold[ci] = round(self.l_threshold[ci], 3) f1_scores.append([ci, i, 0.]) continue if self.l_threshold[ci] < 0.5: self.l_threshold[ci] -= i self.l_threshold[ci] = round(self.l_threshold[ci], 3) f1_scores.append([ci, i, 0.]) continue if self.l_threshold in prev_base: self.l_threshold[ci] -= i self.l_threshold[ci] = round(self.l_threshold[ci], 3) f1_scores.append([ci, i, 0.]) continue classified = [] for flow_idx, fpreds, fprobs in zip( range(len(self.flows)), self.ppreds, self.pprobs): found = False for pkt_idx, pred, prob in zip(range(len(fpreds)), fpreds, fprobs): if (self.h_threshold <= prob): classified.append(pred) found = True break if not found: if (self.l_threshold[self.spreds[flow_idx]] <= self.sprobs[flow_idx]): classified.append(self.spreds[flow_idx]) else: max_prob_idx = np.argmax(fprobs) classified.append(fpreds[max_prob_idx]) classified = np.array(classified) f1_scores.append([ ci, i, f1_score(y_true=self.y_true, y_pred=classified, labels=self.classes, average='macro') ]) self.l_threshold[ci] -= i self.l_threshold[ci] = round(self.l_threshold[ci], 3) chg_th_idx = np.argmax(f1_scores, axis=0)[-1] max_f1 = f1_scores[chg_th_idx] te = timeit.default_timer() if max_f1[2] < base_f1: solstice = True fprint( self.log, 'Total process count: {} ({:.4f} seconds)'.format( search_count, te - ts)) else: diff = max_f1[2] - base_f1 prev_th = self.l_threshold[max_f1[0]] self.l_threshold[max_f1[0]] += max_f1[1] if self.l_threshold[max_f1[0]] > 1.: self.l_threshold[max_f1[0]] = 1. if self.l_threshold[max_f1[0]] < 0.5: self.l_threshold[max_f1[0]] = 0.5 self.l_threshold[max_f1[0]] = round( self.l_threshold[max_f1[0]], 3) prev_base.append(self.l_threshold.copy()) if self.verbose: print('[Base: {}] [Max: {}] [difference: {}] [delta: {}]'. format(base_f1, max_f1[2], diff, max_f1[1])) if search_count % times == 0: fprint( self.log, '{} --> {} ({:.4f} seconds)'.format( search_count, self.l_threshold, te - ts)) if search_count > limit: fprint( self.log, 'Process count is over than {} --> Stop process ({:.4f} seconds)' .format(limit, te - ts)) break print('') if solstice: fprint( self.log, 'Found threshold: [{}]{}'.format(self.h_threshold, self.l_threshold)) else: fprint( self.log, 'Process stoped threshold: [{}]{}'.format( self.h_threshold, self.l_threshold)) return '<Gradient function>'