def get_correlation(in_dir='', datasets='', feature='SIZE', header=True, out_dir='', out_file='.dat'): corr_results = {} for i, dataset in enumerate(datasets): in_file = os.path.join(in_dir, dataset, feature, f"header_{header}", 'Xy.dat') lg.debug(in_file) data = load(in_file) X_train, y_train, X_val, y_val, X_test, y_test = split_train_val_test( data['X'], data['y'], shuffle=True, random_state=RANDOM_STATE) # normalization ss, X_train, y_train, X_val, y_val, X_test, y_test = normalize( X_train, y_train, X_val, y_val, X_test, y_test) # 2 get correlation dim = X_test.shape[1] if feature == 'IAT': # iat_dim + header_dim = dim, here header_dim = (8 + ttl_dim (i.e., size_dim)) # => iat_dim + 8 + size_dim = iat_dim + 8 + (iat_dim + 1) = dim # => iat_dim = (dim - 9)//2 start_idx = (dim - 8 - 1) // 2 elif feature == 'SIZE': # size_dim + header_dim = dim # size_dim + (8+size_dim) = dim # size_dim = (dim - 8 ) // 2 start_idx = ( dim - 8 ) // 2 # # feature + header_feature:(8 tcp flags + TTL). only works for 'SIZE' else: msg = f'Error: {feature}' raise NotImplementedError(msg) corrs = [] lg.debug(f'header_feature_start_idx: {start_idx}') for j in range( 9): # feature + header_feature:(8 tcp flags + first TTL) _corr = _get_each_correlation(X_test[:, start_idx + j], y_test) corrs.append(_corr) corr_results[(in_file, dataset, feature, X_test.shape)] = corrs _out_file = os.path.join(out_dir, dataset, 'correlation.dat') check_path(_out_file) dump(corrs, _out_file) print(_out_file) # save all results check_path(out_file) dump(corr_results, out_file) return out_file
def _generate_pcap(self): # step 1: obtain pcap and label if self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.5' or self.dataset_name == 'UNB(PC1)': self.IP = '192.168.10.5' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc1)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.8' or self.dataset_name == 'UNB(PC2)': self.IP = '192.168.10.8' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc2)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.9' or self.dataset_name == 'UNB(PC3)': self.IP = '192.168.10.9' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc3)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.14' or self.dataset_name == 'UNB(PC4)': self.IP = '192.168.10.14' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc4)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.15' or self.dataset_name == 'UNB(PC5)': self.IP = '192.168.10.15' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc5)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'DEMO_IDS/DS-srcIP_192.168.10.5': self.IP = '192.168.10.5' self.orig_flows = os.path.join( self.out_dir, f'orig_demo_{self.direction}_flows-{self.IP}.dat') else: raise ValueError('dataset does not exist.') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = self.get_unb_flows(in_dir=f'../Datasets', direction=self.direction) dump(meta, out_file=self.orig_flows) lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' + meta['direction']) lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) + ', normal_flows: ' + str(len(meta['normal_flows']))) lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) + ', abnormal_flows: ' + str(len(meta['abnormal_flows']))) else: pass
def gather(in_dir='src', out_dir=''): """ collect all individual results together Parameters ---------- in_dir: search results from the given directory out_dir: save the gathered results to the given directory Returns ------- out_file: the short csv for a quick overview """ res = [] for dataset, feature, header, model, tuning in list(itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING)): f = os.path.join(in_dir, dataset, feature, f'header_{header}', model, f'tuning_{tuning}', 'res.csv') try: line = [str(v) for v in pd.read_csv(f, sep=',', header=None).values.flatten().tolist()][1:] lg.debug(f, line) if len(str(line[0])) == 0: lg.error(f'Error: {line}. [{header}, {tuning}, {feature}, {dataset}, {model}]') except Exception as e: lg.error(f'Error: {e}. [{header}, {tuning}, {feature}, {dataset}, {model}]') line = ['', '0_0|0_0|0_0', ''] # [score, shape, params] res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}'] + line) # Save all results to gather.csv out_file = os.path.join(out_dir, 'gather.csv') check_path(out_file) with open(out_file, 'w') as f: for vs in res: f.write(','.join(vs) + '\n') # Only save needed data for quick overview short_file = os.path.join(os.path.split(out_file)[0], 'short.csv') with open(short_file, 'w') as f: for vs in res: if vs[5] == '' or vs[7] == '': lg.warning(f'Warning: {vs}.') tmp = vs[6].split('|') shape = '|'.join(v.split('_')[0] for v in tmp) dim = tmp[0].split('_')[1] f.write(','.join(vs[:6] + [shape, dim]) + '\n') return out_file
def get_scam2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019', out_dir='', dataset_name='scam_192.168.143.42', direction='src'): IP = '192.168.143.42' normal_pcap = os.path.join(out_dir, f'pc_{IP}.pcap') check_path(normal_pcap) file_name = 'fridge_cam_sound_ghome_2daysactiv-scam_normal.pcap' pcap_file = get_file_path(in_dir=in_dir, dataset_name=dataset_name, file_name=file_name) filter_ip(pcap_file, normal_pcap, ips=[IP], direction=direction) normal_flows = _pcap2flows( normal_pcap, verbose=10) # ~1000 normal flows, it will generate > 1000 subflows max_interval = np.quantile( [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9) normal_flows = augment_flows(normal_flows, step=10, max_interval=max_interval) lg.debug(f'normal_flows: {len(normal_flows)}') abnormal_pcap = os.path.join(out_dir, f'pc_{IP}_abnormal.pcap') check_path(normal_pcap) # file_name = 'samsung_camera-2daysactiv-src_192.168.143.42-anomaly.pca' file_name = 'fridge_cam_sound_ghome_2daysactiv-scam_abnormal.pcap' pcap_file = get_file_path(ipt_dir=in_dir, dataset_name=dataset_name, file_name=file_name) filter_ip(pcap_file, abnormal_pcap, ips=[IP], direction=direction) abnormal_flows = _pcap2flows(abnormal_pcap, verbose=10) abnormal_flows = augment_flows(abnormal_flows, step=1, max_interval=max_interval) lg.debug(f'after augmenting abnormal_flows: {len(abnormal_flows)}') meta = { 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows, 'normal_pcaps': [normal_pcap], 'abnormal_pcaps': [abnormal_pcap], 'direction': direction, 'in_dir': in_dir } return meta
def report(in_file='gather.dat', delimiter=','): res = load(in_file) out_file = os.path.split(in_file) + 'report.csv' check_path(out_file) with open(out_file, 'w') as f: for header in HEADER: for tuning in TUNING: for feature in FEATURES: for dataset in DATASETS: for model in MODELS: data = get_one_res(res, f'header_{header}', f'tuning_{tuning}', feature, dataset, model) line = f'{delimiter}'.join(data) + '\n' lg.debug(line) f.write(line) lg.info(f'report: {out_file}') return out_file
def get_ctu_flows(self, in_dir='../Datatsets', direction='src'): """ https://www.stratosphereips.org/datasets-iot Malware on IoT Dataset """ self.normal_pcap = os.path.join(self.out_dir, f'pc_192.168.1.196.pcap') check_path(self.normal_pcap) # filter pcap # file_name = '2019-01-09-22-46-52-src_192.168.1.196_CTU_IoT_CoinMiner_anomaly.pcap' file_name = 'CTU-IoT-Malware-Capture-41-1_2019-01-09-22-46-52-192.168.1.196.pcap' pcap_file = get_file_path(in_dir=in_dir, dataset_name='CTU/IOT_2017', file_name=file_name) filter_ip(pcap_file, self.normal_pcap, ips=['192.168.1.196'], direction=direction) normal_flows = _pcap2flows(self.normal_pcap, verbose=10) # normal flows self.abnormal_pcap = os.path.join(self.out_dir, f'pc_192.168.1.195_abnormal.pcap') check_path(self.normal_pcap) # file_name = '2018-12-21-15-50-14-src_192.168.1.195-CTU_IoT_Mirai_normal.pcap' file_name = 'CTU-IoT-Malware-Capture-34-1_2018-12-21-15-50-14-192.168.1.195.pcap' pcap_file = get_file_path(ipt_dir=in_dir, dataset_name='CTU/IOT_2017', file_name=file_name) filter_ip(pcap_file, self.abnormal_pcap, ips=['192.168.1.195'], direction=direction) abnormal_flows = _pcap2flows(self.abnormal_pcap, verbose=10) # normal flows meta = { 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows, 'normal_pcap': self.normal_pcap, 'abnormal_pcap': self.abnormal_pcap, 'direction': direction, 'in_dir': in_dir } return meta
def get_smtv2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019', out_dir='', dataset_name='smtv_10.42.0.1', direction='src'): IP = '10.42.0.1' normal_pcap = os.path.join(out_dir, f'pc_{IP}.pcap') check_path(normal_pcap) file_name = 'pc_10.42.0.1_normal.pcap' pcap_file = get_file_path(in_dir=in_dir, dataset_name=dataset_name, file_name=file_name) filter_ip(pcap_file, normal_pcap, ips=[IP], direction=direction) normal_flows = _pcap2flows(normal_pcap, verbose=10) # normal flows max_interval = np.quantile( [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9) normal_flows = augment_flows(normal_flows, step=10, max_interval=max_interval) abnormal_pcap = os.path.join(out_dir, f'pc_10.42.0.119_abnormal.pcap') check_path(normal_pcap) file_name = 'pc_10.42.0.119_anomaly.pcap' pcap_file = get_file_path(in_dir=in_dir, dataset_name=dataset_name, file_name=file_name) filter_ip(pcap_file, abnormal_pcap, ips=['10.42.0.119'], direction=direction) abnormal_flows = _pcap2flows(abnormal_pcap, verbose=10) # normal flows abnormal_flows = augment_flows(abnormal_flows, step=10, max_interval=max_interval) meta = { 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows, 'normal_pcaps': [normal_pcap], 'abnormal_pcaps': [abnormal_pcap], 'direction': direction, 'in_dir': in_dir } return meta
def get_bstch2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019', out_dir='', dataset_name='scam_192.168.143.48', direction='src'): IP = '192.168.143.48' normal_pcap = os.path.join(out_dir, f'pc_{IP}.pcap') check_path(normal_pcap) # file_name = 'bose_soundtouch-2daysactiv-src_192.168.143.48-normal.pcap' file_name = 'fridge_cam_sound_ghome_2daysactiv-bstch_normal.pcap' pcap_file = get_file_path(in_dir=in_dir, dataset_name=dataset_name, file_name=file_name) filter_ip(pcap_file, normal_pcap, ips=[IP], direction=direction) normal_flows = _pcap2flows(normal_pcap, verbose=10) # normal flows max_interval = np.quantile( [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9) normal_flows = augment_flows(normal_flows, step=10, max_interval=max_interval) abnormal_pcap = os.path.join(out_dir, f'pc_{IP}_abnormal.pcap') check_path(normal_pcap) # file_name = 'bose_soundtouch-2daysactiv-src_192.168.143.48-anomaly.pcap' file_name = 'fridge_cam_sound_ghome_2daysactiv-bstch_abnormal.pcap' pcap_file = get_file_path(ipt_dir=in_dir, dataset_name=dataset_name, file_name=file_name) filter_ip(pcap_file, abnormal_pcap, ips=[IP], direction=direction) abnormal_flows = _pcap2flows(abnormal_pcap, verbose=10) # abnormal flows # abnormal_flows = augment_flows(abnormal_flows, starts=50, max_len=max_len) abnormal_flows = augment_flows(abnormal_flows, step=10, max_interval=max_interval) meta = { 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows, 'normal_pcaps': [normal_pcap], 'abnormal_pcaps': [abnormal_pcap], 'direction': direction, 'in_dir': in_dir } return meta
def _generate_pcap(self): # preprocessed the pcap and label on original pcap and label if self.dataset_name == 'MAWI/WIDE_2019/pc_202.171.168.50' or self.dataset_name == 'MAWI': # "http://mawi.wide.ad.jp/mawi/samplepoint-F/2019/201912071400.html" self.IP = '202.171.168.50' self.orig_flows = os.path.join(self.out_dir, f'mawi_{self.direction}_flows-{self.IP}.dat') remove_file(self.orig_flows, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = self.get_mawi_flows(in_dir=f'../Datasets', direction=self.direction) dump(meta, out_file=self.orig_flows) lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' + meta['direction']) lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) + ', normal_flows: ' + str(len(meta['normal_flows']))) lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) + ', abnormal_flows: ' + str(len(meta['abnormal_flows']))) else: raise ValueError('dataset does not exist.')
def generate(self): if os.path.exists(self.Xy_file): self.X, self.y = load(self.Xy_file) else: q_interval = 0.9 # pcap to flows flows = self.pcap2flows(self.pcap_file) # flows to subflow labels = [1] * len(flows) durations = [_get_flow_duration(pkts) for fid, pkts in flows] interval = _get_split_interval(durations, q_interval=q_interval) subflows, labels = self.flow2subflows(flows, interval=interval, labels=labels) # get dimension normal_flows = subflows num_pkts = [len(pkts) for fid, pkts in normal_flows] # only on normal flows dim = int(np.floor(np.quantile( num_pkts, q_interval))) # use the same q_interval to get the dimension lg.info(f'dim={dim}') # flows to features features, fids = self.flow2features(subflows, name=self.feature_name) # fixed the feature size features = self.fix_feature(features, dim=dim) self.X = features self.y = np.asarray([0] * len(features)) # save data to disk check_path(os.path.dirname(self.Xy_file)) dump((self.X, self.y), out_file=self.Xy_file) return self.X, self.y
def get_mawi_flows(self, in_dir='../Datatsets', direction='src'): self.normal_pcap = os.path.join(self.out_dir, f'pc_202.171.168.50.pcap') check_path(self.normal_pcap) file_name = 'samplepoint-F_201912071400-src_dst_202.171.168.50.pcap' pcap_file = get_file_path(in_dir=in_dir, dataset_name='MAWI/WIDE_2019', file_name=file_name) filter_ip(pcap_file, self.normal_pcap, ips=['202.171.168.50'], direction=direction) normal_flows = _pcap2flows(self.normal_pcap, verbose=10) # normal flows self.abnormal_pcap = os.path.join(self.out_dir, f'pc_203.113.113.16_abnormal.pcap') check_path(self.normal_pcap) # file_name = 'samplepoint-F_201912071400-src_dst_202.4.27.109.pcap' # ~5000 file_name = 'samplepoint-F_201912071400-src_203.113.113.16.pcap' # ~1500 pcap_file = get_file_path(ipt_dir=in_dir, dataset_name='MAWI/WIDE_2019', file_name=file_name) filter_ip(pcap_file, self.abnormal_pcap, ips=['203.113.113.16'], direction=direction) abnormal_flows = _pcap2flows(self.abnormal_pcap, verbose=10) # normal flows meta = {'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows, 'normal_pcap': self.normal_pcap, 'abnormal_pcap': self.abnormal_pcap, 'direction': direction, 'in_dir': in_dir} return meta
def extract_subpcap(pcap_file, out_file, start_time, end_time, verbose=20, keep_original=True): """ extract a part of pcap using editcap ' editcap -A "2017-07-04 09:02:00" -B "2017-07-04 09:05:00" input.pcap output.pcap' Parameters ---------- pcap_file: out_file start_time end_time verbose keep_original: bool keep the original pcap or not, True (default) Returns ------- """ if os.path.exists(out_file): return out_file if out_file is None: out_file = pcap_file + f'-start={start_time}-end={end_time}.pcap' out_file = out_file.replace(' ', '_') check_path(out_file) cmd = f"editcap -A \"{start_time}\" -B \"{end_time}\" \"{pcap_file}\" \"{out_file}\"" # print(cmd) if verbose > 10: print(f'{cmd}') result = '' try: result = subprocess.run(cmd, stdout=subprocess.PIPE, shell=True).stdout.decode('utf-8') if not keep_original: os.remove(pcap_file) except Exception as e: print(f'{e}, {result}') return out_file
def _generate_pcap(self): # preprocessed the pcap and label on original pcap and label if self.dataset_name == 'CTU/IOT_2017/pc_192.168.1.196' or self.dataset_name == 'CTU': self.IP = '192.168.1.196' self.orig_flows = os.path.join( self.out_dir, f'ctu_{self.direction}_flows-{self.IP}.dat') remove_file(self.orig_flows, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = self.get_ctu_flows(in_dir=f'../Datasets', direction=self.direction) dump(meta, out_file=self.orig_flows) lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' + meta['direction']) lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) + ', normal_flows: ' + str(len(meta['normal_flows']))) lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) + ', abnormal_flows: ' + str(len(meta['abnormal_flows']))) else: raise ValueError('dataset does not exist.')
def gather(in_dir='examples/representation/out/src', out_dir=''): res = [] for dataset, feature, header, model, tuning in list( itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING)): f = os.path.join(in_dir, dataset, feature, f'header_{header}', model, f'tuning_{tuning}', 'res.csv') try: line = [ str(v) for v in pd.read_csv( f, sep=',', header=None).values.flatten().tolist() ][1:] print(f, line) except Exception as e: print(f'Error: {e}') line = ['', '0_0|0_0|0_0', ''] # [score, shape, params] res.append( [dataset, feature, f'header_{header}', model, f'tuning_{tuning}'] + line) out_file = os.path.join(out_dir, 'gather.csv') check_path(out_file) with open(out_file, 'w') as f: for vs in res: f.write(','.join(vs) + '\n') short_file = os.path.join(os.path.split(out_file)[0], 'short.csv') # data = pd.read_csv(out_file, error_bad_lines=False, header=None) # will miss some results. # data.iloc[:, 0:7].to_csv(short_file) with open(short_file, 'w') as f: for vs in res: tmp = vs[6].split('|') shape = '|'.join(v.split('_')[0] for v in tmp) dim = tmp[0].split('_')[1] f.write(','.join(vs[:6] + [shape, dim]) + '\n') return out_file
def main(): """Get results from xlsx and plot the results. Parameters ---------- root_dir Returns ------- """ # raw_file = 'examples/representation/out/src/~res.csv' # in_file = 'examples/representation/report/res.csv' # check_path(in_file) # copyfile(raw_file, in_file) in_file = 'examples/representation/out/src/results/2021-09-28/short.csv' data = parse_csv(in_file) data = format_name(data, data_orig2name) # out_dir = 'examples/representation/report/out' TUNING = [True, False] MODELS = ['OCSVM', 'IF', 'AE', 'KDE', 'GMM', 'PCA'] ######################################################################################################## ### Get the results on part of datasets and all algorithms # 1. datasets: [UNB(PC1), UNB(PC4), CTU, MAWI, TV&RT, SFrig, and BSTch] # algorithms: [OCSVM, IF, AE, KDE, GMM, PCA] DATASETS1 = [ 'UNB(PC1)', 'UNB(PC4)', 'CTU', 'MAWI', 'TV&RT', 'SFrig', 'BSTch' ] DATASETS2 = ['UNB(PC2)', 'UNB(PC3)', 'UNB(PC5)', 'SCam', 'GHom'] DATASETS3 = [ 'UNB(PC1)', 'UNB(PC2)', 'UNB(PC3)', 'UNB(PC4)', 'UNB(PC5)', 'CTU', 'MAWI', 'TV&RT', 'SFrig', 'BSTch', 'SCam', 'GHom' ] DATASETS_LST = [DATASETS1, DATASETS2, DATASETS3] FIGS = ['size_effect', 'header_effect', 'fft_effect'] for tuning in TUNING: tuning_type = 'best' if tuning else 'default' for DATASETS in DATASETS_LST: # 1. size_effect fig_type = FIGS[0] out_file = f'{out_dir}/{fig_type}-{tuning_type}-{len(DATASETS)}.pdf' check_path(out_file) results = pkt_size_diff(data, tuning, DATASETS, MODELS) size_effect_plot(results, MODELS, DATASETS, fig_type, out_file) # 2. header_effect fig_type = FIGS[1] out_file = f'{out_dir}/{fig_type}-{tuning_type}-{len(DATASETS)}.pdf' check_path(out_file) results = pkt_header_diff(data, tuning, DATASETS, MODELS) header_effect_plot(results, MODELS, DATASETS, fig_type, out_file) # 3. fft_effect fig_type = FIGS[2] out_file = f'{out_dir}/{fig_type}-{tuning_type}-{len(DATASETS)}.pdf' check_path(out_file) results = fft_diff(data, tuning, DATASETS, MODELS) fft_effect_plot(results, MODELS, DATASETS, fig_type, out_file) lg.info('\n')
def plot_correlation_multi(corr_results, out_file='', title=None, show=True): """ plot the data Parameters ---------- corr_results out_dir title show Returns ------- """ # # only show the top 4 figures new_corr_results = {} for i, (dataset, name) in enumerate(data_orig2name.items()): for j, (key, corrs) in enumerate(corr_results.items()): _key_path, _dataset, _feat_set, X_test_shape = key if dataset in key: new_corr_results[(_key_path, _dataset, name, _feat_set, X_test_shape)] = corrs t = 0 cols = 2 fontsize = 20 ## http://jose-coto.com/styling-with-seaborn # colors = ["m", "#4374B3"] # palette = sns.color_palette('RdPu', 1) # a list palette = [sns.color_palette('YlOrRd', 7)[4]] # YlOrRd fig, axes = plt.subplots(2, cols, figsize=(18, 8)) # (width, height) # print(new_corr_results) for i, (key, corrs) in enumerate(new_corr_results.items()): print(f"i: {i}, {key}, corrs: {corrs}") # hue = feat_set key_path, dataset, short_name, feat_set, X_test_shape = key HEADER = [ 'FIN', 'SYN', 'RST', 'PSH', 'ACK', 'URG', 'ECE', 'CWR', '1st-TTL' ] data = sorted(range(len(corrs)), key=lambda i: abs(corrs[i]), reverse=True)[:6] # top 6 values data = [[f'({HEADER[_i]}, y)', feat_set, corrs[_i]] for _i in data] # print(f"i: {i}, {key}, corrs: {data}") new_yerrs = [1 / (np.sqrt(X_test_shape[0])) ] * 6 # for the same dataset, it has the same err_bar # # print(f'i: {i}, {new_yerrs}') df = pd.DataFrame(data, columns=[f'Xi_y', 'feat_set', 'corr_rho']) if i % cols == 0 and i > 0: t += 1 g = sns.barplot(x=f"Xi_y", y="corr_rho", ax=axes[t, i % cols], hue='feat_set', data=df, palette=palette) # palette=palette, g.set(xlabel=None) g.set(ylim=(-1, 1)) if i % cols == 0: # g.set_ylabel(r'$\rho$', fontsize=fontsize + 4) g.set_ylabel(r'Correlation', fontsize=fontsize + 4) # print(g.get_yticks()) g.set_yticks([-1, -0.5, 0, 0.5, 1]) g.set_yticklabels(g.get_yticks(), fontsize=fontsize + 6) # set the number of each value in y axis # print(g.get_yticks()) else: g.set(ylabel=None) g.set_yticklabels(['' for v_tmp in g.get_yticks()]) g.set_ylabel('') # g.set_title(dataset_name) g.get_legend().set_visible(False) g.set_xticklabels(g.get_xticklabels(), fontsize=fontsize + 4, rotation=30, ha="center") ys = [] xs = [] width = 0 for i_p, p in enumerate(g.patches): height = p.get_height() width = p.get_width() ys.append(height) xs.append(p.get_x()) if i_p == 0: pre = p.get_x() + p.get_width() if i_p > 0: cur = p.get_x() g.axvline(color='black', linestyle='--', x=pre + (cur - pre) / 2, ymin=0, ymax=1, alpha=0.3) pre = cur + p.get_width() ## https://stackoverflow.com/questions/34888058/changing-width-of-bars-in-bar-chart-created-using-seaborn-factorplot p.set_width(width / 3) # set the bar width # we recenter the bar p.set_x(p.get_x() + width / 3) g.set_title(short_name, fontsize=fontsize + 8) # add error bars g.errorbar(x=xs + width / 2, y=ys, yerr=new_yerrs, fmt='none', c='b', capsize=3) # # get the legend and modify it # handles, labels = g.get_legend_handles_labels() # fig.legend(handles, ['IAT+SIZE'], title=None, loc='lower center', ncol=1, # prop={'size': fontsize-2}) # loc='lower right', loc = (0.74, 0.13) plt.tight_layout() plt.subplots_adjust(bottom=0.2) check_path(out_file) print(out_file) plt.savefig(out_file) # should use before plt.show() if show: plt.show() plt.close(fig) plt.close("all")
def get_unb_flows(self, in_dir='../Datatsets', direction='src'): # preprocessed the pcap and label on original pcap and label self.pcap_file = os.path.join(self.out_dir, f'pc_{self.IP}_AGMT.pcap') self.label_file = os.path.join(self.out_dir, f'pc_{self.IP}_AGMT.csv') remove_file(self.pcap_file, self.overwrite) remove_file(self.label_file, self.overwrite) check_path(self.pcap_file) check_path(self.label_file) if not os.path.exists(self.pcap_file) or not os.path.exists( self.label_file): # 1. original pcap friday_pacp_orig = get_file_path( ipt_dir=in_dir, dataset_name='UNB/CICIDS_2017/', data_cat='pcaps/Friday', file_name='Friday-WorkingHours.pcap') # filter pcap filter_ip(friday_pacp_orig, out_file=self.pcap_file, ips=[self.IP], direction=self.direction, keep_original=True) # 2. merge original labels friday_label = get_file_path( ipt_dir=self.out_dir, dataset_name='UNB/CICIDS_2017/', data_cat='labels/Friday', file_name='Friday-WorkingHours-Morning.pcap_ISCX.csv') friday_label_orig1 = get_file_path( ipt_dir=in_dir, dataset_name='UNB/CICIDS_2017/', data_cat='labels/Friday', file_name='Friday-WorkingHours-Morning.pcap_ISCX.csv') friday_label_orig2 = get_file_path( ipt_dir=in_dir, dataset_name='UNB/CICIDS_2017/', data_cat='labels/Friday', file_name='Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv') friday_label_orig3 = get_file_path( ipt_dir=in_dir, dataset_name='UNB/CICIDS_2017/', data_cat='labels/Friday', file_name='Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv' ) friday_label_tmp = friday_label + '-all.csv' check_path(friday_label_tmp) merge_labels( [friday_label_orig1, friday_label_orig2, friday_label_orig3], mrg_label_path=friday_label_tmp) filter_csv_ip(friday_label_tmp, out_file=self.label_file, ips=[self.IP], direction=self.direction) ############################################################################################## # step 2.1 extract flows flows = _pcap2flows(self.pcap_file, verbose=10) # normal and abnormal flows # step 2.2 split normal flow and abnormal flow labels = pd.read_csv(self.label_file).values # normal_flows, abnormal_flows = split_normal_abnormal(flows, labels) # augment abnormal flows max_interval = np.quantile( [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9) abnormal_flows = augment_flows(abnormal_flows, step=1, max_interval=max_interval) meta = { 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows, 'normal_pcap': self.pcap_file, 'abnormal_pcap': self.label_file, 'direction': direction, 'in_dir': in_dir } return meta
def _generate_features(self, normal_flows, abnormal_flows): # step 3: flows to features. # only on normal flows normal_flow_lengths = [len(pkts) for fid, pkts in normal_flows] qs = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1] normal_lengths_stat = np.quantile(normal_flow_lengths, q=qs) lg.debug(f'normal_lengths_stat: {normal_lengths_stat}, where q = {qs}') self.dim = int( np.floor(np.quantile(normal_flow_lengths, self.q_flow_dur))) lg.info(f'dim(SIZE) = {self.dim}') self.X = [] self.y = [] if self.header: header_features, header_fids = _get_header(normal_flows) header_dim = int( np.quantile([len(v) for v in header_features], q=self.q_flow_dur)) lg.info(f'header_dim: {header_dim}') else: header_dim = None if 'SAMP' in self.feature_name: normal_features, normal_fids = self.flow2features( normal_flows, name=self.feature_name, dim=self.dim, header=self.header, header_dim=header_dim) abnormal_features, abnormal_fids = self.flow2features( abnormal_flows, name=self.feature_name, dim=self.dim, header=self.header, header_dim=header_dim) for q in normal_features.keys(): X_ = list( normal_features[q][0]) # (features, fid, sampling_rate_) y_ = [0] * len(normal_features[q][0]) X_.extend(list(abnormal_features[q][0])) y_.extend([1] * len(abnormal_features[q][0])) self.X.append(np.asarray(X_)) self.y.append(np.asarray(y_)) # save data to disk check_path(self.Xy_file) meta = { 'X': self.X, 'y': self.y, 'normal_flow_lengths': (normal_flow_lengths, normal_lengths_stat), 'dim': self.dim, 'q_flow_dur': self.q_flow_dur } dump(meta, out_file=self.Xy_file) # save feature data as csv csv_file = os.path.splitext(self.Xy_file)[0] + '.csv' # np.savetxt(csv_file, np.concatenate([self.X, self.y[..., np.newaxis]], axis=1), delimiter=',') else: for flows, label in zip([normal_flows, abnormal_flows], [0, 1]): features, fids = self.flow2features(flows, name=self.feature_name, dim=self.dim, header=self.header, header_dim=header_dim) self.X.extend(features) self.y.extend([label] * len(features)) # save data to disk check_path(self.Xy_file) self.X = np.asarray(self.X) self.y = np.asarray(self.y) meta = { 'X': self.X, 'y': self.y, 'normal_flow_lengths': (normal_flow_lengths, normal_lengths_stat), 'dim': self.dim, 'q_flow_dur': self.q_flow_dur } dump(meta, out_file=self.Xy_file) # save feature data as csv csv_file = os.path.splitext(self.Xy_file)[0] + '.csv' np.savetxt(csv_file, np.concatenate([self.X, self.y[..., np.newaxis]], axis=1), delimiter=',') return meta
def _main(): """ Main function Returns ------- """ res = [] out_file = f'{OUT_DIR}/src/{RESULT_DIR}/res.dat' is_parallel = False if is_parallel: # with parallel def set_args(dataset, feature, header, model, tuning): args = parser() args.dataset = dataset args.feature = feature args.header = header args.model = model args.tuning = tuning lg.debug(args) return args # if backend='loky', the time taken is less than that of serial. but if backend='multiprocessing', we can # get very similar time cost comparing with serial. _res = [] with Parallel(n_jobs=20, backend='loky') as parallel: _res = parallel(delayed(_representation.main_no_tuning_vs_tuning) # delayed (set_args(dataset, feature, header, model, tuning)) # params for dataset, feature, header, model, tuning in list(itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING)) # for ) # parallel # reorganize results res = [] for history, (dataset, feature, header, model, tuning) in zip(_res, list( itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING))): res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history['best']]) else: # without parallel for dataset, feature, header, model, tuning in list(itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING)): try: lg.info(f'*** {dataset}-{feature}-header_{header}, {model}-tuning_{tuning}') args = parser() args.dataset = dataset args.feature = feature args.header = header args.model = model args.tuning = tuning args.overwrite = OVERWRITE history = _representation.main_no_tuning_vs_tuning(args) res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history['best']]) # avoid losing any result, so save it immediately. _out_file = f'{args.out_dir}/{args.direction}/{RESULT_DIR}/~res.csv' check_path(_out_file) save2txt(res, _out_file, delimiter=',') except Exception as e: lg.error(f'Error: {e}. [{dataset}, {feature}, {header}, {model}, {tuning}]') # save the final results: '.dat' and '.csv' check_path(out_file) dump(res, out_file) out_file = os.path.splitext(out_file)[0] + '.csv' remove_file(out_file, OVERWRITE) save2txt(res, out_file, delimiter=',') lg.info(f'final result: {out_file}')
def main(args=None, test=False): """ Get the result according to the given parameters Parameters ---------- args test: boolean if we evaluate the built model on val set or test set Returns ------- history: dict Return the best result on 'SAMP' related feature. Otherwise, return the result """ try: lg.debug(args) out_dir = os.path.join(args.out_dir, args.direction, args.dataset, args.feature, f'header_{args.header}', args.model, f'tuning_{args.tuning}') ############################################################################################################### """ 1.1 Parse data and extract features """ lg.info(f'\n--- 1.1 Parse data') data = Data(dataset_name=args.dataset, direction=args.direction, feature_name=args.feature, header=args.header, overwrite=args.overwrite, random_state=RANDOM_STATE) data.generate() if 'SAMP' in args.feature: best = {'score': 0, 'model': None} for i, (X, y) in enumerate(zip(data.X, data.y)): lg.debug(f'SAMP_{i}') try: res_, data_ = _single_main(args, X, y, test=test) except Exception as e: lg.error(f'Error: {e}. SAMP_{i}') continue # get the best results on SAMP data if res_['score'] > best['score']: best['score'] = res_['score'] best['model'] = copy.deepcopy(res_) best['data'] = copy.deepcopy(data_) history = best else: X, y = data.X, data.y res_, data_ = _single_main(args, X, y, test=test) history = {'score': res_['score'], 'model': res_, 'data': data_} except Exception as e: traceback.print_exc() history = { 'score': 0, 'model': {}, 'data': (None, None, None, None, None, None) } ############################################################################################################### """ 3. Dump the result to disk """ lg.info(f'\n--- 3. Save the result') out_file = os.path.join(out_dir, f'res.dat') check_path(out_file) dump(history, out_file=out_file) out_file = os.path.splitext(out_file)[0] + '.csv' remove_file(out_file, overwrite=OVERWRITE) save2txt(history, out_file) lg.info(f'res_file: {out_file}') return history
def main_no_tuning_vs_tuning(args=None): """ get results with default and best parameters according to the args. Parameters ---------- args: given parameters Returns ------- history: dict store all the results in a dictionary """ # 1. Get dimension of the dataset. For some algorithms, they need the dimensions (e.g., AE) data = Data(dataset_name=args.dataset, direction=args.direction, feature_name=args.feature, header=args.header, overwrite=args.overwrite, random_state=RANDOM_STATE) data.generate() if 'SAMP' in args.feature: X = data.X[0] else: X = data.X # 2. Get the results with the given model if args.model == 'OCSVM': if args.tuning: qs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95] else: qs = [0.3] history = { } # store the best result, model parameters, and the best model (dict) best = {'score': 0, 'model': None} lg.debug(f'Tuning: q = {qs}') for q in qs: args.model_params = {'q': q} # get results on the validation set history_ = main(args, test=False) score_ = history_['score'] if score_ > best['score']: best['score'] = score_ best['q'] = q best['model'] = copy.deepcopy(history_) history[q] = history_ # get the final result on the test set. args.model_params = {'q': best['q']} best['model'] = main(args, test=True) best['score'] = best['model']['score'] history['best'] = best elif args.model == 'GMM': if args.tuning: n_components_arr = [1, 2, 5, 10, 15, 20, 25, 30, 35, 40] else: n_components_arr = ['quickshift'] history = {} best = {'score': 0, 'model': None} lg.debug(f'Tuning: q = {n_components_arr}') for n_components in n_components_arr: args.model_params = {'n_components': n_components} history_ = main(args, test=False) score_ = history_['score'] if score_ > best['score']: best['score'] = score_ best['n_components'] = n_components best['model'] = copy.deepcopy(history_) history[n_components] = history_ # get the final result on the test set. args.model_params = {'n_components': best['n_components']} best['model'] = main(args, test=True) best['score'] = best['model']['score'] history['best'] = best elif args.model == 'IF': if args.tuning: n_estimators_arr = [ int(v) for v in list(np.linspace(30, 300, num=10, endpoint=True)) ] else: n_estimators_arr = [100] history = {} best = {'score': 0, 'model': None} lg.debug(f'Tuning: n_estimators_arr = {n_estimators_arr}') for n_estimators in n_estimators_arr: args.model_params = {'n_estimators': n_estimators} history_ = main(args, test=False) score_ = history_['score'] if score_ > best['score']: best['score'] = score_ best['n_estimators'] = n_estimators best['model'] = copy.deepcopy(history_) history[n_estimators] = history_ # get the final result on the test set. args.model_params = {'n_estimators': best['n_estimators']} best['model'] = main(args, test=True) best['score'] = best['model']['score'] history['best'] = best elif args.model == 'PCA': if args.tuning: n_components_arr = [ int(v) for v in list( np.linspace(1, min(X.shape), num=10, endpoint=False)) ] else: n_components_arr = ['mle'] history = {} best = {'score': 0, 'model': None} lg.debug(f'Tuning: n_components_arr = {n_components_arr}') for n_components in n_components_arr: args.model_params = {'n_components': n_components} history_ = main(args, test=False) score_ = history_['score'] if score_ > best['score']: best['score'] = score_ best['n_components'] = n_components best['model'] = copy.deepcopy(history_) history[n_components] = history_ # get the final result on the test set. args.model_params = {'n_components': best['n_components']} best['model'] = main(args, test=True) best['score'] = best['model']['score'] history['best'] = best elif args.model == 'KDE': if args.tuning: qs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95] else: qs = [0.3] history = {} best = {'score': 0, 'model': None} lg.debug(f'Tuning: q = {qs}') for q in qs: args.model_params = {'q': q} history_ = main(args, test=False) score_ = history_['score'] if score_ > best['score']: best['score'] = score_ best['q'] = q best['model'] = copy.deepcopy(history_) history[q] = history_ # get the final result on the test set. args.model_params = {'q': best['q']} best['model'] = main(args, test=True) best['score'] = best['model']['score'] history['best'] = best elif args.model == 'AE': if args.tuning: feat_dim = X.shape[1] def get_AE_parameters(d, num=10): latent_sizes = [] for i in range(num): v = np.ceil(1 + i * (d - 2) / 9).astype(int) if v not in latent_sizes: latent_sizes.append(v) hidden_sizes = [ min((d - 1), np.ceil(2 * v).astype(int)) for v in latent_sizes ] hidden_neurons = [] for i, (hid, lat) in enumerate(zip(hidden_sizes, latent_sizes)): v = [d, hid, lat, hid, d] hidden_neurons.append(v) return hidden_neurons hidden_neurons_arr = get_AE_parameters(feat_dim, num=10) else: feat_dim = X.shape[1] latent_dim = np.ceil(feat_dim / 2).astype(int) hid = min((feat_dim - 1), np.ceil(2 * latent_dim).astype(int)) hidden_neurons = [feat_dim, hid, latent_dim, hid, feat_dim] hidden_neurons_arr = [hidden_neurons] history = {} best = {'score': 0, 'model': None} lg.debug(f'Tuning: hidden_neurons = {hidden_neurons_arr}') for hidden_neurons in hidden_neurons_arr: args.model_params = {'hidden_neurons': hidden_neurons} history_ = main(args, test=False) score_ = history_['score'] if score_ > best['score']: best['score'] = score_ best['hidden_neurons'] = hidden_neurons best['model'] = copy.deepcopy(history_) history[tuple(hidden_neurons)] = history_ # get the final result on the test set. args.model_params = {'hidden_neurons': best['hidden_neurons']} best['model'] = main(args, test=True) best['score'] = best['model']['score'] history['best'] = best else: msg = f'{args.model}' raise NotImplementedError(msg) # lg.info(f'\n*** best: ' + str(history['best'])) out_file = os.path.join(args.out_dir, args.direction, args.dataset, args.feature, f'header_{args.header}', args.model, f'tuning_{args.tuning}', 'res.dat') check_path(out_file) dump(history, out_file) return history
def main(): res = [] res_file = 'res2' is_parallel = False if is_parallel: def set_args(dataset, feature, header, model, tuning): args = parser() args.dataset = dataset args.feature = feature args.header = header args.model = model args.tuning = tuning print(args) return args # if backend='loky', the time taken is less than that of serial. but if backend='multiprocessing', we can # get very similar time cost comparing with serial. _res = [] with Parallel(n_jobs=20, backend='loky') as parallel: _res = parallel( delayed(_representation.main_no_tuning_vs_tuning) # delayed (set_args(dataset, feature, header, model, tuning)) # params for dataset, feature, header, model, tuning in list( itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING)) # for ) # parallel # reorganize results res = [] for history, (dataset, feature, header, model, tuning) in zip( _res, list( itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING))): res.append([ dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history ]) out_file = f'examples/representation/out/src/{DATE}/{res_file}.dat' else: # without parallel for dataset in DATASETS: for feature in FEATURES: for header in HEADER: for model in MODELS: for tuning in TUNING: try: print( f'*** {dataset}-{feature}-header_{header}, {model}-tuning_{tuning}' ) args = parser() args.dataset = dataset args.feature = feature args.header = header args.model = model args.tuning = tuning history = _representation.main_no_tuning_vs_tuning( args) res_ = [ dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history ] res.append(res_) # avoid losing any result, so save it immediately out_file = f'{args.out_dir}/{args.direction}/~{res_file}.dat' dump(res, out_file) save2txt(res, os.path.splitext(out_file)[0] + '.csv', delimiter=',') except Exception as e: lg.error(e) out_file = f'{args.out_dir}/{args.direction}/{DATE}/{res_file}.dat' check_path(out_file) dump(res, out_file) save2txt(res, os.path.splitext(out_file)[0] + '.csv', delimiter=',') lg.info(f'final result: {out_file}')
def get_iot2021_flows( in_dir=f'../Datasets/UCHI/IOT_2021/data-clean/refrigerator', dataset_name='', out_dir='', direction='src'): """ Hard coding in_dir and pcap paths Note: 1) refrigerator IP changes over time (dynamic ip), so here we filter with mac address. 2) please don't merge all pcaps first and then obtain flows. Parameters ---------- in_dir direction Returns ------- """ ip2device = { '192.168.143.152': 'refrigerator', } device2ip = { 'refrigerator': '192.168.143.43', 'nestcam': '192.168.143.104', 'alexa': '192.168.143.74' } # # device2mac = { 'refrigerator': '70:2c:1f:39:25:6e', 'nestcam': '18:b4:30:8a:9f:b2', 'alexa': '4c:ef:c0:0b:91:b3' } normal_pcaps = list( glob.iglob(in_dir + '/no_interaction/**/*.' + 'pcap', recursive=True)) normal_pcaps.append(in_dir + '/idle_1629935923.pcap') normal_pcaps.append(in_dir + '/idle_1630275254.pcap') normal_pcaps = sorted(normal_pcaps) normal_flows = [] for f in normal_pcaps: filter_f = f'{out_dir}/~tmp.pcap' check_path(filter_f) keep_mac_address(f, kept_ips=[device2mac['refrigerator']], out_file=filter_f, direction=direction) flows = _pcap2flows(filter_f, verbose=10) # normal flows normal_flows.extend(flows) lg.debug( f'total normal pcaps: {len(normal_pcaps)} and total flows: {len(normal_flows)}' ) # get abnormal flows abnormal_pcaps = list(glob.iglob(in_dir + '/open_close_fridge/**/*.' + 'pcap', recursive=True)) + \ list(glob.iglob(in_dir + '/put_back_item/**/*.' + 'pcap', recursive=True)) + \ list(glob.iglob(in_dir + '/screen_interaction/**/*.' + 'pcap', recursive=True)) + \ list(glob.iglob(in_dir + '/take_out_item/**/*.' + 'pcap', recursive=True)) abnormal_pcaps = sorted(abnormal_pcaps) abnormal_flows = [] for f in abnormal_pcaps: filter_f = f'{out_dir}/~tmp.pcap' check_path(filter_f) keep_mac_address(f, kept_ips=[device2mac['refrigerator']], out_file=filter_f, direction=direction) flows = _pcap2flows(filter_f, verbose=10) # normal flows abnormal_flows.extend(flows) lg.debug( f'total abnormal pcaps: {len(abnormal_pcaps)} and total flows: {len(abnormal_flows)}' ) meta = { 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows, 'normal_pcaps': normal_pcaps, 'abnormal_pcaps': abnormal_pcaps, 'device2mac': device2mac, 'filter_mac': device2mac['refrigerator'], 'direction': direction, 'in_dir': in_dir } return meta
def _generate_pcap(self): regenerate = False # step 1: obtain pcap and label if self.dataset_name == 'UCHI(SFRIG_2021)': self.IP = 'mac_70:2c:1f:39:25:6e' # IP for the new data changes over time, so here use mac address instead self.orig_flows = os.path.join( self.out_dir, f'iot2021-orig_sfrig_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) # hard coding (is not a good idea) meta = get_iot2021_flows( in_dir=f'../Datasets/UCHI/IOT_2021/data-clean/refrigerator', dataset_name=self.dataset_name, out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass elif self.dataset_name == 'UCHI/IOT_2019/ghome_192.168.143.20' or self.dataset_name == 'UCHI(GHOME_2019)': self.IP = '192.168.143.20' self.orig_flows = os.path.join( self.out_dir, f'ghome2019-orig_sfrig_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = get_ghome2019_flows( in_dir=f'../Datasets/UCHI/IOT_2019/', dataset_name='ghome_192.168.143.20', out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass elif self.dataset_name == 'UCHI/IOT_2019/scam_192.168.143.42' or self.dataset_name == 'UCHI(SCAM_2019)': self.IP = '192.168.143.42' self.orig_flows = os.path.join( self.out_dir, f'scam2019-orig_scam_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = get_scam2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019/', dataset_name='scam_192.168.143.42', out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass elif self.dataset_name == 'UCHI/IOT_2019/bstch_192.168.143.48' or self.dataset_name == 'UCHI(BSTCH_2019)': self.IP = '192.168.143.48' self.orig_flows = os.path.join( self.out_dir, f'bstch2019-orig_bstch_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = get_bstch2019_flows( in_dir=f'../Datasets/UCHI/IOT_2019/', dataset_name='bstch_192.168.143.48', out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass elif self.dataset_name == 'UCHI/IOT_2019/smtv_10.42.0.1' or self.dataset_name == 'UCHI(SMTV_2019)': self.IP = '10.42.0.1' self.orig_flows = os.path.join( self.out_dir, f'smtv2019-orig_smtv_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = get_smtv2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019/', dataset_name='smtv_10.42.0.1', out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass else: raise ValueError('dataset does not exist.')