def get_scam2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019', out_dir='', dataset_name='scam_192.168.143.42', direction='src'): IP = '192.168.143.42' normal_pcap = os.path.join(out_dir, f'pc_{IP}.pcap') check_path(normal_pcap) file_name = 'fridge_cam_sound_ghome_2daysactiv-scam_normal.pcap' pcap_file = get_file_path(in_dir=in_dir, dataset_name=dataset_name, file_name=file_name) filter_ip(pcap_file, normal_pcap, ips=[IP], direction=direction) normal_flows = _pcap2flows( normal_pcap, verbose=10) # ~1000 normal flows, it will generate > 1000 subflows max_interval = np.quantile( [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9) normal_flows = augment_flows(normal_flows, step=10, max_interval=max_interval) lg.debug(f'normal_flows: {len(normal_flows)}') abnormal_pcap = os.path.join(out_dir, f'pc_{IP}_abnormal.pcap') check_path(normal_pcap) # file_name = 'samsung_camera-2daysactiv-src_192.168.143.42-anomaly.pca' file_name = 'fridge_cam_sound_ghome_2daysactiv-scam_abnormal.pcap' pcap_file = get_file_path(ipt_dir=in_dir, dataset_name=dataset_name, file_name=file_name) filter_ip(pcap_file, abnormal_pcap, ips=[IP], direction=direction) abnormal_flows = _pcap2flows(abnormal_pcap, verbose=10) abnormal_flows = augment_flows(abnormal_flows, step=1, max_interval=max_interval) lg.debug(f'after augmenting abnormal_flows: {len(abnormal_flows)}') meta = { 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows, 'normal_pcaps': [normal_pcap], 'abnormal_pcaps': [abnormal_pcap], 'direction': direction, 'in_dir': in_dir } return meta
def get_smtv2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019', out_dir='', dataset_name='smtv_10.42.0.1', direction='src'): IP = '10.42.0.1' normal_pcap = os.path.join(out_dir, f'pc_{IP}.pcap') check_path(normal_pcap) file_name = 'pc_10.42.0.1_normal.pcap' pcap_file = get_file_path(in_dir=in_dir, dataset_name=dataset_name, file_name=file_name) filter_ip(pcap_file, normal_pcap, ips=[IP], direction=direction) normal_flows = _pcap2flows(normal_pcap, verbose=10) # normal flows max_interval = np.quantile( [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9) normal_flows = augment_flows(normal_flows, step=10, max_interval=max_interval) abnormal_pcap = os.path.join(out_dir, f'pc_10.42.0.119_abnormal.pcap') check_path(normal_pcap) file_name = 'pc_10.42.0.119_anomaly.pcap' pcap_file = get_file_path(in_dir=in_dir, dataset_name=dataset_name, file_name=file_name) filter_ip(pcap_file, abnormal_pcap, ips=['10.42.0.119'], direction=direction) abnormal_flows = _pcap2flows(abnormal_pcap, verbose=10) # normal flows abnormal_flows = augment_flows(abnormal_flows, step=10, max_interval=max_interval) meta = { 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows, 'normal_pcaps': [normal_pcap], 'abnormal_pcaps': [abnormal_pcap], 'direction': direction, 'in_dir': in_dir } return meta
def get_bstch2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019', out_dir='', dataset_name='scam_192.168.143.48', direction='src'): IP = '192.168.143.48' normal_pcap = os.path.join(out_dir, f'pc_{IP}.pcap') check_path(normal_pcap) # file_name = 'bose_soundtouch-2daysactiv-src_192.168.143.48-normal.pcap' file_name = 'fridge_cam_sound_ghome_2daysactiv-bstch_normal.pcap' pcap_file = get_file_path(in_dir=in_dir, dataset_name=dataset_name, file_name=file_name) filter_ip(pcap_file, normal_pcap, ips=[IP], direction=direction) normal_flows = _pcap2flows(normal_pcap, verbose=10) # normal flows max_interval = np.quantile( [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9) normal_flows = augment_flows(normal_flows, step=10, max_interval=max_interval) abnormal_pcap = os.path.join(out_dir, f'pc_{IP}_abnormal.pcap') check_path(normal_pcap) # file_name = 'bose_soundtouch-2daysactiv-src_192.168.143.48-anomaly.pcap' file_name = 'fridge_cam_sound_ghome_2daysactiv-bstch_abnormal.pcap' pcap_file = get_file_path(ipt_dir=in_dir, dataset_name=dataset_name, file_name=file_name) filter_ip(pcap_file, abnormal_pcap, ips=[IP], direction=direction) abnormal_flows = _pcap2flows(abnormal_pcap, verbose=10) # abnormal flows # abnormal_flows = augment_flows(abnormal_flows, starts=50, max_len=max_len) abnormal_flows = augment_flows(abnormal_flows, step=10, max_interval=max_interval) meta = { 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows, 'normal_pcaps': [normal_pcap], 'abnormal_pcaps': [abnormal_pcap], 'direction': direction, 'in_dir': in_dir } return meta
def generate(self): if os.path.exists(self.Xy_file): self.X, self.y = load(self.Xy_file) else: q_interval = 0.9 # pcap to flows flows = self.pcap2flows(self.pcap_file) # flows to subflow labels = [1] * len(flows) durations = [_get_flow_duration(pkts) for fid, pkts in flows] interval = _get_split_interval(durations, q_interval=q_interval) subflows, labels = self.flow2subflows(flows, interval=interval, labels=labels) # get dimension normal_flows = subflows num_pkts = [len(pkts) for fid, pkts in normal_flows] # only on normal flows dim = int(np.floor(np.quantile( num_pkts, q_interval))) # use the same q_interval to get the dimension lg.info(f'dim={dim}') # flows to features features, fids = self.flow2features(subflows, name=self.feature_name) # fixed the feature size features = self.fix_feature(features, dim=dim) self.X = features self.y = np.asarray([0] * len(features)) # save data to disk check_path(os.path.dirname(self.Xy_file)) dump((self.X, self.y), out_file=self.Xy_file) return self.X, self.y
def _get_SAMP(flows, name='SAMP_NUM', dim=None, header=False, header_dim=None): qs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95] flow_durations = [_get_flow_duration(pkts) for fid, pkts in flows] features_mp = {} fids_mp = {} for q in qs: sampling_rate_ = np.quantile(flow_durations, q=q) if name in ['SAMP_NUM', 'FFT_SAMP_NUM']: features, fids = _get_SAMP_NUM(flows, sampling_rate_) elif name in ['SAMP_SIZE', 'FFT_SAMP_SIZE']: features, fids = _get_SAMP_SIZE(flows, sampling_rate_) new_dim = dim if 'FFT' in name: features = _get_FFT_data(features, fft_bin=new_dim, fft_part='real') if header: header_features, header_fids = _get_header(flows) header_features = _get_FFT_data( header_features, fft_bin=header_dim, fft_part='real') # 8 is the number of tcp_flg features = np.concatenate( [features, header_features], axis=1) # concatanate feature and header else: features = _fix_data(features, new_dim) if header: header_features, header_fids = _get_header(flows) header_features = _fix_data(header_features, header_dim) features = np.concatenate( [features, header_features], axis=1) # concatanate feature and header features_mp[q] = (features, fids, sampling_rate_) fids_mp[q] = (fids) return features_mp, fids_mp
def _generate_flows(self): self.subflows_file = os.path.join(self.out_dir, 'normal_abnormal_subflows.dat') remove_file(self.subflows_file, self.overwrite) if os.path.exists(self.subflows_file): return load(self.subflows_file) # step 2: extract flows from pcap ############################################################################################## meta = load(self.orig_flows) normal_flows, abnormal_flows = meta['normal_flows'], meta[ 'abnormal_flows'] lg.debug( f'original normal flows: {len(normal_flows)} and abnormal flows: {len(abnormal_flows)}' ) qs = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1] len_stat = np.quantile([len(pkts) for f, pkts in normal_flows], q=qs) lg.debug( f'flows: {len(normal_flows)}, length statistic: {len_stat}, when q = {qs}' ) meta = { 'flows': normal_flows, 'len_stat': (len_stat, qs), 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows } dump(meta, out_file=os.path.join(self.out_dir, 'normal_abnormal_flows.dat')) # step 2.2. only get normal flows durations self.flows_durations = [ _get_flow_duration(pkts) for (fids, pkts) in normal_flows ] normal_durations_stat = np.quantile(self.flows_durations, q=qs) lg.debug(f'normal_durations_stat: {normal_durations_stat}') self.subflow_interval = np.quantile( self.flows_durations, q=self.q_flow_dur) # median of flow_durations lg.debug( f'---subflow_interval: {self.subflow_interval}, q_flow_dur: {self.q_flow_dur}' ) # step 2.3 get subflows normal_flows, _ = _flows2subflows(normal_flows, interval=self.subflow_interval, labels=['0'] * len(normal_flows)) abnormal_flows, _ = _flows2subflows(abnormal_flows, interval=self.subflow_interval, labels=['1'] * len(abnormal_flows)) lg.debug( f'normal_flows: {len(normal_flows)}, and abnormal_flows: {len(abnormal_flows)} ' f'with interval: {self.subflow_interval} and q: {self.q_flow_dur}') meta = { 'normal_flows_durations': self.flows_durations, 'normal_durations_stat': (normal_durations_stat, qs), 'subflow_interval': self.subflow_interval, 'q_flow_dur': self.q_flow_dur, 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows } dump(meta, out_file=self.subflows_file) # only return subflows return meta
def get_unb_flows(self, in_dir='../Datatsets', direction='src'): # preprocessed the pcap and label on original pcap and label self.pcap_file = os.path.join(self.out_dir, f'pc_{self.IP}_AGMT.pcap') self.label_file = os.path.join(self.out_dir, f'pc_{self.IP}_AGMT.csv') remove_file(self.pcap_file, self.overwrite) remove_file(self.label_file, self.overwrite) check_path(self.pcap_file) check_path(self.label_file) if not os.path.exists(self.pcap_file) or not os.path.exists( self.label_file): # 1. original pcap friday_pacp_orig = get_file_path( ipt_dir=in_dir, dataset_name='UNB/CICIDS_2017/', data_cat='pcaps/Friday', file_name='Friday-WorkingHours.pcap') # filter pcap filter_ip(friday_pacp_orig, out_file=self.pcap_file, ips=[self.IP], direction=self.direction, keep_original=True) # 2. merge original labels friday_label = get_file_path( ipt_dir=self.out_dir, dataset_name='UNB/CICIDS_2017/', data_cat='labels/Friday', file_name='Friday-WorkingHours-Morning.pcap_ISCX.csv') friday_label_orig1 = get_file_path( ipt_dir=in_dir, dataset_name='UNB/CICIDS_2017/', data_cat='labels/Friday', file_name='Friday-WorkingHours-Morning.pcap_ISCX.csv') friday_label_orig2 = get_file_path( ipt_dir=in_dir, dataset_name='UNB/CICIDS_2017/', data_cat='labels/Friday', file_name='Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv') friday_label_orig3 = get_file_path( ipt_dir=in_dir, dataset_name='UNB/CICIDS_2017/', data_cat='labels/Friday', file_name='Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv' ) friday_label_tmp = friday_label + '-all.csv' check_path(friday_label_tmp) merge_labels( [friday_label_orig1, friday_label_orig2, friday_label_orig3], mrg_label_path=friday_label_tmp) filter_csv_ip(friday_label_tmp, out_file=self.label_file, ips=[self.IP], direction=self.direction) ############################################################################################## # step 2.1 extract flows flows = _pcap2flows(self.pcap_file, verbose=10) # normal and abnormal flows # step 2.2 split normal flow and abnormal flow labels = pd.read_csv(self.label_file).values # normal_flows, abnormal_flows = split_normal_abnormal(flows, labels) # augment abnormal flows max_interval = np.quantile( [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9) abnormal_flows = augment_flows(abnormal_flows, step=1, max_interval=max_interval) meta = { 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows, 'normal_pcap': self.pcap_file, 'abnormal_pcap': self.label_file, 'direction': direction, 'in_dir': in_dir } return meta