Example #1
0
def get_correlation(in_dir='',
                    datasets='',
                    feature='SIZE',
                    header=True,
                    out_dir='',
                    out_file='.dat'):
    corr_results = {}
    for i, dataset in enumerate(datasets):
        in_file = os.path.join(in_dir, dataset, feature, f"header_{header}",
                               'Xy.dat')
        lg.debug(in_file)
        data = load(in_file)
        X_train, y_train, X_val, y_val, X_test, y_test = split_train_val_test(
            data['X'], data['y'], shuffle=True, random_state=RANDOM_STATE)
        # normalization
        ss, X_train, y_train, X_val, y_val, X_test, y_test = normalize(
            X_train, y_train, X_val, y_val, X_test, y_test)
        # 2 get correlation
        dim = X_test.shape[1]
        if feature == 'IAT':
            # iat_dim + header_dim = dim, here header_dim =  (8 + ttl_dim (i.e., size_dim))
            # => iat_dim + 8 + size_dim = iat_dim + 8 + (iat_dim + 1) = dim
            # => iat_dim = (dim - 9)//2
            start_idx = (dim - 8 - 1) // 2
        elif feature == 'SIZE':
            # size_dim + header_dim = dim
            # size_dim + (8+size_dim) = dim
            # size_dim = (dim - 8 ) // 2
            start_idx = (
                dim - 8
            ) // 2  # # feature + header_feature:(8 tcp flags + TTL). only works for 'SIZE'
        else:
            msg = f'Error: {feature}'
            raise NotImplementedError(msg)
        corrs = []
        lg.debug(f'header_feature_start_idx: {start_idx}')
        for j in range(
                9):  # feature + header_feature:(8 tcp flags + first TTL)
            _corr = _get_each_correlation(X_test[:, start_idx + j], y_test)
            corrs.append(_corr)
        corr_results[(in_file, dataset, feature, X_test.shape)] = corrs

        _out_file = os.path.join(out_dir, dataset, 'correlation.dat')
        check_path(_out_file)
        dump(corrs, _out_file)
        print(_out_file)
    # save all results
    check_path(out_file)
    dump(corr_results, out_file)

    return out_file
Example #2
0
    def _generate_pcap(self):

        # step 1: obtain pcap and label
        if self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.5' or self.dataset_name == 'UNB(PC1)':
            self.IP = '192.168.10.5'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc1)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.8' or self.dataset_name == 'UNB(PC2)':
            self.IP = '192.168.10.8'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc2)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.9' or self.dataset_name == 'UNB(PC3)':
            self.IP = '192.168.10.9'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc3)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.14' or self.dataset_name == 'UNB(PC4)':
            self.IP = '192.168.10.14'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc4)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.15' or self.dataset_name == 'UNB(PC5)':
            self.IP = '192.168.10.15'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc5)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'DEMO_IDS/DS-srcIP_192.168.10.5':
            self.IP = '192.168.10.5'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_demo_{self.direction}_flows-{self.IP}.dat')
        else:
            raise ValueError('dataset does not exist.')

        remove_file(self.Xy_file, self.overwrite)
        if not os.path.exists(self.orig_flows):
            lg.warning(f'{self.orig_flows} does not exist.')
            check_path(self.orig_flows)
            meta = self.get_unb_flows(in_dir=f'../Datasets',
                                      direction=self.direction)
            dump(meta, out_file=self.orig_flows)
            lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' +
                     meta['direction'])
            lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) +
                     ', normal_flows: ' + str(len(meta['normal_flows'])))
            lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) +
                     ', abnormal_flows: ' + str(len(meta['abnormal_flows'])))
        else:
            pass
Example #3
0
def gather(in_dir='src', out_dir=''):
	""" collect all individual results together

	Parameters
	----------
	in_dir:
		search results from the given directory
	out_dir:
		save the gathered results to the given directory
	Returns
	-------
	out_file:
		the short csv for a quick overview
	"""
	res = []
	for dataset, feature, header, model, tuning in list(itertools.product(DATASETS,
	                                                                      FEATURES, HEADER, MODELS, TUNING)):
		f = os.path.join(in_dir, dataset, feature, f'header_{header}', model, f'tuning_{tuning}', 'res.csv')
		try:
			line = [str(v) for v in pd.read_csv(f, sep=',', header=None).values.flatten().tolist()][1:]
			lg.debug(f, line)
			if len(str(line[0])) == 0:
				lg.error(f'Error: {line}. [{header}, {tuning}, {feature}, {dataset}, {model}]')
		except Exception as e:
			lg.error(f'Error: {e}. [{header}, {tuning}, {feature}, {dataset}, {model}]')
			line = ['', '0_0|0_0|0_0', '']  # [score, shape, params]
		res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}'] + line)

	# Save all results to gather.csv
	out_file = os.path.join(out_dir, 'gather.csv')
	check_path(out_file)
	with open(out_file, 'w') as f:
		for vs in res:
			f.write(','.join(vs) + '\n')

	# Only save needed data for quick overview
	short_file = os.path.join(os.path.split(out_file)[0], 'short.csv')
	with open(short_file, 'w') as f:
		for vs in res:
			if vs[5] == '' or vs[7] == '':
				lg.warning(f'Warning: {vs}.')
			tmp = vs[6].split('|')
			shape = '|'.join(v.split('_')[0] for v in tmp)
			dim = tmp[0].split('_')[1]
			f.write(','.join(vs[:6] + [shape, dim]) + '\n')

	return out_file
Example #4
0
def get_scam2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019',
                       out_dir='',
                       dataset_name='scam_192.168.143.42',
                       direction='src'):
    IP = '192.168.143.42'
    normal_pcap = os.path.join(out_dir, f'pc_{IP}.pcap')
    check_path(normal_pcap)
    file_name = 'fridge_cam_sound_ghome_2daysactiv-scam_normal.pcap'
    pcap_file = get_file_path(in_dir=in_dir,
                              dataset_name=dataset_name,
                              file_name=file_name)
    filter_ip(pcap_file, normal_pcap, ips=[IP], direction=direction)
    normal_flows = _pcap2flows(
        normal_pcap,
        verbose=10)  # ~1000 normal flows, it will generate > 1000 subflows
    max_interval = np.quantile(
        [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9)
    normal_flows = augment_flows(normal_flows,
                                 step=10,
                                 max_interval=max_interval)
    lg.debug(f'normal_flows: {len(normal_flows)}')

    abnormal_pcap = os.path.join(out_dir, f'pc_{IP}_abnormal.pcap')
    check_path(normal_pcap)
    # file_name = 'samsung_camera-2daysactiv-src_192.168.143.42-anomaly.pca'
    file_name = 'fridge_cam_sound_ghome_2daysactiv-scam_abnormal.pcap'
    pcap_file = get_file_path(ipt_dir=in_dir,
                              dataset_name=dataset_name,
                              file_name=file_name)
    filter_ip(pcap_file, abnormal_pcap, ips=[IP], direction=direction)
    abnormal_flows = _pcap2flows(abnormal_pcap, verbose=10)
    abnormal_flows = augment_flows(abnormal_flows,
                                   step=1,
                                   max_interval=max_interval)
    lg.debug(f'after augmenting abnormal_flows: {len(abnormal_flows)}')
    meta = {
        'normal_flows': normal_flows,
        'abnormal_flows': abnormal_flows,
        'normal_pcaps': [normal_pcap],
        'abnormal_pcaps': [abnormal_pcap],
        'direction': direction,
        'in_dir': in_dir
    }
    return meta
Example #5
0
def report(in_file='gather.dat', delimiter=','):
    res = load(in_file)
    out_file = os.path.split(in_file) + 'report.csv'
    check_path(out_file)
    with open(out_file, 'w') as f:
        for header in HEADER:
            for tuning in TUNING:
                for feature in FEATURES:
                    for dataset in DATASETS:
                        for model in MODELS:
                            data = get_one_res(res, f'header_{header}',
                                               f'tuning_{tuning}', feature,
                                               dataset, model)
                            line = f'{delimiter}'.join(data) + '\n'
                            lg.debug(line)
                            f.write(line)

    lg.info(f'report: {out_file}')
    return out_file
Example #6
0
    def get_ctu_flows(self, in_dir='../Datatsets', direction='src'):
        """
		https://www.stratosphereips.org/datasets-iot
		Malware on IoT Dataset
		"""
        self.normal_pcap = os.path.join(self.out_dir, f'pc_192.168.1.196.pcap')
        check_path(self.normal_pcap)
        # filter pcap
        # file_name = '2019-01-09-22-46-52-src_192.168.1.196_CTU_IoT_CoinMiner_anomaly.pcap'
        file_name = 'CTU-IoT-Malware-Capture-41-1_2019-01-09-22-46-52-192.168.1.196.pcap'
        pcap_file = get_file_path(in_dir=in_dir,
                                  dataset_name='CTU/IOT_2017',
                                  file_name=file_name)
        filter_ip(pcap_file,
                  self.normal_pcap,
                  ips=['192.168.1.196'],
                  direction=direction)
        normal_flows = _pcap2flows(self.normal_pcap,
                                   verbose=10)  # normal  flows

        self.abnormal_pcap = os.path.join(self.out_dir,
                                          f'pc_192.168.1.195_abnormal.pcap')
        check_path(self.normal_pcap)
        # file_name = '2018-12-21-15-50-14-src_192.168.1.195-CTU_IoT_Mirai_normal.pcap'
        file_name = 'CTU-IoT-Malware-Capture-34-1_2018-12-21-15-50-14-192.168.1.195.pcap'
        pcap_file = get_file_path(ipt_dir=in_dir,
                                  dataset_name='CTU/IOT_2017',
                                  file_name=file_name)
        filter_ip(pcap_file,
                  self.abnormal_pcap,
                  ips=['192.168.1.195'],
                  direction=direction)
        abnormal_flows = _pcap2flows(self.abnormal_pcap,
                                     verbose=10)  # normal  flows
        meta = {
            'normal_flows': normal_flows,
            'abnormal_flows': abnormal_flows,
            'normal_pcap': self.normal_pcap,
            'abnormal_pcap': self.abnormal_pcap,
            'direction': direction,
            'in_dir': in_dir
        }
        return meta
Example #7
0
def get_smtv2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019',
                       out_dir='',
                       dataset_name='smtv_10.42.0.1',
                       direction='src'):
    IP = '10.42.0.1'
    normal_pcap = os.path.join(out_dir, f'pc_{IP}.pcap')
    check_path(normal_pcap)
    file_name = 'pc_10.42.0.1_normal.pcap'
    pcap_file = get_file_path(in_dir=in_dir,
                              dataset_name=dataset_name,
                              file_name=file_name)
    filter_ip(pcap_file, normal_pcap, ips=[IP], direction=direction)
    normal_flows = _pcap2flows(normal_pcap, verbose=10)  # normal  flows
    max_interval = np.quantile(
        [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9)
    normal_flows = augment_flows(normal_flows,
                                 step=10,
                                 max_interval=max_interval)

    abnormal_pcap = os.path.join(out_dir, f'pc_10.42.0.119_abnormal.pcap')
    check_path(normal_pcap)
    file_name = 'pc_10.42.0.119_anomaly.pcap'
    pcap_file = get_file_path(in_dir=in_dir,
                              dataset_name=dataset_name,
                              file_name=file_name)
    filter_ip(pcap_file,
              abnormal_pcap,
              ips=['10.42.0.119'],
              direction=direction)
    abnormal_flows = _pcap2flows(abnormal_pcap, verbose=10)  # normal  flows
    abnormal_flows = augment_flows(abnormal_flows,
                                   step=10,
                                   max_interval=max_interval)
    meta = {
        'normal_flows': normal_flows,
        'abnormal_flows': abnormal_flows,
        'normal_pcaps': [normal_pcap],
        'abnormal_pcaps': [abnormal_pcap],
        'direction': direction,
        'in_dir': in_dir
    }
    return meta
Example #8
0
def get_bstch2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019',
                        out_dir='',
                        dataset_name='scam_192.168.143.48',
                        direction='src'):
    IP = '192.168.143.48'
    normal_pcap = os.path.join(out_dir, f'pc_{IP}.pcap')
    check_path(normal_pcap)
    # file_name = 'bose_soundtouch-2daysactiv-src_192.168.143.48-normal.pcap'
    file_name = 'fridge_cam_sound_ghome_2daysactiv-bstch_normal.pcap'
    pcap_file = get_file_path(in_dir=in_dir,
                              dataset_name=dataset_name,
                              file_name=file_name)
    filter_ip(pcap_file, normal_pcap, ips=[IP], direction=direction)
    normal_flows = _pcap2flows(normal_pcap, verbose=10)  # normal  flows
    max_interval = np.quantile(
        [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9)
    normal_flows = augment_flows(normal_flows,
                                 step=10,
                                 max_interval=max_interval)

    abnormal_pcap = os.path.join(out_dir, f'pc_{IP}_abnormal.pcap')
    check_path(normal_pcap)
    # file_name = 'bose_soundtouch-2daysactiv-src_192.168.143.48-anomaly.pcap'
    file_name = 'fridge_cam_sound_ghome_2daysactiv-bstch_abnormal.pcap'
    pcap_file = get_file_path(ipt_dir=in_dir,
                              dataset_name=dataset_name,
                              file_name=file_name)
    filter_ip(pcap_file, abnormal_pcap, ips=[IP], direction=direction)
    abnormal_flows = _pcap2flows(abnormal_pcap, verbose=10)  # abnormal  flows
    # abnormal_flows = augment_flows(abnormal_flows, starts=50, max_len=max_len)
    abnormal_flows = augment_flows(abnormal_flows,
                                   step=10,
                                   max_interval=max_interval)
    meta = {
        'normal_flows': normal_flows,
        'abnormal_flows': abnormal_flows,
        'normal_pcaps': [normal_pcap],
        'abnormal_pcaps': [abnormal_pcap],
        'direction': direction,
        'in_dir': in_dir
    }
    return meta
Example #9
0
	def _generate_pcap(self):
		# preprocessed the pcap and label on original pcap and label
		if self.dataset_name == 'MAWI/WIDE_2019/pc_202.171.168.50' or self.dataset_name == 'MAWI':
			# "http://mawi.wide.ad.jp/mawi/samplepoint-F/2019/201912071400.html"
			self.IP = '202.171.168.50'
			self.orig_flows = os.path.join(self.out_dir, f'mawi_{self.direction}_flows-{self.IP}.dat')
			remove_file(self.orig_flows, self.overwrite)
			if not os.path.exists(self.orig_flows):
				lg.warning(f'{self.orig_flows} does not exist.')
				check_path(self.orig_flows)
				meta = self.get_mawi_flows(in_dir=f'../Datasets', direction=self.direction)
				dump(meta, out_file=self.orig_flows)
				lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' + meta['direction'])
				lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) + ', normal_flows: '
				         + str(len(meta['normal_flows'])))
				lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) + ', abnormal_flows: '
				         + str(len(meta['abnormal_flows'])))

		else:
			raise ValueError('dataset does not exist.')
Example #10
0
    def generate(self):
        if os.path.exists(self.Xy_file):
            self.X, self.y = load(self.Xy_file)
        else:
            q_interval = 0.9
            # pcap to flows
            flows = self.pcap2flows(self.pcap_file)

            # flows to subflow
            labels = [1] * len(flows)
            durations = [_get_flow_duration(pkts) for fid, pkts in flows]
            interval = _get_split_interval(durations, q_interval=q_interval)
            subflows, labels = self.flow2subflows(flows,
                                                  interval=interval,
                                                  labels=labels)

            # get dimension
            normal_flows = subflows
            num_pkts = [len(pkts)
                        for fid, pkts in normal_flows]  # only on normal flows
            dim = int(np.floor(np.quantile(
                num_pkts,
                q_interval)))  # use the same q_interval to get the dimension
            lg.info(f'dim={dim}')

            # flows to features
            features, fids = self.flow2features(subflows,
                                                name=self.feature_name)

            # fixed the feature size
            features = self.fix_feature(features, dim=dim)

            self.X = features
            self.y = np.asarray([0] * len(features))

            # save data to disk
            check_path(os.path.dirname(self.Xy_file))
            dump((self.X, self.y), out_file=self.Xy_file)

        return self.X, self.y
Example #11
0
	def get_mawi_flows(self, in_dir='../Datatsets', direction='src'):

		self.normal_pcap = os.path.join(self.out_dir, f'pc_202.171.168.50.pcap')
		check_path(self.normal_pcap)
		file_name = 'samplepoint-F_201912071400-src_dst_202.171.168.50.pcap'
		pcap_file = get_file_path(in_dir=in_dir, dataset_name='MAWI/WIDE_2019',
		                          file_name=file_name)
		filter_ip(pcap_file, self.normal_pcap, ips=['202.171.168.50'], direction=direction)
		normal_flows = _pcap2flows(self.normal_pcap, verbose=10)  # normal  flows

		self.abnormal_pcap = os.path.join(self.out_dir, f'pc_203.113.113.16_abnormal.pcap')
		check_path(self.normal_pcap)
		# file_name = 'samplepoint-F_201912071400-src_dst_202.4.27.109.pcap'    # ~5000
		file_name = 'samplepoint-F_201912071400-src_203.113.113.16.pcap'  # ~1500
		pcap_file = get_file_path(ipt_dir=in_dir, dataset_name='MAWI/WIDE_2019',
		                          file_name=file_name)
		filter_ip(pcap_file, self.abnormal_pcap, ips=['203.113.113.16'], direction=direction)
		abnormal_flows = _pcap2flows(self.abnormal_pcap, verbose=10)  # normal  flows
		meta = {'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows,
		        'normal_pcap': self.normal_pcap, 'abnormal_pcap': self.abnormal_pcap,
		        'direction': direction, 'in_dir': in_dir}
		return meta
Example #12
0
def extract_subpcap(pcap_file, out_file, start_time, end_time, verbose=20, keep_original=True):
	""" extract a part of pcap using editcap
	' editcap -A "2017-07-04 09:02:00" -B "2017-07-04 09:05:00" input.pcap output.pcap'

	Parameters
	----------
	pcap_file:
	out_file
	start_time
	end_time
	verbose
	keep_original: bool
		keep the original pcap or not, True (default)

	Returns
	-------

	"""
	if os.path.exists(out_file): return out_file

	if out_file is None:
		out_file = pcap_file + f'-start={start_time}-end={end_time}.pcap'
		out_file = out_file.replace(' ', '_')

	check_path(out_file)
	cmd = f"editcap -A \"{start_time}\" -B \"{end_time}\" \"{pcap_file}\" \"{out_file}\""
	# print(cmd)
	if verbose > 10:
		print(f'{cmd}')
	result = ''
	try:
		result = subprocess.run(cmd, stdout=subprocess.PIPE, shell=True).stdout.decode('utf-8')
		if not keep_original:
			os.remove(pcap_file)
	except Exception as e:
		print(f'{e}, {result}')

	return out_file
Example #13
0
    def _generate_pcap(self):
        # preprocessed the pcap and label on original pcap and label
        if self.dataset_name == 'CTU/IOT_2017/pc_192.168.1.196' or self.dataset_name == 'CTU':
            self.IP = '192.168.1.196'
            self.orig_flows = os.path.join(
                self.out_dir, f'ctu_{self.direction}_flows-{self.IP}.dat')
            remove_file(self.orig_flows, self.overwrite)
            if not os.path.exists(self.orig_flows):
                lg.warning(f'{self.orig_flows} does not exist.')
                check_path(self.orig_flows)
                meta = self.get_ctu_flows(in_dir=f'../Datasets',
                                          direction=self.direction)
                dump(meta, out_file=self.orig_flows)
                lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] +
                         ', direction: ' + meta['direction'])
                lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) +
                         ', normal_flows: ' + str(len(meta['normal_flows'])))
                lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) +
                         ', abnormal_flows: ' +
                         str(len(meta['abnormal_flows'])))

        else:
            raise ValueError('dataset does not exist.')
Example #14
0
def gather(in_dir='examples/representation/out/src', out_dir=''):
    res = []
    for dataset, feature, header, model, tuning in list(
            itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING)):
        f = os.path.join(in_dir, dataset, feature, f'header_{header}', model,
                         f'tuning_{tuning}', 'res.csv')
        try:
            line = [
                str(v) for v in pd.read_csv(
                    f, sep=',', header=None).values.flatten().tolist()
            ][1:]
            print(f, line)
        except Exception as e:
            print(f'Error: {e}')
            line = ['', '0_0|0_0|0_0', '']  # [score, shape, params]
        res.append(
            [dataset, feature, f'header_{header}', model, f'tuning_{tuning}'] +
            line)

    out_file = os.path.join(out_dir, 'gather.csv')
    check_path(out_file)

    with open(out_file, 'w') as f:
        for vs in res:
            f.write(','.join(vs) + '\n')

    short_file = os.path.join(os.path.split(out_file)[0], 'short.csv')
    # data = pd.read_csv(out_file, error_bad_lines=False, header=None)  # will miss some results.
    # data.iloc[:, 0:7].to_csv(short_file)
    with open(short_file, 'w') as f:
        for vs in res:
            tmp = vs[6].split('|')
            shape = '|'.join(v.split('_')[0] for v in tmp)
            dim = tmp[0].split('_')[1]
            f.write(','.join(vs[:6] + [shape, dim]) + '\n')

    return out_file
Example #15
0
def main():
    """Get results from xlsx and plot the results.

	Parameters
	----------
	root_dir

	Returns
	-------

	"""
    # raw_file = 'examples/representation/out/src/~res.csv'
    # in_file = 'examples/representation/report/res.csv'
    # check_path(in_file)
    # copyfile(raw_file, in_file)
    in_file = 'examples/representation/out/src/results/2021-09-28/short.csv'
    data = parse_csv(in_file)
    data = format_name(data, data_orig2name)  #
    out_dir = 'examples/representation/report/out'
    TUNING = [True, False]
    MODELS = ['OCSVM', 'IF', 'AE', 'KDE', 'GMM', 'PCA']

    ########################################################################################################
    ### Get the results on part of datasets and all algorithms
    # 1. datasets: [UNB(PC1), UNB(PC4), CTU, MAWI, TV&RT, SFrig, and BSTch]
    # algorithms: [OCSVM, IF, AE, KDE, GMM, PCA]
    DATASETS1 = [
        'UNB(PC1)', 'UNB(PC4)', 'CTU', 'MAWI', 'TV&RT', 'SFrig', 'BSTch'
    ]
    DATASETS2 = ['UNB(PC2)', 'UNB(PC3)', 'UNB(PC5)', 'SCam', 'GHom']
    DATASETS3 = [
        'UNB(PC1)', 'UNB(PC2)', 'UNB(PC3)', 'UNB(PC4)', 'UNB(PC5)', 'CTU',
        'MAWI', 'TV&RT', 'SFrig', 'BSTch', 'SCam', 'GHom'
    ]
    DATASETS_LST = [DATASETS1, DATASETS2, DATASETS3]
    FIGS = ['size_effect', 'header_effect', 'fft_effect']
    for tuning in TUNING:
        tuning_type = 'best' if tuning else 'default'
        for DATASETS in DATASETS_LST:
            # 1. size_effect
            fig_type = FIGS[0]
            out_file = f'{out_dir}/{fig_type}-{tuning_type}-{len(DATASETS)}.pdf'
            check_path(out_file)
            results = pkt_size_diff(data, tuning, DATASETS, MODELS)
            size_effect_plot(results, MODELS, DATASETS, fig_type, out_file)

            # 2. header_effect
            fig_type = FIGS[1]
            out_file = f'{out_dir}/{fig_type}-{tuning_type}-{len(DATASETS)}.pdf'
            check_path(out_file)
            results = pkt_header_diff(data, tuning, DATASETS, MODELS)
            header_effect_plot(results, MODELS, DATASETS, fig_type, out_file)

            # 3. fft_effect
            fig_type = FIGS[2]
            out_file = f'{out_dir}/{fig_type}-{tuning_type}-{len(DATASETS)}.pdf'
            check_path(out_file)
            results = fft_diff(data, tuning, DATASETS, MODELS)
            fft_effect_plot(results, MODELS, DATASETS, fig_type, out_file)

            lg.info('\n')
Example #16
0
def plot_correlation_multi(corr_results, out_file='', title=None, show=True):
    """ plot the data

	Parameters
	----------
	corr_results
	out_dir
	title
	show

	Returns
	-------

	"""
    # # only show the top 4 figures
    new_corr_results = {}
    for i, (dataset, name) in enumerate(data_orig2name.items()):
        for j, (key, corrs) in enumerate(corr_results.items()):
            _key_path, _dataset, _feat_set, X_test_shape = key
            if dataset in key:
                new_corr_results[(_key_path, _dataset, name, _feat_set,
                                  X_test_shape)] = corrs
    t = 0
    cols = 2
    fontsize = 20
    ## http://jose-coto.com/styling-with-seaborn
    # colors = ["m", "#4374B3"]
    # palette = sns.color_palette('RdPu', 1)  # a list
    palette = [sns.color_palette('YlOrRd', 7)[4]]  # YlOrRd
    fig, axes = plt.subplots(2, cols, figsize=(18, 8))  # (width, height)
    # print(new_corr_results)
    for i, (key, corrs) in enumerate(new_corr_results.items()):
        print(f"i: {i}, {key}, corrs: {corrs}")  # hue = feat_set
        key_path, dataset, short_name, feat_set, X_test_shape = key
        HEADER = [
            'FIN', 'SYN', 'RST', 'PSH', 'ACK', 'URG', 'ECE', 'CWR', '1st-TTL'
        ]

        data = sorted(range(len(corrs)),
                      key=lambda i: abs(corrs[i]),
                      reverse=True)[:6]  # top 6 values
        data = [[f'({HEADER[_i]}, y)', feat_set, corrs[_i]] for _i in data]
        # print(f"i: {i}, {key}, corrs: {data}")

        new_yerrs = [1 / (np.sqrt(X_test_shape[0]))
                     ] * 6  # for the same dataset, it has the same err_bar
        # # print(f'i: {i}, {new_yerrs}')

        df = pd.DataFrame(data, columns=[f'Xi_y', 'feat_set', 'corr_rho'])
        if i % cols == 0 and i > 0:
            t += 1
        g = sns.barplot(x=f"Xi_y",
                        y="corr_rho",
                        ax=axes[t, i % cols],
                        hue='feat_set',
                        data=df,
                        palette=palette)  # palette=palette,
        g.set(xlabel=None)
        g.set(ylim=(-1, 1))
        if i % cols == 0:
            # g.set_ylabel(r'$\rho$', fontsize=fontsize + 4)
            g.set_ylabel(r'Correlation', fontsize=fontsize + 4)
            # print(g.get_yticks())
            g.set_yticks([-1, -0.5, 0, 0.5, 1])
            g.set_yticklabels(g.get_yticks(), fontsize=fontsize +
                              6)  # set the number of each value in y axis
        # print(g.get_yticks())
        else:
            g.set(ylabel=None)
            g.set_yticklabels(['' for v_tmp in g.get_yticks()])
            g.set_ylabel('')

        # g.set_title(dataset_name)
        g.get_legend().set_visible(False)
        g.set_xticklabels(g.get_xticklabels(),
                          fontsize=fontsize + 4,
                          rotation=30,
                          ha="center")

        ys = []
        xs = []
        width = 0
        for i_p, p in enumerate(g.patches):
            height = p.get_height()
            width = p.get_width()
            ys.append(height)
            xs.append(p.get_x())
            if i_p == 0:
                pre = p.get_x() + p.get_width()
            if i_p > 0:
                cur = p.get_x()
                g.axvline(color='black',
                          linestyle='--',
                          x=pre + (cur - pre) / 2,
                          ymin=0,
                          ymax=1,
                          alpha=0.3)
                pre = cur + p.get_width()
            ## https://stackoverflow.com/questions/34888058/changing-width-of-bars-in-bar-chart-created-using-seaborn-factorplot
            p.set_width(width / 3)  # set the bar width
            # we recenter the bar
            p.set_x(p.get_x() + width / 3)
        g.set_title(short_name, fontsize=fontsize + 8)

        # add error bars
        g.errorbar(x=xs + width / 2,
                   y=ys,
                   yerr=new_yerrs,
                   fmt='none',
                   c='b',
                   capsize=3)

    # # get the legend and modify it
    # handles, labels = g.get_legend_handles_labels()
    # fig.legend(handles, ['IAT+SIZE'], title=None, loc='lower center', ncol=1,
    #  prop={'size': fontsize-2})  # loc='lower right',  loc = (0.74, 0.13)

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.2)

    check_path(out_file)
    print(out_file)
    plt.savefig(out_file)  # should use before plt.show()
    if show: plt.show()
    plt.close(fig)
    plt.close("all")
Example #17
0
    def get_unb_flows(self, in_dir='../Datatsets', direction='src'):

        # preprocessed the pcap and label on original pcap and label
        self.pcap_file = os.path.join(self.out_dir, f'pc_{self.IP}_AGMT.pcap')
        self.label_file = os.path.join(self.out_dir, f'pc_{self.IP}_AGMT.csv')
        remove_file(self.pcap_file, self.overwrite)
        remove_file(self.label_file, self.overwrite)
        check_path(self.pcap_file)
        check_path(self.label_file)

        if not os.path.exists(self.pcap_file) or not os.path.exists(
                self.label_file):
            # 1. original pcap
            friday_pacp_orig = get_file_path(
                ipt_dir=in_dir,
                dataset_name='UNB/CICIDS_2017/',
                data_cat='pcaps/Friday',
                file_name='Friday-WorkingHours.pcap')
            # filter pcap
            filter_ip(friday_pacp_orig,
                      out_file=self.pcap_file,
                      ips=[self.IP],
                      direction=self.direction,
                      keep_original=True)

            # 2. merge original labels
            friday_label = get_file_path(
                ipt_dir=self.out_dir,
                dataset_name='UNB/CICIDS_2017/',
                data_cat='labels/Friday',
                file_name='Friday-WorkingHours-Morning.pcap_ISCX.csv')
            friday_label_orig1 = get_file_path(
                ipt_dir=in_dir,
                dataset_name='UNB/CICIDS_2017/',
                data_cat='labels/Friday',
                file_name='Friday-WorkingHours-Morning.pcap_ISCX.csv')
            friday_label_orig2 = get_file_path(
                ipt_dir=in_dir,
                dataset_name='UNB/CICIDS_2017/',
                data_cat='labels/Friday',
                file_name='Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
            friday_label_orig3 = get_file_path(
                ipt_dir=in_dir,
                dataset_name='UNB/CICIDS_2017/',
                data_cat='labels/Friday',
                file_name='Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv'
            )
            friday_label_tmp = friday_label + '-all.csv'
            check_path(friday_label_tmp)
            merge_labels(
                [friday_label_orig1, friday_label_orig2, friday_label_orig3],
                mrg_label_path=friday_label_tmp)
            filter_csv_ip(friday_label_tmp,
                          out_file=self.label_file,
                          ips=[self.IP],
                          direction=self.direction)

        ##############################################################################################
        # step 2.1 extract flows
        flows = _pcap2flows(self.pcap_file,
                            verbose=10)  # normal and abnormal flows
        # step 2.2 split normal flow and abnormal flow
        labels = pd.read_csv(self.label_file).values  #
        normal_flows, abnormal_flows = split_normal_abnormal(flows, labels)
        # augment abnormal flows
        max_interval = np.quantile(
            [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9)
        abnormal_flows = augment_flows(abnormal_flows,
                                       step=1,
                                       max_interval=max_interval)
        meta = {
            'normal_flows': normal_flows,
            'abnormal_flows': abnormal_flows,
            'normal_pcap': self.pcap_file,
            'abnormal_pcap': self.label_file,
            'direction': direction,
            'in_dir': in_dir
        }

        return meta
Example #18
0
    def _generate_features(self, normal_flows, abnormal_flows):
        # step 3: flows to features.
        # only on normal flows
        normal_flow_lengths = [len(pkts) for fid, pkts in normal_flows]
        qs = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1]
        normal_lengths_stat = np.quantile(normal_flow_lengths, q=qs)
        lg.debug(f'normal_lengths_stat: {normal_lengths_stat}, where q = {qs}')
        self.dim = int(
            np.floor(np.quantile(normal_flow_lengths, self.q_flow_dur)))
        lg.info(f'dim(SIZE) = {self.dim}')

        self.X = []
        self.y = []

        if self.header:
            header_features, header_fids = _get_header(normal_flows)
            header_dim = int(
                np.quantile([len(v) for v in header_features],
                            q=self.q_flow_dur))
            lg.info(f'header_dim: {header_dim}')
        else:
            header_dim = None

        if 'SAMP' in self.feature_name:
            normal_features, normal_fids = self.flow2features(
                normal_flows,
                name=self.feature_name,
                dim=self.dim,
                header=self.header,
                header_dim=header_dim)
            abnormal_features, abnormal_fids = self.flow2features(
                abnormal_flows,
                name=self.feature_name,
                dim=self.dim,
                header=self.header,
                header_dim=header_dim)

            for q in normal_features.keys():
                X_ = list(
                    normal_features[q][0])  # (features, fid, sampling_rate_)
                y_ = [0] * len(normal_features[q][0])
                X_.extend(list(abnormal_features[q][0]))
                y_.extend([1] * len(abnormal_features[q][0]))
                self.X.append(np.asarray(X_))
                self.y.append(np.asarray(y_))

            # save data to disk
            check_path(self.Xy_file)
            meta = {
                'X': self.X,
                'y': self.y,
                'normal_flow_lengths':
                (normal_flow_lengths, normal_lengths_stat),
                'dim': self.dim,
                'q_flow_dur': self.q_flow_dur
            }
            dump(meta, out_file=self.Xy_file)
            # save feature data as csv
            csv_file = os.path.splitext(self.Xy_file)[0] + '.csv'
        # np.savetxt(csv_file, np.concatenate([self.X, self.y[..., np.newaxis]], axis=1), delimiter=',')
        else:
            for flows, label in zip([normal_flows, abnormal_flows], [0, 1]):
                features, fids = self.flow2features(flows,
                                                    name=self.feature_name,
                                                    dim=self.dim,
                                                    header=self.header,
                                                    header_dim=header_dim)
                self.X.extend(features)
                self.y.extend([label] * len(features))

            # save data to disk
            check_path(self.Xy_file)
            self.X = np.asarray(self.X)
            self.y = np.asarray(self.y)
            meta = {
                'X': self.X,
                'y': self.y,
                'normal_flow_lengths':
                (normal_flow_lengths, normal_lengths_stat),
                'dim': self.dim,
                'q_flow_dur': self.q_flow_dur
            }
            dump(meta, out_file=self.Xy_file)
            # save feature data as csv
            csv_file = os.path.splitext(self.Xy_file)[0] + '.csv'
            np.savetxt(csv_file,
                       np.concatenate([self.X, self.y[..., np.newaxis]],
                                      axis=1),
                       delimiter=',')
        return meta
Example #19
0
def _main():
	""" Main function

	Returns
	-------

	"""
	res = []
	out_file = f'{OUT_DIR}/src/{RESULT_DIR}/res.dat'
	is_parallel = False
	if is_parallel:  # with parallel
		def set_args(dataset, feature, header, model, tuning):
			args = parser()
			args.dataset = dataset
			args.feature = feature
			args.header = header
			args.model = model
			args.tuning = tuning
			lg.debug(args)
			return args

		# if backend='loky', the time taken is less than that of serial. but if backend='multiprocessing', we can
		# get very similar time cost comparing with serial.
		_res = []
		with Parallel(n_jobs=20, backend='loky') as parallel:
			_res = parallel(delayed(_representation.main_no_tuning_vs_tuning)  # delayed
			                (set_args(dataset, feature, header, model, tuning))  # params
			                for dataset, feature, header, model, tuning in
			                list(itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING))  # for
			                )  # parallel
		# reorganize results
		res = []
		for history, (dataset, feature, header, model, tuning) in zip(_res, list(
				itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING))):
			res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history['best']])
	else:  # without parallel
		for dataset, feature, header, model, tuning in list(itertools.product(DATASETS,
		                                                                      FEATURES, HEADER, MODELS, TUNING)):
			try:
				lg.info(f'*** {dataset}-{feature}-header_{header}, {model}-tuning_{tuning}')
				args = parser()
				args.dataset = dataset
				args.feature = feature
				args.header = header
				args.model = model
				args.tuning = tuning
				args.overwrite = OVERWRITE
				history = _representation.main_no_tuning_vs_tuning(args)
				res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history['best']])
				# avoid losing any result, so save it immediately.
				_out_file = f'{args.out_dir}/{args.direction}/{RESULT_DIR}/~res.csv'
				check_path(_out_file)
				save2txt(res, _out_file, delimiter=',')
			except Exception as e:
				lg.error(f'Error: {e}. [{dataset}, {feature}, {header}, {model}, {tuning}]')

	# save the final results: '.dat' and '.csv'
	check_path(out_file)
	dump(res, out_file)
	out_file = os.path.splitext(out_file)[0] + '.csv'
	remove_file(out_file, OVERWRITE)
	save2txt(res, out_file, delimiter=',')
	lg.info(f'final result: {out_file}')
Example #20
0
def main(args=None, test=False):
    """ Get the result according to the given parameters

	Parameters
	----------
	args
	test: boolean
		if we evaluate the built model on val set or test set
	Returns
	-------
	history: dict
		Return the best result on 'SAMP' related feature. Otherwise, return the result
	"""
    try:
        lg.debug(args)
        out_dir = os.path.join(args.out_dir, args.direction, args.dataset,
                               args.feature, f'header_{args.header}',
                               args.model, f'tuning_{args.tuning}')

        ###############################################################################################################
        """ 1.1 Parse data and extract features
			
		"""
        lg.info(f'\n--- 1.1 Parse data')
        data = Data(dataset_name=args.dataset,
                    direction=args.direction,
                    feature_name=args.feature,
                    header=args.header,
                    overwrite=args.overwrite,
                    random_state=RANDOM_STATE)
        data.generate()

        if 'SAMP' in args.feature:
            best = {'score': 0, 'model': None}
            for i, (X, y) in enumerate(zip(data.X, data.y)):
                lg.debug(f'SAMP_{i}')
                try:
                    res_, data_ = _single_main(args, X, y, test=test)
                except Exception as e:
                    lg.error(f'Error: {e}. SAMP_{i}')
                    continue
                # get the best results on SAMP data
                if res_['score'] > best['score']:
                    best['score'] = res_['score']
                    best['model'] = copy.deepcopy(res_)
                    best['data'] = copy.deepcopy(data_)
            history = best
        else:
            X, y = data.X, data.y
            res_, data_ = _single_main(args, X, y, test=test)
            history = {'score': res_['score'], 'model': res_, 'data': data_}

    except Exception as e:
        traceback.print_exc()
        history = {
            'score': 0,
            'model': {},
            'data': (None, None, None, None, None, None)
        }

    ###############################################################################################################
    """ 3. Dump the result to disk

	"""
    lg.info(f'\n--- 3. Save the result')
    out_file = os.path.join(out_dir, f'res.dat')
    check_path(out_file)
    dump(history, out_file=out_file)
    out_file = os.path.splitext(out_file)[0] + '.csv'
    remove_file(out_file, overwrite=OVERWRITE)
    save2txt(history, out_file)
    lg.info(f'res_file: {out_file}')

    return history
Example #21
0
def main_no_tuning_vs_tuning(args=None):
    """ get results with default and best parameters according to the args.

	Parameters
	----------
	args: given parameters

	Returns
	-------
	history: dict
		store all the results in a dictionary
	"""
    # 1. Get dimension of the dataset. For some algorithms, they need the dimensions (e.g., AE)
    data = Data(dataset_name=args.dataset,
                direction=args.direction,
                feature_name=args.feature,
                header=args.header,
                overwrite=args.overwrite,
                random_state=RANDOM_STATE)
    data.generate()
    if 'SAMP' in args.feature:
        X = data.X[0]
    else:
        X = data.X

    # 2. Get the results with the given model
    if args.model == 'OCSVM':
        if args.tuning:
            qs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
        else:
            qs = [0.3]
        history = {
        }  # store the best result, model parameters, and the best model (dict)
        best = {'score': 0, 'model': None}
        lg.debug(f'Tuning: q = {qs}')
        for q in qs:
            args.model_params = {'q': q}
            # get results on the validation set
            history_ = main(args, test=False)
            score_ = history_['score']
            if score_ > best['score']:
                best['score'] = score_
                best['q'] = q
                best['model'] = copy.deepcopy(history_)
            history[q] = history_

        # get the final result on the test set.
        args.model_params = {'q': best['q']}
        best['model'] = main(args, test=True)
        best['score'] = best['model']['score']
        history['best'] = best
    elif args.model == 'GMM':
        if args.tuning:
            n_components_arr = [1, 2, 5, 10, 15, 20, 25, 30, 35, 40]
        else:
            n_components_arr = ['quickshift']
        history = {}
        best = {'score': 0, 'model': None}
        lg.debug(f'Tuning: q = {n_components_arr}')
        for n_components in n_components_arr:
            args.model_params = {'n_components': n_components}
            history_ = main(args, test=False)
            score_ = history_['score']
            if score_ > best['score']:
                best['score'] = score_
                best['n_components'] = n_components
                best['model'] = copy.deepcopy(history_)
            history[n_components] = history_

        # get the final result on the test set.
        args.model_params = {'n_components': best['n_components']}
        best['model'] = main(args, test=True)
        best['score'] = best['model']['score']
        history['best'] = best

    elif args.model == 'IF':
        if args.tuning:
            n_estimators_arr = [
                int(v)
                for v in list(np.linspace(30, 300, num=10, endpoint=True))
            ]
        else:
            n_estimators_arr = [100]
        history = {}
        best = {'score': 0, 'model': None}
        lg.debug(f'Tuning: n_estimators_arr = {n_estimators_arr}')
        for n_estimators in n_estimators_arr:
            args.model_params = {'n_estimators': n_estimators}
            history_ = main(args, test=False)
            score_ = history_['score']
            if score_ > best['score']:
                best['score'] = score_
                best['n_estimators'] = n_estimators
                best['model'] = copy.deepcopy(history_)
            history[n_estimators] = history_

        # get the final result on the test set.
        args.model_params = {'n_estimators': best['n_estimators']}
        best['model'] = main(args, test=True)
        best['score'] = best['model']['score']
        history['best'] = best

    elif args.model == 'PCA':
        if args.tuning:
            n_components_arr = [
                int(v) for v in list(
                    np.linspace(1, min(X.shape), num=10, endpoint=False))
            ]
        else:
            n_components_arr = ['mle']
        history = {}
        best = {'score': 0, 'model': None}
        lg.debug(f'Tuning: n_components_arr = {n_components_arr}')
        for n_components in n_components_arr:
            args.model_params = {'n_components': n_components}
            history_ = main(args, test=False)
            score_ = history_['score']
            if score_ > best['score']:
                best['score'] = score_
                best['n_components'] = n_components
                best['model'] = copy.deepcopy(history_)
            history[n_components] = history_

        # get the final result on the test set.
        args.model_params = {'n_components': best['n_components']}
        best['model'] = main(args, test=True)
        best['score'] = best['model']['score']
        history['best'] = best
    elif args.model == 'KDE':
        if args.tuning:
            qs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
        else:
            qs = [0.3]
        history = {}
        best = {'score': 0, 'model': None}
        lg.debug(f'Tuning: q = {qs}')
        for q in qs:
            args.model_params = {'q': q}
            history_ = main(args, test=False)
            score_ = history_['score']
            if score_ > best['score']:
                best['score'] = score_
                best['q'] = q
                best['model'] = copy.deepcopy(history_)
            history[q] = history_
        # get the final result on the test set.
        args.model_params = {'q': best['q']}
        best['model'] = main(args, test=True)
        best['score'] = best['model']['score']
        history['best'] = best

    elif args.model == 'AE':
        if args.tuning:
            feat_dim = X.shape[1]

            def get_AE_parameters(d, num=10):
                latent_sizes = []
                for i in range(num):
                    v = np.ceil(1 + i * (d - 2) / 9).astype(int)
                    if v not in latent_sizes:
                        latent_sizes.append(v)

                hidden_sizes = [
                    min((d - 1),
                        np.ceil(2 * v).astype(int)) for v in latent_sizes
                ]

                hidden_neurons = []
                for i, (hid, lat) in enumerate(zip(hidden_sizes,
                                                   latent_sizes)):
                    v = [d, hid, lat, hid, d]
                    hidden_neurons.append(v)
                return hidden_neurons

            hidden_neurons_arr = get_AE_parameters(feat_dim, num=10)
        else:
            feat_dim = X.shape[1]
            latent_dim = np.ceil(feat_dim / 2).astype(int)
            hid = min((feat_dim - 1), np.ceil(2 * latent_dim).astype(int))
            hidden_neurons = [feat_dim, hid, latent_dim, hid, feat_dim]
            hidden_neurons_arr = [hidden_neurons]

        history = {}
        best = {'score': 0, 'model': None}
        lg.debug(f'Tuning: hidden_neurons = {hidden_neurons_arr}')
        for hidden_neurons in hidden_neurons_arr:
            args.model_params = {'hidden_neurons': hidden_neurons}
            history_ = main(args, test=False)
            score_ = history_['score']
            if score_ > best['score']:
                best['score'] = score_
                best['hidden_neurons'] = hidden_neurons
                best['model'] = copy.deepcopy(history_)
            history[tuple(hidden_neurons)] = history_
        # get the final result on the test set.
        args.model_params = {'hidden_neurons': best['hidden_neurons']}
        best['model'] = main(args, test=True)
        best['score'] = best['model']['score']
        history['best'] = best

    else:
        msg = f'{args.model}'
        raise NotImplementedError(msg)
    # lg.info(f'\n*** best: ' + str(history['best']))
    out_file = os.path.join(args.out_dir, args.direction, args.dataset,
                            args.feature, f'header_{args.header}', args.model,
                            f'tuning_{args.tuning}', 'res.dat')
    check_path(out_file)
    dump(history, out_file)

    return history
Example #22
0
def main():
    res = []
    res_file = 'res2'
    is_parallel = False
    if is_parallel:

        def set_args(dataset, feature, header, model, tuning):
            args = parser()
            args.dataset = dataset
            args.feature = feature
            args.header = header
            args.model = model
            args.tuning = tuning
            print(args)
            return args

        # if backend='loky', the time taken is less than that of serial. but if backend='multiprocessing', we can
        # get very similar time cost comparing with serial.
        _res = []
        with Parallel(n_jobs=20, backend='loky') as parallel:
            _res = parallel(
                delayed(_representation.main_no_tuning_vs_tuning)  # delayed
                (set_args(dataset, feature, header, model, tuning))  # params
                for dataset, feature, header, model, tuning in list(
                    itertools.product(DATASETS, FEATURES, HEADER, MODELS,
                                      TUNING))  # for
            )  # parallel
        # reorganize results
        res = []
        for history, (dataset, feature, header, model, tuning) in zip(
                _res,
                list(
                    itertools.product(DATASETS, FEATURES, HEADER, MODELS,
                                      TUNING))):
            res.append([
                dataset, feature, f'header_{header}', model,
                f'tuning_{tuning}', history
            ])
        out_file = f'examples/representation/out/src/{DATE}/{res_file}.dat'
    else:  # without parallel
        for dataset in DATASETS:
            for feature in FEATURES:
                for header in HEADER:
                    for model in MODELS:
                        for tuning in TUNING:
                            try:
                                print(
                                    f'*** {dataset}-{feature}-header_{header}, {model}-tuning_{tuning}'
                                )
                                args = parser()
                                args.dataset = dataset
                                args.feature = feature
                                args.header = header
                                args.model = model
                                args.tuning = tuning
                                history = _representation.main_no_tuning_vs_tuning(
                                    args)
                                res_ = [
                                    dataset, feature, f'header_{header}',
                                    model, f'tuning_{tuning}', history
                                ]
                                res.append(res_)
                                # avoid losing any result, so save it immediately
                                out_file = f'{args.out_dir}/{args.direction}/~{res_file}.dat'
                                dump(res, out_file)
                                save2txt(res,
                                         os.path.splitext(out_file)[0] +
                                         '.csv',
                                         delimiter=',')
                            except Exception as e:
                                lg.error(e)

        out_file = f'{args.out_dir}/{args.direction}/{DATE}/{res_file}.dat'

    check_path(out_file)
    dump(res, out_file)
    save2txt(res, os.path.splitext(out_file)[0] + '.csv', delimiter=',')
    lg.info(f'final result: {out_file}')
Example #23
0
def get_iot2021_flows(
        in_dir=f'../Datasets/UCHI/IOT_2021/data-clean/refrigerator',
        dataset_name='',
        out_dir='',
        direction='src'):
    """ Hard coding in_dir and pcap paths

	Note:
		1) refrigerator IP changes over time (dynamic ip), so here we filter with mac address.
		2) please don't merge all pcaps first and then obtain flows.
	Parameters
	----------
	in_dir
	direction

	Returns
	-------

	"""
    ip2device = {
        '192.168.143.152': 'refrigerator',
    }
    device2ip = {
        'refrigerator': '192.168.143.43',
        'nestcam': '192.168.143.104',
        'alexa': '192.168.143.74'
    }
    # #
    device2mac = {
        'refrigerator': '70:2c:1f:39:25:6e',
        'nestcam': '18:b4:30:8a:9f:b2',
        'alexa': '4c:ef:c0:0b:91:b3'
    }
    normal_pcaps = list(
        glob.iglob(in_dir + '/no_interaction/**/*.' + 'pcap', recursive=True))
    normal_pcaps.append(in_dir + '/idle_1629935923.pcap')
    normal_pcaps.append(in_dir + '/idle_1630275254.pcap')
    normal_pcaps = sorted(normal_pcaps)
    normal_flows = []
    for f in normal_pcaps:
        filter_f = f'{out_dir}/~tmp.pcap'
        check_path(filter_f)
        keep_mac_address(f,
                         kept_ips=[device2mac['refrigerator']],
                         out_file=filter_f,
                         direction=direction)
        flows = _pcap2flows(filter_f, verbose=10)  # normal  flows
        normal_flows.extend(flows)
    lg.debug(
        f'total normal pcaps: {len(normal_pcaps)} and total flows: {len(normal_flows)}'
    )

    # get abnormal flows
    abnormal_pcaps = list(glob.iglob(in_dir + '/open_close_fridge/**/*.' + 'pcap', recursive=True)) + \
                     list(glob.iglob(in_dir + '/put_back_item/**/*.' + 'pcap', recursive=True)) + \
                     list(glob.iglob(in_dir + '/screen_interaction/**/*.' + 'pcap', recursive=True)) + \
                     list(glob.iglob(in_dir + '/take_out_item/**/*.' + 'pcap', recursive=True))
    abnormal_pcaps = sorted(abnormal_pcaps)

    abnormal_flows = []
    for f in abnormal_pcaps:
        filter_f = f'{out_dir}/~tmp.pcap'
        check_path(filter_f)
        keep_mac_address(f,
                         kept_ips=[device2mac['refrigerator']],
                         out_file=filter_f,
                         direction=direction)
        flows = _pcap2flows(filter_f, verbose=10)  # normal  flows
        abnormal_flows.extend(flows)
    lg.debug(
        f'total abnormal pcaps: {len(abnormal_pcaps)} and total flows: {len(abnormal_flows)}'
    )

    meta = {
        'normal_flows': normal_flows,
        'abnormal_flows': abnormal_flows,
        'normal_pcaps': normal_pcaps,
        'abnormal_pcaps': abnormal_pcaps,
        'device2mac': device2mac,
        'filter_mac': device2mac['refrigerator'],
        'direction': direction,
        'in_dir': in_dir
    }
    return meta
Example #24
0
 def _generate_pcap(self):
     regenerate = False
     # step 1: obtain pcap and label
     if self.dataset_name == 'UCHI(SFRIG_2021)':
         self.IP = 'mac_70:2c:1f:39:25:6e'  # IP for the new data changes over time, so here use mac address instead
         self.orig_flows = os.path.join(
             self.out_dir,
             f'iot2021-orig_sfrig_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             # hard coding (is not a good idea)
             meta = get_iot2021_flows(
                 in_dir=f'../Datasets/UCHI/IOT_2021/data-clean/refrigerator',
                 dataset_name=self.dataset_name,
                 out_dir=self.out_dir,
                 direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     elif self.dataset_name == 'UCHI/IOT_2019/ghome_192.168.143.20' or self.dataset_name == 'UCHI(GHOME_2019)':
         self.IP = '192.168.143.20'
         self.orig_flows = os.path.join(
             self.out_dir,
             f'ghome2019-orig_sfrig_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             meta = get_ghome2019_flows(
                 in_dir=f'../Datasets/UCHI/IOT_2019/',
                 dataset_name='ghome_192.168.143.20',
                 out_dir=self.out_dir,
                 direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     elif self.dataset_name == 'UCHI/IOT_2019/scam_192.168.143.42' or self.dataset_name == 'UCHI(SCAM_2019)':
         self.IP = '192.168.143.42'
         self.orig_flows = os.path.join(
             self.out_dir,
             f'scam2019-orig_scam_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             meta = get_scam2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019/',
                                       dataset_name='scam_192.168.143.42',
                                       out_dir=self.out_dir,
                                       direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     elif self.dataset_name == 'UCHI/IOT_2019/bstch_192.168.143.48' or self.dataset_name == 'UCHI(BSTCH_2019)':
         self.IP = '192.168.143.48'
         self.orig_flows = os.path.join(
             self.out_dir,
             f'bstch2019-orig_bstch_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             meta = get_bstch2019_flows(
                 in_dir=f'../Datasets/UCHI/IOT_2019/',
                 dataset_name='bstch_192.168.143.48',
                 out_dir=self.out_dir,
                 direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     elif self.dataset_name == 'UCHI/IOT_2019/smtv_10.42.0.1' or self.dataset_name == 'UCHI(SMTV_2019)':
         self.IP = '10.42.0.1'
         self.orig_flows = os.path.join(
             self.out_dir,
             f'smtv2019-orig_smtv_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             meta = get_smtv2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019/',
                                       dataset_name='smtv_10.42.0.1',
                                       out_dir=self.out_dir,
                                       direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     else:
         raise ValueError('dataset does not exist.')