Example #1
0
    def _generate_pcap(self):

        # step 1: obtain pcap and label
        if self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.5' or self.dataset_name == 'UNB(PC1)':
            self.IP = '192.168.10.5'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc1)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.8' or self.dataset_name == 'UNB(PC2)':
            self.IP = '192.168.10.8'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc2)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.9' or self.dataset_name == 'UNB(PC3)':
            self.IP = '192.168.10.9'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc3)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.14' or self.dataset_name == 'UNB(PC4)':
            self.IP = '192.168.10.14'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc4)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.15' or self.dataset_name == 'UNB(PC5)':
            self.IP = '192.168.10.15'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc5)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'DEMO_IDS/DS-srcIP_192.168.10.5':
            self.IP = '192.168.10.5'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_demo_{self.direction}_flows-{self.IP}.dat')
        else:
            raise ValueError('dataset does not exist.')

        remove_file(self.Xy_file, self.overwrite)
        if not os.path.exists(self.orig_flows):
            lg.warning(f'{self.orig_flows} does not exist.')
            check_path(self.orig_flows)
            meta = self.get_unb_flows(in_dir=f'../Datasets',
                                      direction=self.direction)
            dump(meta, out_file=self.orig_flows)
            lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' +
                     meta['direction'])
            lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) +
                     ', normal_flows: ' + str(len(meta['normal_flows'])))
            lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) +
                     ', abnormal_flows: ' + str(len(meta['abnormal_flows'])))
        else:
            pass
Example #2
0
    def generate(self):
        remove_file(self.Xy_file, self.overwrite)
        if os.path.exists(self.Xy_file):
            Xy_meta = load(self.Xy_file)
        else:
            if self.dataset_name in ['CTU']:
                self._generate_pcap()  # generate data
                flows_meta = self._generate_flows()  # normal_abnormal.data
                # Xy (fixed feature data)
                Xy_meta = self._generate_features(flows_meta['normal_flows'],
                                                  flows_meta['abnormal_flows'])
            else:
                msg = f'{self.dataset_name}'
                raise NotImplementedError(msg)
        self.X, self.y = Xy_meta['X'], Xy_meta['y']

        return Xy_meta
Example #3
0
	def _generate_pcap(self):
		# preprocessed the pcap and label on original pcap and label
		if self.dataset_name == 'MAWI/WIDE_2019/pc_202.171.168.50' or self.dataset_name == 'MAWI':
			# "http://mawi.wide.ad.jp/mawi/samplepoint-F/2019/201912071400.html"
			self.IP = '202.171.168.50'
			self.orig_flows = os.path.join(self.out_dir, f'mawi_{self.direction}_flows-{self.IP}.dat')
			remove_file(self.orig_flows, self.overwrite)
			if not os.path.exists(self.orig_flows):
				lg.warning(f'{self.orig_flows} does not exist.')
				check_path(self.orig_flows)
				meta = self.get_mawi_flows(in_dir=f'../Datasets', direction=self.direction)
				dump(meta, out_file=self.orig_flows)
				lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' + meta['direction'])
				lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) + ', normal_flows: '
				         + str(len(meta['normal_flows'])))
				lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) + ', abnormal_flows: '
				         + str(len(meta['abnormal_flows'])))

		else:
			raise ValueError('dataset does not exist.')
Example #4
0
    def generate(self):
        remove_file(self.Xy_file, self.overwrite)

        if os.path.exists(self.Xy_file):
            Xy_meta = load(self.Xy_file)
        else:
            if self.dataset_name in [
                    'UCHI(SFRIG_2021)', 'UCHI(SMTV_2019)', 'UCHI(GHOME_2019)',
                    'UCHI(SCAM_2019)', 'UCHI(BSTCH_2019)'
            ]:
                self._generate_pcap()  # generate data
                flows_meta = self._generate_flows()  # normal_abnormal.data
                # Xy (fixed feature data)
                Xy_meta = self._generate_features(flows_meta['normal_flows'],
                                                  flows_meta['abnormal_flows'])
            else:
                msg = f'{self.dataset_name}'
                raise NotImplementedError(msg)
        self.X, self.y = Xy_meta['X'], Xy_meta['y']
        self.Xy_meta = Xy_meta
        return self.Xy_meta
Example #5
0
    def _generate_pcap(self):
        # preprocessed the pcap and label on original pcap and label
        if self.dataset_name == 'CTU/IOT_2017/pc_192.168.1.196' or self.dataset_name == 'CTU':
            self.IP = '192.168.1.196'
            self.orig_flows = os.path.join(
                self.out_dir, f'ctu_{self.direction}_flows-{self.IP}.dat')
            remove_file(self.orig_flows, self.overwrite)
            if not os.path.exists(self.orig_flows):
                lg.warning(f'{self.orig_flows} does not exist.')
                check_path(self.orig_flows)
                meta = self.get_ctu_flows(in_dir=f'../Datasets',
                                          direction=self.direction)
                dump(meta, out_file=self.orig_flows)
                lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] +
                         ', direction: ' + meta['direction'])
                lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) +
                         ', normal_flows: ' + str(len(meta['normal_flows'])))
                lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) +
                         ', abnormal_flows: ' +
                         str(len(meta['abnormal_flows'])))

        else:
            raise ValueError('dataset does not exist.')
Example #6
0
    def _generate_flows(self):
        self.subflows_file = os.path.join(self.out_dir,
                                          'normal_abnormal_subflows.dat')
        remove_file(self.subflows_file, self.overwrite)
        if os.path.exists(self.subflows_file):
            return load(self.subflows_file)

        # step 2: extract flows from pcap
        ##############################################################################################
        meta = load(self.orig_flows)
        normal_flows, abnormal_flows = meta['normal_flows'], meta[
            'abnormal_flows']
        lg.debug(
            f'original normal flows: {len(normal_flows)} and abnormal flows: {len(abnormal_flows)}'
        )
        qs = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1]
        len_stat = np.quantile([len(pkts) for f, pkts in normal_flows], q=qs)
        lg.debug(
            f'flows: {len(normal_flows)}, length statistic: {len_stat}, when q = {qs}'
        )
        meta = {
            'flows': normal_flows,
            'len_stat': (len_stat, qs),
            'normal_flows': normal_flows,
            'abnormal_flows': abnormal_flows
        }
        dump(meta,
             out_file=os.path.join(self.out_dir, 'normal_abnormal_flows.dat'))

        # step 2.2. only get normal flows durations
        self.flows_durations = [
            _get_flow_duration(pkts) for (fids, pkts) in normal_flows
        ]
        normal_durations_stat = np.quantile(self.flows_durations, q=qs)
        lg.debug(f'normal_durations_stat: {normal_durations_stat}')
        self.subflow_interval = np.quantile(
            self.flows_durations,
            q=self.q_flow_dur)  # median  of flow_durations
        lg.debug(
            f'---subflow_interval: {self.subflow_interval}, q_flow_dur: {self.q_flow_dur}'
        )
        # step 2.3 get subflows
        normal_flows, _ = _flows2subflows(normal_flows,
                                          interval=self.subflow_interval,
                                          labels=['0'] * len(normal_flows))
        abnormal_flows, _ = _flows2subflows(abnormal_flows,
                                            interval=self.subflow_interval,
                                            labels=['1'] * len(abnormal_flows))
        lg.debug(
            f'normal_flows: {len(normal_flows)}, and abnormal_flows: {len(abnormal_flows)} '
            f'with interval: {self.subflow_interval} and q: {self.q_flow_dur}')
        meta = {
            'normal_flows_durations': self.flows_durations,
            'normal_durations_stat': (normal_durations_stat, qs),
            'subflow_interval': self.subflow_interval,
            'q_flow_dur': self.q_flow_dur,
            'normal_flows': normal_flows,
            'abnormal_flows': abnormal_flows
        }
        dump(meta, out_file=self.subflows_file)

        # only return subflows
        return meta
Example #7
0
def main(args=None, test=False):
    """ Get the result according to the given parameters

	Parameters
	----------
	args
	test: boolean
		if we evaluate the built model on val set or test set
	Returns
	-------
	history: dict
		Return the best result on 'SAMP' related feature. Otherwise, return the result
	"""
    try:
        lg.debug(args)
        out_dir = os.path.join(args.out_dir, args.direction, args.dataset,
                               args.feature, f'header_{args.header}',
                               args.model, f'tuning_{args.tuning}')

        ###############################################################################################################
        """ 1.1 Parse data and extract features
			
		"""
        lg.info(f'\n--- 1.1 Parse data')
        data = Data(dataset_name=args.dataset,
                    direction=args.direction,
                    feature_name=args.feature,
                    header=args.header,
                    overwrite=args.overwrite,
                    random_state=RANDOM_STATE)
        data.generate()

        if 'SAMP' in args.feature:
            best = {'score': 0, 'model': None}
            for i, (X, y) in enumerate(zip(data.X, data.y)):
                lg.debug(f'SAMP_{i}')
                try:
                    res_, data_ = _single_main(args, X, y, test=test)
                except Exception as e:
                    lg.error(f'Error: {e}. SAMP_{i}')
                    continue
                # get the best results on SAMP data
                if res_['score'] > best['score']:
                    best['score'] = res_['score']
                    best['model'] = copy.deepcopy(res_)
                    best['data'] = copy.deepcopy(data_)
            history = best
        else:
            X, y = data.X, data.y
            res_, data_ = _single_main(args, X, y, test=test)
            history = {'score': res_['score'], 'model': res_, 'data': data_}

    except Exception as e:
        traceback.print_exc()
        history = {
            'score': 0,
            'model': {},
            'data': (None, None, None, None, None, None)
        }

    ###############################################################################################################
    """ 3. Dump the result to disk

	"""
    lg.info(f'\n--- 3. Save the result')
    out_file = os.path.join(out_dir, f'res.dat')
    check_path(out_file)
    dump(history, out_file=out_file)
    out_file = os.path.splitext(out_file)[0] + '.csv'
    remove_file(out_file, overwrite=OVERWRITE)
    save2txt(history, out_file)
    lg.info(f'res_file: {out_file}')

    return history
Example #8
0
 def _generate_pcap(self):
     regenerate = False
     # step 1: obtain pcap and label
     if self.dataset_name == 'UCHI(SFRIG_2021)':
         self.IP = 'mac_70:2c:1f:39:25:6e'  # IP for the new data changes over time, so here use mac address instead
         self.orig_flows = os.path.join(
             self.out_dir,
             f'iot2021-orig_sfrig_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             # hard coding (is not a good idea)
             meta = get_iot2021_flows(
                 in_dir=f'../Datasets/UCHI/IOT_2021/data-clean/refrigerator',
                 dataset_name=self.dataset_name,
                 out_dir=self.out_dir,
                 direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     elif self.dataset_name == 'UCHI/IOT_2019/ghome_192.168.143.20' or self.dataset_name == 'UCHI(GHOME_2019)':
         self.IP = '192.168.143.20'
         self.orig_flows = os.path.join(
             self.out_dir,
             f'ghome2019-orig_sfrig_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             meta = get_ghome2019_flows(
                 in_dir=f'../Datasets/UCHI/IOT_2019/',
                 dataset_name='ghome_192.168.143.20',
                 out_dir=self.out_dir,
                 direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     elif self.dataset_name == 'UCHI/IOT_2019/scam_192.168.143.42' or self.dataset_name == 'UCHI(SCAM_2019)':
         self.IP = '192.168.143.42'
         self.orig_flows = os.path.join(
             self.out_dir,
             f'scam2019-orig_scam_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             meta = get_scam2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019/',
                                       dataset_name='scam_192.168.143.42',
                                       out_dir=self.out_dir,
                                       direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     elif self.dataset_name == 'UCHI/IOT_2019/bstch_192.168.143.48' or self.dataset_name == 'UCHI(BSTCH_2019)':
         self.IP = '192.168.143.48'
         self.orig_flows = os.path.join(
             self.out_dir,
             f'bstch2019-orig_bstch_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             meta = get_bstch2019_flows(
                 in_dir=f'../Datasets/UCHI/IOT_2019/',
                 dataset_name='bstch_192.168.143.48',
                 out_dir=self.out_dir,
                 direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     elif self.dataset_name == 'UCHI/IOT_2019/smtv_10.42.0.1' or self.dataset_name == 'UCHI(SMTV_2019)':
         self.IP = '10.42.0.1'
         self.orig_flows = os.path.join(
             self.out_dir,
             f'smtv2019-orig_smtv_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             meta = get_smtv2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019/',
                                       dataset_name='smtv_10.42.0.1',
                                       out_dir=self.out_dir,
                                       direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     else:
         raise ValueError('dataset does not exist.')
Example #9
0
def _main():
	""" Main function

	Returns
	-------

	"""
	res = []
	out_file = f'{OUT_DIR}/src/{RESULT_DIR}/res.dat'
	is_parallel = False
	if is_parallel:  # with parallel
		def set_args(dataset, feature, header, model, tuning):
			args = parser()
			args.dataset = dataset
			args.feature = feature
			args.header = header
			args.model = model
			args.tuning = tuning
			lg.debug(args)
			return args

		# if backend='loky', the time taken is less than that of serial. but if backend='multiprocessing', we can
		# get very similar time cost comparing with serial.
		_res = []
		with Parallel(n_jobs=20, backend='loky') as parallel:
			_res = parallel(delayed(_representation.main_no_tuning_vs_tuning)  # delayed
			                (set_args(dataset, feature, header, model, tuning))  # params
			                for dataset, feature, header, model, tuning in
			                list(itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING))  # for
			                )  # parallel
		# reorganize results
		res = []
		for history, (dataset, feature, header, model, tuning) in zip(_res, list(
				itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING))):
			res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history['best']])
	else:  # without parallel
		for dataset, feature, header, model, tuning in list(itertools.product(DATASETS,
		                                                                      FEATURES, HEADER, MODELS, TUNING)):
			try:
				lg.info(f'*** {dataset}-{feature}-header_{header}, {model}-tuning_{tuning}')
				args = parser()
				args.dataset = dataset
				args.feature = feature
				args.header = header
				args.model = model
				args.tuning = tuning
				args.overwrite = OVERWRITE
				history = _representation.main_no_tuning_vs_tuning(args)
				res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history['best']])
				# avoid losing any result, so save it immediately.
				_out_file = f'{args.out_dir}/{args.direction}/{RESULT_DIR}/~res.csv'
				check_path(_out_file)
				save2txt(res, _out_file, delimiter=',')
			except Exception as e:
				lg.error(f'Error: {e}. [{dataset}, {feature}, {header}, {model}, {tuning}]')

	# save the final results: '.dat' and '.csv'
	check_path(out_file)
	dump(res, out_file)
	out_file = os.path.splitext(out_file)[0] + '.csv'
	remove_file(out_file, OVERWRITE)
	save2txt(res, out_file, delimiter=',')
	lg.info(f'final result: {out_file}')
Example #10
0
    def get_unb_flows(self, in_dir='../Datatsets', direction='src'):

        # preprocessed the pcap and label on original pcap and label
        self.pcap_file = os.path.join(self.out_dir, f'pc_{self.IP}_AGMT.pcap')
        self.label_file = os.path.join(self.out_dir, f'pc_{self.IP}_AGMT.csv')
        remove_file(self.pcap_file, self.overwrite)
        remove_file(self.label_file, self.overwrite)
        check_path(self.pcap_file)
        check_path(self.label_file)

        if not os.path.exists(self.pcap_file) or not os.path.exists(
                self.label_file):
            # 1. original pcap
            friday_pacp_orig = get_file_path(
                ipt_dir=in_dir,
                dataset_name='UNB/CICIDS_2017/',
                data_cat='pcaps/Friday',
                file_name='Friday-WorkingHours.pcap')
            # filter pcap
            filter_ip(friday_pacp_orig,
                      out_file=self.pcap_file,
                      ips=[self.IP],
                      direction=self.direction,
                      keep_original=True)

            # 2. merge original labels
            friday_label = get_file_path(
                ipt_dir=self.out_dir,
                dataset_name='UNB/CICIDS_2017/',
                data_cat='labels/Friday',
                file_name='Friday-WorkingHours-Morning.pcap_ISCX.csv')
            friday_label_orig1 = get_file_path(
                ipt_dir=in_dir,
                dataset_name='UNB/CICIDS_2017/',
                data_cat='labels/Friday',
                file_name='Friday-WorkingHours-Morning.pcap_ISCX.csv')
            friday_label_orig2 = get_file_path(
                ipt_dir=in_dir,
                dataset_name='UNB/CICIDS_2017/',
                data_cat='labels/Friday',
                file_name='Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
            friday_label_orig3 = get_file_path(
                ipt_dir=in_dir,
                dataset_name='UNB/CICIDS_2017/',
                data_cat='labels/Friday',
                file_name='Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv'
            )
            friday_label_tmp = friday_label + '-all.csv'
            check_path(friday_label_tmp)
            merge_labels(
                [friday_label_orig1, friday_label_orig2, friday_label_orig3],
                mrg_label_path=friday_label_tmp)
            filter_csv_ip(friday_label_tmp,
                          out_file=self.label_file,
                          ips=[self.IP],
                          direction=self.direction)

        ##############################################################################################
        # step 2.1 extract flows
        flows = _pcap2flows(self.pcap_file,
                            verbose=10)  # normal and abnormal flows
        # step 2.2 split normal flow and abnormal flow
        labels = pd.read_csv(self.label_file).values  #
        normal_flows, abnormal_flows = split_normal_abnormal(flows, labels)
        # augment abnormal flows
        max_interval = np.quantile(
            [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9)
        abnormal_flows = augment_flows(abnormal_flows,
                                       step=1,
                                       max_interval=max_interval)
        meta = {
            'normal_flows': normal_flows,
            'abnormal_flows': abnormal_flows,
            'normal_pcap': self.pcap_file,
            'abnormal_pcap': self.label_file,
            'direction': direction,
            'in_dir': in_dir
        }

        return meta