def test_extract_only(self): """extract file-labeled & flag-labeled features to path""" feature_path = TEMPDIR_PATH / 'netml-test-features-a' self.try_execute( 'learn', 'extract', '-p', DATA_PATH / 'demo.pcap', '-l', DATA_PATH / 'demo.csv', '--pcap-normal', DATA_PATH / 'srcIP_10.42.0.1' / 'srcIP_10.42.0.1_normal.pcap', '--pcap-abnormal', DATA_PATH / 'srcIP_10.42.0.1' / 'srcIP_10.42.0.119_anomaly.pcap', '-f', feature_path, ) (features, labels) = load_data(feature_path) self.assertEqual(len(features), 88 + 4979 + 310) self.assertEqual(len(labels), 88 + 4979 + 310) # 0 is normal; 1 is abnormal self.assertTrue(labels[:88].any()) self.assertFalse(labels[:88].all()) self.assertFalse(labels[88:4979].any()) self.assertTrue(labels[(88 + 4979):].all())
def main(data_file=DATA_FILE): # load data X, y = load_data(data_file) # split train and test test X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=RANDOM_STATE) print( f'X_train.shape: {X_train.shape}, X_test.shape: {X_test.shape}, y_train.shape: {y_train.shape}, ' f'y_test.shape: {y_test.shape}') # model_name in ['OCSVM', 'KDE','IF', 'AE', 'GMM', 'PCA'] model_name = 'OCSVM' print(f'model_name: {model_name}') # create detection model model = generate_model(model_name) ndm = MODEL(model, score_metric='auc', verbose=10, random_state=RANDOM_STATE) # learned the model from the train set ndm.train(X_train) # evaluate the learned model ndm.test(X_test, y_test) # dump data to disk out_file = data_file.parent / f'{ndm.model_name}-results.dat' dump_data((model, ndm.history), out_file=out_file) print(ndm.train.tot_time, ndm.test.tot_time, ndm.score)
def __call__(self, args, parser): pcap_file = args.pcap if pcap_file is None: if sys.stdin.isatty(): parser.error("the following arguments are required " "when standard input is not specified: -p/--pcap") pcap_file = sys.stdin.buffer pcap = self.extract(pcap_file) # TODO: catch model file issues such as EOFError (model, train_history) = load_data(args.model) classifications = model.predict(pcap.features) for ((flow_key, flow_packets), classification) in zip(pcap.flows, classifications): if classification == CLASS_NORMAL: if args.report_all is None: if args.verbosity <= 1: continue elif not args.report_all: continue class_tag = 'NORMAL' elif classification == CLASS_ABNORMAL: class_tag = 'ANOMALY' | colors.red | colors.bold else: class_tag = '[unclassified]' if flow_key[4] == 6: packet_type = 'TCP' elif flow_key[4] == 17: packet_type = 'UDP' else: packet_type = '[protocol-other]' (packet_datetime0, packet_datetime1) = packet_datetimes = [ datetime.datetime.fromtimestamp(packet.time) for packet in (flow_packets[0], flow_packets[-1]) ] packet_date = packet_datetime0.date() (packet_time0, packet_time1) = ( packet_datetime.time() for packet_datetime in packet_datetimes ) print( f'[{packet_date}] [{packet_time0} – {packet_time1}]', f'{flow_key[0]}:{flow_key[2]} → {flow_key[1]}:{flow_key[3]} [{packet_type}]', class_tag, )
def test_extract_train(self): """extract features and train model""" model_path = TEMPDIR_PATH / 'netml-test-output-a' self.try_execute( 'learn', '-p', DATA_PATH / 'demo.pcap', '-o', model_path, ) (model, train_history) = load_data(model_path) self.assertTrue(hasattr(model, 'predict')) self.assertFalse(train_history)
def test_extract_only(self): """extract features to path""" feature_path = TEMPDIR_PATH / 'netml-test-features-a' self.try_execute( 'learn', 'extract', '-p', DATA_PATH / 'demo.pcap', '-f', feature_path, ) (features, labels) = load_data(feature_path) self.assertGreater(len(features), 0) self.assertIsNone(labels)
def test_extract_train(self): """extract features & labels and train & test model""" model_path = TEMPDIR_PATH / 'netml-test-output-a' self.try_execute( 'learn', '-p', DATA_PATH / 'demo.pcap', '-l', DATA_PATH / 'demo.csv', '-o', model_path, ) (model, train_history) = load_data(model_path) self.assertTrue(hasattr(model, 'predict')) self.assertIsNot(train_history, None) self.assertIn('score', train_history) self.assertGreater(train_history['score'], 0)
def test_extract_multiple(self): """extract features from multiple unlabeled pcap files""" feature_path = TEMPDIR_PATH / 'netml-test-features-a' self.try_execute( 'learn', 'extract', '-f', feature_path, # (filenames reflect what's in them but here we specify no labels for testing) '-p', DATA_PATH / 'srcIP_10.42.0.1' / 'srcIP_10.42.0.119_anomaly.pcap', '-p', DATA_PATH / 'srcIP_10.42.0.1' / 'srcIP_10.42.0.1_normal.pcap', ) (features, labels) = load_data(feature_path) self.assertEqual(len(features), 5_289) self.assertIsNone(labels)
def test_extract_train(self): """extract flag-labeled features and train & test model""" model_path = TEMPDIR_PATH / 'netml-test-output-a' self.try_execute( 'learn', '--pcap-normal', DATA_PATH / 'srcIP_10.42.0.1' / 'srcIP_10.42.0.1_normal.pcap', '--pcap-abnormal', DATA_PATH / 'srcIP_10.42.0.1' / 'srcIP_10.42.0.119_anomaly.pcap', '-o', model_path, ) (model, train_history) = load_data(model_path) self.assertTrue(hasattr(model, 'predict')) self.assertIsNot(train_history, None) self.assertIn('score', train_history) self.assertGreater(train_history['score'], 0)
def test_extract_multiple(self): """extract features & labels from multiple pcap & label files""" feature_path = TEMPDIR_PATH / 'netml-test-features-a' self.try_execute( 'learn', 'extract', '-f', feature_path, '-p', DATA_PATH / 'demo.pcap', '-p', DATA_PATH / 'demo.pcap', '-l', DATA_PATH / 'demo.csv', '-l', DATA_PATH / 'demo.csv', ) (features, labels) = load_data(feature_path) self.assertEqual(len(features), 176) self.assertEqual(len(labels), 176)
def test_extract_only(self): """extract flag-labeled features to path""" feature_path = TEMPDIR_PATH / 'netml-test-features-a' self.try_execute( 'learn', 'extract', '--pcap-normal', DATA_PATH / 'srcIP_10.42.0.1' / 'srcIP_10.42.0.1_normal.pcap', '--pcap-abnormal', DATA_PATH / 'srcIP_10.42.0.1' / 'srcIP_10.42.0.119_anomaly.pcap', '-f', feature_path, ) (features, labels) = load_data(feature_path) self.assertEqual(len(features), 5289) self.assertEqual(len(labels), 5289) # 0 is normal self.assertFalse(labels[:4979].any()) # 1 is abnormal self.assertTrue(labels[4979:].all())
def perform_train(self, feature_file, output_file): (features, labels) = load_data(feature_file) # train (and test split) # # TODO: might make sense to split testing into a separate "action," such that # TODO: it's done by default, but also can be applied modularly (on its own) # if labels is None: features_train = features features_test = labels_test = _labels_train = None self.vtable(2, [ ('', 'features', 'labels'), ('train', features_train.shape, 'n/a'), ('test', 'n/a', 'n/a'), ], title='data shapes') else: ( features_train, features_test, _labels_train, labels_test, ) = sklearn.model_selection.train_test_split(features, labels, test_size=self.args.test_size, random_state=self.args.random_state) self.vtable(2, [ ('', 'features', 'labels'), ('train', features_train.shape, 'n/a'), ('test', features_test.shape, labels_test.shape), ], title='data shapes') self.vprint(2, f'model name: {self.args.algorithm}') # param may be: # * unspecified (None) # * just a dict for this model # * dict with sub-dicts for many models params = self.args.param or {} if isinstance(params.get(self.args.algorithm), dict): params = params[self.args.algorithm] self.vprint(2, f'param override: {params}') model_class = self.load_algorithmic_model(self.args.algorithm) try: inspect.signature(model_class).bind(**params) except TypeError as exc: raise ValueError( f"model-training params failed to bind for {self.args.algorithm}:\n" + textwrap.indent(yaml.dump(params).strip(), ' ') ) from exc # create detection model model = model_class( random_state=self.args.random_state, **params ) model.name = self.args.algorithm # FIXME ndm = netml.ndm.model.MODEL( model, score_metric='auc', verbose=self.args.verbosity, random_state=self.args.random_state, ) # train the model from the train set ndm.train(features_train) time_train = ndm.train.tot_time # evaluate the model if features_test is not None and labels_test is not None: ndm.test(features_test, labels_test) time_test = ndm.test.tot_time else: time_test = None # dump data to disk dump_data((model, ndm.history), out_file=output_file) self.vtable(1, [ ('train time (m)', 'test time (m)', 'model score (auc)'), ( time_train, 'n/a' if time_test is None else time_test, getattr(ndm, 'score', 'n/a'), ), ], title='training performance')