def test_train(): """Test training function of HostFootprint class""" with tempfile.TemporaryDirectory() as tmpdir: testdata = os.path.join(tmpdir, 'test_data') shutil.copytree('./tests/test_data', testdata) input_file = os.path.join(testdata, 'combined.csv') operation = 'train' sys.argv = hf_args(tmpdir, operation, input_file) instance = HostFootprint() instance.main()
def test_predict_num_roles(): """ Test predict function of HostFootprint class with varying number of distinct roles present """ with tempfile.TemporaryDirectory() as tmpdir: testdata = os.path.join(tmpdir, 'test_data') shutil.copytree('./tests/test_data', testdata) for file in ['combined_three_roles.csv', 'combined_two_roles.csv']: input_file = os.path.join(testdata, file) operation = 'train' sys.argv = hf_args(tmpdir, operation, input_file) instance = HostFootprint() instance.main() operation = 'predict' sys.argv = hf_args(tmpdir, operation, input_file) instance = HostFootprint() instance.main() predictions = json.loads(instance.predict()) assert isinstance(predictions, dict) # Check if number of predictions is correct if file == 'combined_three_roles.csv': assert len(predictions) == 6 else: assert len(predictions) == 4
def test_serialize_scaler(): instance = HostFootprint() scaler = StandardScaler() test_data = [[i, i] for i in range(99)] scaler.fit(test_data) with tempfile.TemporaryDirectory() as tmpdir: scaler_file = os.path.join(tmpdir, 'scaler.mod') instance.serialize_scaler(scaler, scaler_file) new_scaler = instance.deserialize_scaler(scaler_file) assert len(scaler.mean_) == 2 assert scaler.mean_.tolist() == new_scaler.mean_.tolist()
def test_list_model(): expected = [ "foo", "bar", "baz", ] instance = HostFootprint() instance.model_path = './tests/test_data/list_test.json' instance.list = 'features' actual = instance.list_model() assert actual == expected
def test_serialize_label_encoder(): instance = HostFootprint() le_classes = ['printer', 'workstation', 'server'] le = preprocessing.LabelEncoder() le.fit(le_classes) with tempfile.TemporaryDirectory() as tmpdir: le_file = os.path.join(tmpdir, 'le.json') instance.serialize_label_encoder(le, le_file) new_le = instance.deserialize_label_encoder(le_file) assert le.classes_.tolist() == new_le.classes_.tolist() assert new_le.inverse_transform( le.transform(le_classes)).tolist() == le_classes
def test_train_bad_data_too_few_columns(): """ This test tries to train a model on a mal-formed csv with too few fields """ with tempfile.TemporaryDirectory() as tmpdir: testdata = os.path.join(tmpdir, 'test_data') shutil.copytree('./tests/test_data', testdata) input_file = os.path.join(testdata, 'bad_data_too_few_columns.csv') operation = 'train' sys.argv = hf_args(tmpdir, operation, input_file) instance = HostFootprint() with pytest.raises(Exception): instance.main()
def test_get_individual_predictions(): le_classes = ['asomething', 'bsomething'] le = preprocessing.LabelEncoder() le.fit(le_classes) filename = ['firstfile'] host_key = np.array(['mac1']) tshark_srcips = np.array(["['1.1.1.1']"]) frame_epoch = None instance = HostFootprint() assert instance.get_individual_predictions([[0.6, 0.7]], le, filename, host_key, tshark_srcips, frame_epoch) == { 'firstfile': [{ 'top_role': 'bsomething', 'role_list': [('bsomething', 0.7), ('asomething', 0.6)], 'source_ip': '1.1.1.1', 'source_mac': 'mac1' }] } assert instance.get_individual_predictions([[0.2, 0.1]], le, filename, host_key, tshark_srcips, frame_epoch) == { 'firstfile': [{ 'top_role': 'Unknown', 'role_list': [('asomething', 0.2), ('bsomething', 0.1)], 'source_ip': '1.1.1.1', 'source_mac': 'mac1' }] }
def test_serialize_model(): instance = HostFootprint() model = MLPClassifier() label_binarizer = LabelBinarizer() label_binarizer.neg_label = 0 label_binarizer.pos_label = 1 label_binarizer.sparse_output = False label_binarizer.y_type_ = "binary" label_binarizer.sparse_input_ = False label_binarizer.classes_ = np.array([0]) parameters = {'hidden_layer_sizes': [(64, 32)]} GridSearchCV(model, parameters, cv=5, n_jobs=-1, scoring='f1_weighted') model.coefs_ = np.array([[1], [2]]) model.loss_ = 42 model.intercepts_ = np.array([[3], [4]]) model.classes_ = np.array([[5], [6]]) model.n_iter_ = 42 model.n_layers_ = 2 model.n_outputs_ = 1 model.out_activation_ = "logistic" model._label_binarizer = label_binarizer model.features = ['test_1', 'test_2', 'test_3'] with tempfile.TemporaryDirectory() as tmpdir: model_file = os.path.join(tmpdir, 'host_footprint.json') instance.serialize_model(model, model_file) new_model = instance.deserialize_model(model_file) assert model.features == new_model.features print(f"model params: {model.get_params()}") print(f"new_model params: {new_model.get_params()}") assert len(model.get_params()['hidden_layer_sizes']) == len( new_model.get_params()['hidden_layer_sizes']) assert model._label_binarizer.y_type_ == new_model._label_binarizer.y_type_ assert len(model.coefs_) == len(new_model.coefs_) assert len(model.intercepts_) == len(new_model.intercepts_)
def run_algorithm_stage(self, in_path): raw_args = self.add_opt_args(self.stage_args['algorithm']) raw_args.extend(['-O', self.operation, '-v', self.log_level, in_path]) instance = HostFootprint(raw_args=raw_args) return instance.main()