def test_split_default_args(mocker): directory = tests_utils.get_directory( "dataset", pathlib.Path(__file__).stem, inspect.currentframe().f_code.co_name) dataset_path = directory dataset_name = "ids2018full" mocker.patch( "nd00333.dataset.split.split.parse_args", return_value=mock_args( dataset_path=dataset_path, dataset_name=dataset_name, ), ) directory_full = pathlib.Path(directory, dataset_name) directory_full.mkdir(parents=False, exist_ok=False) filename = pathlib.Path(directory_full, "data.csv") data = list(zip(range(20), range(20))) # 20x2 list of tuples columns = ["value", "target"] _ = pd.DataFrame(data=data, columns=columns).to_csv(filename, index=False) x_train, x_test, y_train, y_test = split.split(split.parse_args()) assert x_train.shape == (10, 1) assert x_test.shape == (10, 1) assert y_train.shape == (10, ) assert y_test.shape == (10, ) shutil.rmtree(directory)
def test_datastore_upload_files_overwrite_false_succeed(mocker): directory = tests_utils.get_directory( "dataset", pathlib.Path(__file__).stem, inspect.currentframe().f_code.co_name ) dataset_path = directory.parent dataset_name = directory.name mocker.patch( "nd00333.dataset.register.register.parse_args", return_value=mock_args( dataset_path=dataset_path, dataset_name=dataset_name, dataset_version="2" ), ) mocker.patch( "nd00333.dataset.register.register.upload_files", return_value="", ) filename = pathlib.Path(directory, "data.csv") data = [["Benign"]] columns = ["Label"] _ = pd.DataFrame(data=data, columns=columns).to_csv(filename, index=False) args = register.parse_args() datastore_path, target_path = register.datastore_upload_files(register.parse_args()) assert len(datastore_path) == 1 assert target_path == f"{args.dataset_name}_{args.dataset_version}" shutil.rmtree(directory)
def test_get_df_from_csv_default(): directory = tests_utils.get_directory( "dataset", pathlib.Path(__file__).stem, inspect.currentframe().f_code.co_name) filename = pathlib.Path(directory, "data.csv") data = [["Benign"]] columns = ["Label"] _ = pd.DataFrame(data=data, columns=columns).to_csv(filename, index=False) df = load.get_df_from_csv(filename) assert list(df.columns) == ["Label"] assert list(df.dtypes) == ["object"] shutil.rmtree(directory)
def test_get_df_from_csv_dtype_int(): directory = tests_utils.get_directory( "dataset", pathlib.Path(__file__).stem, inspect.currentframe().f_code.co_name) filename = pathlib.Path(directory, "data.csv") data = ["123", "456"] dtype = [("Label", np.dtype(int))] records = np.array(data, dtype=dtype) _ = pd.DataFrame.from_records(records).to_csv(filename, index=False) df = load.get_df_from_csv(filename) assert list(df.columns) == ["Label"] assert list(df.dtypes) == ["int"] shutil.rmtree(directory)
def test_get_df_from_csv_usecols(): directory = tests_utils.get_directory( "dataset", pathlib.Path(__file__).stem, inspect.currentframe().f_code.co_name) filename = pathlib.Path(directory, "data.csv") data = [("1", "Benign"), ("2", "Malicious")] dtype = [("dummy1", np.dtype(int)), ("Label", np.dtype(object))] records = np.array(data, dtype=dtype) usecols = ["Label"] _ = pd.DataFrame.from_records(records).to_csv(filename, index=False) df = load.get_df_from_csv(filename, usecols=usecols) assert list(df.columns) == ["Label"] assert list(df.dtypes) == ["object"] shutil.rmtree(directory)
def test_get_df_from_directory_default(): directory = tests_utils.get_directory( "dataset", pathlib.Path(__file__).stem, inspect.currentframe().f_code.co_name) columns = ["Label"] for iter in [0, 1, 2]: _ = pd.DataFrame(data=[iter], columns=columns).to_csv(pathlib.Path( directory, str(iter) + ".csv"), index=False) df = load.get_df_from_directory(directory) pd.testing.assert_frame_equal( df, pd.DataFrame( data=[[0], [1], [2]], columns=columns, ), ) shutil.rmtree(directory)
def test_split_nondefault_args(mocker): directory = tests_utils.get_directory( "dataset", pathlib.Path(__file__).stem, inspect.currentframe().f_code.co_name) dataset_path = directory dataset_name = "full" dataset_name_train = "training" dataset_name_test = "testing" sample_fraction = 0.5 test_size = 0.3 target_label = "label" mocker.patch( "nd00333.dataset.split.split.parse_args", return_value=mock_args( dataset_path=dataset_path, dataset_name=dataset_name, dataset_name_train=dataset_name_train, dataset_name_test=dataset_name_test, sample_fraction=sample_fraction, test_size=test_size, target_label=target_label, ), ) directory_full = pathlib.Path(directory, dataset_name) directory_full.mkdir(parents=False, exist_ok=False) filename = pathlib.Path(directory_full, "data.csv") data = list(zip(range(20), range(20))) # 20x2 list of tuples columns = ["value", "label"] _ = pd.DataFrame(data=data, columns=columns).to_csv(filename, index=False) x_train, x_test, y_train, y_test = split.split(split.parse_args()) assert x_train.shape == (7, 1) assert x_test.shape == (3, 1) assert y_train.shape == (7, ) assert y_test.shape == (3, ) shutil.rmtree(directory)