def test_split_default_args(mocker):
    directory = tests_utils.get_directory(
        "dataset",
        pathlib.Path(__file__).stem,
        inspect.currentframe().f_code.co_name)
    dataset_path = directory
    dataset_name = "ids2018full"
    mocker.patch(
        "nd00333.dataset.split.split.parse_args",
        return_value=mock_args(
            dataset_path=dataset_path,
            dataset_name=dataset_name,
        ),
    )
    directory_full = pathlib.Path(directory, dataset_name)
    directory_full.mkdir(parents=False, exist_ok=False)
    filename = pathlib.Path(directory_full, "data.csv")
    data = list(zip(range(20), range(20)))  # 20x2 list of tuples
    columns = ["value", "target"]
    _ = pd.DataFrame(data=data, columns=columns).to_csv(filename, index=False)
    x_train, x_test, y_train, y_test = split.split(split.parse_args())
    assert x_train.shape == (10, 1)
    assert x_test.shape == (10, 1)
    assert y_train.shape == (10, )
    assert y_test.shape == (10, )
    shutil.rmtree(directory)
Exemple #2
0
def test_datastore_upload_files_overwrite_false_succeed(mocker):
    directory = tests_utils.get_directory(
        "dataset", pathlib.Path(__file__).stem, inspect.currentframe().f_code.co_name
    )
    dataset_path = directory.parent
    dataset_name = directory.name
    mocker.patch(
        "nd00333.dataset.register.register.parse_args",
        return_value=mock_args(
            dataset_path=dataset_path, dataset_name=dataset_name, dataset_version="2"
        ),
    )
    mocker.patch(
        "nd00333.dataset.register.register.upload_files",
        return_value="",
    )
    filename = pathlib.Path(directory, "data.csv")
    data = [["Benign"]]
    columns = ["Label"]
    _ = pd.DataFrame(data=data, columns=columns).to_csv(filename, index=False)
    args = register.parse_args()
    datastore_path, target_path = register.datastore_upload_files(register.parse_args())
    assert len(datastore_path) == 1
    assert target_path == f"{args.dataset_name}_{args.dataset_version}"
    shutil.rmtree(directory)
def test_get_df_from_csv_default():
    directory = tests_utils.get_directory(
        "dataset",
        pathlib.Path(__file__).stem,
        inspect.currentframe().f_code.co_name)
    filename = pathlib.Path(directory, "data.csv")
    data = [["Benign"]]
    columns = ["Label"]
    _ = pd.DataFrame(data=data, columns=columns).to_csv(filename, index=False)
    df = load.get_df_from_csv(filename)
    assert list(df.columns) == ["Label"]
    assert list(df.dtypes) == ["object"]
    shutil.rmtree(directory)
def test_get_df_from_csv_dtype_int():
    directory = tests_utils.get_directory(
        "dataset",
        pathlib.Path(__file__).stem,
        inspect.currentframe().f_code.co_name)
    filename = pathlib.Path(directory, "data.csv")
    data = ["123", "456"]
    dtype = [("Label", np.dtype(int))]
    records = np.array(data, dtype=dtype)
    _ = pd.DataFrame.from_records(records).to_csv(filename, index=False)
    df = load.get_df_from_csv(filename)
    assert list(df.columns) == ["Label"]
    assert list(df.dtypes) == ["int"]
    shutil.rmtree(directory)
def test_get_df_from_csv_usecols():
    directory = tests_utils.get_directory(
        "dataset",
        pathlib.Path(__file__).stem,
        inspect.currentframe().f_code.co_name)
    filename = pathlib.Path(directory, "data.csv")
    data = [("1", "Benign"), ("2", "Malicious")]
    dtype = [("dummy1", np.dtype(int)), ("Label", np.dtype(object))]
    records = np.array(data, dtype=dtype)
    usecols = ["Label"]
    _ = pd.DataFrame.from_records(records).to_csv(filename, index=False)
    df = load.get_df_from_csv(filename, usecols=usecols)
    assert list(df.columns) == ["Label"]
    assert list(df.dtypes) == ["object"]
    shutil.rmtree(directory)
def test_get_df_from_directory_default():
    directory = tests_utils.get_directory(
        "dataset",
        pathlib.Path(__file__).stem,
        inspect.currentframe().f_code.co_name)
    columns = ["Label"]
    for iter in [0, 1, 2]:
        _ = pd.DataFrame(data=[iter], columns=columns).to_csv(pathlib.Path(
            directory,
            str(iter) + ".csv"),
                                                              index=False)
    df = load.get_df_from_directory(directory)
    pd.testing.assert_frame_equal(
        df,
        pd.DataFrame(
            data=[[0], [1], [2]],
            columns=columns,
        ),
    )
    shutil.rmtree(directory)
def test_split_nondefault_args(mocker):
    directory = tests_utils.get_directory(
        "dataset",
        pathlib.Path(__file__).stem,
        inspect.currentframe().f_code.co_name)
    dataset_path = directory
    dataset_name = "full"
    dataset_name_train = "training"
    dataset_name_test = "testing"
    sample_fraction = 0.5
    test_size = 0.3
    target_label = "label"
    mocker.patch(
        "nd00333.dataset.split.split.parse_args",
        return_value=mock_args(
            dataset_path=dataset_path,
            dataset_name=dataset_name,
            dataset_name_train=dataset_name_train,
            dataset_name_test=dataset_name_test,
            sample_fraction=sample_fraction,
            test_size=test_size,
            target_label=target_label,
        ),
    )
    directory_full = pathlib.Path(directory, dataset_name)
    directory_full.mkdir(parents=False, exist_ok=False)
    filename = pathlib.Path(directory_full, "data.csv")
    data = list(zip(range(20), range(20)))  # 20x2 list of tuples
    columns = ["value", "label"]
    _ = pd.DataFrame(data=data, columns=columns).to_csv(filename, index=False)
    x_train, x_test, y_train, y_test = split.split(split.parse_args())
    assert x_train.shape == (7, 1)
    assert x_test.shape == (3, 1)
    assert y_train.shape == (7, )
    assert y_test.shape == (3, )
    shutil.rmtree(directory)