Beispiel #1
0
def test_resolve_patterns_in_dataset_repository_sorted_files():
    unsorted_names = ["0.txt", "2.txt", "3.txt"]
    siblings = [{"rfilename": name} for name in unsorted_names]
    datasets_infos = DatasetInfo(id="test_unsorted_files", siblings=siblings, sha="foobar")
    resolved_data_files = resolve_patterns_in_dataset_repository(datasets_infos, ["*"])
    resolved_names = [os.path.basename(data_file) for data_file in resolved_data_files]
    assert resolved_names == sorted(unsorted_names)
def hub_dataset_info(complex_data_dir):
    return DatasetInfo(
        siblings=[{
            "rfilename": path.relative_to(complex_data_dir).as_posix()
        } for path in Path(complex_data_dir).rglob("*") if path.is_file()],
        sha="foobarfoobar",
        id="foo",
    )
Beispiel #3
0
def test_fail_resolve_data_files_in_dataset_repository(complex_data_dir):
    dataset_info = DatasetInfo(
        siblings=[
            {"rfilename": path.relative_to(complex_data_dir).as_posix()}
            for path in Path(complex_data_dir).rglob("*")
            if path.is_file()
        ]
    )
    with pytest.raises(FileNotFoundError):
        _resolve_data_files_in_dataset_repository(dataset_info, "blablabla")
Beispiel #4
0
def test_resolve_data_files_in_dataset_repository_with_extensions(complex_data_dir, pattern, size, extensions):
    dataset_info = DatasetInfo(
        siblings=[
            {"rfilename": path.relative_to(complex_data_dir).as_posix()}
            for path in Path(complex_data_dir).rglob("*")
            if path.is_file()
        ]
    )
    if size > 0:
        resolved_data_files = _resolve_data_files_in_dataset_repository(
            dataset_info, pattern, allowed_extensions=extensions
        )
        assert len(resolved_data_files) == size
    else:
        with pytest.raises(FileNotFoundError):
            resolved_data_files = _resolve_data_files_in_dataset_repository(
                dataset_info, pattern, allowed_extensions=extensions
            )
Beispiel #5
0
def test_resolve_data_files_in_dataset_repository(complex_data_dir, pattern, size):
    dataset_info = DatasetInfo(
        siblings=[
            {"rfilename": path.relative_to(complex_data_dir).as_posix()}
            for path in Path(complex_data_dir).rglob("*")
            if path.is_file()
        ]
    )
    resolved_data_files = _resolve_data_files_in_dataset_repository(dataset_info, pattern)
    files_to_ignore = {".dummy", "README.md"}
    expected_resolved_data_files = [
        path.relative_to(complex_data_dir)
        for path in Path(complex_data_dir).rglob(pattern)
        if path.name not in files_to_ignore and path.is_file()
    ]
    assert len(resolved_data_files) == size
    assert sorted(resolved_data_files) == sorted(expected_resolved_data_files)
    assert all(isinstance(path, PurePath) for path in resolved_data_files)
    assert all((Path(complex_data_dir) / path).is_file() for path in resolved_data_files)