def dataset_name(): name = naming.DatasetName('ds1') assert name.name == 'ds1' assert name.namespace is None assert str(name) == 'ds1' assert repr(name) == "DatasetName('ds1')" name = naming.DatasetName('namespace123:ds1') assert name.name == 'ds1' assert name.namespace == 'namespace123' assert str(name) == 'namespace123:ds1' assert repr(name) == "DatasetName('namespace123:ds1')" name = naming.DatasetName(name='ds1', namespace='namespace123') assert name.name == 'ds1' assert name.namespace == 'namespace123' assert str(name) == 'namespace123:ds1' assert repr(name) == "DatasetName('namespace123:ds1')" name = naming.DatasetName(name='ds1') assert name.name == 'ds1' assert name.namespace is None assert str(name) == 'ds1' assert repr(name) == "DatasetName('ds1')" with pytest.raises(ValueError, match='Mixing args and kwargs'): name = naming.DatasetName('namespace123', name='abc')
def _get_builder_names_single_namespace( ns_name: str, data_dir: epath.Path, ) -> List[str]: # Note: `data_dir` might contain non-dataset folders, but checking # individual dataset would have significant performance drop, so # this is an acceptable trade-of. return [ str(naming.DatasetName(namespace=ns_name, name=builder_dir.name)) for builder_dir in _maybe_iterdir(data_dir) if _is_valid_dataset_name(builder_dir.name) ]
def test_parse_builder_name_kwargs_with_kwargs(): parse = naming.parse_builder_name_kwargs assert parse('ds1', data_dir='/abc') == ( naming.DatasetName('ds1'), {'data_dir': '/abc'} ) with pytest.raises(TypeError, match='got multiple values for keyword arg'): parse('ds1:1.0.0', version='1.0.0') # Version defined twice with pytest.raises(ValueError, match='Parsing builder name string .* failed'): parse('ds/config:ns:1.0.0')
def test_register_builder(dummy_register): # pylint: disable=redefined-outer-name builder = dummy_register.builder(naming.DatasetName('kaggle:ds0')) assert 'kaggle' in builder.data_path.parts # Same dataset name can be loaded from different namespace builder = dummy_register.builder(naming.DatasetName('mlds:ds0')) assert 'mlds' in builder.data_path.parts builder = dummy_register.builder( naming.DatasetName('mlds:ds0'), data_dir=None, # data_dir can be passed only if None version='1.0.0', ) assert 'mlds' in builder.data_path.parts with pytest.raises(ValueError, match='`data_dir` cannot be set for'): dummy_register.builder( naming.DatasetName('mlds:ds0'), data_dir='/path/to/data_dir') with pytest.raises( registered.DatasetNotFoundError, match='Namespace .* not found.'): dummy_register.builder(naming.DatasetName('non-existing-namespace:ds0')) with pytest.raises(registered.DatasetNotFoundError): dummy_register.builder(naming.DatasetName('other:ds0'))
def test_builder_cls(dummy_register): # pylint: disable=redefined-outer-name # The dataset will be installed in the cache installed_path = cache.cache_path() installed_path /= 'modules/tfds_community/kaggle/dummy_dataset' assert not installed_path.exists() ds_name = naming.DatasetName('kaggle:dummy_dataset') builder_cls = dummy_register.builder_cls(ds_name) assert builder_cls.name == 'dummy_dataset' clshash = 'e58f413affd65c267bae7acbd27fd5ac673d3e3ae13c316ffc2a461d00c8ab56' assert installed_path / f'{clshash}/dummy_dataset.py' == builder_cls.code_path assert 'kaggle' in builder_cls.code_path.parts assert issubclass(builder_cls, dataset_builder.DatasetBuilder) assert not builder_cls.url_infos # No checksums installed with the package # Dataset installed in the cache # Filename should be deterministic assert list(sorted(installed_path.iterdir())) == [installed_path / clshash] # Reusing the dataset should re-use the cache with mock.patch.object( register_package, '_download_and_cache', side_effect=ValueError('Dataset should have been cached already')): ds_name = naming.DatasetName('kaggle:dummy_dataset') builder_cls2 = dummy_register.builder_cls(ds_name) assert builder_cls is builder_cls2 # Datasets from different namespace can have the same name ds_name = naming.DatasetName('mlds:dummy_dataset') builder_cls = dummy_register.builder_cls(ds_name) assert 'mlds' in builder_cls.code_path.parts assert issubclass(builder_cls, dataset_builder.DatasetBuilder) # Checksums have been correctly installed assert 'http://dummy.org/data.txt' in builder_cls.url_infos with pytest.raises(registered.DatasetNotFoundError): dummy_register.builder(naming.DatasetName('other:ds0'))
def test_dataset_package(): """Exports/imports operation should be identity.""" pkg = register_package.DatasetPackage( name=naming.DatasetName('ns:ds'), source=dataset_sources.DatasetSource.from_json( 'github://<owner>/<name>/tree/<branch>/my_ds/ds.py',), ) assert register_package.DatasetPackage.from_json(pkg.to_json()) == pkg pkg2 = register_package._InstalledPackage( package=pkg, instalation_date=datetime.datetime.now(), hash='asdajhdadsadsad', ) assert register_package._InstalledPackage.from_json(pkg2.to_json()) == pkg2
def test_naming_sorted(): assert sorted([ naming.DatasetName('zzz:aaa'), naming.DatasetName('aaa:zzz'), naming.DatasetName('aaa:aaa'), ]) == [ naming.DatasetName('aaa:aaa'), naming.DatasetName('aaa:zzz'), naming.DatasetName('zzz:aaa'), ]
def list_ds_packages_for_namespace( namespace: str, path: epath.Path, ) -> List[DatasetPackage]: """Returns the dataset names found in a specific directory. Directories that contain code should have the following structure: ``` <path>/ <dataset0>/ <dataset0>.py <dataset1>/ <dataset1>.py ... ``` Additional files or folders which are not detected as datasets will be ignored (e.g. `__init__.py`). Args: namespace: Namespace of the datasets path: The directory path containing the datasets. Returns: ds_packages: The dataset packages found in the directory (sorted for determinism). Raises: FileNotFoundError: If the path cannot be reached. """ if not path.exists(): # Should be fault-tolerant in the future raise FileNotFoundError(f'Could not find datasets at {path}') all_packages = [] for ds_path in path.iterdir(): source = get_dataset_source(ds_path) if source: pkg = DatasetPackage( name=naming.DatasetName(namespace=namespace, name=ds_path.name), source=source, ) all_packages.append(pkg) return all_packages
assert repr(name) == "DatasetName('namespace123:ds1')" name = naming.DatasetName(name='ds1') assert name.name == 'ds1' assert name.namespace is None assert str(name) == 'ds1' assert repr(name) == "DatasetName('ds1')" with pytest.raises(ValueError, match='Mixing args and kwargs'): name = naming.DatasetName('namespace123', name='abc') @pytest.mark.parametrize( ['name', 'result'], [ ('ds1', (naming.DatasetName('ds1'), {})), ('ds1:1.0.0', (naming.DatasetName('ds1'), { 'version': '1.0.0' })), ('ns1:ds1', (naming.DatasetName('ns1:ds1'), {})), ('hugging_face:abc', (naming.DatasetName(namespace='hugging_face', name='abc'), {})), ('ns_1-b:ds1', (naming.DatasetName(namespace='ns_1-b', name='ds1'), {})), ( 'ns1:ds1:1.0.0', (naming.DatasetName('ns1:ds1'), { 'version': '1.0.0' }), ), ('ns1:ds1/conf:1.0.0', (naming.DatasetName('ns1:ds1'), {
assert repr(name) == "DatasetName('namespace123:ds1')" name = naming.DatasetName(name='ds1') assert name.name == 'ds1' assert name.namespace is None assert str(name) == 'ds1' assert repr(name) == "DatasetName('ds1')" with pytest.raises(ValueError, match='Mixing args and kwargs'): name = naming.DatasetName('namespace123', name='abc') @pytest.mark.parametrize( ['name', 'result'], [ ('ds1', (naming.DatasetName('ds1'), {})), ('ds1:1.0.0', (naming.DatasetName('ds1'), { 'version': '1.0.0' })), ('ns1:ds1', (naming.DatasetName('ns1:ds1'), {})), ( 'ns1:ds1:1.0.0', (naming.DatasetName('ns1:ds1'), { 'version': '1.0.0' }), ), ('ns1:ds1/conf:1.0.0', (naming.DatasetName('ns1:ds1'), { 'version': '1.0.0', 'config': 'conf', })), ],
assert repr(name) == "DatasetName('namespace123:ds1')" name = naming.DatasetName(name='ds1') assert name.name == 'ds1' assert name.namespace is None assert str(name) == 'ds1' assert repr(name) == "DatasetName('ds1')" with pytest.raises(ValueError, match='Mixing args and kwargs'): name = naming.DatasetName('namespace123', name='abc') @pytest.mark.parametrize( ['name', 'result'], [ ('ds1', (naming.DatasetName('ds1'), {})), ('ds1:1.0.0', (naming.DatasetName('ds1'), { 'version': '1.0.0' })), ('ns1:ds1', (naming.DatasetName('ns1:ds1'), {})), ( 'ns1:ds1:1.0.0', (naming.DatasetName('ns1:ds1'), { 'version': '1.0.0' }), ), ('ns1:ds1/conf:1.0.0', (naming.DatasetName('ns1:ds1'), { 'version': '1.0.0', 'config': 'conf', })), ('grand-vision:katr/128x128:1.0.0',
def from_json(cls, data: utils.Json) -> 'DatasetPackage': """Factory which creates the cls from json.""" return cls( name=naming.DatasetName(namespace_name=data['name']), source=dataset_sources_lib.DatasetSource.from_json(data['source']), )