Esempio n. 1
0
def dataset_name():
  name = naming.DatasetName('ds1')
  assert name.name == 'ds1'
  assert name.namespace is None
  assert str(name) == 'ds1'
  assert repr(name) == "DatasetName('ds1')"

  name = naming.DatasetName('namespace123:ds1')
  assert name.name == 'ds1'
  assert name.namespace == 'namespace123'
  assert str(name) == 'namespace123:ds1'
  assert repr(name) == "DatasetName('namespace123:ds1')"

  name = naming.DatasetName(name='ds1', namespace='namespace123')
  assert name.name == 'ds1'
  assert name.namespace == 'namespace123'
  assert str(name) == 'namespace123:ds1'
  assert repr(name) == "DatasetName('namespace123:ds1')"

  name = naming.DatasetName(name='ds1')
  assert name.name == 'ds1'
  assert name.namespace is None
  assert str(name) == 'ds1'
  assert repr(name) == "DatasetName('ds1')"

  with pytest.raises(ValueError, match='Mixing args and kwargs'):
    name = naming.DatasetName('namespace123', name='abc')
Esempio n. 2
0
 def _get_builder_names_single_namespace(
     ns_name: str,
     data_dir: epath.Path,
 ) -> List[str]:
     # Note: `data_dir` might contain non-dataset folders, but checking
     # individual dataset would have significant performance drop, so
     # this is an acceptable trade-of.
     return [
         str(naming.DatasetName(namespace=ns_name, name=builder_dir.name))
         for builder_dir in _maybe_iterdir(data_dir)
         if _is_valid_dataset_name(builder_dir.name)
     ]
Esempio n. 3
0
def test_parse_builder_name_kwargs_with_kwargs():
  parse = naming.parse_builder_name_kwargs

  assert parse('ds1', data_dir='/abc') == (
      naming.DatasetName('ds1'), {'data_dir': '/abc'}
  )

  with pytest.raises(TypeError, match='got multiple values for keyword arg'):
    parse('ds1:1.0.0', version='1.0.0')  # Version defined twice

  with pytest.raises(ValueError, match='Parsing builder name string .* failed'):
    parse('ds/config:ns:1.0.0')
Esempio n. 4
0
def test_register_builder(dummy_register):  # pylint: disable=redefined-outer-name
  builder = dummy_register.builder(naming.DatasetName('kaggle:ds0'))
  assert 'kaggle' in builder.data_path.parts

  # Same dataset name can be loaded from different namespace
  builder = dummy_register.builder(naming.DatasetName('mlds:ds0'))
  assert 'mlds' in builder.data_path.parts

  builder = dummy_register.builder(
      naming.DatasetName('mlds:ds0'),
      data_dir=None,  # data_dir can be passed only if None
      version='1.0.0',
  )
  assert 'mlds' in builder.data_path.parts

  with pytest.raises(ValueError, match='`data_dir` cannot be set for'):
    dummy_register.builder(
        naming.DatasetName('mlds:ds0'), data_dir='/path/to/data_dir')

  with pytest.raises(
      registered.DatasetNotFoundError, match='Namespace .* not found.'):
    dummy_register.builder(naming.DatasetName('non-existing-namespace:ds0'))

  with pytest.raises(registered.DatasetNotFoundError):
    dummy_register.builder(naming.DatasetName('other:ds0'))
Esempio n. 5
0
def test_builder_cls(dummy_register):  # pylint: disable=redefined-outer-name

  # The dataset will be installed in the cache
  installed_path = cache.cache_path()
  installed_path /= 'modules/tfds_community/kaggle/dummy_dataset'
  assert not installed_path.exists()

  ds_name = naming.DatasetName('kaggle:dummy_dataset')
  builder_cls = dummy_register.builder_cls(ds_name)
  assert builder_cls.name == 'dummy_dataset'

  clshash = 'e58f413affd65c267bae7acbd27fd5ac673d3e3ae13c316ffc2a461d00c8ab56'
  assert installed_path / f'{clshash}/dummy_dataset.py' == builder_cls.code_path
  assert 'kaggle' in builder_cls.code_path.parts
  assert issubclass(builder_cls, dataset_builder.DatasetBuilder)
  assert not builder_cls.url_infos  # No checksums installed with the package

  # Dataset installed in the cache
  # Filename should be deterministic
  assert list(sorted(installed_path.iterdir())) == [installed_path / clshash]

  # Reusing the dataset should re-use the cache
  with mock.patch.object(
      register_package,
      '_download_and_cache',
      side_effect=ValueError('Dataset should have been cached already')):
    ds_name = naming.DatasetName('kaggle:dummy_dataset')
    builder_cls2 = dummy_register.builder_cls(ds_name)
  assert builder_cls is builder_cls2

  # Datasets from different namespace can have the same name
  ds_name = naming.DatasetName('mlds:dummy_dataset')
  builder_cls = dummy_register.builder_cls(ds_name)
  assert 'mlds' in builder_cls.code_path.parts
  assert issubclass(builder_cls, dataset_builder.DatasetBuilder)
  # Checksums have been correctly installed
  assert 'http://dummy.org/data.txt' in builder_cls.url_infos

  with pytest.raises(registered.DatasetNotFoundError):
    dummy_register.builder(naming.DatasetName('other:ds0'))
Esempio n. 6
0
def test_dataset_package():
  """Exports/imports operation should be identity."""
  pkg = register_package.DatasetPackage(
      name=naming.DatasetName('ns:ds'),
      source=dataset_sources.DatasetSource.from_json(
          'github://<owner>/<name>/tree/<branch>/my_ds/ds.py',),
  )
  assert register_package.DatasetPackage.from_json(pkg.to_json()) == pkg

  pkg2 = register_package._InstalledPackage(
      package=pkg,
      instalation_date=datetime.datetime.now(),
      hash='asdajhdadsadsad',
  )
  assert register_package._InstalledPackage.from_json(pkg2.to_json()) == pkg2
Esempio n. 7
0
def test_naming_sorted():
  assert sorted([
      naming.DatasetName('zzz:aaa'),
      naming.DatasetName('aaa:zzz'),
      naming.DatasetName('aaa:aaa'),
  ]) == [
      naming.DatasetName('aaa:aaa'),
      naming.DatasetName('aaa:zzz'),
      naming.DatasetName('zzz:aaa'),
  ]
Esempio n. 8
0
def list_ds_packages_for_namespace(
    namespace: str,
    path: epath.Path,
) -> List[DatasetPackage]:
    """Returns the dataset names found in a specific directory.

  Directories that contain code should have the following structure:

  ```
  <path>/
      <dataset0>/
          <dataset0>.py
      <dataset1>/
          <dataset1>.py
      ...
  ```

  Additional files or folders which are not detected as datasets will be
  ignored (e.g. `__init__.py`).

  Args:
    namespace: Namespace of the datasets
    path: The directory path containing the datasets.

  Returns:
    ds_packages: The dataset packages found in the directory (sorted for
      determinism).

  Raises:
    FileNotFoundError: If the path cannot be reached.
  """
    if not path.exists():
        # Should be fault-tolerant in the future
        raise FileNotFoundError(f'Could not find datasets at {path}')

    all_packages = []
    for ds_path in path.iterdir():
        source = get_dataset_source(ds_path)
        if source:
            pkg = DatasetPackage(
                name=naming.DatasetName(namespace=namespace,
                                        name=ds_path.name),
                source=source,
            )
            all_packages.append(pkg)

    return all_packages
Esempio n. 9
0
  assert repr(name) == "DatasetName('namespace123:ds1')"

  name = naming.DatasetName(name='ds1')
  assert name.name == 'ds1'
  assert name.namespace is None
  assert str(name) == 'ds1'
  assert repr(name) == "DatasetName('ds1')"

  with pytest.raises(ValueError, match='Mixing args and kwargs'):
    name = naming.DatasetName('namespace123', name='abc')


@pytest.mark.parametrize(
    ['name', 'result'],
    [
        ('ds1', (naming.DatasetName('ds1'), {})),
        ('ds1:1.0.0', (naming.DatasetName('ds1'), {
            'version': '1.0.0'
        })),
        ('ns1:ds1', (naming.DatasetName('ns1:ds1'), {})),
        ('hugging_face:abc',
         (naming.DatasetName(namespace='hugging_face', name='abc'), {})),
        ('ns_1-b:ds1',
         (naming.DatasetName(namespace='ns_1-b', name='ds1'), {})),
        (
            'ns1:ds1:1.0.0',
            (naming.DatasetName('ns1:ds1'), {
                'version': '1.0.0'
            }),
        ),
        ('ns1:ds1/conf:1.0.0', (naming.DatasetName('ns1:ds1'), {
Esempio n. 10
0
    assert repr(name) == "DatasetName('namespace123:ds1')"

    name = naming.DatasetName(name='ds1')
    assert name.name == 'ds1'
    assert name.namespace is None
    assert str(name) == 'ds1'
    assert repr(name) == "DatasetName('ds1')"

    with pytest.raises(ValueError, match='Mixing args and kwargs'):
        name = naming.DatasetName('namespace123', name='abc')


@pytest.mark.parametrize(
    ['name', 'result'],
    [
        ('ds1', (naming.DatasetName('ds1'), {})),
        ('ds1:1.0.0', (naming.DatasetName('ds1'), {
            'version': '1.0.0'
        })),
        ('ns1:ds1', (naming.DatasetName('ns1:ds1'), {})),
        (
            'ns1:ds1:1.0.0',
            (naming.DatasetName('ns1:ds1'), {
                'version': '1.0.0'
            }),
        ),
        ('ns1:ds1/conf:1.0.0', (naming.DatasetName('ns1:ds1'), {
            'version': '1.0.0',
            'config': 'conf',
        })),
    ],
Esempio n. 11
0
  assert repr(name) == "DatasetName('namespace123:ds1')"

  name = naming.DatasetName(name='ds1')
  assert name.name == 'ds1'
  assert name.namespace is None
  assert str(name) == 'ds1'
  assert repr(name) == "DatasetName('ds1')"

  with pytest.raises(ValueError, match='Mixing args and kwargs'):
    name = naming.DatasetName('namespace123', name='abc')


@pytest.mark.parametrize(
    ['name', 'result'],
    [
        ('ds1', (naming.DatasetName('ds1'), {})),
        ('ds1:1.0.0', (naming.DatasetName('ds1'), {
            'version': '1.0.0'
        })),
        ('ns1:ds1', (naming.DatasetName('ns1:ds1'), {})),
        (
            'ns1:ds1:1.0.0',
            (naming.DatasetName('ns1:ds1'), {
                'version': '1.0.0'
            }),
        ),
        ('ns1:ds1/conf:1.0.0', (naming.DatasetName('ns1:ds1'), {
            'version': '1.0.0',
            'config': 'conf',
        })),
        ('grand-vision:katr/128x128:1.0.0',
Esempio n. 12
0
 def from_json(cls, data: utils.Json) -> 'DatasetPackage':
     """Factory which creates the cls from json."""
     return cls(
         name=naming.DatasetName(namespace_name=data['name']),
         source=dataset_sources_lib.DatasetSource.from_json(data['source']),
     )