Ejemplo n.º 1
0
def test_read_only_builder(code_builder: dataset_builder.DatasetBuilder):
  """Builder can be created from the files only."""

  # Reconstruct the dataset
  builder = read_only_builder.builder_from_directory(code_builder.data_dir)
  assert builder.name == code_builder.name
  assert builder.data_dir == code_builder.data_dir
  assert builder.info.version == code_builder.info.version
  assert builder.info.full_name == code_builder.info.full_name
  assert repr(builder.info) == repr(code_builder.info)
  assert builder.VERSION == code_builder.info.version
  assert builder.__module__ == type(code_builder).__module__
  assert read_only_builder.ReadOnlyBuilder.VERSION is None

  if code_builder.builder_config:
    assert builder.builder_config
    code_config = code_builder.builder_config
    file_config = builder.builder_config
    # Config attributes should be restored too
    assert code_config.name == file_config.name
    assert code_config.description == file_config.description
    assert code_config.version == file_config.version

  # Test that the dataset can be read
  ds = dataset_utils.as_numpy(builder.as_dataset(split='train').take(5))
  origin_ds = dataset_utils.as_numpy(builder.as_dataset(split='train').take(5))
  assert [ex['id'] for ex in ds] == [ex['id'] for ex in origin_ds]

  builder.download_and_prepare()  # Should be a no-op
Ejemplo n.º 2
0
def test_write_metadata(
    tmp_path: pathlib.Path,
    file_format,
):
  tmp_path = utils.as_path(tmp_path)

  src_builder = testing.DummyDataset(
      data_dir=tmp_path / 'origin',
      file_format=file_format,
  )
  src_builder.download_and_prepare()

  dst_dir = tmp_path / 'copy'
  dst_dir.mkdir()

  # Copy all the tfrecord files, but not the dataset info
  for f in src_builder.data_path.iterdir():
    if naming.FilenameInfo.is_valid(f.name):
      f.copy(dst_dir / f.name)

  metadata_path = dst_dir / 'dataset_info.json'

  if file_format is None:
    split_infos = list(src_builder.info.splits.values())
  else:
    split_infos = None  # Auto-compute split infos

  assert not metadata_path.exists()
  write_metadata_utils.write_metadata(
      data_dir=dst_dir,
      features=src_builder.info.features,
      split_infos=split_infos,
      description='my test description.')
  assert metadata_path.exists()

  # After metadata are written, builder can be restored from the directory
  builder = read_only_builder.builder_from_directory(dst_dir)
  assert builder.name == 'dummy_dataset'
  assert builder.version == '1.0.0'
  assert set(builder.info.splits) == {'train'}
  assert builder.info.splits['train'].num_examples == 3
  assert builder.info.description == 'my test description.'

  # Values are the same
  src_ds = src_builder.as_dataset(split='train')
  ds = builder.as_dataset(split='train')
  assert list(src_ds.as_numpy_iterator()) == list(ds.as_numpy_iterator())
Ejemplo n.º 3
0
def test_read_only_builder(code_builder: dataset_builder.DatasetBuilder):
    """Builder can be created from the files only."""

    # Reconstruct the dataset
    builder = read_only_builder.builder_from_directory(code_builder.data_dir)
    assert builder.name == code_builder.name
    assert builder.data_dir == code_builder.data_dir
    assert builder.info.version == code_builder.info.version
    assert builder.info.full_name == code_builder.info.full_name
    assert repr(builder.info) == repr(code_builder.info)
    assert builder.VERSION is None

    # Test that the dataset can be read
    ds = dataset_utils.as_numpy(builder.as_dataset(split='train').take(5))
    origin_ds = dataset_utils.as_numpy(
        builder.as_dataset(split='train').take(5))
    assert [ex['label'] for ex in ds] == [ex['label'] for ex in origin_ds]

    builder.download_and_prepare()  # Should be a no-op
Ejemplo n.º 4
0
def test_read_only_builder(
    builder_cls: dataset_builder.DatasetBuilder,
    tmp_path: pathlib.Path,
):
  # Generate the dataset
  origin_builder = builder_cls(data_dir=tmp_path)
  origin_builder.download_and_prepare()

  # Reconstruct the dataset
  builder = read_only_builder.builder_from_directory(origin_builder.data_dir)
  assert builder.name == origin_builder.name
  assert builder.data_dir == origin_builder.data_dir
  assert builder.info.version == origin_builder.info.version
  assert builder.info.full_name == origin_builder.info.full_name
  assert repr(builder.info) == repr(origin_builder.info)
  assert builder.VERSION is None

  # Test that the dataset can be read
  ds = dataset_utils.as_numpy(builder.as_dataset(split='train').take(5))
  origin_ds = dataset_utils.as_numpy(builder.as_dataset(split='train').take(5))
  assert [ex['label'] for ex in ds] == [ex['label'] for ex in origin_ds]

  builder.download_and_prepare()  # Should be a no-op
Ejemplo n.º 5
0
def builder(
    name: str,
    *,
    data_dir: Optional[str] = None,
    **builder_init_kwargs: Any
) -> dataset_builder.DatasetBuilder:
  """Fetches a `tfds.core.DatasetBuilder` by string name.

  Args:
    name: `str`, the registered name of the `DatasetBuilder` (the class name
      as camel or snake case: `MyDataset` or `my_dataset`).
      This can be either `'dataset_name'` or
      `'dataset_name/config_name'` for datasets with `BuilderConfig`s.
      As a convenience, this string may contain comma-separated keyword
      arguments for the builder. For example `'foo_bar/a=True,b=3'` would use
      the `FooBar` dataset passing the keyword arguments `a=True` and `b=3`
      (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to
      use the `'zoo'` config and pass to the builder keyword arguments `a=True`
      and `b=3`).
    data_dir: Path to the dataset(s). See `tfds.load` for more information.
    **builder_init_kwargs: `dict` of keyword arguments passed to the
      `DatasetBuilder`. These will override keyword arguments passed in `name`,
      if any.

  Returns:
    A `tfds.core.DatasetBuilder`.

  Raises:
    DatasetNotFoundError: if `name` is unrecognized.
  """
  builder_name, builder_kwargs = _dataset_name_and_kwargs_from_name_str(name)

  # Try loading the code (if it exists)
  try:
    cls = builder_cls(builder_name)
  except DatasetNotFoundError as e:
    if e.is_abstract:
      raise  # Abstract can't be instanciated neither from code nor files.
    cls = None  # Class not found
    not_found_error = e  # Save the exception to eventually reraise

  version_explicitly_given = "version" in builder_kwargs

  # Try loading from files first:
  # * If code not present.
  # * If version explicitly given (backward/forward compatibility).
  # Note: If `builder_init_kwargs` are set (e.g. version='experimental_latest',
  # custom config,...), read from generation code.
  if (not cls or version_explicitly_given) and not builder_init_kwargs:
    builder_dir = find_builder_dir(name, data_dir=data_dir)
    if builder_dir is not None:  # A generated dataset was found on disk
      return read_only_builder.builder_from_directory(builder_dir)

  # If loading from files was skipped (e.g. files not found), load from the
  # source code.
  if cls:
    with py_utils.try_reraise(prefix=f"Failed to construct dataset {name}: "):
      return cls(data_dir=data_dir, **builder_kwargs, **builder_init_kwargs)  # pytype: disable=not-instantiable

  # If neither the code nor the files are found, raise DatasetNotFoundError
  raise not_found_error
Ejemplo n.º 6
0
def test_not_exists(tmp_path: pathlib.Path):
  with pytest.raises(
      FileNotFoundError, match='Could not load `ReadOnlyBuilder`'
  ):
    read_only_builder.builder_from_directory(tmp_path)
Ejemplo n.º 7
0
def write_metadata(
    *,
    data_dir: epath.PathLike,
    features: features_lib.feature.FeatureConnectorArg,
    split_infos: Union[None, epath.PathLike, List[split_lib.SplitInfo]] = None,
    version: Union[None, str, utils.Version] = None,
    check_data: bool = True,
    **ds_info_kwargs,
) -> None:
    """Add metadata required to load with TFDS.

  See documentation for usage:
  https://www.tensorflow.org/datasets/external_tfrecord

  Args:
    data_dir: Dataset path on which to save the metadata
    features: dict of `tfds.features.FeatureConnector` matching the proto specs.
    split_infos: Can be either:  * A path to the pre-computed split info values
      ( the `out_dir` kwarg of `tfds.folder_dataset.compute_split_info`) * A
      list of `tfds.core.SplitInfo` (returned value of
      `tfds.folder_dataset.compute_split_info`) * `None` to auto-compute the
      split info.
    version: Optional dataset version (auto-infer by default, or fallback to
      1.0.0)
    check_data: If True, perform additional check to validate the data in
      data_dir is valid
    **ds_info_kwargs: Additional metadata forwarded to `tfds.core.DatasetInfo` (
      description, homepage,...). Will appear in the doc.
  """
    features = features_lib.features_dict.to_feature(features)
    data_dir = epath.Path(data_dir)
    # Extract the tf-record filenames
    tfrecord_files = [
        f for f in data_dir.iterdir() if naming.FilenameInfo.is_valid(f.name)
    ]
    if not tfrecord_files:
        raise ValueError(
            f'Could not find tf-record (or compatible format) in {data_dir}. '
            'Make sure to follow the pattern: '
            '`<dataset_name>-<split_name>.<file-extension>-xxxxxx-of-yyyyyy`')

    file_infos = [naming.FilenameInfo.from_str(f.name) for f in tfrecord_files]

    # Use set with tuple expansion syntax to ensure all names are consistents
    snake_name, = {f.dataset_name for f in file_infos}
    camel_name = naming.snake_to_camelcase(snake_name)
    filetype_suffix, = {f.filetype_suffix for f in file_infos}
    file_format = file_adapters.file_format_from_suffix(filetype_suffix)

    cls = types.new_class(
        camel_name,
        bases=(_WriteBuilder, ),
        kwds=dict(skip_registration=True),
        exec_body=None,
    )

    if version is None:  # Automatically detect the version
        if utils.Version.is_valid(data_dir.name):
            version = data_dir.name
        else:
            version = '1.0.0'
    cls.VERSION = utils.Version(version)

    # Create a dummy builder (use non existant folder to make sure
    # dataset_info.json is not restored)
    builder = cls(file_format=file_format, data_dir='/tmp/non-existent-dir/')

    # Create the metadata
    ds_info = dataset_info.DatasetInfo(
        builder=builder,
        features=features,
        **ds_info_kwargs,
    )
    ds_info.set_file_format(file_format)

    # Add the split infos
    split_dict = _load_splits(
        data_dir=data_dir,
        split_infos=split_infos,
        file_infos=file_infos,
        filetype_suffix=filetype_suffix,
        builder=builder,
    )
    ds_info.set_splits(split_dict)

    # Save all metadata (dataset_info.json, features.json,...)
    ds_info.write_to_directory(data_dir)

    # Make sure that the data can be loaded (feature connector match the actual
    # specs)
    if check_data:
        utils.print_notebook(
            'Metadata written. Testing by reading first example. '
            'Set check_data=False to skip.')
        builder = read_only_builder.builder_from_directory(data_dir)
        split_name = next(iter(builder.info.splits))
        _, = builder.as_dataset(
            split=f'{split_name}[:1]')  # Load the first example
Ejemplo n.º 8
0
def test_builder_from_directory_dir_not_exists(tmp_path: pathlib.Path):
  with pytest.raises(FileNotFoundError, match='Could not load dataset info'):
    read_only_builder.builder_from_directory(tmp_path)