Esempio n. 1
0
def main(_):
    dataset_name = FLAGS.dataset
    dataset_type = FLAGS.type
    root_dir = FLAGS.tfds_dir
    if not root_dir:
        root_dir = py_utils.tfds_dir()

    data = dict(
        dataset_name=dataset_name,
        dataset_type=dataset_type,
        dataset_cls=naming.snake_to_camelcase(dataset_name),
        TODO='TODO({})'.format(dataset_name),
    )

    create_dataset_file(root_dir, data)
    add_the_init(root_dir, data)
    create_dataset_test_file(root_dir, data)
    create_fake_data(root_dir, data)
    create_checksum_file(root_dir, data)

    print(
        'Dataset generated in {}\n'
        'You can start with searching TODO({}).\n'
        'Please check this '
        '`https://github.com/tensorflow/datasets/blob/master/docs/add_dataset.md`'
        'for details.'.format(root_dir, dataset_name))
Esempio n. 2
0
    def __post_init__(self):
        self.cls_name = naming.snake_to_camelcase(self.name)
        self.tfds_api = ('tensorflow_datasets.public_api'
                         if self.in_tfds else 'tensorflow_datasets')
        self.todo = f'TODO({self.name})'

        if self.in_tfds:
            # `/path/to/tensorflow_datasets/image/my_dataset`
            # ->`tensorflow_datasets.image.my_dataset`
            import_parts = itertools.dropwhile(
                lambda p: p != 'tensorflow_datasets', self.path.parts)
            ds_import = '.'.join(import_parts)
        else:
            # For external datasets, it's difficult to correctly infer the full
            # `from my_module.path.datasets.my_dataset import MyDataset`.
            # Could try to auto-infer the absolute import path from the `setup.py`.
            # Instead uses relative import for now: `from . import my_dataset`
            ds_import = '.'
        self.ds_import = ds_import
Esempio n. 3
0
 def test_snake_to_camelcase(self, camel, snake):
   self.assertEqual(naming.snake_to_camelcase(snake), camel)
   # camelcase_to_snakecase is a no-op if the name is already snake_case.
   self.assertEqual(naming.camelcase_to_snakecase(snake), snake)
 def test_snake_to_camelcase(self, camel, snake):
     self.assertEqual(naming.snake_to_camelcase(snake), camel)
Esempio n. 5
0
def write_metadata(
    *,
    data_dir: epath.PathLike,
    features: features_lib.feature.FeatureConnectorArg,
    split_infos: Union[None, epath.PathLike, List[split_lib.SplitInfo]] = None,
    version: Union[None, str, utils.Version] = None,
    check_data: bool = True,
    **ds_info_kwargs,
) -> None:
    """Add metadata required to load with TFDS.

  See documentation for usage:
  https://www.tensorflow.org/datasets/external_tfrecord

  Args:
    data_dir: Dataset path on which to save the metadata
    features: dict of `tfds.features.FeatureConnector` matching the proto specs.
    split_infos: Can be either:  * A path to the pre-computed split info values
      ( the `out_dir` kwarg of `tfds.folder_dataset.compute_split_info`) * A
      list of `tfds.core.SplitInfo` (returned value of
      `tfds.folder_dataset.compute_split_info`) * `None` to auto-compute the
      split info.
    version: Optional dataset version (auto-infer by default, or fallback to
      1.0.0)
    check_data: If True, perform additional check to validate the data in
      data_dir is valid
    **ds_info_kwargs: Additional metadata forwarded to `tfds.core.DatasetInfo` (
      description, homepage,...). Will appear in the doc.
  """
    features = features_lib.features_dict.to_feature(features)
    data_dir = epath.Path(data_dir)
    # Extract the tf-record filenames
    tfrecord_files = [
        f for f in data_dir.iterdir() if naming.FilenameInfo.is_valid(f.name)
    ]
    if not tfrecord_files:
        raise ValueError(
            f'Could not find tf-record (or compatible format) in {data_dir}. '
            'Make sure to follow the pattern: '
            '`<dataset_name>-<split_name>.<file-extension>-xxxxxx-of-yyyyyy`')

    file_infos = [naming.FilenameInfo.from_str(f.name) for f in tfrecord_files]

    # Use set with tuple expansion syntax to ensure all names are consistents
    snake_name, = {f.dataset_name for f in file_infos}
    camel_name = naming.snake_to_camelcase(snake_name)
    filetype_suffix, = {f.filetype_suffix for f in file_infos}
    file_format = file_adapters.file_format_from_suffix(filetype_suffix)

    cls = types.new_class(
        camel_name,
        bases=(_WriteBuilder, ),
        kwds=dict(skip_registration=True),
        exec_body=None,
    )

    if version is None:  # Automatically detect the version
        if utils.Version.is_valid(data_dir.name):
            version = data_dir.name
        else:
            version = '1.0.0'
    cls.VERSION = utils.Version(version)

    # Create a dummy builder (use non existant folder to make sure
    # dataset_info.json is not restored)
    builder = cls(file_format=file_format, data_dir='/tmp/non-existent-dir/')

    # Create the metadata
    ds_info = dataset_info.DatasetInfo(
        builder=builder,
        features=features,
        **ds_info_kwargs,
    )
    ds_info.set_file_format(file_format)

    # Add the split infos
    split_dict = _load_splits(
        data_dir=data_dir,
        split_infos=split_infos,
        file_infos=file_infos,
        filetype_suffix=filetype_suffix,
        builder=builder,
    )
    ds_info.set_splits(split_dict)

    # Save all metadata (dataset_info.json, features.json,...)
    ds_info.write_to_directory(data_dir)

    # Make sure that the data can be loaded (feature connector match the actual
    # specs)
    if check_data:
        utils.print_notebook(
            'Metadata written. Testing by reading first example. '
            'Set check_data=False to skip.')
        builder = read_only_builder.builder_from_directory(data_dir)
        split_name = next(iter(builder.info.splits))
        _, = builder.as_dataset(
            split=f'{split_name}[:1]')  # Load the first example