コード例 #1
0
 def test_bool(self):
     sd = splits.SplitDict("ds_name")
     self.assertFalse(sd)  # Empty split is False
     sd.add(tfds.core.SplitInfo(name="train", num_shards=10))
     self.assertTrue(sd)  # Non-empty split is True
コード例 #2
0
ファイル: dataset_info.py プロジェクト: vmtang11/datasets
    def __init__(self,
                 *,
                 builder,
                 description=None,
                 features=None,
                 supervised_keys=None,
                 homepage=None,
                 citation=None,
                 metadata=None,
                 redistribution_info=None):
        """Constructs DatasetInfo.

    Args:
      builder: `DatasetBuilder`, dataset builder for this info.
      description: `str`, description of this dataset.
      features: `tfds.features.FeaturesDict`, Information on the feature dict
        of the `tf.data.Dataset()` object from the `builder.as_dataset()`
        method.
      supervised_keys: `tuple` of `(input_key, target_key)`, Specifies the
        input feature and the label for supervised learning, if applicable for
        the dataset. The keys correspond to the feature names to select in
        `info.features`. When calling `tfds.core.DatasetBuilder.as_dataset()`
        with `as_supervised=True`, the `tf.data.Dataset` object will yield
        the (input, target) defined here.
      homepage: `str`, optional, the homepage for this dataset.
      citation: `str`, optional, the citation to use for this dataset.
      metadata: `tfds.core.Metadata`, additonal object which will be
        stored/restored with the dataset. This allows for storing additional
        information with the dataset.
      redistribution_info: `dict`, optional, information needed for
        redistribution, as specified in `dataset_info_pb2.RedistributionInfo`.
        The content of the `license` subfield will automatically be written to a
        LICENSE file stored with the dataset.
    """
        self._builder = builder

        self._info_proto = dataset_info_pb2.DatasetInfo(
            name=builder.name,
            description=utils.dedent(description),
            version=str(builder._version),  # pylint: disable=protected-access
            citation=utils.dedent(citation),
            redistribution_info=dataset_info_pb2.RedistributionInfo(
                license=utils.dedent(redistribution_info.pop("license")),
                **redistribution_info) if redistribution_info else None)

        if homepage:
            self._info_proto.location.urls[:] = [homepage]

        if features:
            if not isinstance(features, top_level_feature.TopLevelFeature):
                raise ValueError(
                    "DatasetInfo.features only supports FeaturesDict or Sequence at "
                    "the top-level. Got {}".format(features))
            features._set_top_level()  # pylint: disable=protected-access
        self._features = features
        self._splits = splits_lib.SplitDict(self._builder.name)
        if supervised_keys is not None:
            assert isinstance(supervised_keys, tuple)
            assert len(supervised_keys) == 2
            self._info_proto.supervised_keys.input = supervised_keys[0]
            self._info_proto.supervised_keys.output = supervised_keys[1]

        if metadata and not isinstance(metadata, Metadata):
            raise ValueError(
                "Metadata should be a `tfds.core.Metadata` instance. Received "
                "{}".format(metadata))
        self._metadata = metadata

        # Is this object initialized with both the static and the dynamic data?
        self._fully_initialized = False
コード例 #3
0
 def test_bool(self):
   sd = splits.SplitDict([], dataset_name="ds_name")
   self.assertFalse(sd)  # Empty split is False
   si = [tfds.core.SplitInfo(name="train", shard_lengths=[5], num_bytes=0)]
   sd = splits.SplitDict(si, dataset_name="ds_name")
   self.assertTrue(sd)  # Non-empty split is True
コード例 #4
0
 def split_dict(self):
     sd = splits.SplitDict("ds_name")
     sd.add(tfds.core.SplitInfo(name="train", num_shards=10))
     sd.add(tfds.core.SplitInfo(name="test", num_shards=1))
     return sd
コード例 #5
0
ファイル: splits_test.py プロジェクト: GorVad/MISIS-NN-ML
 def test_empty_split(self):
     sd = splits.SplitDict([], dataset_name="ds_name")
     with self.assertRaisesWithPredicateMatch(KeyError,
                                              "`splits` is empty"):
         _ = sd["train"]
コード例 #6
0
 def test_num_shards(self):
   si = tfds.core.SplitInfo(name="train", shard_lengths=[1, 2, 3], num_bytes=0)
   sd = splits.SplitDict([si], dataset_name="ds_name")
   self.assertEqual(sd["train"].num_shards, 3)
コード例 #7
0
  def _download_and_prepare(
      self,
      dl_manager: download.DownloadManager,
      download_config: download.DownloadConfig,
  ) -> None:
    """Generate all splits and returns the computed split infos."""
    split_builder = split_builder_lib.SplitBuilder(
        split_dict=self.info.splits,
        features=self.info.features,
        max_examples_per_split=download_config.max_examples_per_split,
        beam_options=download_config.beam_options,
        beam_runner=download_config.beam_runner,
        file_format=self._file_format,
    )
    # Wrap the generation inside a context manager.
    # If `beam` is used during generation (when a pipeline gets created),
    # the context manager is equivalent to `with beam.Pipeline()`.
    # Otherwise, this is a no-op.
    # By auto-detecting Beam, the user only has to change `_generate_examples`
    # to go from non-beam to beam dataset:
    # https://www.tensorflow.org/datasets/beam_datasets#instructions
    with split_builder.maybe_beam_pipeline():
      # If the signature has a `pipeline` kwargs, create the pipeline now and
      # forward it to `self._split_generators`
      # We add this magic because the pipeline kwargs is only used by c4 and
      # we do not want to make the API more verbose for a single advanced case.
      signature = inspect.signature(self._split_generators)
      if "pipeline" in signature.parameters.keys():
        optional_pipeline_kwargs = dict(pipeline=split_builder.beam_pipeline)
      else:
        optional_pipeline_kwargs = {}
      split_generators = self._split_generators(  # pylint: disable=unexpected-keyword-arg
          dl_manager, **optional_pipeline_kwargs
      )
      # TODO(tfds): Could be removed once all datasets are migrated.
      # https://github.com/tensorflow/datasets/issues/2537
      # Legacy mode (eventually convert list[SplitGeneratorLegacy] -> dict)
      split_generators = split_builder.normalize_legacy_split_generators(
          split_generators=split_generators,
          generator_fn=self._generate_examples,
          is_beam=isinstance(self, BeamBasedBuilder),
      )

      # Ensure `all` isn't used as key.
      _check_split_names(split_generators.keys())

      # Writer fail if the number of example yield is `0`, so we return here.
      if download_config.max_examples_per_split == 0:
        return

      # Start generating data for all splits
      path_suffix = file_adapters.ADAPTER_FOR_FORMAT[
          self._file_format].FILE_SUFFIX

      split_info_futures = [
          split_builder.submit_split_generation(  # pylint: disable=g-complex-comprehension
              split_name=split_name,
              generator=generator,
              path=self.data_path / f"{self.name}-{split_name}.{path_suffix}",
          )
          for split_name, generator
          in utils.tqdm(
              split_generators.items(),
              desc="Generating splits...",
              unit=" splits",
              leave=False,
          )
      ]
    # Finalize the splits (after apache beam completed, if it was used)
    split_infos = [future.result() for future in split_info_futures]

    # Update the info object with the splits.
    split_dict = splits_lib.SplitDict(split_infos, dataset_name=self.name)
    self.info.set_splits(split_dict)
コード例 #8
0
    def __init__(
            self,
            *,
            builder: Union[DatasetIdentity, Any],
            description: Optional[str] = None,
            features: Optional[feature_lib.FeatureConnector] = None,
            supervised_keys: Optional[SupervisedKeysType] = None,
            disable_shuffling: bool = False,
            homepage: Optional[str] = None,
            citation: Optional[str] = None,
            metadata: Optional[Metadata] = None,
            license: Optional[str] = None,  # pylint: disable=redefined-builtin
            redistribution_info: Optional[Dict[str, str]] = None,
            split_dict: Optional[splits_lib.SplitDict] = None):
        # pyformat: disable
        """Constructs DatasetInfo.

    Args:
      builder: `DatasetBuilder` or `DatasetIdentity`. The dataset builder or
        identity will be used to populate this info.
      description: `str`, description of this dataset.
      features: `tfds.features.FeaturesDict`, Information on the feature dict of
        the `tf.data.Dataset()` object from the `builder.as_dataset()` method.
      supervised_keys: Specifies the input structure for supervised learning, if
        applicable for the dataset, used with "as_supervised". The keys
        correspond to the feature names to select in `info.features`. When
        calling `tfds.core.DatasetBuilder.as_dataset()` with
        `as_supervised=True`, the `tf.data.Dataset` object will yield the
        structure defined by the keys passed here, instead of that defined by
        the `features` argument. Typically this is a `(input_key, target_key)`
        tuple, and the dataset yields a tuple of tensors `(input, target)`
        tensors.

        To yield a more complex structure, pass a tuple of `tf.nest` compatible
        structures of feature keys. The resulting `Dataset` will yield
        structures with each key replaced by the coresponding tensor. For
        example, passing a triple of keys would return a dataset
        that yields `(feature, target, sample_weights)` triples for keras.
        Using `supervised_keys=({'a':'a','b':'b'}, 'c')` would create a dataset
        yielding a tuple with a dictionary of features in the `features`
        position.

        Note that selecting features in nested `tfds.features.FeaturesDict`
        objects is not supported.
      disable_shuffling: `bool`, specify whether to shuffle the examples.
      homepage: `str`, optional, the homepage for this dataset.
      citation: `str`, optional, the citation to use for this dataset.
      metadata: `tfds.core.Metadata`, additonal object which will be
        stored/restored with the dataset. This allows for storing additional
        information with the dataset.
      license: license of the dataset.
      redistribution_info: information needed for redistribution, as specified
        in `dataset_info_pb2.RedistributionInfo`. The content of the `license`
        subfield will automatically be written to a LICENSE file stored with the
        dataset.
      split_dict: information about the splits in this dataset.
    """
        # pyformat: enable
        self._builder_or_identity = builder
        if isinstance(builder, DatasetIdentity):
            self._identity = builder
        else:
            self._identity = DatasetIdentity.from_builder(builder)

        self._info_proto = dataset_info_pb2.DatasetInfo(
            name=self._identity.name,
            description=utils.dedent(description),
            version=str(self._identity.version),
            release_notes=self._identity.release_notes,
            disable_shuffling=disable_shuffling,
            config_name=self._identity.config_name,
            config_description=self._identity.config_description,
            citation=utils.dedent(citation),
            module_name=self._identity.module_name,
            redistribution_info=dataset_info_pb2.RedistributionInfo(
                license=utils.dedent(license
                                     or redistribution_info.pop("license")),
                **redistribution_info) if redistribution_info else None)

        if homepage:
            self._info_proto.location.urls[:] = [homepage]

        if features:
            if not isinstance(features, top_level_feature.TopLevelFeature):
                raise ValueError(
                    "DatasetInfo.features only supports FeaturesDict or Sequence at "
                    "the top-level. Got {}".format(features))
        self._features = features
        self._splits = splits_lib.SplitDict([])
        if split_dict:
            self.set_splits(split_dict)
        if supervised_keys is not None:
            self._info_proto.supervised_keys.CopyFrom(
                _supervised_keys_to_proto(supervised_keys))

        if metadata and not isinstance(metadata, Metadata):
            raise ValueError(
                "Metadata should be a `tfds.core.Metadata` instance. Received "
                "{}".format(metadata))
        self._metadata = metadata

        # Is this object initialized with both the static and the dynamic data?
        self._fully_initialized = False
コード例 #9
0
def _split_dict(splits: Dict[str, Split]) -> splits_lib.SplitDict:
    return splits_lib.SplitDict([split.info for _, split in splits.items()])
コード例 #10
0
ファイル: splits_test.py プロジェクト: jackzhangsir/datasets
 def split_dict(self):
     sd = splits.SplitDict()
     sd.add(SplitInfo(name="train", num_shards=10))
     sd.add(SplitInfo(name="test", num_shards=1))
     return sd
コード例 #11
0
 def test_empty_split(self):
   sd = splits.SplitDict([])
   with self.assertRaisesWithPredicateMatch(KeyError, '`splits` is empty'):
     _ = sd['train']