def _parse_dataset_info_proto( self, config_name: str, config: Mapping[str, Any]) -> dataset_info_pb2.DatasetInfo: """Parses a DatasetInfo proto from the given Json.""" splits = [] for name, details in config['splits'].items(): splits.append( dataset_info_pb2.SplitInfo( name=name, num_shards=1, shard_lengths=[details['num_examples']], num_bytes=details['num_bytes'])) if isinstance(config['version'], dict): version = config['version']['version_str'] elif isinstance(config['version'], str): version = config['version'] return dataset_info_pb2.DatasetInfo( name=config_name, module_name=config_name, description=config['description'], version=version, citation=config['citation'], redistribution_info=dataset_info_pb2.RedistributionInfo( license=config['license']), splits=splits, features=_get_huggingface_features(config), )
def __init__(self, builder, description=None, features=None, supervised_keys=None, urls=None, citation=None, metadata=None, redistribution_info=None): """Constructs DatasetInfo. Args: builder: `DatasetBuilder`, dataset builder for this info. description: `str`, description of this dataset. features: `tfds.features.FeaturesDict`, Information on the feature dict of the `tf.data.Dataset()` object from the `builder.as_dataset()` method. supervised_keys: `tuple`, Specifies the input feature and the label for supervised learning, if applicable for the dataset. urls: `list(str)`, optional, the homepage(s) for this dataset. citation: `str`, optional, the citation to use for this dataset. metadata: `tfds.core.Metadata`, additonal object which will be stored/restored with the dataset. This allows for storing additional information with the dataset. redistribution_info: `dict`, optional, information needed for redistribution, as specified in `dataset_info_pb2.RedistributionInfo`. The content of the `license` subfield will automatically be written to a LICENSE file stored with the dataset. """ self._builder = builder self._info_proto = dataset_info_pb2.DatasetInfo( name=builder.name, description=description, version=str(builder._version), # pylint: disable=protected-access citation=citation, redistribution_info=dataset_info_pb2.RedistributionInfo( **redistribution_info) if redistribution_info else None) if urls: self._info_proto.location.urls[:] = urls self._features = features self._splits = splits_lib.SplitDict() if supervised_keys is not None: assert isinstance(supervised_keys, tuple) assert len(supervised_keys) == 2 self._info_proto.supervised_keys.input = supervised_keys[0] self._info_proto.supervised_keys.output = supervised_keys[1] if metadata and not isinstance(metadata, Metadata): raise ValueError( "Metadata should be a `tfds.core.Metadata` instance. Received " "{}".format(metadata)) self._metadata = metadata # Is this object initialized with both the static and the dynamic data? self._fully_initialized = False
def __init__(self, builder, description=None, features=None, supervised_keys=None, homepage=None, citation=None, metadata=None, redistribution_info=None): """Constructs DatasetInfo. Args: builder: `DatasetBuilder`, dataset builder for this info. description: `str`, description of this dataset. features: `tfds.features.FeaturesDict`, Information on the feature dict of the `tf.data.Dataset()` object from the `builder.as_dataset()` method. supervised_keys: `tuple` of `(input_key, target_key)`, Specifies the input feature and the label for supervised learning, if applicable for the dataset. The keys correspond to the feature names to select in `info.features`. When calling `tfds.core.DatasetBuilder.as_dataset()` with `as_supervised=True`, the `tf.data.Dataset` object will yield the (input, target) defined here. homepage: `str`, optional, the homepage for this dataset. citation: `str`, optional, the citation to use for this dataset. metadata: `tfds.core.Metadata`, additonal object which will be stored/restored with the dataset. This allows for storing additional information with the dataset. redistribution_info: `dict`, optional, information needed for redistribution, as specified in `dataset_info_pb2.RedistributionInfo`. The content of the `license` subfield will automatically be written to a LICENSE file stored with the dataset. """ self._builder = builder self._info_proto = dataset_info_pb2.DatasetInfo( name=builder.name, description=utils.dedent(description), version=str(builder._version), # pylint: disable=protected-access citation=utils.dedent(citation), redistribution_info=dataset_info_pb2.RedistributionInfo( license=utils.dedent(redistribution_info.pop("license")), **redistribution_info) if redistribution_info else None) if homepage: self._info_proto.location.urls[:] = [homepage] if features: if not isinstance(features, top_level_feature.TopLevelFeature): raise ValueError( "DatasetInfo.features only supports FeaturesDict or Sequence at " "the top-level. Got {}".format(features)) features._set_top_level() # pylint: disable=protected-access self._features = features self._splits = splits_lib.SplitDict(self._builder.name) if supervised_keys is not None: assert isinstance(supervised_keys, tuple) assert len(supervised_keys) == 2 self._info_proto.supervised_keys.input = supervised_keys[0] self._info_proto.supervised_keys.output = supervised_keys[1] if metadata and not isinstance(metadata, Metadata): raise ValueError( "Metadata should be a `tfds.core.Metadata` instance. Received " "{}".format(metadata)) self._metadata = metadata # Is this object initialized with both the static and the dynamic data? self._fully_initialized = False
def __init__( self, *, builder: Union[DatasetIdentity, Any], description: Optional[str] = None, features: Optional[feature_lib.FeatureConnector] = None, supervised_keys: Optional[SupervisedKeysType] = None, disable_shuffling: bool = False, homepage: Optional[str] = None, citation: Optional[str] = None, metadata: Optional[Metadata] = None, license: Optional[str] = None, # pylint: disable=redefined-builtin redistribution_info: Optional[Dict[str, str]] = None, split_dict: Optional[splits_lib.SplitDict] = None): # pyformat: disable """Constructs DatasetInfo. Args: builder: `DatasetBuilder` or `DatasetIdentity`. The dataset builder or identity will be used to populate this info. description: `str`, description of this dataset. features: `tfds.features.FeaturesDict`, Information on the feature dict of the `tf.data.Dataset()` object from the `builder.as_dataset()` method. supervised_keys: Specifies the input structure for supervised learning, if applicable for the dataset, used with "as_supervised". The keys correspond to the feature names to select in `info.features`. When calling `tfds.core.DatasetBuilder.as_dataset()` with `as_supervised=True`, the `tf.data.Dataset` object will yield the structure defined by the keys passed here, instead of that defined by the `features` argument. Typically this is a `(input_key, target_key)` tuple, and the dataset yields a tuple of tensors `(input, target)` tensors. To yield a more complex structure, pass a tuple of `tf.nest` compatible structures of feature keys. The resulting `Dataset` will yield structures with each key replaced by the coresponding tensor. For example, passing a triple of keys would return a dataset that yields `(feature, target, sample_weights)` triples for keras. Using `supervised_keys=({'a':'a','b':'b'}, 'c')` would create a dataset yielding a tuple with a dictionary of features in the `features` position. Note that selecting features in nested `tfds.features.FeaturesDict` objects is not supported. disable_shuffling: `bool`, specify whether to shuffle the examples. homepage: `str`, optional, the homepage for this dataset. citation: `str`, optional, the citation to use for this dataset. metadata: `tfds.core.Metadata`, additonal object which will be stored/restored with the dataset. This allows for storing additional information with the dataset. license: license of the dataset. redistribution_info: information needed for redistribution, as specified in `dataset_info_pb2.RedistributionInfo`. The content of the `license` subfield will automatically be written to a LICENSE file stored with the dataset. split_dict: information about the splits in this dataset. """ # pyformat: enable self._builder_or_identity = builder if isinstance(builder, DatasetIdentity): self._identity = builder else: self._identity = DatasetIdentity.from_builder(builder) self._info_proto = dataset_info_pb2.DatasetInfo( name=self._identity.name, description=utils.dedent(description), version=str(self._identity.version), release_notes=self._identity.release_notes, disable_shuffling=disable_shuffling, config_name=self._identity.config_name, config_description=self._identity.config_description, citation=utils.dedent(citation), module_name=self._identity.module_name, redistribution_info=dataset_info_pb2.RedistributionInfo( license=utils.dedent(license or redistribution_info.pop("license")), **redistribution_info) if redistribution_info else None) if homepage: self._info_proto.location.urls[:] = [homepage] if features: if not isinstance(features, top_level_feature.TopLevelFeature): raise ValueError( "DatasetInfo.features only supports FeaturesDict or Sequence at " "the top-level. Got {}".format(features)) self._features = features self._splits = splits_lib.SplitDict([]) if split_dict: self.set_splits(split_dict) if supervised_keys is not None: self._info_proto.supervised_keys.CopyFrom( _supervised_keys_to_proto(supervised_keys)) if metadata and not isinstance(metadata, Metadata): raise ValueError( "Metadata should be a `tfds.core.Metadata` instance. Received " "{}".format(metadata)) self._metadata = metadata # Is this object initialized with both the static and the dynamic data? self._fully_initialized = False