def test_builder_from_metadata(code_builder: dataset_builder.DatasetBuilder): features = features_dict.FeaturesDict({ 'a': tf.float32, 'b': tf.string, }) info_proto = dataset_info_pb2.DatasetInfo( name='abcd', description='efgh', config_name='en', config_description='something', version='0.1.0', release_notes={'0.1.0': 'release description'}, citation='some citation', features=features.to_proto()) builder = read_only_builder.builder_from_metadata( code_builder.data_dir, info_proto=info_proto) assert builder.name == info_proto.name assert builder.info.description == info_proto.description assert builder.info.citation == info_proto.citation assert builder.info.version == info_proto.version assert builder.builder_config assert builder.builder_config.name == info_proto.config_name assert builder.builder_config.version == info_proto.version assert builder.builder_config.description == info_proto.config_description assert builder.builder_config.release_notes == info_proto.release_notes assert str(builder.info.features) == str(features)
def test_dataset_info_from_proto(): builder = RandomShapedImageGenerator(data_dir=testing.make_tmp_dir()) train = dataset_info_pb2.SplitInfo(name="train", num_shards=2, shard_lengths=[4, 5]) test = dataset_info_pb2.SplitInfo(name="test", num_shards=3, shard_lengths=[1, 2, 3]) text_feature = feature_pb2.Feature( python_class_name="tensorflow_datasets.core.features.text_feature.Text", text=feature_pb2.TextFeature()) proto = dataset_info_pb2.DatasetInfo( name="random_shaped_image_generator", version=str(builder.version), features=feature_pb2.Feature( python_class_name= "tensorflow_datasets.core.features.features_dict.FeaturesDict", features_dict=feature_pb2.FeaturesDict( features={"text": text_feature})), splits=[train, test]) result = dataset_info.DatasetInfo.from_proto(builder=builder, proto=proto) assert result.splits["test"].shard_lengths == test.shard_lengths assert result.splits["train"].shard_lengths == train.shard_lengths assert set(result.features.keys()) == {"text"} assert result.version == builder.version
def _parse_dataset_info_proto( self, config_name: str, config: Mapping[str, Any]) -> dataset_info_pb2.DatasetInfo: """Parses a DatasetInfo proto from the given Json.""" splits = [] for name, details in config['splits'].items(): splits.append( dataset_info_pb2.SplitInfo( name=name, num_shards=1, shard_lengths=[details['num_examples']], num_bytes=details['num_bytes'])) if isinstance(config['version'], dict): version = config['version']['version_str'] elif isinstance(config['version'], str): version = config['version'] return dataset_info_pb2.DatasetInfo( name=config_name, module_name=config_name, description=config['description'], version=version, citation=config['citation'], redistribution_info=dataset_info_pb2.RedistributionInfo( license=config['license']), splits=splits, features=_get_huggingface_features(config), )
def read_from_json(json_filename): """Read JSON-formatted proto into DatasetInfo proto.""" with tf.io.gfile.GFile(json_filename) as f: dataset_info_json_str = f.read() # Parse it back into a proto. parsed_proto = json_format.Parse(dataset_info_json_str, dataset_info_pb2.DatasetInfo()) return parsed_proto
def __init__(self, builder, description=None, features=None, supervised_keys=None, urls=None, citation=None, metadata=None, redistribution_info=None): """Constructs DatasetInfo. Args: builder: `DatasetBuilder`, dataset builder for this info. description: `str`, description of this dataset. features: `tfds.features.FeaturesDict`, Information on the feature dict of the `tf.data.Dataset()` object from the `builder.as_dataset()` method. supervised_keys: `tuple`, Specifies the input feature and the label for supervised learning, if applicable for the dataset. urls: `list(str)`, optional, the homepage(s) for this dataset. citation: `str`, optional, the citation to use for this dataset. metadata: `tfds.core.Metadata`, additonal object which will be stored/restored with the dataset. This allows for storing additional information with the dataset. redistribution_info: `dict`, optional, information needed for redistribution, as specified in `dataset_info_pb2.RedistributionInfo`. The content of the `license` subfield will automatically be written to a LICENSE file stored with the dataset. """ self._builder = builder self._info_proto = dataset_info_pb2.DatasetInfo( name=builder.name, description=description, version=str(builder._version), # pylint: disable=protected-access citation=citation, redistribution_info=dataset_info_pb2.RedistributionInfo( **redistribution_info) if redistribution_info else None) if urls: self._info_proto.location.urls[:] = urls self._features = features self._splits = splits_lib.SplitDict() if supervised_keys is not None: assert isinstance(supervised_keys, tuple) assert len(supervised_keys) == 2 self._info_proto.supervised_keys.input = supervised_keys[0] self._info_proto.supervised_keys.output = supervised_keys[1] if metadata and not isinstance(metadata, Metadata): raise ValueError( "Metadata should be a `tfds.core.Metadata` instance. Received " "{}".format(metadata)) self._metadata = metadata # Is this object initialized with both the static and the dynamic data? self._fully_initialized = False
def __init__(self, builder, description=None, features=None, supervised_keys=None, splits=None, urls=None, download_checksums=None, size_in_bytes=0, citation=None): """Constructs DatasetInfo. Args: builder: `DatasetBuilder`, dataset builder for this info. description: `str`, description of this dataset. features: `tfds.features.FeaturesDict`, Information on the feature dict of the `tf.data.Dataset()` object from the `builder.as_dataset()` method. supervised_keys: `tuple`, Specifies the input feature and the label for supervised learning, if applicable for the dataset. splits: `tfds.core.SplitDict`, the available splits for this dataset. urls: `list(str)`, optional, the homepage(s) for this dataset. download_checksums: `dict<str url, str sha256>`, URL to sha256 of file. If a url is not listed, its checksum is not checked. size_in_bytes: `int`, optional, approximate size in bytes of the raw size of the dataset that we will be downloading from the internet. citation: `str`, optional, the citation to use for this dataset. """ self._builder = builder self._info_proto = dataset_info_pb2.DatasetInfo( name=builder.name, description=description, version=str(builder._version), # pylint: disable=protected-access size_in_bytes=int(size_in_bytes), citation=citation) if urls: self._info_proto.location.urls[:] = urls self._info_proto.download_checksums.update(download_checksums or {}) self._features = features self._splits = splits or splits_lib.SplitDict() if supervised_keys is not None: assert isinstance(supervised_keys, tuple) assert len(supervised_keys) == 2 self._info_proto.supervised_keys.input = supervised_keys[0] self._info_proto.supervised_keys.output = supervised_keys[1] # Is this object initialized with both the static and the dynamic data? self._fully_initialized = False
def __init__(self, name=None, description=None, features=None, supervised_keys=None, splits=None, urls=None, size_in_bytes=0, citation=None): """Constructor of the DatasetInfo. Args: name: (`str`) Name of the dataset, usually set to builder.name. description: `str`, description of this dataset. features: (`tfds.features.FeaturesDict`) Information on the feature dict of the `tf.data.Dataset()` object from the `builder.as_dataset()` method. supervised_keys: (`tuple`) Specifies the input feature and the label for supervised learning, if applicable for the dataset. splits: `SplitDict`, the available Splits for this dataset. urls: `list(str)`, optional, the homepage(s) for this dataset. size_in_bytes: `integer`, optional, approximate size in bytes of the raw size of the dataset that we will be downloading from the internet. citation: `str`, optional, the citation to use for this dataset. """ self._info_proto = dataset_info_pb2.DatasetInfo( name=name, description=description, size_in_bytes=int(size_in_bytes), citation=citation) if urls: self._info_proto.location.urls[:] = urls self._features = features self._splits = splits or splits_lib.SplitDict() if supervised_keys is not None: assert isinstance(supervised_keys, tuple) assert len(supervised_keys) == 2 self._info_proto.supervised_keys.input = supervised_keys[0] self._info_proto.supervised_keys.output = supervised_keys[1] # Is this object initialized with both the static and the dynamic data? self._fully_initialized = False
def __init__(self, builder, description=None, features=None, supervised_keys=None, urls=None, citation=None): """Constructs DatasetInfo. Args: builder: `DatasetBuilder`, dataset builder for this info. description: `str`, description of this dataset. features: `tfds.features.FeaturesDict`, Information on the feature dict of the `tf.data.Dataset()` object from the `builder.as_dataset()` method. supervised_keys: `tuple`, Specifies the input feature and the label for supervised learning, if applicable for the dataset. urls: `list(str)`, optional, the homepage(s) for this dataset. citation: `str`, optional, the citation to use for this dataset. """ self._builder = builder self._info_proto = dataset_info_pb2.DatasetInfo( name=builder.name, description=description, version=str(builder._version), # pylint: disable=protected-access citation=citation) if urls: self._info_proto.location.urls[:] = urls self._features = features self._splits = splits_lib.SplitDict() if supervised_keys is not None: assert isinstance(supervised_keys, tuple) assert len(supervised_keys) == 2 self._info_proto.supervised_keys.input = supervised_keys[0] self._info_proto.supervised_keys.output = supervised_keys[1] # Is this object initialized with both the static and the dynamic data? self._fully_initialized = False
def read_from_directory(self, dataset_info_dir): """Update the DatasetInfo properties from the metadata file. This function updates all the dynamically generated fields (num_samples, hash, time of creation,...) of the DatasetInfo. This reads the metadata file on the dataset directory to extract the info and expose them. This function is called after the data has been generated in .download_and_prepare() and when the data is loaded and already exists. This will overwrite all previous metadata. Args: dataset_info_dir: `str` The directory containing the metadata file. This should be the root directory of a specific dataset version. """ if not dataset_info_dir: raise ValueError( "Calling read_from_directory with undefined dataset_info_dir.") json_filename = self._dataset_info_filename(dataset_info_dir) # Load the metadata from disk if not tf.gfile.Exists(json_filename): return with tf.gfile.Open(json_filename, "r") as f: dataset_info_json_str = f.read() # Parse it back into a proto. self._info_proto = json_format.Parse(dataset_info_json_str, dataset_info_pb2.DatasetInfo()) # Restore the Splits self._splits = splits_lib.SplitDict.from_proto(self._info_proto.splits) # Mark as fully initialized. self._fully_initialized = True
def __init__(self, builder, description=None, features=None, supervised_keys=None, homepage=None, citation=None, metadata=None, redistribution_info=None): """Constructs DatasetInfo. Args: builder: `DatasetBuilder`, dataset builder for this info. description: `str`, description of this dataset. features: `tfds.features.FeaturesDict`, Information on the feature dict of the `tf.data.Dataset()` object from the `builder.as_dataset()` method. supervised_keys: `tuple` of `(input_key, target_key)`, Specifies the input feature and the label for supervised learning, if applicable for the dataset. The keys correspond to the feature names to select in `info.features`. When calling `tfds.core.DatasetBuilder.as_dataset()` with `as_supervised=True`, the `tf.data.Dataset` object will yield the (input, target) defined here. homepage: `str`, optional, the homepage for this dataset. citation: `str`, optional, the citation to use for this dataset. metadata: `tfds.core.Metadata`, additonal object which will be stored/restored with the dataset. This allows for storing additional information with the dataset. redistribution_info: `dict`, optional, information needed for redistribution, as specified in `dataset_info_pb2.RedistributionInfo`. The content of the `license` subfield will automatically be written to a LICENSE file stored with the dataset. """ self._builder = builder self._info_proto = dataset_info_pb2.DatasetInfo( name=builder.name, description=utils.dedent(description), version=str(builder._version), # pylint: disable=protected-access citation=utils.dedent(citation), redistribution_info=dataset_info_pb2.RedistributionInfo( license=utils.dedent(redistribution_info.pop("license")), **redistribution_info) if redistribution_info else None) if homepage: self._info_proto.location.urls[:] = [homepage] if features: if not isinstance(features, top_level_feature.TopLevelFeature): raise ValueError( "DatasetInfo.features only supports FeaturesDict or Sequence at " "the top-level. Got {}".format(features)) features._set_top_level() # pylint: disable=protected-access self._features = features self._splits = splits_lib.SplitDict(self._builder.name) if supervised_keys is not None: assert isinstance(supervised_keys, tuple) assert len(supervised_keys) == 2 self._info_proto.supervised_keys.input = supervised_keys[0] self._info_proto.supervised_keys.output = supervised_keys[1] if metadata and not isinstance(metadata, Metadata): raise ValueError( "Metadata should be a `tfds.core.Metadata` instance. Received " "{}".format(metadata)) self._metadata = metadata # Is this object initialized with both the static and the dynamic data? self._fully_initialized = False
def read_from_json(path: type_utils.PathLike) -> dataset_info_pb2.DatasetInfo: """Read JSON-formatted proto into DatasetInfo proto.""" json_str = utils.as_path(path).read_text() # Parse it back into a proto. parsed_proto = json_format.Parse(json_str, dataset_info_pb2.DatasetInfo()) return parsed_proto
def __init__( self, *, builder: Union[DatasetIdentity, Any], description: Optional[str] = None, features: Optional[feature_lib.FeatureConnector] = None, supervised_keys: Optional[SupervisedKeysType] = None, disable_shuffling: bool = False, homepage: Optional[str] = None, citation: Optional[str] = None, metadata: Optional[Metadata] = None, license: Optional[str] = None, # pylint: disable=redefined-builtin redistribution_info: Optional[Dict[str, str]] = None, split_dict: Optional[splits_lib.SplitDict] = None): # pyformat: disable """Constructs DatasetInfo. Args: builder: `DatasetBuilder` or `DatasetIdentity`. The dataset builder or identity will be used to populate this info. description: `str`, description of this dataset. features: `tfds.features.FeaturesDict`, Information on the feature dict of the `tf.data.Dataset()` object from the `builder.as_dataset()` method. supervised_keys: Specifies the input structure for supervised learning, if applicable for the dataset, used with "as_supervised". The keys correspond to the feature names to select in `info.features`. When calling `tfds.core.DatasetBuilder.as_dataset()` with `as_supervised=True`, the `tf.data.Dataset` object will yield the structure defined by the keys passed here, instead of that defined by the `features` argument. Typically this is a `(input_key, target_key)` tuple, and the dataset yields a tuple of tensors `(input, target)` tensors. To yield a more complex structure, pass a tuple of `tf.nest` compatible structures of feature keys. The resulting `Dataset` will yield structures with each key replaced by the coresponding tensor. For example, passing a triple of keys would return a dataset that yields `(feature, target, sample_weights)` triples for keras. Using `supervised_keys=({'a':'a','b':'b'}, 'c')` would create a dataset yielding a tuple with a dictionary of features in the `features` position. Note that selecting features in nested `tfds.features.FeaturesDict` objects is not supported. disable_shuffling: `bool`, specify whether to shuffle the examples. homepage: `str`, optional, the homepage for this dataset. citation: `str`, optional, the citation to use for this dataset. metadata: `tfds.core.Metadata`, additonal object which will be stored/restored with the dataset. This allows for storing additional information with the dataset. license: license of the dataset. redistribution_info: information needed for redistribution, as specified in `dataset_info_pb2.RedistributionInfo`. The content of the `license` subfield will automatically be written to a LICENSE file stored with the dataset. split_dict: information about the splits in this dataset. """ # pyformat: enable self._builder_or_identity = builder if isinstance(builder, DatasetIdentity): self._identity = builder else: self._identity = DatasetIdentity.from_builder(builder) self._info_proto = dataset_info_pb2.DatasetInfo( name=self._identity.name, description=utils.dedent(description), version=str(self._identity.version), release_notes=self._identity.release_notes, disable_shuffling=disable_shuffling, config_name=self._identity.config_name, config_description=self._identity.config_description, citation=utils.dedent(citation), module_name=self._identity.module_name, redistribution_info=dataset_info_pb2.RedistributionInfo( license=utils.dedent(license or redistribution_info.pop("license")), **redistribution_info) if redistribution_info else None) if homepage: self._info_proto.location.urls[:] = [homepage] if features: if not isinstance(features, top_level_feature.TopLevelFeature): raise ValueError( "DatasetInfo.features only supports FeaturesDict or Sequence at " "the top-level. Got {}".format(features)) self._features = features self._splits = splits_lib.SplitDict([]) if split_dict: self.set_splits(split_dict) if supervised_keys is not None: self._info_proto.supervised_keys.CopyFrom( _supervised_keys_to_proto(supervised_keys)) if metadata and not isinstance(metadata, Metadata): raise ValueError( "Metadata should be a `tfds.core.Metadata` instance. Received " "{}".format(metadata)) self._metadata = metadata # Is this object initialized with both the static and the dynamic data? self._fully_initialized = False
def read_from_directory(self, dataset_info_dir, from_packaged_data=False): """Update DatasetInfo from the JSON file in `dataset_info_dir`. This function updates all the dynamically generated fields (num_examples, hash, time of creation,...) of the DatasetInfo. This will overwrite all previous metadata. Args: dataset_info_dir: `str` The directory containing the metadata file. This should be the root directory of a specific dataset version. from_packaged_data: `bool`, If data is restored from packaged data, then only the informations not defined in the code are updated Returns: True if we were able to initialize using `dataset_info_dir`, else false. """ if not dataset_info_dir: raise ValueError( "Calling read_from_directory with undefined dataset_info_dir.") json_filename = self._dataset_info_filename(dataset_info_dir) # Load the metadata from disk if not tf.gfile.Exists(json_filename): return False with tf.gfile.Open(json_filename, "r") as f: dataset_info_json_str = f.read() # Parse it back into a proto. parsed_proto = json_format.Parse(dataset_info_json_str, dataset_info_pb2.DatasetInfo()) # Update splits self.splits = splits_lib.SplitDict.from_proto(parsed_proto.splits) # Update schema self.as_proto.schema.CopyFrom(parsed_proto.schema) # Restore the feature metadata (vocabulary, labels names,...) if self.features: self.features.load_metadata(dataset_info_dir) # Restore download info self.download_checksums = parsed_proto.download_checksums self.size_in_bytes = parsed_proto.size_in_bytes # If we are restoring on-disk data, then we also restore all dataste info # information from the previously saved proto. # If we are loading from packaged data (only possible when we do not # restore previous data), then do not restore the info which are already # defined in the code. Otherwise, we would overwrite code info. if not from_packaged_data: # Update the full proto self._info_proto = parsed_proto if self._builder._version != self.version: # pylint: disable=protected-access raise AssertionError( "The constructed DatasetInfo instance and the restored proto version " "do not match. Builder version: {}. Proto version: {}".format( self._builder._version, self.version)) # pylint: disable=protected-access # Mark as fully initialized. self._fully_initialized = True return True