def test_builder_code_not_found(code_builder: dataset_builder.DatasetBuilder): """If the code isn't found, use files instead.""" # Patch `tfds.builder_cls` to emulate that the dataset isn't registered with mock.patch.object( load, 'builder_cls', side_effect=registered.DatasetNotFoundError(code_builder.name), ): # Files exists, but not code, loading from files builder = load.builder(code_builder.name) assert isinstance(builder, read_only_builder.ReadOnlyBuilder) load.load(code_builder.name, split=[]) # Dataset found -> no error if code_builder.builder_config: # When the code isn't found, default config is infered from `.config/` assert builder.builder_config.name == code_builder.BUILDER_CONFIGS[0].name # Explicitly passing a config should works too. config_name = f'{code_builder.name}/{code_builder.builder_config.name}' builder = load.builder(config_name) assert isinstance(builder, read_only_builder.ReadOnlyBuilder) # Neither code not files found, raise DatasetNotFoundError with pytest.raises(registered.DatasetNotFoundError): load.builder(code_builder.name, data_dir='/tmp/non-existing/tfds/dir') with pytest.raises(registered.DatasetNotFoundError): load.load( code_builder.name, split=[], data_dir='/tmp/non-existing/tfds/dir' )
def builder_cls( self, name: naming.DatasetName, ) -> Type[dataset_builder.DatasetBuilder]: """Loads the builder class for the given dataset. Arguments: name: the name and namespace of the dataset to load the builder class for. Returns: DatasetNotFoundError if data is not found. """ registers = self._get_registers(name) # Typically there's only 1, so add special case so that more informative # exceptions are raised. if len(registers) == 1: return registers[0].builder_cls(name) # If this dataset has multiple registers, use the first that can be found. for register in registers: try: return register.builder_cls(name) except registered.DatasetNotFoundError: pass raise registered.DatasetNotFoundError( f'Namespace {name.namespace} found, ' f'but could not load dataset {name.name}.' f'{self._get_list_builders_context(name)}')
def test_builder_code_not_found(code_builder: dataset_builder.DatasetBuilder): """If the code isn't found, use files instead.""" # Patch `tfds.builder_cls` to emulate that the dataset isn't registered with mock.patch.object( load, 'builder_cls', side_effect=registered.DatasetNotFoundError(code_builder.name), ): # When the code isn't found, loading dataset require explicit config name: # tfds.builder('ds/config') config_name = code_builder.name if code_builder.builder_config: config_name = f'{config_name}/{code_builder.builder_config.name}' # Files exists, but not code, loading from files builder = load.builder(config_name) assert isinstance(builder, read_only_builder.ReadOnlyBuilder) load.load(config_name, split=[]) # Dataset found -> no error # Neither code not files found, raise DatasetNotFoundError with pytest.raises(registered.DatasetNotFoundError): load.builder(config_name, data_dir='/tmp/non-existing/tfds/dir') with pytest.raises(registered.DatasetNotFoundError): load.load(config_name, split=[], data_dir='/tmp/non-existing/tfds/dir')
def builder_cls( self, name: utils.DatasetName, ) -> Type[dataset_builder.DatasetBuilder]: """Returns the builder classes.""" if name.namespace not in self.namespaces: # pylint: disable=unsupported-membership-test raise registered.DatasetNotFoundError( f'Namespace {name.namespace} not found. Should be one of: ' f'{sorted(self.namespaces)}') raise NotImplementedError( 'builder_cls does not support data_dir-based community datasets. Got: ' f'{name}')
def _download_or_reuse_cache( name: utils.DatasetName, package_index: _PackageIndex, ) -> _InstalledPackage: """Downloads the dataset generation source code. Search the dataset in the cache, or download it from the package index otherwise. Args: name: Dataset name to load. package_index: Index of all community datasets. Might be updated. Returns: The installed dataset information. Raises: DatasetNotFoundError: If the dataset can't be loaded. """ # Dataset can be: # * Installed locally (in the cache) -> reuse # * Not installed but present in the package index -> install # * Not present in the package index -> raise error # Check if the file is already downloaded/cached # TODO(tfds): To force a download even if file already present, we # should add a `ignore_cache=True` option in `tfds.load`. Or should always # try to download the file ? last_installed_version = _get_last_installed_version(name) if last_installed_version: return last_installed_version # If file isn't cached yet, we need to download it. # First need to find it's location. if name not in package_index: # If not, we need to update the package index cache package_index.refresh() # If the dataset is present in the package index cache, use this package = package_index.get(name) if not package: # If still not found, raise an DatasetNotFoundError raise registered.DatasetNotFoundError( f'Could not find dataset {name}: Dataset not found among the ' f'{len(package_index)} datasets of the community index.' ) # If package was found, download it. installed_package = _download_and_cache(package) return installed_package
def builder( self, name: naming.DatasetName, **builder_kwargs: Any, ) -> dataset_builder.DatasetBuilder: """Loads the builder class for the given dataset.""" registers = self._get_registers(name) # Typically there's only 1, so add special case so that more informative # exceptions are raised. if len(registers) == 1: return registers[0].builder(name, **builder_kwargs) if len(registers) > 1: raise ValueError(f'Namespace {name.namespace} has multiple registers! ' f'This should not happen! Registers: {registers}') raise registered.DatasetNotFoundError( f'Namespace {name.namespace} found with {len(registers)} registers, ' f'but could not load dataset {name.name}.')
def _get_registers( self, name: naming.DatasetName) -> List[register_base.BaseRegister]: """Returns all available registers for a given namespace, if any. Args: name: str, the namespace's name. Raises: DatasetNotFound error if the namespace is not found. """ if not self.has_namespace(name.namespace): error_msg = (f'\nNamespace {name.namespace} not found. ') error_msg += (f'Note that the namespace should be one of: ' f'{sorted(self.registers_per_namespace.keys())}.\n') close_matches = difflib.get_close_matches( name.namespace, self.registers_per_namespace, n=1) if close_matches: error_msg += f'Did you mean: {name.namespace} -> {close_matches[0]} ?\n' raise registered.DatasetNotFoundError(error_msg) return self.registers_per_namespace[name.namespace]
def builder_from_files( name: str, **builder_kwargs: Any, ) -> dataset_builder.DatasetBuilder: """Loads a `tfds.core.DatasetBuilder` from files, auto-infering location. This function is similar to `tfds.builder` (same signature), but create the `tfds.core.DatasetBuilder` directly from files, without loading original generation source code. It does not supports: * namespaces (e.g. 'kaggle:dataset') * config objects (`dataset/config` valid, but not `config=MyConfig()`) * `version='experimental_latest'` Args: name: Dataset name. **builder_kwargs: `tfds.core.DatasetBuilder` kwargs. Returns: builder: The loaded dataset builder. Raises: DatasetNotFoundError: If the dataset cannot be loaded. """ # Find and load dataset builder. builder_dir = _find_builder_dir(name, **builder_kwargs) if builder_dir is not None: # A generated dataset was found on disk return builder_from_directory(builder_dir) else: data_dirs = constants.list_data_dirs( given_data_dir=builder_kwargs.get('data_dir')) raise registered.DatasetNotFoundError( f'Could not find dataset files for: {name}. Make sure the dataset ' f'has been generated in: {data_dirs}. If the dataset has configs, you ' 'might have to specify the config name.')