def _test_checksums(self): # If no call to `dl_manager.download`, then no need to check url presence. if not self._download_urls: return err_msg = ( "Did you forget to record checksums with `--register_checksums` ? See " "instructions at: " "https://www.tensorflow.org/datasets/add_dataset#run_the_generation_codeIf" " want to opt-out of checksums validation, please add `SKIP_CHECKSUMS " "= True` to the `DatasetBuilderTestCase`.\n") url_infos = self.DATASET_CLASS.url_infos filepath = self.DATASET_CLASS._checksums_path # pylint: disable=protected-access # Legacy checksums: Search in `checksums/` dir if url_infos is None: legacy_filepath = checksums._checksum_paths().get( self.builder.name) # pylint: disable=protected-access if legacy_filepath and legacy_filepath.exists(): filepath = legacy_filepath url_infos = checksums.load_url_infos(filepath) # Checksums not present neither in legacy nor package if url_infos is None: raise FileNotFoundError( f"Checksums file not found at: {filepath}\n" f"{err_msg}\n") missing_urls = self._download_urls - set(url_infos.keys()) self.assertEmpty( missing_urls, f"Some urls checksums are missing at: {filepath}\n{err_msg}")
def _collect_path_to_url_infos( ) -> Dict[tfds.core.ReadWritePath, Dict[Url, checksums.UrlInfo]]: """Collect checksums paths to url_infos.""" # Collect legacy checksums paths url_info_paths = list(checksums._checksum_paths().values()) # pylint: disable=protected-access # Collect dataset-as-folder checksums path for name in tfds.list_builders(): url_info_path = tfds.builder_cls(name)._checksums_path # pylint: disable=protected-access if url_info_path.exists(): url_info_paths.append(url_info_path) url_info_paths = [tfds.core.utils.to_write_path(p) for p in url_info_paths] return { path: typing.cast(Dict[Url, checksums.UrlInfo], checksums.load_url_infos(path)) for path in url_info_paths }
def test_checksums(tmp_path: pathlib.Path): path = tmp_path / 'checksums.tsv' url_infos = { 'http://abc.org/data': checksums.UrlInfo( checksum='abcd', size=1234, filename='a.zip', ), 'http://edf.org/data': checksums.UrlInfo( checksum='abcd', size=1234, filename='b.zip', ), } checksums.save_url_infos(path, url_infos) loaded_url_infos = checksums.load_url_infos(path) assert loaded_url_infos == url_infos
def download_checksums(self, checksums_url): """Downloads checksum file from the given URL and adds it to registry.""" checksums_path = self.download(checksums_url) self._url_infos.update(checksums.load_url_infos(checksums_path))