def _validate_checksums( url: str, path: epath.Path, computed_url_info: Optional[checksums.UrlInfo], expected_url_info: Optional[checksums.UrlInfo], force_checksums_validation: bool, ) -> None: """Validate computed_url_info match expected_url_info.""" # If force-checksums validations, both expected and computed url_info # should exists if force_checksums_validation: # Checksum of the downloaded file unknown (for manually downloaded file) if not computed_url_info: computed_url_info = checksums.compute_url_info(path) # Checksums have not been registered if not expected_url_info: raise ValueError(f'Missing checksums url: {url}, yet ' '`force_checksums_validation=True`. ' 'Did you forget to register checksums?') if (expected_url_info and computed_url_info and expected_url_info != computed_url_info): msg = ( f'Artifact {url}, downloaded to {path}, has wrong checksum:\n' f'* Expected: {expected_url_info}\n' f'* Got: {computed_url_info}\n' 'To debug, see: ' 'https://www.tensorflow.org/datasets/overview#fixing_nonmatchingchecksumerror' ) raise NonMatchingChecksumError(msg)
def _compute_dir_hash(path: utils.ReadOnlyPath) -> str: """Computes the checksums of the given directory deterministically.""" all_files = sorted(path.iterdir()) if any(f.is_dir() for f in all_files): raise ValueError('Installed package should only contains files.') # Concatenate the filenames and files content to create the directory hash all_checksums = [f.name for f in all_files] all_checksums += [checksums.compute_url_info(f).checksum for f in all_files] return hashlib.sha256(''.join(all_checksums).encode()).hexdigest()
def test_compute_url_info(): filepath = utils.tfds_path() / 'testing/test_data/6pixels.png' expected_url_info = checksums.UrlInfo( checksum= '04f38ebed34d3b027d2683193766155912fba647158c583c3bdb4597ad8af34c', size=utils.Size(102), filename='6pixels.png', ) url_info = checksums.compute_url_info(filepath, checksum_cls=hashlib.sha256) assert url_info == expected_url_info assert url_info.filename == expected_url_info.filename
def _sync_file_copy( self, filepath: str, destination_path: str, ) -> DownloadResult: """Downloads the file through `tf.io.gfile` API.""" filename = os.path.basename(filepath) out_path = os.path.join(destination_path, filename) tf.io.gfile.copy(filepath, out_path) url_info = checksums_lib.compute_url_info( out_path, checksum_cls=self._checksumer_cls) self._pbar_dl_size.update_total(url_info.size) self._pbar_dl_size.update(url_info.size) self._pbar_url.update(1) return DownloadResult(path=epath.Path(out_path), url_info=url_info)