def _download_raw_dataset(metadata): if os.path.exists(metadata["filename"]): return print(f"Downloading raw dataset from {metadata['url']}...") util.download_url(metadata["url"], metadata["filename"]) sha256 = util.compute_sha256(metadata["filename"]) if sha256 != metadata["sha256"]: raise ValueError("Downloaded data file SHA-256 does not match that listed in metadata document.")
def _download_raw_dataset(metadata): if os.path.exists(metadata['filename']): return print('Downloading raw dataset...') util.download_url(metadata['url'], metadata['filename']) print('Computing SHA-256...') sha256 = util.compute_sha256(metadata['filename']) if sha256 != metadata['sha256']: raise ValueError('Downloaded data file SHA-256 does not match that listed in metadata document.')
def load_or_generate_data(self): """Load or generate dataset data.""" if not PROCESSED_DATA_FILENAME.exists(): PROCESSED_DATA_DIRNAME.mkdir(parents=True, exist_ok=True) print('Downloading IAM lines...') util.download_url(PROCESSED_DATA_URL, PROCESSED_DATA_FILENAME) with h5py.File(PROCESSED_DATA_FILENAME, 'r') as f: self.x_train = f['x_train'][:] self.y_train_int = f['y_train'][:] self.x_test = f['x_test'][:] self.y_test_int = f['y_test'][:] self._subsample()
def _download_raw_dataset(metadata: Dict, dl_dirname: Path) -> Path: dl_dirname.mkdir(parents=True, exist_ok=True) filename = dl_dirname / metadata["filename"] if filename.exists(): return print(f"Downloading raw dataset from {metadata['url']} to {filename}...") util.download_url(metadata["url"], filename) print("Computing SHA-256...") sha256 = util.compute_sha256(filename) if sha256 != metadata["sha256"]: raise ValueError("Downloaded data file SHA-256 does not match that listed in metadata document.") return filename
def _download_pages(self): PAGES_DIRNAME.mkdir(exist_ok=True, parents=True) ids, urls = zip(*[(id_, data["url"]) for id_, data in self.data_by_page_id.items()]) filenames = [PAGES_DIRNAME / id_ for id_ in ids] util.download_url(urls, filenames)