def main(): files = glob.glob('*.nc') for ncf in files: prefix = ncf.split('.')[0] outf = '{}.md5'.format(prefix) with open(outf, 'w') as f: f.write(file_md5_checksum(ncf))
def _get( fullname: Path, github_url: str, branch: str, suffix: str, cache_dir: Path, ) -> Path: cache_dir = cache_dir.absolute() local_file = cache_dir / branch / fullname md5name = fullname.with_suffix("{}.md5".format(suffix)) md5file = cache_dir / branch / md5name if not local_file.is_file(): # This will always leave this directory on disk. # We may want to add an option to remove it. local_file.parent.mkdir(parents=True, exist_ok=True) url = "/".join((github_url, "raw", branch, fullname.as_posix())) LOGGER.info("Fetching remote file: %s" % fullname.as_posix()) urlretrieve(url, local_file) try: url = "/".join((github_url, "raw", branch, md5name.as_posix())) LOGGER.info("Fetching remote file md5: %s" % md5name.as_posix()) urlretrieve(url, md5file) except HTTPError as e: msg = f"{md5name.as_posix()} not found. Aborting file retrieval." local_file.unlink() raise FileNotFoundError(msg) from e localmd5 = file_md5_checksum(local_file) try: with open(md5file) as f: remotemd5 = f.read() if localmd5.strip() != remotemd5.strip(): local_file.unlink() msg = """ MD5 checksum does not match, try downloading dataset again. """ raise OSError(msg) except OSError as e: LOGGER.error(e) return local_file
def main(): files = glob.glob('*.nc') for ncf in files: outf = '{}.md5'.format(ncf) with open(outf, 'w') as f: f.write(file_md5_checksum(ncf))
def open_dataset( name, cache: bool = True, cache_dir: Path = _default_cache_dir, github_url: str = "https://github.com/Ouranosinc/xclim-testdata", branch: str = "main", **kws, ): """ Open a dataset from the online repository (requires internet). If a local copy is found then always use that to avoid network traffic. Parameters ---------- name : str Name of the file containing the dataset. If no suffix is given, assumed to be netCDF ('.nc' is appended). The name may contain cache_dir : Path The directory in which to search for and write cached data. cache : bool If True, then cache data locally for use on subsequent calls github_url : str Github repository where the data is stored branch : str The git branch to download from kws : dict, optional Passed to xarray.open_dataset See Also -------- xarray.open_dataset """ name = Path(name) fullname = name.with_suffix(".nc") cache_dir = cache_dir.absolute() local_file = cache_dir / fullname md5name = fullname.with_suffix(".nc.md5") md5file = cache_dir / md5name if not local_file.is_file(): # This will always leave this directory on disk. # We may want to add an option to remove it. local_file.parent.mkdir(parents=True, exist_ok=True) url = "/".join((github_url, "raw", branch, fullname.as_posix())) urlretrieve(url, local_file) url = "/".join((github_url, "raw", branch, md5name.as_posix())) urlretrieve(url, md5file) localmd5 = file_md5_checksum(local_file) with open(md5file) as f: remotemd5 = f.read() if localmd5 != remotemd5: local_file.unlink() msg = """ MD5 checksum does not match, try downloading dataset again. """ raise OSError(msg) ds = _open_dataset(local_file, **kws) if not cache: ds = ds.load() local_file.unlink() return ds