def open_dataset( name="remo_output_3d", cache=True, cache_dir=_default_cache_dir, data_url=data.DKRZ_URL, folder="tutorial", **kws, ): """ Open a dataset from the online repository (requires internet). If a local copy is found then always use that to avoid network traffic. Parameters ---------- name : str Name of the file containing the dataset. If no suffix is given, assumed to be netCDF ('.nc' is appended) e.g. 'air_temperature' cache_dir : str, optional The directory in which to search for and write cached data. cache : bool, optional If True, then cache data locally for use on subsequent calls data_url : str url where the data is stored folder : str folder where the data is stored kws : dict, optional Passed to xarray.open_dataset See Also -------- xarray.open_dataset """ root, ext = _os.path.splitext(name) if not ext: ext = ".nc" fullname = root + ext longdir = _os.path.expanduser(cache_dir) localfile = _os.sep.join((longdir, fullname)) md5name = fullname + ".md5" md5file = _os.sep.join((longdir, md5name)) if not _os.path.exists(localfile): # This will always leave this directory on disk. # May want to add an option to remove it. if not _os.path.isdir(longdir): _os.mkdir(longdir) url = "/".join((data_url, folder, fullname)) urlretrieve(url, localfile) url = "/".join((data_url, folder, md5name)) urlretrieve(url, md5file) localmd5 = file_md5_checksum(localfile) with open(md5file) as f: remotemd5 = f.read() if localmd5 != remotemd5: _os.remove(localfile) msg = """ MD5 checksum does not match, try downloading dataset again. """ raise OSError(msg) ds = _open_dataset(localfile, **kws) if not cache: ds = ds.load() _os.remove(localfile) return ds
def open_dataset( name: str, suffix: Optional[str] = None, dap_url: Optional[str] = None, github_url: str = "https://github.com/Ouranosinc/raven-testdata", branch: str = "master", cache: bool = True, cache_dir: Path = _default_cache_dir, **kwds, ) -> Dataset: """Open a dataset from the online GitHub-like repository. If a local copy is found then always use that to avoid network traffic. Parameters ---------- name: str Name of the file containing the dataset. If no suffix is given, assumed to be netCDF ('.nc' is appended). suffix: str, optional If no suffix is given, assumed to be netCDF ('.nc' is appended). For no suffix, set "". dap_url: str, optional URL to OPeNDAP folder where the data is stored. If supplied, supersedes github_url. github_url: str URL to Github repository where the data is stored. branch: str, optional For GitHub-hosted files, the branch to download from. cache_dir: Path The directory in which to search for and write cached data. cache: bool If True, then cache data locally for use on subsequent calls. kwds: dict, optional For NetCDF files, keywords passed to xarray.open_dataset. Returns ------- Union[Dataset, Path] See Also -------- xarray.open_dataset """ name = Path(name) if suffix is None: suffix = ".nc" fullname = name.with_suffix(suffix) if dap_url is not None: dap_file = urljoin(dap_url, str(name)) try: ds = _open_dataset(dap_file, **kwds) return ds except OSError: msg = "OPeNDAP file not read. Verify that service is available." LOGGER.error(msg) raise local_file = _get( fullname=fullname, github_url=github_url, branch=branch, suffix=suffix, cache_dir=cache_dir, ) try: ds = _open_dataset(local_file, **kwds) if not cache: ds = ds.load() local_file.unlink() return ds except OSError: raise
def open_dataset( name, cache: bool = True, cache_dir: Path = _default_cache_dir, github_url: str = "https://github.com/Ouranosinc/xclim-testdata", branch: str = "main", **kws, ): """ Open a dataset from the online repository (requires internet). If a local copy is found then always use that to avoid network traffic. Parameters ---------- name : str Name of the file containing the dataset. If no suffix is given, assumed to be netCDF ('.nc' is appended). The name may contain cache_dir : Path The directory in which to search for and write cached data. cache : bool If True, then cache data locally for use on subsequent calls github_url : str Github repository where the data is stored branch : str The git branch to download from kws : dict, optional Passed to xarray.open_dataset See Also -------- xarray.open_dataset """ name = Path(name) fullname = name.with_suffix(".nc") cache_dir = cache_dir.absolute() local_file = cache_dir / fullname md5name = fullname.with_suffix(".nc.md5") md5file = cache_dir / md5name if not local_file.is_file(): # This will always leave this directory on disk. # We may want to add an option to remove it. local_file.parent.mkdir(parents=True, exist_ok=True) url = "/".join((github_url, "raw", branch, fullname.as_posix())) urlretrieve(url, local_file) url = "/".join((github_url, "raw", branch, md5name.as_posix())) urlretrieve(url, md5file) localmd5 = file_md5_checksum(local_file) with open(md5file) as f: remotemd5 = f.read() if localmd5 != remotemd5: local_file.unlink() msg = """ MD5 checksum does not match, try downloading dataset again. """ raise OSError(msg) ds = _open_dataset(local_file, **kws) if not cache: ds = ds.load() local_file.unlink() return ds