Beispiel #1
0
def open_dataset(
    name="remo_output_3d",
    cache=True,
    cache_dir=_default_cache_dir,
    data_url=data.DKRZ_URL,
    folder="tutorial",
    **kws,
):
    """
    Open a dataset from the online repository (requires internet).

    If a local copy is found then always use that to avoid network traffic.

    Parameters
    ----------
    name : str
        Name of the file containing the dataset. If no suffix is given, assumed
        to be netCDF ('.nc' is appended)
        e.g. 'air_temperature'
    cache_dir : str, optional
        The directory in which to search for and write cached data.
    cache : bool, optional
        If True, then cache data locally for use on subsequent calls
    data_url : str
        url where the data is stored
    folder : str
        folder where the data is stored
    kws : dict, optional
        Passed to xarray.open_dataset

    See Also
    --------
    xarray.open_dataset

    """
    root, ext = _os.path.splitext(name)
    if not ext:
        ext = ".nc"
    fullname = root + ext
    longdir = _os.path.expanduser(cache_dir)
    localfile = _os.sep.join((longdir, fullname))
    md5name = fullname + ".md5"
    md5file = _os.sep.join((longdir, md5name))

    if not _os.path.exists(localfile):

        # This will always leave this directory on disk.
        # May want to add an option to remove it.
        if not _os.path.isdir(longdir):
            _os.mkdir(longdir)

        url = "/".join((data_url, folder, fullname))
        urlretrieve(url, localfile)
        url = "/".join((data_url, folder, md5name))
        urlretrieve(url, md5file)
        localmd5 = file_md5_checksum(localfile)
        with open(md5file) as f:
            remotemd5 = f.read()
        if localmd5 != remotemd5:
            _os.remove(localfile)
            msg = """
            MD5 checksum does not match, try downloading dataset again.
            """
            raise OSError(msg)

    ds = _open_dataset(localfile, **kws)

    if not cache:
        ds = ds.load()
        _os.remove(localfile)

    return ds
Beispiel #2
0
def open_dataset(
    name: str,
    suffix: Optional[str] = None,
    dap_url: Optional[str] = None,
    github_url: str = "https://github.com/Ouranosinc/raven-testdata",
    branch: str = "master",
    cache: bool = True,
    cache_dir: Path = _default_cache_dir,
    **kwds,
) -> Dataset:
    """Open a dataset from the online GitHub-like repository.

    If a local copy is found then always use that to avoid network traffic.

    Parameters
    ----------
    name: str
      Name of the file containing the dataset. If no suffix is given, assumed to be netCDF ('.nc' is appended).
    suffix: str, optional
      If no suffix is given, assumed to be netCDF ('.nc' is appended). For no suffix, set "".
    dap_url: str, optional
      URL to OPeNDAP folder where the data is stored. If supplied, supersedes github_url.
    github_url: str
      URL to Github repository where the data is stored.
    branch: str, optional
      For GitHub-hosted files, the branch to download from.
    cache_dir: Path
      The directory in which to search for and write cached data.
    cache: bool
      If True, then cache data locally for use on subsequent calls.
    kwds: dict, optional
      For NetCDF files, keywords passed to xarray.open_dataset.

    Returns
    -------
    Union[Dataset, Path]

    See Also
    --------
    xarray.open_dataset
    """
    name = Path(name)
    if suffix is None:
        suffix = ".nc"
    fullname = name.with_suffix(suffix)

    if dap_url is not None:
        dap_file = urljoin(dap_url, str(name))
        try:
            ds = _open_dataset(dap_file, **kwds)
            return ds
        except OSError:
            msg = "OPeNDAP file not read. Verify that service is available."
            LOGGER.error(msg)
            raise

    local_file = _get(
        fullname=fullname,
        github_url=github_url,
        branch=branch,
        suffix=suffix,
        cache_dir=cache_dir,
    )

    try:
        ds = _open_dataset(local_file, **kwds)
        if not cache:
            ds = ds.load()
            local_file.unlink()
        return ds
    except OSError:
        raise
Beispiel #3
0
def open_dataset(
    name,
    cache: bool = True,
    cache_dir: Path = _default_cache_dir,
    github_url: str = "https://github.com/Ouranosinc/xclim-testdata",
    branch: str = "main",
    **kws,
):
    """
    Open a dataset from the online repository (requires internet).

    If a local copy is found then always use that to avoid network traffic.

    Parameters
    ----------
    name : str
        Name of the file containing the dataset. If no suffix is given, assumed
        to be netCDF ('.nc' is appended). The name may contain
    cache_dir : Path
        The directory in which to search for and write cached data.
    cache : bool
        If True, then cache data locally for use on subsequent calls
    github_url : str
        Github repository where the data is stored
    branch : str
        The git branch to download from
    kws : dict, optional
        Passed to xarray.open_dataset

    See Also
    --------
    xarray.open_dataset

    """
    name = Path(name)
    fullname = name.with_suffix(".nc")
    cache_dir = cache_dir.absolute()
    local_file = cache_dir / fullname
    md5name = fullname.with_suffix(".nc.md5")
    md5file = cache_dir / md5name

    if not local_file.is_file():
        # This will always leave this directory on disk.
        # We may want to add an option to remove it.
        local_file.parent.mkdir(parents=True, exist_ok=True)

        url = "/".join((github_url, "raw", branch, fullname.as_posix()))
        urlretrieve(url, local_file)
        url = "/".join((github_url, "raw", branch, md5name.as_posix()))
        urlretrieve(url, md5file)

        localmd5 = file_md5_checksum(local_file)
        with open(md5file) as f:
            remotemd5 = f.read()
        if localmd5 != remotemd5:
            local_file.unlink()
            msg = """
            MD5 checksum does not match, try downloading dataset again.
            """
            raise OSError(msg)

    ds = _open_dataset(local_file, **kws)

    if not cache:
        ds = ds.load()
        local_file.unlink()

    return ds