Esempio n. 1
0
def sound_subreg():
    # this selection is needed because the full dataset is incomplete
    # so we test only the values that can be parsed
    reg = get_data_registry_table()
    return reg[["size", "byte_size"]][reg["size"].notna()]
Esempio n. 2
0
def load_sample(fn, progressbar: bool = True, timeout=None, **kwargs):
    """
    Load sample data with yt. Simple wrapper around `yt.load` to include fetching
    data with pooch.

    The data registry table can be retrieved and visualized using
    `yt.sample_data.api.get_data_registry_table`.

    This function requires pandas and pooch to be installed.

    Parameters
    ----------
    fn : str
        The `filename` of the dataset to load, as defined in the data registry
        table.

    progressbar: bool
        display a progress bar (tqdm).

    timeout: float or int (optional)
        Maximal waiting time, in seconds, after which download is aborted.
        `None` means "no limit". This parameter is directly passed to down to
        requests.get via pooch.HTTPDownloader

    Any additional keyword argument is passed down to `yt.load`.
    Note that in case of collision with predefined keyword arguments as set in
    the data registry, the ones passed to this function take priority.
    """

    from yt.sample_data.api import (
        _download_sample_data_file,
        _get_test_data_dir_path,
        get_data_registry_table,
    )

    pooch_logger = pooch.utils.get_logger()

    topdir, _, specific_file = str(fn).partition(os.path.sep)

    registry_table = get_data_registry_table()
    # PR 3089
    # note: in the future the registry table should be reindexed
    # so that the following line can be replaced with
    #
    # specs = registry_table.loc[fn]
    #
    # however we don't want to do it right now because the "filename" column is
    # currently incomplete

    try:
        specs = registry_table.query(f"`filename` == '{topdir}'").iloc[0]
    except IndexError as err:
        raise KeyError(f"Could not find '{fn}' in the registry.") from err

    if not specs["load_name"]:
        raise ValueError(
            "Registry appears to be corrupted: could not find a 'load_name' entry for this dataset."
        )

    kwargs = {**specs["load_kwargs"], **kwargs}

    try:
        data_dir = lookup_on_disk_data(fn)
    except FileNotFoundError:
        mylog.info("'%s' is not available locally. Looking up online.", fn)
    else:
        # if the data is already available locally, `load_sample`
        # only acts as a thin wrapper around `load`
        loadable_path = data_dir.joinpath(specs["load_name"], specific_file)
        mylog.info("Sample dataset found in '%s'", data_dir)
        if timeout is not None:
            mylog.info("Ignoring the `timeout` keyword argument received.")
        return load(loadable_path, **kwargs)

    try:
        save_dir = _get_test_data_dir_path()
    except FileNotFoundError:
        mylog.warning(
            "yt test data directory is not properly set up. "
            "Data will be saved to the current work directory instead.")
        save_dir = Path.cwd()

    # effectively silence the pooch's logger and create our own log instead
    pooch_logger.setLevel(100)
    mylog.info("Downloading from %s", specs["url"])

    # downloading via a pooch.Pooch instance behind the scenes
    filename = urlsplit(specs["url"]).path.split("/")[-1]

    tmp_file = _download_sample_data_file(filename,
                                          progressbar=progressbar,
                                          timeout=timeout)

    # pooch has functionalities to unpack downloaded archive files,
    # but it needs to be told in advance that we are downloading a tarball.
    # Since that information is not necessarily trival to guess from the filename,
    # we rely on the standard library to perform a conditional unpacking instead.
    if tarfile.is_tarfile(tmp_file):
        mylog.info("Untaring downloaded file to '%s'", save_dir)
        with tarfile.open(tmp_file) as fh:
            fh.extractall(save_dir)
        os.remove(tmp_file)
    else:
        os.replace(tmp_file, save_dir)

    loadable_path = Path.joinpath(save_dir, fn, specs["load_name"],
                                  specific_file)

    if specific_file and not loadable_path.exists():
        raise ValueError(f"Could not find file '{specific_file}'.")

    return load(loadable_path, **kwargs)
Esempio n. 3
0
def test_registry_integrity():
    reg = get_data_registry_table()
    assert not any(reg.isna())
Esempio n. 4
0
def data_registry():
    yield get_data_registry_table()