Example #1
0
def consolidate_metadata(target):
    """
    Consolidate Zarr metadata

    Parameters
    ----------
    target : str
         Path or url of the Zarr store.
    """
    mapper = fsspec.get_mapper(target)
    zarr.consolidate_metadata(mapper)
Example #2
0
def fetch_zarr(zarr_url, storage_options={'anon': True}):
    zg = zarr.open_consolidated(fsspec.get_mapper(zarr_url, **storage_options),
                                mode='r')
    dimensions = {}
    variable_arrays = {}
    for k, a in zg.arrays():
        if k in a.attrs['_ARRAY_DIMENSIONS']:
            dimensions[k] = a.attrs['_ARRAY_DIMENSIONS']
        else:
            variable_arrays[k] = a.attrs['_ARRAY_DIMENSIONS']
    return zg, dimensions, variable_arrays
Example #3
0
def _set_file_path(path):
    """Find the speakers.json under the given path or the above it.
    Intended to band aid the different paths returned in restored and continued training."""
    path_restore = os.path.join(os.path.dirname(path), "speakers.json")
    path_continue = os.path.join(path, "speakers.json")
    fs = fsspec.get_mapper(path).fs
    if fs.exists(path_restore):
        return path_restore
    if fs.exists(path_continue):
        return path_continue
    raise FileNotFoundError(f" [!] `speakers.json` not found in {path}")
Example #4
0
def open_dsets(df):
    """Open datasets from cloud storage and return xarray dataset."""
    dsets = [
        xr.open_zarr(fsspec.get_mapper(ds_url),
                     consolidated=True).pipe(drop_all_bounds)
        for ds_url in df.zstore
    ]
    try:
        ds = xr.merge(dsets, join='exact')
        return ds
    except ValueError:
        return None
Example #5
0
def nc2zarr(source_url, cache_location):
    """convert netcdf data to zarr"""
    target_url = source_url + ".zarr"

    with dask.config.set(scheduler="single-threaded"):

        ds = (xr.open_dataset(fsspec.open(source_url).open()).pipe(
            preproc).pipe(postproc).load().chunk(chunks))

        mapper = fsspec.get_mapper(target_url)
        ds.to_zarr(mapper)

    return target_url
def split_and_write(model, scenario, member, method):

    ds = get_scratch_ds(model, scenario, member, method)

    scen_mapper = fsspec.get_mapper(
        f'az://carbonplan-downscaling/cmip6/{method}/conus/4000m/monthly/{model}.{scenario}.{member}.zarr',
        account_name="carbonplan",
        account_key=account_key,
    )
    clean_store(scen_mapper)

    print('writing scen')
    write_zarr(ds.sel(time=slice('2015-01', None)), scen_mapper)

    hist_mapper = fsspec.get_mapper(
        f'az://carbonplan-downscaling/cmip6/{method}/conus/4000m/monthly/{model}.historical.{member}.zarr',
        account_name="carbonplan",
        account_key=account_key,
    )
    clean_store(hist_mapper)
    print('writing hist')
    write_zarr(ds.sel(time=slice(None, '2014-12')), hist_mapper)
def test_binary_table():
    out = kerchunk.fits.process_file(btable, extension=1)
    m = fsspec.get_mapper("reference://", fo=out)
    z = zarr.open(m)
    arr = z["1"]
    with open(btable, "rb") as f:
        hdul = fits.open(f)
        attr2 = dict(arr.attrs)
        assert attr2.pop('_ARRAY_DIMENSIONS') == ['x']
        assert attr2 == dict(hdul[1].header)
        assert (arr['order'] == hdul[1].data['order']).all()
        assert (arr['mag'] == hdul[1].data['mag']).all()
        assert (arr['name'].astype("U") == hdul[1].data['name']).all()  # string come out as bytes
Example #8
0
def test_open_asset_preprocess_error():
    path = os.path.join(
        here, './sample_data/cesm-le/b.e11.B1850C5CN.f09_g16.005.pop.h.SHF.040001-049912.nc'
    )
    print(path)
    path = f'file://{path}'
    mapper = fsspec.get_mapper(path)

    def preprocess(ds):
        return ds.set_coords('foo')

    with pytest.raises(RuntimeError):
        _open_asset(mapper, 'netcdf', cdf_kwargs={}, varname=['SHF'], preprocess=preprocess)
Example #9
0
def consolidate_metadata(writes: List[str], target: str) -> None:
    """
    Consolidate the metadata the Zarr group at `target`.

    Parameters
    ----------
    writes : List[str]
        The URLs the combined stores were written to. This is only a
        parameter to introduce a dependency. The actual value isn't used.
    target : str
        The URL for the (combined) Zarr group.
    """
    mapper = fsspec.get_mapper(target)
    zarr.consolidate_metadata(mapper)
def load_dataset(path: str,
                 unpack: bool = False,
                 consolidated: bool = False) -> Dataset:
    store = fsspec.get_mapper(path, check=False, create=False)
    ds = xr.open_zarr(store,
                      concat_characters=False,
                      consolidated=consolidated)
    if unpack:
        ds = unpack_variables(ds, dtype="float16")
    for v in ds:
        # Workaround for https://github.com/pydata/xarray/issues/4386
        if v.endswith("_mask"):
            ds[v] = ds[v].astype(bool)
    return ds
def get_scratch_ds(model, scenario, member, method):

    print(f'loading {model}.{scenario}.{member}')

    mapper = fsspec.get_mapper(
        f'az://carbonplan-scratch/cmip6/{method}/conus/4000m/monthly/{model}.{scenario}.{member}.zarr',
        account_name="carbonplan",
    )

    ds = xr.open_zarr(mapper, consolidated=True)[
        cp_vars
    ]  # .load(retries=task_retries).chunk(chunks)
    print(f'ds size: {ds.nbytes / 1e9}')
    return ds
Example #12
0
def test_fsspec_get_mapper():
    """Added for #788"""

    with tempzip(archive_data) as z:
        mapping = fsspec.get_mapper(f"zip::{z}")

        assert isinstance(mapping, collections.abc.Mapping)
        keys = sorted(list(mapping.keys()))
        assert keys == ["a", "b", "deeply/nested/path"]

        # mapping.getitems() will call FSMap.fs.cat()
        # which was not accurately implemented for zip.
        assert isinstance(mapping, fsspec.mapping.FSMap)
        items = dict(mapping.getitems(keys))
        assert items == {"a": b"", "b": b"hello", "deeply/nested/path": b"stuff"}
Example #13
0
def test_mapping_prefix(tmpdir):
    tmpdir = str(tmpdir)
    os.makedirs(os.path.join(tmpdir, "afolder"))
    open(os.path.join(tmpdir, "afile"), "w").write("test")
    open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2")

    m = fsspec.get_mapper("file://" + tmpdir)
    assert "afile" in m
    assert m["afolder/anotherfile"] == b"test2"

    fs = fsspec.filesystem("file")
    m2 = fs.get_mapper(tmpdir)
    m3 = fs.get_mapper("file://" + tmpdir)

    assert m == m2 == m3
Example #14
0
def test_mapping_prefix(tmpdir):
    tmpdir = str(tmpdir)
    os.makedirs(os.path.join(tmpdir, 'afolder'))
    open(os.path.join(tmpdir, 'afile'), 'w').write('test')
    open(os.path.join(tmpdir, 'afolder', 'anotherfile'), 'w').write('test2')

    m = fsspec.get_mapper('file://' + tmpdir)
    assert 'afile' in m
    assert m['afolder/anotherfile'] == b'test2'

    fs = fsspec.filesystem('file')
    m2 = fs.get_mapper(tmpdir)
    m3 = fs.get_mapper('file://' + tmpdir)

    assert m == m2 == m3
def map_tgt(tgt: str) -> fsspec.FSMap:
    """Uses fsspec to creating mapped object from target connection string

    Parameters
    ----------
    tgt : str
        path store

    Returns
    -------
    fsspec.FSMap
        fsspec mapped object
    """
    tgt_map = fsspec.get_mapper(tgt, connection_string)
    return tgt_map
Example #16
0
async def refresh_dataset(request: Request, dataset_id: str):
    if dataset_id:
        if dataset_id in DATASETS_STORE:
            logger.info(f"Refreshing dataset: {dataset_id}.")
            ds = xr.open_zarr(
                fsspec.get_mapper(DATASETS_STORE[dataset_id].zarr_url),
                consolidated=True,
            )
            DATASETS_STORE[dataset_id].set_ds(ds)
            for r in request.app.routes:
                if r.path == f"/{dataset_id}":
                    request.app.routes.remove(r)
            request.app.mount(f"/{dataset_id}", DATASETS_STORE[dataset_id].app)
            logger.info(f"Refresh completed: {dataset_id}.")
    return {"status": "success", "dataset_id": dataset_id}
Example #17
0
    def _open_dataset(self):
        import xarray as xr
        import fsspec
        assert fsspec.__version__ >= "0.3.6", "zarr plugin requires fsspec >= 0.3.6"
        from fsspec import filesystem, get_mapper
        from fsspec.utils import update_storage_options, infer_storage_options

        storage_options = infer_storage_options(self.urlpath)
        update_storage_options(storage_options, self.storage_options)
        self._fs = filesystem(storage_options['protocol'])
        if storage_options['protocol'] != 'file':
            self._mapper = get_mapper(self.urlpath)
            self._ds = xr.open_zarr(self._mapper, **self.kwargs)
        else:
            self._ds = xr.open_zarr(self.urlpath, **self.kwargs)
Example #18
0
def consolidate_metadata(target, writes: Optional[List[str]] = None) -> None:
    """
    Consolidate the metadata the Zarr group at `target`.

    Parameters
    ----------
    target : str
        The URL for the (combined) Zarr group.
    writes : list of strings, optional
        The URLs the combined stores were written to. This is only a
        parameter to introduce a dependency in the pipeline execution graph.
        The actual value isn't used.
    """
    mapper = fsspec.get_mapper(target)
    zarr.consolidate_metadata(mapper)
Example #19
0
def _to_zarr(  # type: ignore[no-untyped-def]
    arr,
    url,
    component=None,
    storage_options=None,
    overwrite=False,
    compute=True,
    return_stored=False,
    attrs=None,
    **kwargs,
):
    """Extension of dask.array.core.to_zarr that can set attributes on the resulting Zarr array,
    in the same Dask operation.
    """

    # call Dask version with compute=False just to check preconditions
    da.to_zarr(
        arr,
        url,
        component=component,
        storage_options=storage_options,
        overwrite=overwrite,
        compute=False,
        return_stored=return_stored,
        **kwargs,
    )

    storage_options = storage_options or {}
    if isinstance(url, str):
        mapper = get_mapper(url, **storage_options)
    else:
        # assume the object passed is already a mapper
        mapper = url  # pragma: no cover
    chunks = [c[0] for c in arr.chunks]
    z = dask.delayed(_zarr_create_with_attrs)(
        shape=arr.shape,
        chunks=chunks,
        dtype=arr.dtype,
        store=mapper,
        path=component,
        overwrite=overwrite,
        attrs=attrs,
        **kwargs,
    )
    return arr.store(z,
                     lock=False,
                     compute=compute,
                     return_stored=return_stored)
Example #20
0
def get_version(zstore, method='fsspec'):

    client = requests.session()
    baseurl = 'http://hdl.handle.net/api/handles/'
    query1 = '?type=IS_PART_OF'
    query2 = '?type=VERSION_NUMBER'

    # get the `netcdf_tracking_ids` from the zstore metadata
    if method == 'fsspec':
        mapper = fsspec.get_mapper(zstore)
    else:
        mapper = zstore
    group = zarr.open_consolidated(mapper)
    tracking_ids = group.attrs['tracking_id']

    # query the dataset handler to obtain `dataset_tracking_id` and `version`
    versions = []
    datasets = []
    for file_tracking_id in tracking_ids.split('\n')[0:1]:
        url = baseurl + file_tracking_id[4:] + query1
        r = client.get(url)
        r.raise_for_status()
        dataset_tracking_id = r.json()['values'][0]['data']['value']
        datasets += [dataset_tracking_id]
        if ';' in dataset_tracking_id:
            # multiple dataset_ids erroneously reported
            dtracks = dataset_tracking_id.split(';')
            vs = []
            for dtrack in dtracks:
                url2 = baseurl + dtrack[4:] + query2
                r = client.get(url2)
                r.raise_for_status()
                r.json()['values'][0]['data']['value']
                vs += [r.json()['values'][0]['data']['value']]
            v = sorted(vs)[-1]
        else:
            url2 = baseurl + dataset_tracking_id[4:] + query2
            r = client.get(url2)
            r.raise_for_status()
            v = r.json()['values'][0]['data']['value']
        versions += [v]

    version_id = list(set(versions))
    dataset_id = list(set(datasets))

    assert len(version_id) == 1

    return dataset_id[0], version_id[0]
Example #21
0
def test_no_dircache(s3):
    from s3fs.tests.test_s3fs import endpoint_uri
    import fsspec

    d = fsspec.get_mapper(
        "s3://" + root,
        anon=False,
        client_kwargs={"endpoint_url": endpoint_uri},
        use_listings_cache=False,
    )
    d.clear()
    assert list(d) == []
    d[1] = b"1"
    assert list(d) == ["1"]
    d.clear()
    assert list(d) == []
def map_and_open_zarr_link(file_loc_str: str) -> xr.Dataset:
    """Takes zarr store, opens with fsspec and returns xarray dataset

    Parameters
    ----------
    file_loc_str : str
        zarr store target path

    Returns
    -------
    xr.Dataset
        output xarray dataset
    """
    mapped_key = fsspec.get_mapper(file_loc_str, anon=True)
    ds = xr.open_zarr(mapped_key, consolidated=True)
    return ds
Example #23
0
def to_zarr(input_path: str, output_path: str, dictionary_path: str):
    import dask.dataframe as dd
    import fsspec
    import xarray as xr
    from dask.diagnostics import ProgressBar

    logger.info(f"Converting parquet at {input_path} to {output_path}")
    df = dd.read_parquet(input_path)

    trait_columns = df.columns[df.columns.to_series().str.match(r"^\d+")]
    # 41210_Z942 -> 41210 (UKB field id)
    trait_group_ids = [c.split("_")[0] for c in trait_columns]
    # 41210_Z942 -> Z942 (Data coding value as one-hot encoding in phenotype, e.g.)
    trait_code_ids = ["_".join(c.split("_")[1:]) for c in trait_columns]
    trait_values = df[trait_columns].astype("float").to_dask_array()
    trait_values.compute_chunk_sizes()

    trait_id_to_name = (
        pd.read_csv(
            dictionary_path,
            sep=",",
            usecols=["FieldID", "Field"],
            dtype={"FieldID": str, "Field": str},
        )
        .set_index("FieldID")["Field"]
        .to_dict()
    )
    trait_name = [trait_id_to_name.get(v) for v in trait_group_ids]

    ds = xr.Dataset(
        dict(
            id=("samples", np.asarray(df["userId"], dtype=int)),
            trait=(("samples", "traits"), trait_values),
            trait_id=("traits", np.asarray(trait_columns.values, dtype=str)),
            trait_group_id=("traits", np.array(trait_group_ids, dtype=int)),
            trait_code_id=("traits", np.array(trait_code_ids, dtype=str)),
            trait_name=("traits", np.array(trait_name, dtype=str)),
        )
    )
    # Keep chunks small in trait dimension for faster per-trait processing
    ds["trait"] = ds["trait"].chunk(dict(samples="auto", traits=100))
    ds = ds.rename_vars({v: f"sample_{v}" for v in ds})

    logger.info(f"Saving dataset to {output_path}:\n{ds}")
    with ProgressBar():
        ds.to_zarr(fsspec.get_mapper(output_path), consolidated=True, mode="w")
    logger.info("Done")
Example #24
0
def test_setitem_types():
    import array

    m = fsspec.get_mapper("memory://")
    m["a"] = array.array("i", [1])
    if sys.byteorder == "little":
        assert m["a"] == b"\x01\x00\x00\x00"
    else:
        assert m["a"] == b"\x00\x00\x00\x01"
    m["b"] = bytearray(b"123")
    assert m["b"] == b"123"
    m.setitems({"c": array.array("i", [1]), "d": bytearray(b"123")})
    if sys.byteorder == "little":
        assert m["c"] == b"\x01\x00\x00\x00"
    else:
        assert m["c"] == b"\x00\x00\x00\x01"
    assert m["d"] == b"123"
Example #25
0
def test_missing_nonasync(m):
    zarr = pytest.importorskip("zarr")
    zarray = {
        "chunks": [1],
        "compressor": None,
        "dtype": "<f8",
        "fill_value": "NaN",
        "filters": [],
        "order": "C",
        "shape": [10],
        "zarr_format": 2,
    }
    refs = {".zarray": json.dumps(zarray)}

    m = fsspec.get_mapper("reference://", fo=refs, remote_protocol="memory")

    a = zarr.open_array(m)
    assert str(a[0]) == "nan"
Example #26
0
def test_ops():
    MemoryFileSystem.store.clear()
    m = fsspec.get_mapper('memory://')
    assert not m
    assert list(m) == []

    with pytest.raises(KeyError):
        m['hi']

    assert m.pop('key', 0) == 0

    m['key0'] = b'data'
    assert list(m) == ['key0']
    assert m['key0'] == b'data'

    m.clear()

    assert list(m) == []
Example #27
0
def test_ops():
    MemoryFileSystem.store.clear()
    m = fsspec.get_mapper("memory://")
    assert not m
    assert list(m) == []

    with pytest.raises(KeyError):
        m["hi"]

    assert m.pop("key", 0) == 0

    m["key0"] = b"data"
    assert list(m) == ["key0"]
    assert m["key0"] == b"data"

    m.clear()

    assert list(m) == []
def combine_and_write(sources, target, append_dim, first=True):
    # while debugging this, I had itermittent fsspec / hdf5 read errors related to
    # "trying to read from a closed file"
    # but they seem to have gone away for now
    double_open_files = [fsspec.open(url).open() for url in sources]
    ds = xr.open_mfdataset(double_open_files,
                           combine="nested",
                           concat_dim=concat_dim)
    # by definition, this should be a contiguous chunk
    ds = ds.chunk({append_dim: len(sources)})

    if first:
        kwargs = dict(mode="w")
    else:
        kwargs = dict(mode="a", append_dim=append_dim)

    mapper = fsspec.get_mapper(target)
    ds.to_zarr(mapper, **kwargs)
Example #29
0
def get_obs_std(obs, train_period_start, train_period_end, variables, gcm_grid_spec, ds=None):
    # if std is not already saved, ds must be a valid dataset
    path = make_coarse_obs_path(
        obs=obs,
        train_period_start=train_period_start,
        train_period_end=train_period_end,
        variables=variables,
        gcm_grid_spec=gcm_grid_spec,
        chunking_approach='std',
    )
    store = fsspec.get_mapper(intermediate_cache_path + '/' + path)

    if '.zmetadata' not in store:
        std = ds.std(dim='time')
        std.to_zarr(store, mode="w", consolidated=True)
    else:
        std = xr.open_zarr(store).load()
    return std
def get_gwas_sumstat_manifest(path: str) -> pd.DataFrame:
    store = fsspec.get_mapper(path)
    df = []
    for f in list(store):
        fn = f.split("/")[-1]
        parts = re.findall(r"ukb_chr(\d+)_(\d+)_(.*).parquet", fn)
        if not parts:
            continue
        parts = parts[0]
        df.append(
            dict(
                contig=parts[0],
                batch=int(parts[1]),
                trait_id=parts[2],
                trait_group_id=parts[2].split("_")[0],
                trait_code_id="_".join(parts[2].split("_")[1:]),
                file=f,
            ))
    return pd.DataFrame(df)