Exemple #1
0
 def read(self,
          path: str,
          s3_kwargs: Dict[str, Any] = None,
          s3_client_kwargs: Dict[str, Any] = None,
          max_cache_size: int = None,
          **kwargs) -> xr.Dataset:
     """
     Read dataset from some Zarr storage.
     :param path: File path or object storage URL.
     :param s3_kwargs: if *path* is an object storage URL, keyword-arguments passed to S3 file system,
         that is ``s3fs.S3FileSystem(**s3_kwargs, ...)``.
     :param s3_client_kwargs: if *path* is an object storage URL, keyword-arguments passed to S3 (boto3) client,
         that is ``s3fs.S3FileSystem(..., client_kwargs=s3_client_kwargs)``.
     :param max_cache_size: if this is a positive integer, the store will be wrapped in an in-memory cache,
         that is ``store = zarr.LRUStoreCache(store, max_size=max_cache_size)``.
     :param kwargs: Keyword-arguments passed to xarray Zarr adapter,
         that is ``xarray.open_zarr(..., **kwargs)``. In addition, the parameter **
     :return:
     """
     path_or_store = path
     consolidated = False
     if isinstance(path, str):
         path_or_store, consolidated = get_path_or_s3_store(
             path_or_store,
             s3_kwargs=s3_kwargs,
             s3_client_kwargs=s3_client_kwargs,
             mode='r')
         if max_cache_size is not None and max_cache_size > 0:
             path_or_store = zarr.LRUStoreCache(path_or_store,
                                                max_size=max_cache_size)
     return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
    def open(
        self,
        mode: str = "r",
        cached: bool = True,
        cache_size_bytes: int = int(1e9)) -> "ChunkedDataset":
        """Opens a zarr dataset from disk from the path supplied in the constructor.

        :param mode: Mode to open dataset in, default to read-only (default: {"r"})
        :param cached: Whether to cache files read from disk using a LRU cache. (default: {True})
        :param cache_size_bytes: Size of cache in bytes (default: {1e9} (1GB))
        """
        if cached:
            self.root = zarr.open_group(store=zarr.LRUStoreCache(
                zarr.DirectoryStore(self.path), max_size=cache_size_bytes),
                                        mode=mode)
        else:
            self.root = zarr.open_group(self.path, mode=mode)
        self.frames = self.root[FRAME_ARRAY_KEY]
        self.agents = self.root[AGENT_ARRAY_KEY]
        self.scenes = self.root[SCENE_ARRAY_KEY]
        try:
            self.tl_faces = self.root[TL_FACE_ARRAY_KEY]
        except KeyError:
            # the real issue here is that frame doesn't have traffic_light_faces_index_interval
            warnings.warn(
                f"{TL_FACE_ARRAY_KEY} not found in {self.path}! "
                f"You won't be able to use this zarr into an Ego/AgentDataset",
                RuntimeWarning,
                stacklevel=2,
            )
            self.tl_faces = np.empty((0, ), dtype=TL_FACE_DTYPE)
        return self
Exemple #3
0
def open_ml_dataset_from_object_storage(
        ctx: ServiceContext,
        dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset:
    ds_id = dataset_descriptor.get('Identifier')

    path = dataset_descriptor.get('Path')
    if not path:
        raise ServiceConfigError(
            f"Missing 'path' entry in dataset descriptor {ds_id}")

    data_format = dataset_descriptor.get('Format', FORMAT_NAME_ZARR)

    s3_client_kwargs = {}
    if 'Endpoint' in dataset_descriptor:
        s3_client_kwargs['endpoint_url'] = dataset_descriptor['Endpoint']
    if 'Region' in dataset_descriptor:
        s3_client_kwargs['region_name'] = dataset_descriptor['Region']
    obs_file_system = s3fs.S3FileSystem(anon=True,
                                        client_kwargs=s3_client_kwargs)

    if data_format == FORMAT_NAME_ZARR:
        store = s3fs.S3Map(root=path, s3=obs_file_system, check=False)
        cached_store = zarr.LRUStoreCache(store, max_size=2**28)
        with measure_time(tag=f"opened remote zarr dataset {path}"):
            ds = xr.open_zarr(cached_store)
        return BaseMultiLevelDataset(ds)

    if data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened remote levels dataset {path}"):
            return ObjectStorageMultiLevelDataset(
                ds_id,
                obs_file_system,
                path,
                exception_type=ServiceConfigError)
Exemple #4
0
    def _get_dataset_lazily(self, index: int, **zarr_kwargs) -> xr.Dataset:
        """
        Read the dataset for the level at given *index*.

        :param index: the level index
        :param zarr_kwargs: kwargs passed to xr.open_zarr()
        :return: the dataset for the level at *index*.
        """
        ext, level_path = self._level_paths[index]
        if ext == ".link":
            with self._obs_file_system.open(level_path, "w") as fp:
                level_path = fp.read()
                # if file_path is a relative path, resolve it against the levels directory
                if not os.path.isabs(level_path):
                    base_dir = os.path.dirname(self._dir_path)
                    level_path = os.path.join(base_dir, level_path)

        store = s3fs.S3Map(root=level_path,
                           s3=self._obs_file_system,
                           check=False)
        cached_store = zarr.LRUStoreCache(store, max_size=2**28)
        with measure_time(
                tag=f"opened remote dataset {level_path} for level {index}"):
            return assert_cube(xr.open_zarr(cached_store, **zarr_kwargs),
                               name=level_path)
Exemple #5
0
    def read(self, path: str, **kwargs) -> xr.Dataset:
        path_or_store = path

        if isinstance(path, str):
            endpoint_url = None
            root = None

            if 'endpoint_url' in kwargs:
                endpoint_url = kwargs.pop('endpoint_url')
                root = path
            if path.startswith("http://") or path.startswith("https://"):
                import urllib3.util
                url = urllib3.util.parse_url(path_or_store)
                if url.port is not None:
                    endpoint_url = f'{url.scheme}://{url.host}:{url.port}'
                else:
                    endpoint_url = f'{url.scheme}://{url.host}'
                root = url.path
                if root.startswith('/'):
                    root = root[1:]

            if endpoint_url and root is not None:
                s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(endpoint_url=endpoint_url))
                path_or_store = s3fs.S3Map(root=root, s3=s3, check=False)
                if 'max_cache_size' in kwargs:
                    max_cache_size = kwargs.pop('max_cache_size')
                    if max_cache_size > 0:
                        path_or_store = zarr.LRUStoreCache(path_or_store, max_size=max_cache_size)

        return xr.open_zarr(path_or_store, **kwargs)
Exemple #6
0
    def open(self, mode: str = "r", cached: bool = True, cache_size_bytes: int = int(1e9)) -> "ChunkedDataset":
        """Opens a zarr dataset from disk from the path supplied in the constructor.

        Keyword Arguments:
            mode (str): Mode to open dataset in, default to read-only (default: {"r"})
            cached (bool): Whether to cache files read from disk using a LRU cache. (default: {True})
            cache_size_bytes (int): Size of cache in bytes (default: {1e9} (1GB))

        Raises:
            Exception: When any of the expected arrays (frames, agents, scenes) is missing or the store couldn't be
opened.
        """
        if cached:
            self.root = zarr.open_group(
                store=zarr.LRUStoreCache(zarr.DirectoryStore(self.path), max_size=cache_size_bytes), mode=mode
            )
        else:
            self.root = zarr.open_group(self.path, mode=mode)
        self.frames = self.root[FRAME_ARRAY_KEY]
        self.agents = self.root[AGENT_ARRAY_KEY]
        self.scenes = self.root[SCENE_ARRAY_KEY]
        try:
            self.tl_faces = self.root[TL_FACE_ARRAY_KEY]
        except KeyError:
            warnings.warn(
                f"{TL_FACE_ARRAY_KEY} not found in {self.path}! Traffic lights will be disabled",
                RuntimeWarning,
                stacklevel=2,
            )
            self.tl_faces = np.empty((0,), dtype=TL_FACE_DTYPE)
        return self
Exemple #7
0
def open_cube(cube_config: CubeConfig,
              observer: Callable = None,
              trace_store_calls: bool = False,
              max_cache_size: int = 2 ** 30,
              sentinel_hub: SentinelHub = None,
              **sh_kwargs) -> xr.Dataset:
    """
    Open a data cube from SentinelHub.

    This is a facade function that hides the details of opening a volatile data cube from SentinelHub.

    :param cube_config: The cube configuration.
    :param observer: A observer function or callable that is called on every request made to SentinelHub.
    :param trace_store_calls: Whether to trace and dump calls made into the Zarr store.
    :param max_cache_size: Cache size in bytes. Defaults to 1 GB. If zero or None, no caching takes place:
    :param sentinel_hub: Optional instance of SentinelHub, the object representing the SENTINEL Hub API.
    :param sh_kwargs: Optional keyword arguments passed to the SentinelHub constructor. Only valid if
         *sentinel_hub* is not given.
    :return: the data cube represented by an xarray Dataset object.
    """
    if sentinel_hub is None:
        sentinel_hub = SentinelHub(**sh_kwargs)
    elif sh_kwargs:
        raise ValueError(f'unexpected keyword-arguments: {", ".join(sh_kwargs.keys())}')
    cube_store = SentinelHubChunkStore(sentinel_hub, cube_config, observer=observer,
                                       trace_store_calls=trace_store_calls)
    if max_cache_size:
        cube_store = zarr.LRUStoreCache(cube_store, max_cache_size)
    return xr.open_zarr(cube_store)
Exemple #8
0
    def read(self, path: str, **kwargs) -> xr.Dataset:
        path_or_store = path
        consolidated = False

        if isinstance(path, str):
            region_name = None

            if 'endpoint_url' in kwargs:
                endpoint_url = kwargs.pop('endpoint_url')
                root = path
            else:
                endpoint_url, root = split_bucket_url(path)

            if 'region_name' in kwargs:
                region_name = kwargs.pop('region_name')

            if endpoint_url and root:
                s3 = s3fs.S3FileSystem(anon=True,
                                       client_kwargs=dict(
                                           endpoint_url=endpoint_url,
                                           region_name=region_name))
                consolidated = s3.exists(f'{root}/.zmetadata')
                path_or_store = s3fs.S3Map(root=root, s3=s3, check=False)
                if 'max_cache_size' in kwargs:
                    max_cache_size = kwargs.pop('max_cache_size')
                    if max_cache_size > 0:
                        path_or_store = zarr.LRUStoreCache(
                            path_or_store, max_size=max_cache_size)
            else:
                consolidated = os.path.exists(
                    os.path.join(path_or_store, '.zmetadata'))
        return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
Exemple #9
0
    def _get_dataset_lazily(self, index: int,
                            parameters: Dict[str, Any]) -> xr.Dataset:
        """
        Read the dataset for the level at given *index*.

        :param index: the level index
        :param parameters: keyword arguments passed to xr.open_zarr()
        :return: the dataset for the level at *index*.
        """
        ext, level_path = self._level_paths[index]
        if ext == ".link":
            with self._s3_file_system.open(level_path, "w") as fp:
                level_path = fp.read()
                # if file_path is a relative path, resolve it against the levels directory
                if not os.path.isabs(level_path):
                    base_dir = os.path.dirname(self._dir_path)
                    level_path = os.path.join(base_dir, level_path)
        store = s3fs.S3Map(root=level_path,
                           s3=self._s3_file_system,
                           check=False)
        max_size = self.get_chunk_cache_capacity(index)
        if max_size:
            store = zarr.LRUStoreCache(store, max_size=max_size)
        with measure_time(
                tag=f"opened remote dataset {level_path} for level {index}"):
            consolidated = self._s3_file_system.exists(
                f'{level_path}/.zmetadata')
            return assert_cube(xr.open_zarr(store,
                                            consolidated=consolidated,
                                            **parameters),
                               name=level_path)
Exemple #10
0
    def read(self, path: str, **kwargs) -> xr.Dataset:
        path_or_store = path
        consolidated = False
        mode = 'read'
        root = None

        if isinstance(path, str):
            client_kwargs = {}
            if 'client_kwargs' in kwargs:
                client_kwargs = kwargs.pop('client_kwargs')
            if 'endpoint_url' in kwargs:
                client_kwargs['endpoint_url'] = kwargs.pop('endpoint_url')
                root = path
            if 'region_name' in kwargs:
                client_kwargs['region_name'] = kwargs.pop('region_name')

            path_or_store, root, client_kwargs = _get_path_or_store(path_or_store, client_kwargs, mode, root)

            if 'endpoint_url' in client_kwargs and root is not None:
                s3 = s3fs.S3FileSystem(anon=True,
                                       client_kwargs=client_kwargs)
                consolidated = s3.exists(f'{root}/.zmetadata')
                path_or_store = s3fs.S3Map(root=root, s3=s3, check=False)
                if 'max_cache_size' in kwargs:
                    max_cache_size = kwargs.pop('max_cache_size')
                    if max_cache_size > 0:
                        path_or_store = zarr.LRUStoreCache(path_or_store, max_size=max_cache_size)
            else:
                consolidated = os.path.exists(os.path.join(path_or_store, '.zmetadata'))
        return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
Exemple #11
0
def open_zarr(s3_path, anon=False, cache=False):
    """Open a zarr archive and return its root."""

    s3 = s3fs.S3FileSystem(anon=anon)
    store = s3fs.S3Map(s3_path, s3=s3, check=False, create=False)
    if cache:
        lrucache = zarr.LRUStoreCache(store=store, max_size=1 << 29)
        root = zarr.group(store=lrucache)
    else:
        root = zarr.group(store=store)
    return root
Exemple #12
0
def get_storage_map(url: str, creds: dict = None, memcache: float = None):
    fs, path, store = _get_storage_map(url, creds)
    # TODO: Make use that fs.listdir and store.get do not cache locally filenames,
    # because in that case if something is added to s3 or gcs it won't be notified by the program
    if (store.get(".zarray") is None and store.get(".zgroup") is None
            and len(fs.listdir(path)) > 0):
        raise NotZarrFolderException(
            "This url is not empty but not zarr url either, for safety reasons refusing to overwrite this folder"
        )
    return store if not memcache else zarr.LRUStoreCache(
        store, memcache * (2**20))
Exemple #13
0
 def open_data(self, data_id: str, **open_params) -> Any:
     cci_schema = self.get_open_data_params_schema(data_id)
     cci_schema.validate_instance(open_params)
     cube_kwargs, open_params = cci_schema.process_kwargs_subset(
         open_params, ('variable_names', 'time_range', 'bbox'))
     max_cache_size: int = 2**30
     chunk_store = CciChunkStore(self._cci_odp, data_id, cube_kwargs)
     if max_cache_size:
         chunk_store = zarr.LRUStoreCache(chunk_store, max_cache_size)
     ds = xr.open_zarr(chunk_store)
     ds = self._normalize_dataset(ds, cci_schema, **open_params)
     return ds
Exemple #14
0
    def __init__(self, store, cache_size=128 * (1024**2)):
        # don't cache meta-data read once
        self.cache = zarr.LRUStoreCache(store, max_size=cache_size)

        self.root = zarr.open(self.cache, mode="r")

        meta_data, recmd, time_seconds = load_meta(self.root)

        self.depth = recmd
        self.time_seconds = time_seconds
        self.sample_events = self.root["sample_events"][:]
        self.segy_filenames = self.root["segy_filenames"][:]
Exemple #15
0
 def read(self,
          path: str,
          client_kwargs: Dict[str, Any] = None,
          **kwargs) -> xr.Dataset:
     path_or_store = path
     consolidated = False
     if isinstance(path, str):
         path_or_store, consolidated = get_path_or_obs_store(path_or_store,
                                                             client_kwargs,
                                                             mode='r')
         if 'max_cache_size' in kwargs:
             max_cache_size = kwargs.pop('max_cache_size')
             if max_cache_size > 0:
                 path_or_store = zarr.LRUStoreCache(path_or_store,
                                                    max_size=max_cache_size)
     return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
Exemple #16
0
def open_from_obs(path: str,
                  endpoint_url: str = None,
                  max_cache_size: int = 2**28) -> xr.Dataset:
    """
    Open an xcube (xarray dataset) from S3 compatible object storage (OBS).

    :param path: Path having format "<bucket>/<my>/<sub>/<path>"
    :param endpoint_url: Optional URL of the OBS service endpoint. If omitted, AWS S3 service URL is used.
    :param max_cache_size: If > 0, size of a memory cache in bytes, e.g. 2**30 = one giga bytes.
           If None or size <= 0, no memory cache will be used.
    :return: an xarray dataset
    """
    s3 = s3fs.S3FileSystem(anon=True,
                           client_kwargs=dict(endpoint_url=endpoint_url))
    store = s3fs.S3Map(root=path, s3=s3, check=False)
    if max_cache_size is not None and max_cache_size > 0:
        store = zarr.LRUStoreCache(store, max_size=max_cache_size)
    return xr.open_zarr(store)
def load_dask_array_from_s3(plate_id, index, resolution='0'):
    cache_size_mb = 2048
    cfg = {
        'anon': True,
        'client_kwargs': {
            'endpoint_url': 'https://minio-dev.openmicroscopy.org/',
        },
        'root': 'idr/zarr/v0.1-extra/plate-%s.zarr/%s/%s' % (plate_id, index,
                                                             resolution)
    }
    s3 = s3fs.S3FileSystem(
        anon=cfg['anon'],
        client_kwargs=cfg['client_kwargs'],
    )
    store = s3fs.S3Map(root=cfg['root'], s3=s3, check=False)
    cached_store = zarr.LRUStoreCache(store, max_size=(cache_size_mb * 2**20))
    # data.shape is (t, c, z, y, x) by convention
    return da.from_zarr(cached_store)
Exemple #18
0
    def __getattr__(self, name: str) -> Any:
        if name in self._cube_config:
            if name in self._dataset_cache:
                return self._dataset_cache[name]
            else:

                dataset_descriptor = self._cube_config[name]
                fs_type = dataset_descriptor.get("FileSystem", "local")
                path = dataset_descriptor.get('Path')
                ds = None
                if not path:
                    print("Missing 'path' entry in dataset descriptor")
                if fs_type == 'obs':
                    data_format = dataset_descriptor.get('Format', 'zarr')
                    if data_format != 'zarr':
                        print("Invalid format=" + data_format +
                              "!r} in dataset descriptor ")
                    client_kwargs = {}
                    if 'Endpoint' in dataset_descriptor:
                        client_kwargs['endpoint_url'] = dataset_descriptor[
                            'Endpoint']
                    if 'Region' in dataset_descriptor:
                        client_kwargs['region_name'] = dataset_descriptor[
                            'Region']
                    s3 = s3fs.S3FileSystem(anon=True,
                                           client_kwargs=client_kwargs)
                    store = s3fs.S3Map(root=path, s3=s3, check=False)
                    cached_store = zarr.LRUStoreCache(store, max_size=2**28)
                    ds = xr.open_zarr(cached_store)
                elif fs_type == 'local':
                    if not os.path.isabs(path):
                        path = os.path.join(self.base_dir, path)
                    data_format = dataset_descriptor.get('Format', 'nc')
                    if data_format == 'nc':
                        ds = xr.open_dataset(path)
                    elif data_format == 'zarr':
                        ds = xr.open_zarr(path)
                    else:
                        print("Invalid format=" + data_format +
                              "!r} in dataset descriptor")
                self._dataset_cache[name] = ds
            return ds
        return super().__getattribute__(name)
Exemple #19
0
def open_ml_dataset_from_object_storage(path: str,
                                        data_format: str = None,
                                        ds_id: str = None,
                                        exception_type: type = ValueError,
                                        client_kwargs: Mapping[str,
                                                               Any] = None,
                                        **kwargs) -> MultiLevelDataset:
    data_format = data_format or guess_ml_dataset_format(path)

    endpoint_url, root = split_bucket_url(path)
    if endpoint_url:
        kwargs['endpoint_url'] = endpoint_url
        path = root

    client_kwargs = dict(client_kwargs or {})
    for arg_name in ['endpoint_url', 'region_name']:
        if arg_name in kwargs:
            client_kwargs[arg_name] = kwargs.pop(arg_name)

    obs_file_system = s3fs.S3FileSystem(anon=True, client_kwargs=client_kwargs)

    if data_format == FORMAT_NAME_ZARR:
        store = s3fs.S3Map(root=path, s3=obs_file_system, check=False)
        cached_store = zarr.LRUStoreCache(store, max_size=2**28)
        with measure_time(tag=f"opened remote zarr dataset {path}"):
            consolidated = obs_file_system.exists(f'{path}/.zmetadata')
            ds = assert_cube(
                xr.open_zarr(cached_store, consolidated=consolidated,
                             **kwargs))
        return BaseMultiLevelDataset(ds, ds_id=ds_id)
    elif data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened remote levels dataset {path}"):
            return ObjectStorageMultiLevelDataset(
                obs_file_system,
                path,
                zarr_kwargs=kwargs,
                ds_id=ds_id,
                exception_type=exception_type)

    raise exception_type(
        f'Unrecognized multi-level dataset format {data_format!r} for path {path!r}'
    )
Exemple #20
0
    def __init__(self, store, cache_size=512 * (1024**2)):
        """
        rss format data access.

        Parameters
        ----------
        store - Instance of s ZArr storage object,
                see s3fs.S3Map for remote s3 storage, or zarr.DirectoryStore as common
                types of store.
        """

        # don't cache meta-data read once
        self.root = zarr.open(store, mode="r")

        clear_output()
        print("Mounting line access.")

        cache = zarr.LRUStoreCache(store, max_size=cache_size)

        inline_root = zarr.open(cache, mode="r")
        self.inline_root = inline_root["inline"]

        crossline_root = zarr.open(cache, mode="r")
        self.crossline_root = crossline_root["crossline"]

        clear_output()
        print("Configuring meta-data.")

        self.bounds = self.root["bounds"]

        self.ilxl = np.vstack([
            self.root["coords"]["inlines"][:],
            self.root["coords"]["crosslines"][:],
        ]).T

        self.xy = np.vstack(
            [self.root["coords"]["cdpx"][:], self.root["coords"]["cdpy"][:]]).T

        self.kdtree = None

        clear_output()
        print("Connection complete.")
Exemple #21
0
 def open_data(self, data_id: str, **open_params) -> xr.Dataset:
     assert_instance(data_id, str, name='data_id')
     fs, root, open_params = self.load_fs(open_params)
     zarr_store = fs.get_mapper(data_id)
     cache_size = open_params.pop('cache_size', None)
     if isinstance(cache_size, int) and cache_size > 0:
         zarr_store = zarr.LRUStoreCache(zarr_store, max_size=cache_size)
     log_access = open_params.pop('log_access', None)
     if log_access:
         zarr_store = LoggingStore(zarr_store,
                                   name=f'zarr_store({data_id!r})')
     consolidated = open_params.pop('consolidated',
                                    fs.exists(f'{data_id}/.zmetadata'))
     try:
         return xr.open_zarr(zarr_store,
                             consolidated=consolidated,
                             **open_params)
     except ValueError as e:
         raise DataStoreError(f'Failed to open'
                              f' dataset {data_id!r}: {e}') from e
Exemple #22
0
    def open(self, mode: str = "r", cached: bool = True, cache_size_bytes: int = int(1e9)) -> None:
        """Opens a zarr dataset from disk from the path supplied in the constructor.

        Keyword Arguments:
            mode (str): Mode to open dataset in, default to read-only (default: {"r"})
            cached (bool): Whether to cache files read from disk using a LRU cache. (default: {True})
            cache_size (int): Size of cache in bytes (default: {1e9} (1GB))

        Raises:
            Exception: When any of the expected arrays (frames, agents, scenes) is missing or the store couldn't be
opened.
        """
        if cached:
            self.root = zarr.open_group(
                store=zarr.LRUStoreCache(zarr.DirectoryStore(self.path), max_size=cache_size_bytes), mode=mode
            )
        else:
            self.root = zarr.open_group(self.path, mode=mode)
        self.frames = self.root[FRAME_ARRAY_KEY]
        self.agents = self.root[AGENT_ARRAY_KEY]
        self.scenes = self.root[SCENE_ARRAY_KEY]
    def __init__(
        self,
        zarr_dataset_path: str,
        cache_zarr: bool = False,
        with_history: bool = False,
        return_indices: bool = False,
        agents_from_standard_mask_only: bool = False,
    ):

        if cache_zarr:
            zarr_root = zarr.open_group(
                store=zarr.LRUStoreCache(
                    zarr.DirectoryStore(zarr_dataset_path), max_size=int(1e9)),
                mode="r",
            )
        else:
            zarr_root = zarr.open_group(zarr_dataset_path, mode="r")

        self.cumulative_sizes = zarr_root[SCENE_ARRAY_KEY][
            "frame_index_interval"][:, 1]

        if with_history:
            raise NotImplementedError
        if agents_from_standard_mask_only:
            self.sample_function = partial(
                generate_frame_sample_without_hist,
                agents=zarr_root[AGENT_ARRAY_KEY],
                tl_faces=zarr_root[TL_FACE_ARRAY_KEY],
                agents_from_standard_mask_only=True,
                mask_agent_indices=zarr_root[MASK_AGENT_INDICES_ARRAY_KEY],
            )
        else:
            self.sample_function = partial(
                generate_frame_sample_without_hist,
                agents=zarr_root[AGENT_ARRAY_KEY],
                tl_faces=zarr_root[TL_FACE_ARRAY_KEY],
            )
        self.with_history = with_history
        self.return_indices = return_indices
        self.zarr_root = zarr_root
Exemple #24
0
def open_cube(cube_config: CubeConfig,
              observer: Callable = None,
              trace_store_calls: bool = False,
              max_cache_size: int = 2 ** 30,
              **sh_kwargs) -> xr.Dataset:
    """
    Open a data cube from SentinelHub.

    This is a facade function that hides the details of opening a volatile data cube from SentinelHub.

    :param cube_config: The cube configuration.
    :param observer: A observer function or callable that is called on every request made to SentinelHub.
    :param trace_store_calls: Whether to trace and dump calls made into the Zarr store.
    :param max_cache_size: Cache size in bytes. Defaults to 1 GB. If zero or None, no caching takes place:
    :param sh_kwargs: Keyword arguments passed to the SentinelHub constructor.
    :return: the data cube represented by an xarray Dataset object.
    """
    sentinel_hub = SentinelHub(**sh_kwargs)
    cube_store = SentinelHubStore(sentinel_hub, cube_config, observer=observer, trace_store_calls=trace_store_calls)
    if max_cache_size:
        cube_store = zarr.LRUStoreCache(cube_store, max_cache_size)
    return xr.open_zarr(cube_store)
Exemple #25
0
def open_ml_dataset_from_object_storage(path: str,
                                        data_format: str = None,
                                        ds_id: str = None,
                                        exception_type: type = ValueError,
                                        s3_kwargs: Mapping[str, Any] = None,
                                        s3_client_kwargs: Mapping[str,
                                                                  Any] = None,
                                        chunk_cache_capacity: int = None,
                                        **kwargs) -> MultiLevelDataset:
    data_format = data_format or guess_ml_dataset_format(path)

    s3, root = parse_s3_fs_and_root(path,
                                    s3_kwargs=s3_kwargs,
                                    s3_client_kwargs=s3_client_kwargs,
                                    mode='r')

    if data_format == FORMAT_NAME_ZARR:
        store = s3fs.S3Map(root=root, s3=s3, check=False)
        if chunk_cache_capacity:
            store = zarr.LRUStoreCache(store, max_size=chunk_cache_capacity)
        with measure_time(tag=f"opened remote zarr dataset {path}"):
            consolidated = s3.exists(f'{root}/.zmetadata')
            ds = assert_cube(
                xr.open_zarr(store, consolidated=consolidated, **kwargs))
        return BaseMultiLevelDataset(ds, ds_id=ds_id)
    elif data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened remote levels dataset {path}"):
            return ObjectStorageMultiLevelDataset(
                s3,
                root,
                zarr_kwargs=kwargs,
                ds_id=ds_id,
                chunk_cache_capacity=chunk_cache_capacity,
                exception_type=exception_type)

    raise exception_type(
        f'Unrecognized multi-level dataset format {data_format!r} for path {path!r}'
    )
import urllib.request
import matplotlib.colors

from .structure_graph import acronym_to_allen_id, allen_id_to_acronym, structure_graph, allen_id_to_tree_node
from .allen_id_label import labels_for_allen_id
from .swc_morphology import swc_morphology_geometry
from .structure_mesh import structure_mesh

from IPython.core.debugger import set_trace

_image_fs = HTTPFileSystem()
# Todo: Use AWS store after Scott / Lydia upload
_image_store = _image_fs.get_mapper(
    "https://thewtex.github.io/allen-ccf-itk-vtk-zarr/average_template_50_chunked.zarr"
)
_image_store_cached = zarr.LRUStoreCache(_image_store, max_size=None)
_image_ds = xr.open_zarr(_image_store_cached, consolidated=True)
_image_da = _image_ds.average_template_50

_label_image_fs = HTTPFileSystem()
# Todo: Use AWS store after Scott / Lydia upload
_label_image_store = _label_image_fs.get_mapper(
    "https://thewtex.github.io/allen-ccf-itk-vtk-zarr/allen_ccfv3_annotation_50_contiguous.zarr"
)
_label_image_store_cached = zarr.LRUStoreCache(_label_image_store,
                                               max_size=None)
_label_image_ds = xr.open_zarr(_label_image_store_cached, consolidated=True)
_label_image_da = _label_image_ds.allen_ccfv3_annotation


@register
Exemple #27
0
 def test_cached(self):
     store_cache = zarr.LRUStoreCache(self.store, max_size=2 * 24)
     cube = xr.open_zarr(store_cache)
     self.assert_4d_cube_is_valid(cube)
Exemple #28
0
    def get_dataset(self, ds_name: str) -> xr.Dataset:
        if ds_name in self.dataset_cache:
            ds, _, _ = self.dataset_cache[ds_name]
        else:
            dataset_descriptor = self.get_dataset_descriptor(ds_name)

            path = dataset_descriptor.get('Path')
            if not path:
                raise ServiceConfigError(
                    f"Missing 'path' entry in dataset descriptor {ds_name}")

            t1 = time.clock()

            fs_type = dataset_descriptor.get('FileSystem', 'local')
            if fs_type == 'obs':
                data_format = dataset_descriptor.get('Format', 'zarr')
                if data_format != 'zarr':
                    raise ServiceConfigError(
                        f"Invalid format={data_format!r} in dataset descriptor {ds_name!r}"
                    )
                client_kwargs = {}
                if 'Endpoint' in dataset_descriptor:
                    client_kwargs['endpoint_url'] = dataset_descriptor[
                        'Endpoint']
                if 'Region' in dataset_descriptor:
                    client_kwargs['region_name'] = dataset_descriptor['Region']
                s3 = s3fs.S3FileSystem(anon=True, client_kwargs=client_kwargs)
                store = s3fs.S3Map(root=path, s3=s3, check=False)
                cached_store = zarr.LRUStoreCache(store, max_size=2**28)
                with log_time(f"opened remote dataset {path}"):
                    ds = xr.open_zarr(cached_store)
            elif fs_type == 'local':
                if not os.path.isabs(path):
                    path = os.path.join(self.base_dir, path)
                data_format = dataset_descriptor.get('Format', 'nc')
                if data_format == 'nc':
                    with log_time(f"opened local NetCDF dataset {path}"):
                        ds = xr.open_dataset(path)
                elif data_format == 'zarr':
                    with log_time(f"opened local zarr dataset {path}"):
                        ds = xr.open_zarr(path)
                else:
                    raise ServiceConfigError(
                        f"Invalid format={data_format!r} in dataset descriptor {ds_name!r}"
                    )
            elif fs_type == 'computed':
                if not os.path.isabs(path):
                    path = os.path.join(self.base_dir, path)
                with open(path) as fp:
                    python_code = fp.read()

                local_env = dict()
                global_env = None
                try:
                    exec(python_code, global_env, local_env)
                except Exception as e:
                    raise ServiceError(
                        f"Failed to compute dataset {ds_name!r} from {path!r}: {e}"
                    ) from e

                callable_name = dataset_descriptor.get('Function',
                                                       COMPUTE_DATASET)
                callable_args = dataset_descriptor.get('Args', [])

                callable_obj = local_env.get(callable_name)
                if callable_obj is None:
                    raise ServiceConfigError(
                        f"Invalid dataset descriptor {ds_name!r}: "
                        f"no callable named {callable_name!r} found in {path!r}"
                    )
                elif not callable(callable_obj):
                    raise ServiceConfigError(
                        f"Invalid dataset descriptor {ds_name!r}: "
                        f"object {callable_name!r} in {path!r} is not callable"
                    )

                args = list()
                for arg_value in callable_args:
                    if isinstance(arg_value, str) and len(arg_value) > 2 \
                            and arg_value.startswith('@') and arg_value.endswith('@'):
                        ref_ds_name = arg_value[1:-1]
                        if not self.get_dataset_descriptor(ref_ds_name):
                            raise ServiceConfigError(
                                f"Invalid dataset descriptor {ds_name!r}: "
                                f"argument {arg_value!r} of callable {callable_name!r} "
                                f"must reference another dataset")
                        args.append(self.get_dataset(ref_ds_name))
                    else:
                        args.append(arg_value)

                try:
                    with log_time(f"created computed dataset {ds_name}"):
                        ds = callable_obj(*args)
                except Exception as e:
                    raise ServiceError(
                        f"Failed to compute dataset {ds_name!r} "
                        f"from function {callable_name!r} in {path!r}: {e}"
                    ) from e
                if not isinstance(ds, xr.Dataset):
                    raise ServiceError(
                        f"Failed to compute dataset {ds_name!r} "
                        f"from function {callable_name!r} in {path!r}: "
                        f"expected an xarray.Dataset but got a {type(ds)}")
            else:
                raise ServiceConfigError(
                    f"Invalid fs={fs_type!r} in dataset descriptor {ds_name!r}"
                )

            tile_grid_cache = dict()
            self.dataset_cache[
                ds_name] = ds, dataset_descriptor, tile_grid_cache

            t2 = time.clock()

            if TRACE_PERF:
                print(f'PERF: opening {ds_name!r} took {t2-t1} seconds')

        return ds
Exemple #29
0
    def __init__(self,
                 filename: str,
                 hdf5group: str = None,
                 hdf5file_mode: str = 'r',
                 store: Union[MutableMapping, str, Path] = None,
                 store_path: str = None,
                 store_mode: str = 'a',
                 LRU: bool = False,
                 LRU_max_size: int = 2**30,
                 max_chunksize=2 * 2**20):
        """
        Args:
            filename:                    str or File-like object, file name string or File-like object to be read by zarr
            hdf5group:                   str, hdf5 group in hdf5 file to be read by zarr
                                         along with its children. default is the root group.
            hdf5file_mode                str, subset of h5py file access modes, filename must exist
                                         'r'          readonly, default 'r'
                                         'r+'         read and write
            store:                       collections.abc.MutableMapping or str, zarr store.
                                         if string path is passed, zarr.DirectoryStore
                                         is created at the given path, if None, zarr.MemoryStore is used
            store_mode:                  store data access mode, default 'a'
                                         'r'          readonly, compatible zarr hierarchy should
                                                      already exist in the passed store
                                         'r+'         read and write, return error if file does not exist,
                                                      for updating zarr hierarchy
                                         'w'          create store, remove data if it exists
                                         'w-' or 'x'  create store, fail if exists
                                         'a'          read and write, create if it does not exist, default 'r'
            store_path:                  string, path in zarr store
            LRU:                         bool, if store is not already zarr.LRUStoreCache, add
                                         a zarr.LRUStoreCache store layer on top of currently used store
            LRU_max_size:                int, maximum zarr.LRUStoreCache cache size, only used
                                         if store is zarr.LRUStoreCache, or LRU argument is True
            max_chunksize:               maximum chunk size to use when creating zarr hierarchy, this is useful if
                                         only a small slice of data needs to be read
        """
        # Verify arguments
        if hdf5file_mode not in ('r', 'r+'):
            raise ValueError("hdf5file_mode must be 'r' or 'r+'")
        self.hdf5file_mode = hdf5file_mode

        # Verify arguments
        if not isinstance(LRU, bool):
            raise TypeError(f"Expected bool for LRU, recieved {type(LRU)}")
        self.LRU = LRU
        if not isinstance(LRU_max_size, int):
            raise TypeError(
                f"Expected int for LRU_max_size, recieved {type(LRU_max_size)}"
            )
        self.LRU_max_size = LRU_max_size
        if not isinstance(max_chunksize, int):
            raise TypeError(
                f"Expected int for max_chunksize, recieved {type(max_chunksize)}"
            )
        self.max_chunksize = max_chunksize

        # store, store_path, and store_mode are passed through to zarr
        self.store_path = store_path
        self.store_mode = store_mode
        if store is not None and LRU is True and not isinstance(
                store, zarr.LRUStoreCache):
            self.store = zarr.LRUStoreCache(store, max_size=self.LRU_max_size)
        else:
            self.store = store

        # create dictionary mapping hdf5 filter numbers to compatible zarr codec
        self._hdf5_regfilters_subset = {}
        self._fill_regfilters()

        # dictionary to hold addresses of hdf5 objects in file
        self._address_dict = {}

        # create zarr format hierarchy for datasets and attributes compatible with hdf5 file,
        # dataset contents are not copied, unless it contains variable-length strings

        self.zgroup = zarr.open_group(self.store,
                                      mode=self.store_mode,
                                      path=self.store_path)
        if self.store is None:
            self.store = self.zgroup.store

        # FileChunkStore requires uri
        if isinstance(filename, str):
            self.uri = filename
        else:
            try:
                self.uri = getattr(filename, 'path', None)
                if self.uri is None:
                    self.uri = filename.name
            except:
                self.uri = ''

        # Access hdf5 file and create zarr hierarchy
        if hdf5group is not None and not isinstance(hdf5group, str):
            raise TypeError(
                f"Expected str for hdf5group, recieved {type(hdf5group)}")
        self.hdf5group = hdf5group
        self.filename = filename
        if self.store_mode != 'r':
            self.file = h5py.File(self.filename, mode=self.hdf5file_mode)
            self.group = self.file[
                self.hdf5group] if self.hdf5group is not None else self.file
            self.create_zarr_hierarchy(self.group, self.zgroup)
            self.file.close()
        if isinstance(self.filename, str):
            self.chunkstore_file = fsspec.open(self.filename, mode='rb')
            self.chunk_store = FileChunkStore(
                self.store, chunk_source=self.chunkstore_file.open())
        else:
            self.chunk_store = FileChunkStore(self.store,
                                              chunk_source=self.filename)
        if LRU is True and not isinstance(self.chunk_store,
                                          zarr.LRUStoreCache):
            self.chunk_store = zarr.LRUStoreCache(self.chunk_store,
                                                  max_size=self.LRU_max_size)

        # open zarr group
        store_mode_cons = 'r' if self.store_mode == 'r' else 'r+'
        self.zgroup = zarr.open_group(self.store,
                                      mode=store_mode_cons,
                                      path=self.store_path,
                                      chunk_store=self.chunk_store)
Exemple #30
0
    def _create_dataset_entry(
            self, ds_id: str) -> Tuple[MultiLevelDataset, Dict[str, Any]]:

        dataset_descriptor = self.get_dataset_descriptor(ds_id)

        path = dataset_descriptor.get('Path')
        if not path:
            raise ServiceConfigError(
                f"Missing 'path' entry in dataset descriptor {ds_id}")

        t1 = time.perf_counter()

        fs_type = dataset_descriptor.get('FileSystem', 'local')
        if fs_type == 'obs':
            data_format = dataset_descriptor.get('Format', 'zarr')
            s3_client_kwargs = {}
            if 'Endpoint' in dataset_descriptor:
                s3_client_kwargs['endpoint_url'] = dataset_descriptor[
                    'Endpoint']
            if 'Region' in dataset_descriptor:
                s3_client_kwargs['region_name'] = dataset_descriptor['Region']
            obs_file_system = s3fs.S3FileSystem(anon=True,
                                                client_kwargs=s3_client_kwargs)
            if data_format == 'zarr':
                store = s3fs.S3Map(root=path, s3=obs_file_system, check=False)
                cached_store = zarr.LRUStoreCache(store, max_size=2**28)
                with measure_time(tag=f"opened remote zarr dataset {path}"):
                    ds = xr.open_zarr(cached_store)
                ml_dataset = BaseMultiLevelDataset(ds)
            elif data_format == 'levels':
                with measure_time(tag=f"opened remote levels dataset {path}"):
                    ml_dataset = ObjectStorageMultiLevelDataset(
                        ds_id,
                        obs_file_system,
                        path,
                        exception_type=ServiceConfigError)
            else:
                raise ServiceConfigError(
                    f"Invalid format={data_format!r} in dataset descriptor {ds_id!r}"
                )
        elif fs_type == 'local':
            if not os.path.isabs(path):
                path = os.path.join(self.base_dir, path)

            data_format = dataset_descriptor.get('Format', 'nc')
            if data_format == 'nc':
                with measure_time(tag=f"opened local NetCDF dataset {path}"):
                    ds = xr.open_dataset(path)
                    ml_dataset = BaseMultiLevelDataset(ds)
            elif data_format == 'zarr':
                with measure_time(tag=f"opened local zarr dataset {path}"):
                    ds = xr.open_zarr(path)
                    ml_dataset = BaseMultiLevelDataset(ds)
            elif data_format == 'levels':
                with measure_time(tag=f"opened local levels dataset {path}"):
                    ml_dataset = FileStorageMultiLevelDataset(path)
            else:
                raise ServiceConfigError(
                    f"Invalid format={data_format!r} in dataset descriptor {ds_id!r}"
                )
        elif fs_type == 'memory':
            if not os.path.isabs(path):
                path = os.path.join(self.base_dir, path)

            callable_name = dataset_descriptor.get('Function', COMPUTE_DATASET)
            input_dataset_ids = dataset_descriptor.get('InputDatasets', [])
            input_parameters = dataset_descriptor.get('InputParameters', {})

            for input_dataset_id in input_dataset_ids:
                if not self.get_dataset_descriptor(input_dataset_id):
                    raise ServiceConfigError(
                        f"Invalid dataset descriptor {ds_id!r}: "
                        f"Input dataset {input_dataset_id!r} of callable {callable_name!r} "
                        f"must reference another dataset")

            with measure_time(tag=f"opened memory dataset {path}"):
                ml_dataset = ComputedMultiLevelDataset(
                    ds_id,
                    path,
                    callable_name,
                    input_dataset_ids,
                    self.get_ml_dataset,
                    input_parameters,
                    exception_type=ServiceConfigError)

        else:
            raise ServiceConfigError(
                f"Invalid fs={fs_type!r} in dataset descriptor {ds_id!r}")

        t2 = time.perf_counter()

        if self.config.get("trace_perf", False):
            _LOG.info(f'Opening {ds_id!r} took {t2 - t1} seconds')

        return ml_dataset, dataset_descriptor