Exemple #1
0
def validate_s3_dataset(bucket_name,
                        path_in_bucket,
                        dataset_name,
                        server=None,
                        anon=True,
                        n_threads=1):

    tmp_file = "./tmp_file.n5"
    os.makedirs(tmp_file, exist_ok=True)

    fs = _get_fs(server, anon)
    # make a dummy local file by copying the relevant attributes.json
    store = s3fs.S3Map(root=path_in_bucket, s3=fs)
    attrs = store["attributes.json"].decode("utf-8")
    attrs = json.loads(attrs)
    attrs_file = os.path.join(tmp_file, "attributes.json")
    with open(attrs_file, "w") as f:
        json.dump(attrs, f)

    # make a dummy dataset by copying the dataset attributes.json
    store = s3fs.S3Map(root=os.path.join(path_in_bucket, dataset_name), s3=fs)
    try:
        attrs = store["attributes.json"].decode("utf-8")
    except KeyError:
        try:
            rmtree(tmp_file)
        except OSError:
            pass
        raise ValueError(
            f"No file {path_in_bucket}:{dataset_name} in {bucket_name}")

    attrs = json.loads(attrs)
    tmp_ds = os.path.join(tmp_file, dataset_name)
    os.makedirs(tmp_ds, exist_ok=True)
    attrs_file = os.path.join(tmp_ds, "attributes.json")
    with open(attrs_file, "w") as f:
        json.dump(attrs, f)

    print("validating chunks for s3 dataset stored at")
    if server is None:
        print(f"{bucket_name}:{path_in_bucket}:{dataset_name}")
    else:
        print(f"{server}:{bucket_name}:{path_in_bucket}:{dataset_name}")
    dataset = zarr.open(tmp_file)[dataset_name]
    corrupted_chunks = validate_chunks_s3(store,
                                          dataset,
                                          keys=iter(store),
                                          n_threads=n_threads)

    try:
        rmtree(tmp_file)
    except OSError:
        warnings.warn(
            f"Could not clean up temporary data stored in {tmp_file}")
        pass

    return corrupted_chunks
Exemple #2
0
    def _write_test_data(cls, s3: s3fs.S3FileSystem):
        if not s3.isdir(cls.BUCKET_NAME):
            s3.mkdir(cls.BUCKET_NAME)

        data = helpers.make_test_store()
        s3map = s3fs.S3Map(root=cls.BUCKET_NAME + '/cube_1.zarr',
                           s3=s3,
                           create=True)
        s3map.update(data)
        s3map = s3fs.S3Map(root=cls.BUCKET_NAME + '/cube_2.zarr',
                           s3=s3,
                           create=True)
        s3map.update(data)
Exemple #3
0
    def __init__(self,
                 filename,
                 client_kwargs=None,
                 cache_size=512 * (1024**2)):
        """
        An object for accessing rss data from s3 blob storage.

        Parameters
        ----------
        filename : path to rss data object on s3.
        client_kwargs : dict containing aws_access_key_id and aws_secret_access_key or None.
        If this variable is none, anonymous access is assumed.
        cache_size : max size of the LRU cache.
        """
        print("Establishing Connection, may take a minute ......")

        anon = client_kwargs is None

        s3 = s3fs.S3FileSystem(anon=anon, client_kwargs=client_kwargs)

        clear_output()
        print("Connected to S3.")

        store = s3fs.S3Map(root=filename, s3=s3, check=False)

        super().__init__(store, cache_size=cache_size)
Exemple #4
0
def get_path_or_obs_store(path_or_url: str,
                          client_kwargs: Mapping[str, Any] = None,
                          mode: str = 'r') -> Tuple[Union[str, Dict], bool]:
    """
    If *path_or_url* is an object storage URL, return a object storage Zarr store (mapping object)
    using *client_kwargs* and *mode* and a flag indicating whether the Zarr datasets is consolidated.

    Otherwise *path_or_url* is interpreted as a local file system path, retured as-is plus
    a flag indicating whether the Zarr datasets is consolidated.

    :param path_or_url: A path or a URL.
    :param client_kwargs: Object storage client keyword arguments.
    :param mode: "r" or "w"
    :return: A tuple (path_or_obs_store, consolidated).
    """
    if is_obs_url(path_or_url):
        root, obs_fs_kwargs, obs_fs_client_kwargs = parse_obs_url_and_kwargs(
            path_or_url, client_kwargs)
        s3 = s3fs.S3FileSystem(**obs_fs_kwargs,
                               client_kwargs=obs_fs_client_kwargs)
        consolidated = mode == "r" and s3.exists(f'{root}/.zmetadata')
        return s3fs.S3Map(root=root, s3=s3, check=False,
                          create=mode == "w"), consolidated
    else:
        consolidated = os.path.exists(os.path.join(path_or_url, '.zmetadata'))
        return path_or_url, consolidated
Exemple #5
0
def fix_corrupted_chunks_s3(corrupted_chunks,
                            local_dataset_path,
                            local_dataset_key,
                            bucket_name,
                            path_in_bucket,
                            dataset_name,
                            server=None,
                            anon=False):
    try:
        local_ds = zarr.open(local_dataset_path, "r")[local_dataset_key]
    except KeyError:
        raise ValueError(
            f"No file {path_in_bucket}:{dataset_name} in {bucket_name}")

    fs = _get_fs(server, anon)
    store = s3fs.S3Map(root=os.path.join(path_in_bucket, dataset_name), s3=fs)

    local_corrupted_chunks = []
    for chunk_id in corrupted_chunks:
        local_chunk_path = os.path.join(local_dataset_path, local_dataset_key,
                                        chunk_id)
        with open(local_chunk_path, "rb") as f:
            cdata = f.read()
        try:
            cdata = local_ds._decode_chunk(cdata)
        except Exception:
            local_corrupted_chunks.append(chunk_id)
        store[chunk_id] = cdata

    return local_corrupted_chunks
Exemple #6
0
def _save_zarr(dataset: xr.Dataset, url: str, profile_name: str,
               suffix: str) -> None:

    if isinstance(url, Path):
        url = str(url)

    url = url.rstrip("/")

    if url.startswith("s3://"):

        url = url.replace(" ", '-')
        url = url.replace("s3://", "")

        if url.count("/") > 1:
            raise ValueError(
                "I haven't figured out how to write groups yet, this will fail."
            )
        s3 = s3fs.S3FileSystem(profile_name=profile_name)
        root = f"{url}.{suffix}.zarr"
        store = s3fs.S3Map(root=root, s3=s3, check=False)

        dataset.to_zarr(store=store)

    else:  # assume local
        dataset.to_zarr(f"{url}.{suffix}.zarr")
Exemple #7
0
 def __call__(self, store_id: str) -> collections.MutableMapping:
     """Open a store for store identifier *store_id*."""
     import s3fs
     return s3fs.S3Map(root=self._root_pattern.format(store_id=store_id),
                       s3=self._s3,
                       check=False,
                       create=True)
Exemple #8
0
    def _get_dataset_lazily(self, index: int, **zarr_kwargs) -> xr.Dataset:
        """
        Read the dataset for the level at given *index*.

        :param index: the level index
        :param zarr_kwargs: kwargs passed to xr.open_zarr()
        :return: the dataset for the level at *index*.
        """
        ext, level_path = self._level_paths[index]
        if ext == ".link":
            with self._obs_file_system.open(level_path, "w") as fp:
                level_path = fp.read()
                # if file_path is a relative path, resolve it against the levels directory
                if not os.path.isabs(level_path):
                    base_dir = os.path.dirname(self._dir_path)
                    level_path = os.path.join(base_dir, level_path)

        store = s3fs.S3Map(root=level_path,
                           s3=self._obs_file_system,
                           check=False)
        cached_store = zarr.LRUStoreCache(store, max_size=2**28)
        with measure_time(
                tag=f"opened remote dataset {level_path} for level {index}"):
            return assert_cube(xr.open_zarr(cached_store, **zarr_kwargs),
                               name=level_path)
Exemple #9
0
    def read(self, path: str, **kwargs) -> xr.Dataset:
        path_or_store = path

        if isinstance(path, str):
            endpoint_url = None
            root = None

            if 'endpoint_url' in kwargs:
                endpoint_url = kwargs.pop('endpoint_url')
                root = path
            if path.startswith("http://") or path.startswith("https://"):
                import urllib3.util
                url = urllib3.util.parse_url(path_or_store)
                if url.port is not None:
                    endpoint_url = f'{url.scheme}://{url.host}:{url.port}'
                else:
                    endpoint_url = f'{url.scheme}://{url.host}'
                root = url.path
                if root.startswith('/'):
                    root = root[1:]

            if endpoint_url and root is not None:
                s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(endpoint_url=endpoint_url))
                path_or_store = s3fs.S3Map(root=root, s3=s3, check=False)
                if 'max_cache_size' in kwargs:
                    max_cache_size = kwargs.pop('max_cache_size')
                    if max_cache_size > 0:
                        path_or_store = zarr.LRUStoreCache(path_or_store, max_size=max_cache_size)

        return xr.open_zarr(path_or_store, **kwargs)
Exemple #10
0
def get_path_or_s3_store(path_or_url: str,
                         s3_kwargs: Mapping[str, Any] = None,
                         s3_client_kwargs: Mapping[str, Any] = None,
                         mode: str = 'r') -> Tuple[Union[str, Dict], bool]:
    """
    If *path_or_url* is an object storage URL, return a object storage
    Zarr store (mapping object) using *s3_client_kwargs* and *mode* and
    a flag indicating whether the Zarr datasets is consolidated.

    Otherwise *path_or_url* is interpreted as a local file system path,
    returned as-is plus a flag indicating whether the Zarr datasets
    is consolidated.

    :param path_or_url: A path or a URL.
    :param s3_kwargs: keyword arguments for S3 file system.
    :param s3_client_kwargs: keyword arguments for S3 boto3 client.
    :param mode: access mode "r" or "w". "r" is default.
    :return: A tuple (path_or_obs_store, consolidated).
    """
    if is_s3_url(path_or_url) \
            or s3_kwargs is not None \
            or s3_client_kwargs is not None:
        s3, root = parse_s3_fs_and_root(path_or_url,
                                        s3_kwargs=s3_kwargs,
                                        s3_client_kwargs=s3_client_kwargs,
                                        mode=mode)
        consolidated = mode == "r" and s3.exists(f'{root}/.zmetadata')
        return s3fs.S3Map(root=root, s3=s3, check=False,
                          create=mode == "w"), consolidated
    else:
        consolidated = os.path.exists(os.path.join(path_or_url, '.zmetadata'))
        return path_or_url, consolidated
Exemple #11
0
    def _get_dataset_lazily(self, index: int,
                            parameters: Dict[str, Any]) -> xr.Dataset:
        """
        Read the dataset for the level at given *index*.

        :param index: the level index
        :param parameters: keyword arguments passed to xr.open_zarr()
        :return: the dataset for the level at *index*.
        """
        ext, level_path = self._level_paths[index]
        if ext == ".link":
            with self._s3_file_system.open(level_path, "w") as fp:
                level_path = fp.read()
                # if file_path is a relative path, resolve it against the levels directory
                if not os.path.isabs(level_path):
                    base_dir = os.path.dirname(self._dir_path)
                    level_path = os.path.join(base_dir, level_path)
        store = s3fs.S3Map(root=level_path,
                           s3=self._s3_file_system,
                           check=False)
        max_size = self.get_chunk_cache_capacity(index)
        if max_size:
            store = zarr.LRUStoreCache(store, max_size=max_size)
        with measure_time(
                tag=f"opened remote dataset {level_path} for level {index}"):
            consolidated = self._s3_file_system.exists(
                f'{level_path}/.zmetadata')
            return assert_cube(xr.open_zarr(store,
                                            consolidated=consolidated,
                                            **parameters),
                               name=level_path)
Exemple #12
0
    def read(self, path: str, **kwargs) -> xr.Dataset:
        path_or_store = path
        consolidated = False
        mode = 'read'
        root = None

        if isinstance(path, str):
            client_kwargs = {}
            if 'client_kwargs' in kwargs:
                client_kwargs = kwargs.pop('client_kwargs')
            if 'endpoint_url' in kwargs:
                client_kwargs['endpoint_url'] = kwargs.pop('endpoint_url')
                root = path
            if 'region_name' in kwargs:
                client_kwargs['region_name'] = kwargs.pop('region_name')

            path_or_store, root, client_kwargs = _get_path_or_store(path_or_store, client_kwargs, mode, root)

            if 'endpoint_url' in client_kwargs and root is not None:
                s3 = s3fs.S3FileSystem(anon=True,
                                       client_kwargs=client_kwargs)
                consolidated = s3.exists(f'{root}/.zmetadata')
                path_or_store = s3fs.S3Map(root=root, s3=s3, check=False)
                if 'max_cache_size' in kwargs:
                    max_cache_size = kwargs.pop('max_cache_size')
                    if max_cache_size > 0:
                        path_or_store = zarr.LRUStoreCache(path_or_store, max_size=max_cache_size)
            else:
                consolidated = os.path.exists(os.path.join(path_or_store, '.zmetadata'))
        return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
Exemple #13
0
def open_ml_dataset_from_object_storage(
        ctx: ServiceContext,
        dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset:
    ds_id = dataset_descriptor.get('Identifier')

    path = dataset_descriptor.get('Path')
    if not path:
        raise ServiceConfigError(
            f"Missing 'path' entry in dataset descriptor {ds_id}")

    data_format = dataset_descriptor.get('Format', FORMAT_NAME_ZARR)

    s3_client_kwargs = {}
    if 'Endpoint' in dataset_descriptor:
        s3_client_kwargs['endpoint_url'] = dataset_descriptor['Endpoint']
    if 'Region' in dataset_descriptor:
        s3_client_kwargs['region_name'] = dataset_descriptor['Region']
    obs_file_system = s3fs.S3FileSystem(anon=True,
                                        client_kwargs=s3_client_kwargs)

    if data_format == FORMAT_NAME_ZARR:
        store = s3fs.S3Map(root=path, s3=obs_file_system, check=False)
        cached_store = zarr.LRUStoreCache(store, max_size=2**28)
        with measure_time(tag=f"opened remote zarr dataset {path}"):
            ds = xr.open_zarr(cached_store)
        return BaseMultiLevelDataset(ds)

    if data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened remote levels dataset {path}"):
            return ObjectStorageMultiLevelDataset(
                ds_id,
                obs_file_system,
                path,
                exception_type=ServiceConfigError)
Exemple #14
0
 def _zarr_get(self, uuid, x, y, z, t, c, level):
     s3 = s3fs.S3FileSystem(client_kwargs=dict(region_name=self.region))
     s3_store = s3fs.S3Map(root=f"{self.bucket}/{uuid}", s3=s3, check=False)
     group = zarr.hierarchy.open_group(store=s3_store)
     level = group.get(str(level))
     return level[t, c, z, y * self.tile_size:(y + 1) * self.tile_size,
                  x * self.tile_size:(x + 1) * self.tile_size, ]
Exemple #15
0
def nc2zarr(fns,zpath,s3store=True,chunks=None,parallel=True):
    '''
    Convert netcdf files to zarr format and save to local or s3 store
    
    Parameters
    ----------
    fns     : a list of netcdf file names with full path
    zpath   : path to the local or s3 store
    s3store : flag of whether to save to s3 store, boolean
    chunks  : chunks used to read and write data
    parallel: flag to use dask to read files in parallel, boolean
    '''
    # --- remove lat/long from the list of vars to be concatenated.
    with xr.open_mfdataset(fns,parallel=True,chunks=chunks,combine='nested',concat_dim='time') as ds:
        vns = list(ds.data_vars)
    for vn in ['lat','long']:
        if vn in vns: vns.remove(vn)    
        
    with xr.open_mfdataset(fns,chunks=chunks,parallel=parallel, data_vars=vns,combine='nested',concat_dim='time') as ds:
        if s3store:
            fs = s3fs.S3FileSystem(anon=False)
            ds_store = s3fs.S3Map(root=zpath,s3=fs,check=False,create=True)
        else:
            ds_store = zpath
        if chunks is not None: 
            ds = ds.chunk(chunks=chunks) 
        else:
            ds = ds.chunk(chunks={x:ds.chunks[x][0] for x in ds.chunks})
        compressor = zarr.Blosc(cname='zstd', clevel=4)
        encoding = {vname: {'compressor': compressor} for vname in ds.data_vars}
        ds.to_zarr(store=ds_store,encoding=encoding,consolidated=True) 
        
    return 
Exemple #16
0
def _get_path_or_store(path: str, client_kwargs: Dict[str, Any], mode: str, root: str):
    path_or_store = path
    anon_mode = True
    if not client_kwargs:
        client_kwargs = {}

    if client_kwargs is not None:
        if 'provider_access_key_id' in client_kwargs and 'provider_secret_access_key' in client_kwargs:
            anon_mode = False
            client_kwargs['aws_access_key_id'] = client_kwargs.pop('provider_access_key_id')
            client_kwargs['aws_secret_access_key'] = client_kwargs.pop('provider_secret_access_key')

    if path.startswith("https://") or path.startswith("http://"):
        import urllib3.util
        url = urllib3.util.parse_url(path_or_store)
        if url.port is not None:
            client_kwargs['endpoint_url'] = f'{url.scheme}://{url.host}:{url.port}'
        else:
            client_kwargs['endpoint_url'] = f'{url.scheme}://{url.host}'
        root = url.path
        if root.startswith('/'):
            root = root[1:]
        if "write" in mode:
            root = f's3://{root}'
        s3 = s3fs.S3FileSystem(anon=anon_mode,
                               client_kwargs=client_kwargs)
        path_or_store = s3fs.S3Map(root=root, s3=s3, check=False)
    return path_or_store, root, client_kwargs
Exemple #17
0
    def read(self, path: str, **kwargs) -> xr.Dataset:
        path_or_store = path
        consolidated = False

        if isinstance(path, str):
            region_name = None

            if 'endpoint_url' in kwargs:
                endpoint_url = kwargs.pop('endpoint_url')
                root = path
            else:
                endpoint_url, root = split_bucket_url(path)

            if 'region_name' in kwargs:
                region_name = kwargs.pop('region_name')

            if endpoint_url and root:
                s3 = s3fs.S3FileSystem(anon=True,
                                       client_kwargs=dict(
                                           endpoint_url=endpoint_url,
                                           region_name=region_name))
                consolidated = s3.exists(f'{root}/.zmetadata')
                path_or_store = s3fs.S3Map(root=root, s3=s3, check=False)
                if 'max_cache_size' in kwargs:
                    max_cache_size = kwargs.pop('max_cache_size')
                    if max_cache_size > 0:
                        path_or_store = zarr.LRUStoreCache(
                            path_or_store, max_size=max_cache_size)
            else:
                consolidated = os.path.exists(
                    os.path.join(path_or_store, '.zmetadata'))
        return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
Exemple #18
0
def load_json_from_s3(address):
    server = "/".join(address.split("/")[:3])
    root_path = "/".join(address.split("/")[3:-1])
    fname = address.split("/")[-1]
    fs = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": server})
    store = s3fs.S3Map(root=root_path, s3=fs)
    attrs = store[fname]
    attrs = json.loads(attrs.decode("utf-8"))
    return attrs
Exemple #19
0
def make_example_nc(nldas_path, out_file):
    if os.path.split(nldas_path)[1] == '.zattrs':
        nldas_path = os.path.split(nldas_path)[0]

    fs = s3fs.S3FileSystem(profile='ds-drb-creds', anon=False)
    nldas_path = s3fs.S3Map(nldas_path, s3=fs)
    ds = xr.open_zarr(nldas_path)
    ds = ds.isel(time=0)
    ds[['pressfc']].to_netcdf(out_file)
Exemple #20
0
def read_zarr(path, project, **kwargs):
    dataset_id = to_dataset_id(path, project)
    zarr_path = get_zarr_path(dataset_id, project, join=True)
    endpoint_url = CONFIG["store"]["endpoint_url"]
    jasmin_s3 = s3fs.S3FileSystem(anon=True,
                                  client_kwargs={"endpoint_url": endpoint_url})

    s3_store = s3fs.S3Map(root=zarr_path, s3=jasmin_s3)
    ds = xr.open_zarr(store=s3_store, consolidated=True, **kwargs)
    return ds
Exemple #21
0
def write_zarr_to_s3(
        dataset: xr.Dataset,
        full_zarr_filename: str,
        s3: s3fs.S3FileSystem,
        **write_zarr_kwargs) -> xr.backends.ZarrStore:

    prep_and_check_s3(full_zarr_filename, s3)
    store = s3fs.S3Map(
        root=full_zarr_filename, s3=s3, check=False, create=True)
    return write_zarr(dataset, store, **write_zarr_kwargs)
def merge_zarrs(zarr_paths, output_path):

    s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='us-east-1'))
    arrays_to_merge = []
    for zarr_path in zarr_paths:
        store = s3fs.S3Map(root=zarr_path[len("s3://"):], s3=s3, check=False)
        arr = zarr.Array(store)
        arrays_to_merge.append(arr)
    merged_array = numpy.concatenate(arrays_to_merge, axis=1)
    zarr.save(output_path, merged_array)
def open_zarr(zarr_url):
    url_comps = urlparse(zarr_url)

    endpoint = f"{url_comps.scheme}://{url_comps.netloc}"
    zarr_path = url_comps.path

    jasmin_s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint})

    s3_store = s3fs.S3Map(root=zarr_path, s3=jasmin_s3)
    ds = xr.open_zarr(store=s3_store, consolidated=True)
    return ds
Exemple #24
0
def save_zarr(chunk_df, out_zarr):
    col_name = 'nldas_grid_no'
    idx_name = 'nhd_comid'
    chunks = {col_name: 10000, idx_name: 30000}
    ds = convert_df_to_dataset(chunk_df, col_name, idx_name, 'weight',
                               chunks)
    print(ds)
    print(out_zarr)
    fs = s3fs.S3FileSystem(profile='ds-drb-creds', anon=False)
    out_zarr = s3fs.S3Map(out_zarr, s3=fs)
    ds.to_zarr(out_zarr, mode='w')
Exemple #25
0
 def test_remote(self):
     import s3fs
     endpoint_url = "https://s3.eu-central-1.amazonaws.com"
     s3 = s3fs.S3FileSystem(anon=True,
                            client_kwargs=dict(endpoint_url=endpoint_url))
     s3_store = s3fs.S3Map(root="xcube-examples/OLCI-SNS-RAW-CUBE-2.zarr",
                           s3=s3,
                           check=False)
     diagnostic_store = DiagnosticStore(
         s3_store, logging_observer(log_path='remote-cube.log'))
     xr.open_zarr(diagnostic_store)
Exemple #26
0
def open_zarr(s3_path, anon=False, cache=False):
    """Open a zarr archive and return its root."""

    s3 = s3fs.S3FileSystem(anon=anon)
    store = s3fs.S3Map(s3_path, s3=s3, check=False, create=False)
    if cache:
        lrucache = zarr.LRUStoreCache(store=store, max_size=1 << 29)
        root = zarr.group(store=lrucache)
    else:
        root = zarr.group(store=store)
    return root
Exemple #27
0
    def load_data(self):

        units = ""
        level = self.variable.split('-')[1]
        self.variable = self.variable.split('-')[0]
        fs = s3fs.S3FileSystem(anon=True)
        run_date_str = self.run_date.strftime("%Y%m%d")
        run_hour = self.run_date.strftime("%H")
        path = join(self.path, run_date_str,
                    f'{run_date_str}_{run_hour}z_fcst.zarr', level,
                    self.variable, level)
        f = s3fs.S3Map(root=path, s3=fs, check=False)
        ds = xr.open_mfdataset([f], engine='zarr', parallel=True).load()

        if self.run_date in self.valid_dates:
            arr = ds[self.variable].values[self.forecast_hours[0]:self.
                                           forecast_hours[-1] +
                                           1].astype('float32')
            forecast_hour_00_path = join(
                self.path, run_date_str,
                f'{run_date_str}_{run_hour}z_anl.zarr', level,
                self.variable.replace('1hr_', ''), level)
            fh_0_file = s3fs.S3Map(root=forecast_hour_00_path,
                                   s3=fs,
                                   check=False)
            fh_0_ds = xr.open_mfdataset([fh_0_file],
                                        engine='zarr',
                                        parallel=True).expand_dims('time')
            fh_0_arr = fh_0_ds[self.variable.replace('1hr_', '')].values
            array = np.concatenate([
                fh_0_arr, arr
            ])[self.forecast_hours[0]:self.forecast_hours[-1] + 1, :, :]
        else:
            array = ds[self.variable].values[self.forecast_hours[0] -
                                             1:self.forecast_hours[-1]].astype(
                                                 'float32')

        if hasattr(ds[self.variable], 'units'):
            units = ds[self.variable].attrs['units']

        return array, units
Exemple #28
0
def retrieve_era5(varname, year, month, datestart, dateend):
    """
    Retrieve ERA5 Reanalysis from https://registry.opendata.aws/ecmwf-era5/.
    Open zarr file and get data in time range (datestart, dateend)

    Args:
        varname (str): variable names, like:
                       air_pressure_at_mean_sea_level
                       air_temperature_at_2_metres
                       air_temperature_at_2_metres_1hour_Maximum
                       air_temperature_at_2_metres_1hour_Minimum
                       dew_point_temperature_at_2_metres
                       eastward_wind_at_100_metres
                       eastward_wind_at_10_metres
                       integral_wrt_time_of_surface_direct_downwelling_shortwave_flux_in_air_1hour_Accumulation
                       lwe_thickness_of_surface_snow_amount
                       northward_wind_at_100_metres
                       northward_wind_at_10_metres
                       precipitation_amount_1hour_Accumulation
                       sea_surface_temperature
                       snow_density
                       surface_air_pressure
        year (int): year
        month (int): month
        datestart (str): start date, like '1987-12-02'
        dateend (str): end date, like '1987-12-02 23:59'
    
    Examples:
        year = 1987
        month = 12
        datestart = '1987-12-02'
        dateend = '1987-12-02 23:59'
        varname = 'air_temperature_at_2_metres'
        data = retrieve_era5(varname, year, month, datestart, dateend)
        print(data)
    """

    # Access S3 file system with anonymous.
    fs = s3fs.S3FileSystem(anon=True)

    # construct data file
    datestring = 'era5-pds/zarr/{year}/{month:02d}/data/'.format(year=year,
                                                                 month=month)
    datafile = datestring + varname + '.zarr/'

    # open zarr file
    data = xr.open_zarr(s3fs.S3Map(datafile, s3=fs))
    if varname in ['precipitation_amount_1hour_Accumulation']:
        data.sel(time1=slice(np.datetime64(datestart), np.datetime64(dateend)))
    else:
        data.sel(time0=slice(np.datetime64(datestart), np.datetime64(dateend)))

    return data
Exemple #29
0
    def __init__(self, url, root, **kwargs):

        super().__init__(**kwargs)

        fs = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": url})

        store = s3fs.S3Map(
            root=root,
            s3=fs,
            check=False,
        )
        self._ds = xr.open_zarr(store)
Exemple #30
0
def read_zarr(path, **kwargs):
    dataset_id = to_dataset_id(path)
    zarr_path = "/".join(split_string_at(dataset_id, ".", 4)) + ".zarr"

    endpoint_url = CONFIG["store"]["endpoint_url"]
    jasmin_s3 = s3fs.S3FileSystem(
        anon=True, client_kwargs={"endpoint_url": endpoint_url}
    )

    s3_store = s3fs.S3Map(root=zarr_path, s3=jasmin_s3)
    ds = xr.open_zarr(store=s3_store, consolidated=True, **kwargs)
    return ds