def validate_s3_dataset(bucket_name, path_in_bucket, dataset_name, server=None, anon=True, n_threads=1): tmp_file = "./tmp_file.n5" os.makedirs(tmp_file, exist_ok=True) fs = _get_fs(server, anon) # make a dummy local file by copying the relevant attributes.json store = s3fs.S3Map(root=path_in_bucket, s3=fs) attrs = store["attributes.json"].decode("utf-8") attrs = json.loads(attrs) attrs_file = os.path.join(tmp_file, "attributes.json") with open(attrs_file, "w") as f: json.dump(attrs, f) # make a dummy dataset by copying the dataset attributes.json store = s3fs.S3Map(root=os.path.join(path_in_bucket, dataset_name), s3=fs) try: attrs = store["attributes.json"].decode("utf-8") except KeyError: try: rmtree(tmp_file) except OSError: pass raise ValueError( f"No file {path_in_bucket}:{dataset_name} in {bucket_name}") attrs = json.loads(attrs) tmp_ds = os.path.join(tmp_file, dataset_name) os.makedirs(tmp_ds, exist_ok=True) attrs_file = os.path.join(tmp_ds, "attributes.json") with open(attrs_file, "w") as f: json.dump(attrs, f) print("validating chunks for s3 dataset stored at") if server is None: print(f"{bucket_name}:{path_in_bucket}:{dataset_name}") else: print(f"{server}:{bucket_name}:{path_in_bucket}:{dataset_name}") dataset = zarr.open(tmp_file)[dataset_name] corrupted_chunks = validate_chunks_s3(store, dataset, keys=iter(store), n_threads=n_threads) try: rmtree(tmp_file) except OSError: warnings.warn( f"Could not clean up temporary data stored in {tmp_file}") pass return corrupted_chunks
def _write_test_data(cls, s3: s3fs.S3FileSystem): if not s3.isdir(cls.BUCKET_NAME): s3.mkdir(cls.BUCKET_NAME) data = helpers.make_test_store() s3map = s3fs.S3Map(root=cls.BUCKET_NAME + '/cube_1.zarr', s3=s3, create=True) s3map.update(data) s3map = s3fs.S3Map(root=cls.BUCKET_NAME + '/cube_2.zarr', s3=s3, create=True) s3map.update(data)
def __init__(self, filename, client_kwargs=None, cache_size=512 * (1024**2)): """ An object for accessing rss data from s3 blob storage. Parameters ---------- filename : path to rss data object on s3. client_kwargs : dict containing aws_access_key_id and aws_secret_access_key or None. If this variable is none, anonymous access is assumed. cache_size : max size of the LRU cache. """ print("Establishing Connection, may take a minute ......") anon = client_kwargs is None s3 = s3fs.S3FileSystem(anon=anon, client_kwargs=client_kwargs) clear_output() print("Connected to S3.") store = s3fs.S3Map(root=filename, s3=s3, check=False) super().__init__(store, cache_size=cache_size)
def get_path_or_obs_store(path_or_url: str, client_kwargs: Mapping[str, Any] = None, mode: str = 'r') -> Tuple[Union[str, Dict], bool]: """ If *path_or_url* is an object storage URL, return a object storage Zarr store (mapping object) using *client_kwargs* and *mode* and a flag indicating whether the Zarr datasets is consolidated. Otherwise *path_or_url* is interpreted as a local file system path, retured as-is plus a flag indicating whether the Zarr datasets is consolidated. :param path_or_url: A path or a URL. :param client_kwargs: Object storage client keyword arguments. :param mode: "r" or "w" :return: A tuple (path_or_obs_store, consolidated). """ if is_obs_url(path_or_url): root, obs_fs_kwargs, obs_fs_client_kwargs = parse_obs_url_and_kwargs( path_or_url, client_kwargs) s3 = s3fs.S3FileSystem(**obs_fs_kwargs, client_kwargs=obs_fs_client_kwargs) consolidated = mode == "r" and s3.exists(f'{root}/.zmetadata') return s3fs.S3Map(root=root, s3=s3, check=False, create=mode == "w"), consolidated else: consolidated = os.path.exists(os.path.join(path_or_url, '.zmetadata')) return path_or_url, consolidated
def fix_corrupted_chunks_s3(corrupted_chunks, local_dataset_path, local_dataset_key, bucket_name, path_in_bucket, dataset_name, server=None, anon=False): try: local_ds = zarr.open(local_dataset_path, "r")[local_dataset_key] except KeyError: raise ValueError( f"No file {path_in_bucket}:{dataset_name} in {bucket_name}") fs = _get_fs(server, anon) store = s3fs.S3Map(root=os.path.join(path_in_bucket, dataset_name), s3=fs) local_corrupted_chunks = [] for chunk_id in corrupted_chunks: local_chunk_path = os.path.join(local_dataset_path, local_dataset_key, chunk_id) with open(local_chunk_path, "rb") as f: cdata = f.read() try: cdata = local_ds._decode_chunk(cdata) except Exception: local_corrupted_chunks.append(chunk_id) store[chunk_id] = cdata return local_corrupted_chunks
def _save_zarr(dataset: xr.Dataset, url: str, profile_name: str, suffix: str) -> None: if isinstance(url, Path): url = str(url) url = url.rstrip("/") if url.startswith("s3://"): url = url.replace(" ", '-') url = url.replace("s3://", "") if url.count("/") > 1: raise ValueError( "I haven't figured out how to write groups yet, this will fail." ) s3 = s3fs.S3FileSystem(profile_name=profile_name) root = f"{url}.{suffix}.zarr" store = s3fs.S3Map(root=root, s3=s3, check=False) dataset.to_zarr(store=store) else: # assume local dataset.to_zarr(f"{url}.{suffix}.zarr")
def __call__(self, store_id: str) -> collections.MutableMapping: """Open a store for store identifier *store_id*.""" import s3fs return s3fs.S3Map(root=self._root_pattern.format(store_id=store_id), s3=self._s3, check=False, create=True)
def _get_dataset_lazily(self, index: int, **zarr_kwargs) -> xr.Dataset: """ Read the dataset for the level at given *index*. :param index: the level index :param zarr_kwargs: kwargs passed to xr.open_zarr() :return: the dataset for the level at *index*. """ ext, level_path = self._level_paths[index] if ext == ".link": with self._obs_file_system.open(level_path, "w") as fp: level_path = fp.read() # if file_path is a relative path, resolve it against the levels directory if not os.path.isabs(level_path): base_dir = os.path.dirname(self._dir_path) level_path = os.path.join(base_dir, level_path) store = s3fs.S3Map(root=level_path, s3=self._obs_file_system, check=False) cached_store = zarr.LRUStoreCache(store, max_size=2**28) with measure_time( tag=f"opened remote dataset {level_path} for level {index}"): return assert_cube(xr.open_zarr(cached_store, **zarr_kwargs), name=level_path)
def read(self, path: str, **kwargs) -> xr.Dataset: path_or_store = path if isinstance(path, str): endpoint_url = None root = None if 'endpoint_url' in kwargs: endpoint_url = kwargs.pop('endpoint_url') root = path if path.startswith("http://") or path.startswith("https://"): import urllib3.util url = urllib3.util.parse_url(path_or_store) if url.port is not None: endpoint_url = f'{url.scheme}://{url.host}:{url.port}' else: endpoint_url = f'{url.scheme}://{url.host}' root = url.path if root.startswith('/'): root = root[1:] if endpoint_url and root is not None: s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(endpoint_url=endpoint_url)) path_or_store = s3fs.S3Map(root=root, s3=s3, check=False) if 'max_cache_size' in kwargs: max_cache_size = kwargs.pop('max_cache_size') if max_cache_size > 0: path_or_store = zarr.LRUStoreCache(path_or_store, max_size=max_cache_size) return xr.open_zarr(path_or_store, **kwargs)
def get_path_or_s3_store(path_or_url: str, s3_kwargs: Mapping[str, Any] = None, s3_client_kwargs: Mapping[str, Any] = None, mode: str = 'r') -> Tuple[Union[str, Dict], bool]: """ If *path_or_url* is an object storage URL, return a object storage Zarr store (mapping object) using *s3_client_kwargs* and *mode* and a flag indicating whether the Zarr datasets is consolidated. Otherwise *path_or_url* is interpreted as a local file system path, returned as-is plus a flag indicating whether the Zarr datasets is consolidated. :param path_or_url: A path or a URL. :param s3_kwargs: keyword arguments for S3 file system. :param s3_client_kwargs: keyword arguments for S3 boto3 client. :param mode: access mode "r" or "w". "r" is default. :return: A tuple (path_or_obs_store, consolidated). """ if is_s3_url(path_or_url) \ or s3_kwargs is not None \ or s3_client_kwargs is not None: s3, root = parse_s3_fs_and_root(path_or_url, s3_kwargs=s3_kwargs, s3_client_kwargs=s3_client_kwargs, mode=mode) consolidated = mode == "r" and s3.exists(f'{root}/.zmetadata') return s3fs.S3Map(root=root, s3=s3, check=False, create=mode == "w"), consolidated else: consolidated = os.path.exists(os.path.join(path_or_url, '.zmetadata')) return path_or_url, consolidated
def _get_dataset_lazily(self, index: int, parameters: Dict[str, Any]) -> xr.Dataset: """ Read the dataset for the level at given *index*. :param index: the level index :param parameters: keyword arguments passed to xr.open_zarr() :return: the dataset for the level at *index*. """ ext, level_path = self._level_paths[index] if ext == ".link": with self._s3_file_system.open(level_path, "w") as fp: level_path = fp.read() # if file_path is a relative path, resolve it against the levels directory if not os.path.isabs(level_path): base_dir = os.path.dirname(self._dir_path) level_path = os.path.join(base_dir, level_path) store = s3fs.S3Map(root=level_path, s3=self._s3_file_system, check=False) max_size = self.get_chunk_cache_capacity(index) if max_size: store = zarr.LRUStoreCache(store, max_size=max_size) with measure_time( tag=f"opened remote dataset {level_path} for level {index}"): consolidated = self._s3_file_system.exists( f'{level_path}/.zmetadata') return assert_cube(xr.open_zarr(store, consolidated=consolidated, **parameters), name=level_path)
def read(self, path: str, **kwargs) -> xr.Dataset: path_or_store = path consolidated = False mode = 'read' root = None if isinstance(path, str): client_kwargs = {} if 'client_kwargs' in kwargs: client_kwargs = kwargs.pop('client_kwargs') if 'endpoint_url' in kwargs: client_kwargs['endpoint_url'] = kwargs.pop('endpoint_url') root = path if 'region_name' in kwargs: client_kwargs['region_name'] = kwargs.pop('region_name') path_or_store, root, client_kwargs = _get_path_or_store(path_or_store, client_kwargs, mode, root) if 'endpoint_url' in client_kwargs and root is not None: s3 = s3fs.S3FileSystem(anon=True, client_kwargs=client_kwargs) consolidated = s3.exists(f'{root}/.zmetadata') path_or_store = s3fs.S3Map(root=root, s3=s3, check=False) if 'max_cache_size' in kwargs: max_cache_size = kwargs.pop('max_cache_size') if max_cache_size > 0: path_or_store = zarr.LRUStoreCache(path_or_store, max_size=max_cache_size) else: consolidated = os.path.exists(os.path.join(path_or_store, '.zmetadata')) return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
def open_ml_dataset_from_object_storage( ctx: ServiceContext, dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset: ds_id = dataset_descriptor.get('Identifier') path = dataset_descriptor.get('Path') if not path: raise ServiceConfigError( f"Missing 'path' entry in dataset descriptor {ds_id}") data_format = dataset_descriptor.get('Format', FORMAT_NAME_ZARR) s3_client_kwargs = {} if 'Endpoint' in dataset_descriptor: s3_client_kwargs['endpoint_url'] = dataset_descriptor['Endpoint'] if 'Region' in dataset_descriptor: s3_client_kwargs['region_name'] = dataset_descriptor['Region'] obs_file_system = s3fs.S3FileSystem(anon=True, client_kwargs=s3_client_kwargs) if data_format == FORMAT_NAME_ZARR: store = s3fs.S3Map(root=path, s3=obs_file_system, check=False) cached_store = zarr.LRUStoreCache(store, max_size=2**28) with measure_time(tag=f"opened remote zarr dataset {path}"): ds = xr.open_zarr(cached_store) return BaseMultiLevelDataset(ds) if data_format == FORMAT_NAME_LEVELS: with measure_time(tag=f"opened remote levels dataset {path}"): return ObjectStorageMultiLevelDataset( ds_id, obs_file_system, path, exception_type=ServiceConfigError)
def _zarr_get(self, uuid, x, y, z, t, c, level): s3 = s3fs.S3FileSystem(client_kwargs=dict(region_name=self.region)) s3_store = s3fs.S3Map(root=f"{self.bucket}/{uuid}", s3=s3, check=False) group = zarr.hierarchy.open_group(store=s3_store) level = group.get(str(level)) return level[t, c, z, y * self.tile_size:(y + 1) * self.tile_size, x * self.tile_size:(x + 1) * self.tile_size, ]
def nc2zarr(fns,zpath,s3store=True,chunks=None,parallel=True): ''' Convert netcdf files to zarr format and save to local or s3 store Parameters ---------- fns : a list of netcdf file names with full path zpath : path to the local or s3 store s3store : flag of whether to save to s3 store, boolean chunks : chunks used to read and write data parallel: flag to use dask to read files in parallel, boolean ''' # --- remove lat/long from the list of vars to be concatenated. with xr.open_mfdataset(fns,parallel=True,chunks=chunks,combine='nested',concat_dim='time') as ds: vns = list(ds.data_vars) for vn in ['lat','long']: if vn in vns: vns.remove(vn) with xr.open_mfdataset(fns,chunks=chunks,parallel=parallel, data_vars=vns,combine='nested',concat_dim='time') as ds: if s3store: fs = s3fs.S3FileSystem(anon=False) ds_store = s3fs.S3Map(root=zpath,s3=fs,check=False,create=True) else: ds_store = zpath if chunks is not None: ds = ds.chunk(chunks=chunks) else: ds = ds.chunk(chunks={x:ds.chunks[x][0] for x in ds.chunks}) compressor = zarr.Blosc(cname='zstd', clevel=4) encoding = {vname: {'compressor': compressor} for vname in ds.data_vars} ds.to_zarr(store=ds_store,encoding=encoding,consolidated=True) return
def _get_path_or_store(path: str, client_kwargs: Dict[str, Any], mode: str, root: str): path_or_store = path anon_mode = True if not client_kwargs: client_kwargs = {} if client_kwargs is not None: if 'provider_access_key_id' in client_kwargs and 'provider_secret_access_key' in client_kwargs: anon_mode = False client_kwargs['aws_access_key_id'] = client_kwargs.pop('provider_access_key_id') client_kwargs['aws_secret_access_key'] = client_kwargs.pop('provider_secret_access_key') if path.startswith("https://") or path.startswith("http://"): import urllib3.util url = urllib3.util.parse_url(path_or_store) if url.port is not None: client_kwargs['endpoint_url'] = f'{url.scheme}://{url.host}:{url.port}' else: client_kwargs['endpoint_url'] = f'{url.scheme}://{url.host}' root = url.path if root.startswith('/'): root = root[1:] if "write" in mode: root = f's3://{root}' s3 = s3fs.S3FileSystem(anon=anon_mode, client_kwargs=client_kwargs) path_or_store = s3fs.S3Map(root=root, s3=s3, check=False) return path_or_store, root, client_kwargs
def read(self, path: str, **kwargs) -> xr.Dataset: path_or_store = path consolidated = False if isinstance(path, str): region_name = None if 'endpoint_url' in kwargs: endpoint_url = kwargs.pop('endpoint_url') root = path else: endpoint_url, root = split_bucket_url(path) if 'region_name' in kwargs: region_name = kwargs.pop('region_name') if endpoint_url and root: s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict( endpoint_url=endpoint_url, region_name=region_name)) consolidated = s3.exists(f'{root}/.zmetadata') path_or_store = s3fs.S3Map(root=root, s3=s3, check=False) if 'max_cache_size' in kwargs: max_cache_size = kwargs.pop('max_cache_size') if max_cache_size > 0: path_or_store = zarr.LRUStoreCache( path_or_store, max_size=max_cache_size) else: consolidated = os.path.exists( os.path.join(path_or_store, '.zmetadata')) return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
def load_json_from_s3(address): server = "/".join(address.split("/")[:3]) root_path = "/".join(address.split("/")[3:-1]) fname = address.split("/")[-1] fs = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": server}) store = s3fs.S3Map(root=root_path, s3=fs) attrs = store[fname] attrs = json.loads(attrs.decode("utf-8")) return attrs
def make_example_nc(nldas_path, out_file): if os.path.split(nldas_path)[1] == '.zattrs': nldas_path = os.path.split(nldas_path)[0] fs = s3fs.S3FileSystem(profile='ds-drb-creds', anon=False) nldas_path = s3fs.S3Map(nldas_path, s3=fs) ds = xr.open_zarr(nldas_path) ds = ds.isel(time=0) ds[['pressfc']].to_netcdf(out_file)
def read_zarr(path, project, **kwargs): dataset_id = to_dataset_id(path, project) zarr_path = get_zarr_path(dataset_id, project, join=True) endpoint_url = CONFIG["store"]["endpoint_url"] jasmin_s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint_url}) s3_store = s3fs.S3Map(root=zarr_path, s3=jasmin_s3) ds = xr.open_zarr(store=s3_store, consolidated=True, **kwargs) return ds
def write_zarr_to_s3( dataset: xr.Dataset, full_zarr_filename: str, s3: s3fs.S3FileSystem, **write_zarr_kwargs) -> xr.backends.ZarrStore: prep_and_check_s3(full_zarr_filename, s3) store = s3fs.S3Map( root=full_zarr_filename, s3=s3, check=False, create=True) return write_zarr(dataset, store, **write_zarr_kwargs)
def merge_zarrs(zarr_paths, output_path): s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='us-east-1')) arrays_to_merge = [] for zarr_path in zarr_paths: store = s3fs.S3Map(root=zarr_path[len("s3://"):], s3=s3, check=False) arr = zarr.Array(store) arrays_to_merge.append(arr) merged_array = numpy.concatenate(arrays_to_merge, axis=1) zarr.save(output_path, merged_array)
def open_zarr(zarr_url): url_comps = urlparse(zarr_url) endpoint = f"{url_comps.scheme}://{url_comps.netloc}" zarr_path = url_comps.path jasmin_s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint}) s3_store = s3fs.S3Map(root=zarr_path, s3=jasmin_s3) ds = xr.open_zarr(store=s3_store, consolidated=True) return ds
def save_zarr(chunk_df, out_zarr): col_name = 'nldas_grid_no' idx_name = 'nhd_comid' chunks = {col_name: 10000, idx_name: 30000} ds = convert_df_to_dataset(chunk_df, col_name, idx_name, 'weight', chunks) print(ds) print(out_zarr) fs = s3fs.S3FileSystem(profile='ds-drb-creds', anon=False) out_zarr = s3fs.S3Map(out_zarr, s3=fs) ds.to_zarr(out_zarr, mode='w')
def test_remote(self): import s3fs endpoint_url = "https://s3.eu-central-1.amazonaws.com" s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(endpoint_url=endpoint_url)) s3_store = s3fs.S3Map(root="xcube-examples/OLCI-SNS-RAW-CUBE-2.zarr", s3=s3, check=False) diagnostic_store = DiagnosticStore( s3_store, logging_observer(log_path='remote-cube.log')) xr.open_zarr(diagnostic_store)
def open_zarr(s3_path, anon=False, cache=False): """Open a zarr archive and return its root.""" s3 = s3fs.S3FileSystem(anon=anon) store = s3fs.S3Map(s3_path, s3=s3, check=False, create=False) if cache: lrucache = zarr.LRUStoreCache(store=store, max_size=1 << 29) root = zarr.group(store=lrucache) else: root = zarr.group(store=store) return root
def load_data(self): units = "" level = self.variable.split('-')[1] self.variable = self.variable.split('-')[0] fs = s3fs.S3FileSystem(anon=True) run_date_str = self.run_date.strftime("%Y%m%d") run_hour = self.run_date.strftime("%H") path = join(self.path, run_date_str, f'{run_date_str}_{run_hour}z_fcst.zarr', level, self.variable, level) f = s3fs.S3Map(root=path, s3=fs, check=False) ds = xr.open_mfdataset([f], engine='zarr', parallel=True).load() if self.run_date in self.valid_dates: arr = ds[self.variable].values[self.forecast_hours[0]:self. forecast_hours[-1] + 1].astype('float32') forecast_hour_00_path = join( self.path, run_date_str, f'{run_date_str}_{run_hour}z_anl.zarr', level, self.variable.replace('1hr_', ''), level) fh_0_file = s3fs.S3Map(root=forecast_hour_00_path, s3=fs, check=False) fh_0_ds = xr.open_mfdataset([fh_0_file], engine='zarr', parallel=True).expand_dims('time') fh_0_arr = fh_0_ds[self.variable.replace('1hr_', '')].values array = np.concatenate([ fh_0_arr, arr ])[self.forecast_hours[0]:self.forecast_hours[-1] + 1, :, :] else: array = ds[self.variable].values[self.forecast_hours[0] - 1:self.forecast_hours[-1]].astype( 'float32') if hasattr(ds[self.variable], 'units'): units = ds[self.variable].attrs['units'] return array, units
def retrieve_era5(varname, year, month, datestart, dateend): """ Retrieve ERA5 Reanalysis from https://registry.opendata.aws/ecmwf-era5/. Open zarr file and get data in time range (datestart, dateend) Args: varname (str): variable names, like: air_pressure_at_mean_sea_level air_temperature_at_2_metres air_temperature_at_2_metres_1hour_Maximum air_temperature_at_2_metres_1hour_Minimum dew_point_temperature_at_2_metres eastward_wind_at_100_metres eastward_wind_at_10_metres integral_wrt_time_of_surface_direct_downwelling_shortwave_flux_in_air_1hour_Accumulation lwe_thickness_of_surface_snow_amount northward_wind_at_100_metres northward_wind_at_10_metres precipitation_amount_1hour_Accumulation sea_surface_temperature snow_density surface_air_pressure year (int): year month (int): month datestart (str): start date, like '1987-12-02' dateend (str): end date, like '1987-12-02 23:59' Examples: year = 1987 month = 12 datestart = '1987-12-02' dateend = '1987-12-02 23:59' varname = 'air_temperature_at_2_metres' data = retrieve_era5(varname, year, month, datestart, dateend) print(data) """ # Access S3 file system with anonymous. fs = s3fs.S3FileSystem(anon=True) # construct data file datestring = 'era5-pds/zarr/{year}/{month:02d}/data/'.format(year=year, month=month) datafile = datestring + varname + '.zarr/' # open zarr file data = xr.open_zarr(s3fs.S3Map(datafile, s3=fs)) if varname in ['precipitation_amount_1hour_Accumulation']: data.sel(time1=slice(np.datetime64(datestart), np.datetime64(dateend))) else: data.sel(time0=slice(np.datetime64(datestart), np.datetime64(dateend))) return data
def __init__(self, url, root, **kwargs): super().__init__(**kwargs) fs = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": url}) store = s3fs.S3Map( root=root, s3=fs, check=False, ) self._ds = xr.open_zarr(store)
def read_zarr(path, **kwargs): dataset_id = to_dataset_id(path) zarr_path = "/".join(split_string_at(dataset_id, ".", 4)) + ".zarr" endpoint_url = CONFIG["store"]["endpoint_url"] jasmin_s3 = s3fs.S3FileSystem( anon=True, client_kwargs={"endpoint_url": endpoint_url} ) s3_store = s3fs.S3Map(root=zarr_path, s3=jasmin_s3) ds = xr.open_zarr(store=s3_store, consolidated=True, **kwargs) return ds