async def putStorBytes(app, key, data, filter_ops=None, bucket=None): """ Store byte string as S3 object with given key """ client = _getStorageClient(app) if not bucket: bucket = app['bucket_name'] if key[0] == '/': key = key[1:] # no leading slash shuffle = -1 # auto-shuffle clevel = 5 cname = None # compressor name if filter_ops: if "compressor" in filter_ops: cname = filter_ops["compressor"] if "use_shuffle" in filter_ops and not filter_ops['use_shuffle']: shuffle = 0 # client indicates to turn off shuffling if "level" in filter_ops: clevel = filter_ops["level"] log.info(f"putStorBytes({bucket}/{key}), {len(data)} bytes shuffle: {shuffle} compressor: {cname} level: {clevel}") if cname: try: blosc = codecs.Blosc(cname=cname, clevel=clevel, shuffle=shuffle) cdata = blosc.encode(data) # TBD: add cname in blosc constructor log.info(f"compressed from {len(data)} bytes to {len(cdata)} bytes using filter: {blosc.cname} with level: {blosc.clevel}") data = cdata except Exception as e: log.error(f"got exception using blosc encoding: {e}") raise HTTPInternalServerError() rsp = await client.put_object(key, data, bucket=bucket) return rsp
def process_one_tile(lat, lon): """ Given lat and lon to select a region, calculate the corresponding emissions for each year from 2001 to 2018 Parameters ---------- lat : float Latitude in degrees lon : float Longitude in degrees Returns ------- url : string Url where a processed tile is located """ url = f"gs://carbonplan-climatetrace/v0/tiles/{lat}_{lon}.zarr" encoding = {"emissions": {"compressor": numcodecs.Blosc()}} mapper = fsspec.get_mapper(url) with dask.config.set(scheduler="threads"): ds = open_hansen_2018_tile(lat, lon) ds = calc_one_tile(ds)[["emissions"]] ds = ds.chunk({"lat": 4000, "lon": 4000, "year": 2}) ds.to_zarr(mapper, encoding=encoding, mode="w", consolidated=True) return url
def test_read_zarr(self): from z5py.dataset import Dataset dtypes = list(Dataset._dtype_dict.keys()) zarr_compressors = {'blosc': numcodecs.Blosc(), 'zlib': numcodecs.Zlib(), 'raw': None, 'bzip2': numcodecs.BZ2()} # conda-forge version of numcodecs is not up-to-data # for python 3.5 and GZip is missing # thats why we need to check explicitly here to not fail the test if hasattr(numcodecs, 'GZip'): zarr_compressors.update({'gzip': numcodecs.GZip()}) zarr.open(self.path) for dtype in dtypes: for compression in zarr_compressors: data = np.random.randint(0, 127, size=self.shape).astype(dtype) # write the data with zarr key = 'test_%s_%s' % (dtype, compression) ar = zarr.open(os.path.join(self.path, key), mode='w', shape=self.shape, chunks=self.chunks, dtype=dtype, compressor=zarr_compressors[compression]) ar[:] = data # read with z5py out = z5py.File(self.path)[key][:] self.assertEqual(data.shape, out.shape) self.assertTrue(np.allclose(data, out))
def test_read_zarr(self): import numcodecs from z5py.dataset import Dataset dtypes = list(Dataset._zarr_dtype_dict.values()) compressions = Dataset.compressors_zarr zarr_compressors = { 'blosc': numcodecs.Blosc(), 'zlib': numcodecs.Zlib(), 'raw': None, 'bzip2': numcodecs.BZ2() } for dtype in dtypes: for compression in compressions: data = np.random.randint(0, 127, size=self.shape).astype(dtype) # write the data with zarr key = 'test_%s_%s' % (dtype, compression) ar = zarr.open(os.path.join(self.path, key), mode='w', shape=self.shape, chunks=self.chunks, dtype=dtype, compressor=zarr_compressors[compression]) ar[:] = data # read with z5py out = z5py.File(self.path)[key][:] self.assertEqual(data.shape, out.shape) self.assertTrue(np.allclose(data, out))
def save_da_to_zarr(da, zarr_bucket, dim_order=['time', 'x', 'y', 'variable'], zarr_mode='a'): da = da.transpose(*dim_order) da['time'] = get_time_as_unix(da) _, y_size, x_size, _ = da.shape out_store = gcsfs.GCSMap(root=zarr_bucket, gcs=gcsfs.GCSFileSystem()) chunks = (36, y_size, x_size, 1) ds = xr.Dataset({'stacked_eumetsat_data': da.chunk(chunks)}) zarr_mode_to_extra_kwargs = { 'a': { 'append_dim': 'time' }, 'w': { 'encoding': { 'stacked_eumetsat_data': { 'compressor': numcodecs.Blosc(cname='zstd', clevel=5), 'chunks': chunks } } } } assert zarr_mode in ['a', 'w'], '`zarr_mode` must be one of: `a`, `w`' extra_kwargs = zarr_mode_to_extra_kwargs[zarr_mode] ds.to_zarr(out_store, mode=zarr_mode, consolidated=True, **extra_kwargs) print('Saved file to zarr bucket') return ds
def __init__(self, fs: fsspec.AbstractFileSystem, root: str, compressor: Optional[numcodecs.Blosc] = None): self.fs = fs self.compressor = compressor or numcodecs.Blosc() self.root = root self._transactions = set() self._deleted = set() self.fs.mkdirs(root, exist_ok=True)
def create_coarsened_global_raster(): with fsspec.open(HANSEN_FILE_LIST) as f: lines = f.read().decode().splitlines() print("We are working with {} different files".format(len(lines))) # the arrays where you'll throw your active lat/lon permutations lats = [] lons = [] encoding = {"emissions": {"compressor": numcodecs.Blosc()}} for line in lines: pieces = line.split("_") lat = pieces[-2] lon = pieces[-1].split(".")[0] if (lat in LATS_TO_RUN) and (lon in LONS_TO_RUN): lats.append(lat) lons.append(lon) all_to_do = len(lats) done = 0 list_all_coarsened = [] for lat, lon in list(zip(lats, lons)): try: # We only have data over land so this will throw # an exception if the tile errors (likely for lack of data - could be improved to check # that it fails precisely because it is an ocean tile - aka we check that all of the land # cells run appropriately) mapper = fsspec.get_mapper(OUT_TILE_TEMPLATE.format(lat, lon)) da_global = xr.open_zarr(mapper, consolidated=True) # We only want to create the da_mask = da_global.isel(year=0, drop=True) da_area = compute_grid_area(da_mask) list_all_coarsened.append((da_global * da_area).coarsen( lat=COARSENING_FACTOR, lon=COARSENING_FACTOR).sum().compute(retries=4)) except ValueError: print("{} {} did not work (likely because it is ocean) booooo". format(lat, lon)) done += 1 print("completed {} of {} tiles".format(done, all_to_do)) coarsened_url = OUT_RASTER_FILE mapper = fsspec.get_mapper(coarsened_url) combined_ds = xr.combine_by_coords(list_all_coarsened, compat="override", coords="minimal") combined_ds = combined_ds.chunk({"lat": -1, "lon": -1, "year": 1}) task = combined_ds.to_zarr(mapper, encoding=encoding, mode="w", compute=False) dask.compute(task, retries=4)
def create(self, mode="w", compressor=numcodecs.Blosc("zstd", 5)): """ Create or open for append a dataset :param n_channels: # of channels in the zarr :param current_channel: the channel to be written """ store = zarr.NestedDirectoryStore( self.dest) self.zgroup = zarr.group(store, overwrite=(mode == "w")) self.compressor = compressor
def create(cls, path: Path, array_info: ArrayInfo) -> "ZarrArray": assert array_info.data_format == cls.data_format assert array_info.chunks_per_shard == Vec3Int.full( 1), "Zarr storage doesn't support sharding yet" zarr.create( shape=(array_info.num_channels, 1, 1, 1), chunks=(array_info.num_channels, ) + array_info.chunk_size.to_tuple(), dtype=array_info.voxel_type, compressor=(numcodecs.Blosc( cname="zstd", clevel=3, shuffle=numcodecs.Blosc.SHUFFLE) if array_info.compression_mode else None), store=_fsstore_from_path(path), order="F", ) return ZarrArray(path)
def __init__(self, filename, overwrite=False, separate=False, out_block_type='zarr', keep_blocks=False, gdal_cache=512, **kwargs): if out_block_type == 'zarr': if not ZARR_INSTALLED: logger.exception('Zarr and numcodecs must be installed.') self.filename = filename self.overwrite = overwrite self.separate = separate self.out_block_type = out_block_type self.keep_blocks = keep_blocks self.gdal_cache = gdal_cache self.kwargs = kwargs self.d_name, f_name = os.path.split(self.filename) self.f_base, self.f_ext = os.path.splitext(f_name) self.root = None self.compressor = None self.sub_dir = None self.zarr_file = None if self.separate: if self.out_block_type.lower() not in ['gtiff', 'zarr']: logger.warning(' The output block type is not recognized. Save blocks as zarr files.') self.out_block_type = 'zarr' self.sub_dir = os.path.join(self.d_name, 'sub_tmp_') self.zarr_file = os.path.join(self.sub_dir, 'data.zarr') self.compressor = numcodecs.Blosc(cname='zstd', clevel=3, shuffle=numcodecs.Blosc.BITSHUFFLE) if os.path.isdir(self.sub_dir): shutil.rmtree(self.sub_dir) os.makedirs(self.sub_dir)
def output_to_zarr(path, seq_id, sample_id, arrays, cname, clevel, shuffle): log('Output zarr to {!r} ...'.format(path)) store = zarr.ZipStore(path, mode='w') root = zarr.group(store=store) callset = root.create_group(sample_id) seq_group = callset.require_group(seq_id) calldata_group = seq_group.require_group('calldata') variants_group = seq_group.require_group('variants') compressor = numcodecs.Blosc(cname=cname, clevel=clevel, shuffle=shuffle) for key, value in arrays.items(): calldata_group.create_dataset(key, data=value, compressor=compressor) log('Created output array: ' + repr(key)) store.close()
def setup_output(output_path, seqid, field, example_arr, samples, cname, clevel, shuffle, chunk_width): log('Setting up output at {!r} ...'.format(output_path)) callset = zarr.open_group(output_path, mode='a') seq_group = callset.require_group(seqid) field_root, field_id = field.split("/") root_group = seq_group.require_group(field_root) output_shape = (example_arr.shape[0], len(samples)) + example_arr.shape[2:] c1 = 2**26 // np.prod((chunk_width, ) + example_arr.shape[2:]) output_chunks = (c1, chunk_width) + example_arr.chunks[2:] compressor = numcodecs.Blosc(cname=cname, clevel=clevel, shuffle=shuffle) output_arr = root_group.empty_like(field_id, example_arr, shape=output_shape, chunks=output_chunks, overwrite=True, compressor=compressor) log('Created output array: ' + repr(output_arr)) return output_arr
def vcf_to_zarr(vcf_in, tabix_exec, chrom): """Convert on-disk VCF to on-disk Zarr database using scikit-allele and zarr modules Zarr database written to same directory as input VCF Args: vcf_in (str): Path to input VCF on disk tabix_exec (str): Full path to tabix executable chrom (str): Chromosome for which Zarr database should be created Returns: None """ vcf_path = os.path.dirname(vcf_in) # allel.vcf_to_zarr returns a directory with Zarr databse # Set Zarr database outdir zarr_base = os.path.basename(vcf_in).split('.')[0] zarr_out = vcf_path + '/' + zarr_base + '.zarr' # Rename 'numalt' field. Required by Zarr to distinguish `NUMALT` from `numalt` # `numalt` is automatically computed by scikit-allel rename_dict = {'variants/numalt':'variants/numalt_sci'} # Use vcf_to_zarr function from scikit-allel to create zarr database # Currently optimized for biallelic SNP VCF but easy to extend functionality allel.vcf_to_zarr( input=vcf_in, output=zarr_out, overwrite=True, group=chrom, rename_fields=rename_dict, fields='*', alt_number=1, tabix=tabix_exec, region=chrom, compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False) )
def save_samples(path, mcmc, title=None): path = os.fspath(path) names = pd.Series(mcmc.column_names).str.replace(r'\.\d+$', '').unique() _log.info('saving %d draws of %d variables to %s', mcmc.chains * mcmc.draws, len(names), path) comp = nc.Blosc('zstd', 9, shuffle=nc.blosc.BITSHUFFLE) with zarr.ZipStore(path) as store: g = zarr.group(store) for name in names: if name.startswith('_') or name == 'log_lik': continue # we don't save names prefixed with _ draws = mcmc.get_drawset([name]) nrows, ncols = draws.shape if ncols > 1: _log.info('saving %d draws of %d-dimensional vector %s', nrows, ncols, name) arr = draws.to_numpy() else: _log.info('saving %d draws of scalar %s', nrows, name) arr = draws.to_numpy().reshape(nrows) g.array(name, arr, compressor=comp) if 'log_lik' in names: _log.info('computing LPPD') ll = mcmc.get_drawset(['log_lik']) draws, dims = ll.shape ll_exp = logsumexp(ll, axis=0) - np.log(draws) ll_var = np.var(ll, axis=0) lppd = np.sum(ll_exp) pwaic = np.sum(ll_var) _log.info('LPPD=%.2f, pWAIC=%.2f, WAIC=%.2f', lppd, pwaic, -2 * (lppd - pwaic)) g.array('ll_exp', ll_exp, compressor=comp) g.array('ll_var', ll_var, compressor=comp)
def vcf2zarr(chrom, zarr_path, vcf_path): """Convert vcf to zarr. Parameters ---------- chroms : TYPE DESCRIPTION. zarr_path : TYPE DESCRIPTION. vcf_path : TYPE DESCRIPTION. Returns ------- None. """ if path.isdir(path.join(zarr_path, chrom)): pass else: allel.vcf_to_zarr(vcf_path, zarr_path, group=chrom, fields='*', alt_number=2, log=sys.stdout, compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False)) return None
def codecs(self, obj): codecs = [] if obj.dtype == np.float64: codecs.append(nc.AsType('f4', 'f8')) codecs.append(nc.Blosc('zstd', 5)) return codecs
ds["nav_lat"] = template["nav_lat"] ds["nav_lon"] = template["nav_lon"] ds del template ds compressor = numcodecs.Blosc(cname='snappy', clevel=6, shuffle=-1) encoding = {vname: {'compressor': compressor} for vname in ds.variables} outdir = '/store/albert7a/eNATL60/zarr/eNATL60-BLB002-SSH-1h-new' ds = ds.chunk(chunks=dict(time_counter=240, y=240, x=480)) print (str(datetime.datetime.now())) ds.to_zarr(outdir, encoding=encoding, mode="w") print (str(datetime.datetime.now()))
async def getStorBytes(app, key, filter_ops=None, offset=0, length=None, bucket=None): """ Get object identified by key and read as bytes """ client = _getStorageClient(app) if not bucket: bucket = app['bucket_name'] if key[0] == '/': key = key[1:] # no leading slash log.info(f"getStorBytes({bucket}/{key})") shuffle = 0 compressor = None if filter_ops: log.debug(f"getStorBytes for {key} with filter_ops: {filter_ops}") if "is_shuffle" in filter_ops and filter_ops['is_shuffle']: shuffle = filter_ops['item_size'] if "compressor" in filter_ops: # TBD - enable blosc compressors compressor = filter_ops["compressor"] data = await client.get_object(bucket=bucket, key=key, offset=offset, length=length) if data is None or len(data) == 0: log.info(f"no data found for {key}") return data log.info(f"read: {len(data)} bytes for key: {key}") if compressor: # compressed chunk data... # first check if this was compressed with blosc blosc_metainfo = codecs.blosc.cbuffer_metainfo( data) # returns typesize, isshuffle, and memcopied if blosc_metainfo[0] > 0: log.info(f"blosc compressed data for {key}") try: blosc = codecs.Blosc() udata = blosc.decode(data) log.info(f"uncompressed to {len(udata)} bytes") data = udata except Exception as e: log.error( f"got exception: {e} using blosc decompression for {key}") raise HTTPInternalServerError() elif compressor == "zlib": # data may have been compressed without blosc, try using zlib directly log.info(f"using zlib to decompress {key}") try: udata = zlib.decompress(data) log.info(f"uncompressed to {len(udata)} bytes") data = udata except zlib.error as zlib_error: log.info(f"zlib_err: {zlib_error}") log.error(f"unable to uncompress obj: {key}") raise HTTPInternalServerError() else: log.error( f"don't know how to decompress data in {compressor} format for {key}" ) raise HTTPInternalServerError() if shuffle > 0: log.debug(f"shuffle is {shuffle}") unshuffled = _unshuffle(shuffle, data) if unshuffled is not None: log.debug(f"unshuffled to {len(unshuffled)} bytes") data = unshuffled return data
from pathlib import Path import zarr import numcodecs if hasattr(numcodecs, 'blosc'): numcodecs.blosc.use_threads = False compressor = numcodecs.Blosc(cname='zstd', clevel=2, shuffle=numcodecs.Blosc.BITSHUFFLE) def to_zarr(filename, data, window, chunks, root=None): """ Writes data to a zarr file Args: filename (str): The output file name. data (ndarray): The data to write. window (namedtuple): A ``rasterio.window.Window`` object. chunks (int or tuple): The ``zarr`` chunks. root (Optional[object]): The ``zarr`` root. Returns: ``str`` """
def compress_zarr(ts, root, variants_only=False): provenance_dict = provenance.get_provenance_dict( {"variants_only": variants_only}) if variants_only: logging.info("Using lossy variants-only compression") # Reduce to site topology. Note that we will remove # any sites, individuals and populations here that have no references. ts = ts.simplify(reduce_to_site_topology=True) tables = ts.tables with warnings.catch_warnings(): warnings.simplefilter("ignore") # When using a zipfile in Zarr we get some harmless warnings. See # https://zarr.readthedocs.io/en/stable/api/storage.html#zarr.storage.ZipStore root.attrs["format_name"] = FORMAT_NAME root.attrs["format_version"] = FORMAT_VERSION root.attrs["sequence_length"] = tables.sequence_length root.attrs["provenance"] = provenance_dict columns = {} for key, value in tables.asdict().items(): if isinstance(value, dict): for sub_key, sub_value in value.items(): columns[f"{key}/{sub_key}"] = sub_value else: columns[key] = value if variants_only: time = np.unique(tables.nodes.time) columns["node/time"] = np.searchsorted(time, tables.nodes.time) # Encoding array is a tuple so must be converted columns["encoding_version"] = np.asarray(columns["encoding_version"]) # Sequence length is stored as an attr for compatibility with older versions of tszip del columns["sequence_length"] # Schemas, metadata and units need to be converted to arrays for name in columns: if name.endswith("metadata_schema") or name in [ "time_units", "reference_sequence/data", "reference_sequence/url", ]: columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8) if name.endswith("metadata"): columns[name] = np.frombuffer(columns[name], np.int8) # Some columns benefit from being quantised coordinates = np.unique( np.hstack([ [0, ts.sequence_length], tables.edges.left, tables.edges.right, tables.sites.position, tables.migrations.left, tables.migrations.right, ])) columns["coordinates"] = coordinates for name in [ "edges/left", "edges/right", "migrations/left", "migrations/right", "sites/position", ]: columns[name] = np.searchsorted(coordinates, columns[name]) # Some columns benefit from additional options delta_filter_cols = ["edges/parent", "sites/position"] # Note: we're not providing any options to set this here because Blosc+Zstd seems to # have a clear advantage in compression performance and speed. There is very little # difference between compression level 6 and 9, and it's extremely fast in any case # so there's no point in adding complexity. The shuffle filter in particular makes # big difference. compressor = numcodecs.Blosc(cname="zstd", clevel=9, shuffle=numcodecs.Blosc.SHUFFLE) for name, data in columns.items(): Column(name, data, delta_filter="_offset" in name or name in delta_filter_cols).compress(root, compressor)
def _ensure_datasets_exist(self, volume_config): dtype = volume_config["zarr"]["creation-settings"]["dtype"] create_if_necessary = volume_config["zarr"]["create-if-necessary"] writable = volume_config["zarr"]["writable"] if writable is None: writable = create_if_necessary mode = 'r' if writable: mode = 'a' self._filemode = mode block_shape = volume_config["zarr"]["creation-settings"]["chunk-shape"][::-1] global_offset = volume_config["zarr"]["global-offset"][::-1] bounding_box_zyx = np.array(volume_config["geometry"]["bounding-box"])[:,::-1] creation_shape = np.array(volume_config["zarr"]["creation-settings"]["shape"][::-1]) replace_default_entries(creation_shape, bounding_box_zyx[1] - global_offset) compression = volume_config["zarr"]["creation-settings"]["compression"] if compression == 'gzip': compressor = numcodecs.GZip() elif compression.startswith('blosc-'): cname = compression[len('blosc-'):] compressor = numcodecs.Blosc(cname) else: assert compression == "", f"Unimplemented compression: {compression}" if create_if_necessary: max_scale = volume_config["zarr"]["creation-settings"]["max-scale"] if max_scale == -1: if -1 in creation_shape: raise RuntimeError("Can't auto-determine the appropriate max-scale to create " "(or extend) the data with, because you didn't specify a " "volume creation shape (or bounding box") max_scale = choose_pyramid_depth(creation_shape, 512) available_scales = [*range(1+max_scale)] else: available_scales = volume_config["geometry"]["available-scales"] if not os.path.exists(self._path): raise RuntimeError(f"File does not exist: {self._path}\n" "You did not specify 'create-if-necessary' in the config, so I won't create it.:\n") if self._dataset_name and not os.path.exists(f"{self._path}/{self._dataset_name}"): raise RuntimeError(f"File does not exist: {self._path}/{self._dataset_name}\n" "You did not specify 'create-if-necessary' in the config, so I won't create it.:\n") for scale in available_scales: if scale == 0: name = self._dataset_name else: name = self._dataset_name[:-1] + f'{scale}' if name not in self.zarr_file: if not writable: raise RuntimeError(f"Dataset for scale {scale} does not exist, and you " "didn't specify 'writable' in the config, so I won't create it.") if dtype == "auto": raise RuntimeError(f"Can't create Zarr array {self._path}/{self._dataset_name}: " "No dtype specified in the config.") # Use 128 if the user didn't specify a chunkshape replace_default_entries(block_shape, 3*[128]) # zarr misbehaves if the chunks are larger than the shape, # which could happen here if we aren't careful (for higher scales). scaled_shape = (creation_shape // (2**scale)) chunks = np.minimum(scaled_shape, block_shape).tolist() if (chunks != block_shape) and (scale == 0): logger.warning(f"Block shape ({block_shape}) is too small for " f"the dataset shape ({creation_shape}). Shrinking block shape.") self._zarr_datasets[scale] = self.zarr_file.create_dataset( name, shape=scaled_shape.tolist(), dtype=np.dtype(dtype), chunks=chunks, compressor=compressor )
def compress_zarr(ts, root, variants_only=False): provenance_dict = provenance.get_provenance_dict({"variants_only": variants_only}) if variants_only: logging.info("Using lossy variants-only compression") # Reduce to site topology and quantise node times. Note that we will remove # any sites, individuals and populations here that have no references. ts = ts.simplify(reduce_to_site_topology=True) tables = ts.tables time = np.unique(tables.nodes.time) node_time = np.searchsorted(time, tables.nodes.time) else: tables = ts.tables node_time = tables.nodes.time coordinates = np.unique(np.hstack([ [0, ts.sequence_length], tables.edges.left, tables.edges.right, tables.sites.position, tables.migrations.left, tables.migrations.right])) with warnings.catch_warnings(): warnings.simplefilter("ignore") # When using a zipfile in Zarr we get some harmless warnings. See # https://zarr.readthedocs.io/en/stable/api/storage.html#zarr.storage.ZipStore root.attrs["format_name"] = FORMAT_NAME root.attrs["format_version"] = FORMAT_VERSION root.attrs["sequence_length"] = tables.sequence_length root.attrs["provenance"] = provenance_dict columns = [ Column("coordinates", coordinates), Column("individuals/flags", tables.individuals.flags), Column("individuals/location", tables.individuals.location), Column( "individuals/location_offset", tables.individuals.location_offset, delta_filter=True), Column("individuals/metadata", tables.individuals.metadata), Column( "individuals/metadata_offset", tables.individuals.metadata_offset, delta_filter=True), Column("nodes/time", node_time), Column("nodes/flags", tables.nodes.flags), Column("nodes/population", tables.nodes.population), Column("nodes/individual", tables.nodes.individual), Column("nodes/metadata", tables.nodes.metadata), Column( "nodes/metadata_offset", tables.nodes.metadata_offset, delta_filter=True), # Delta filtering makes storage slightly worse for everything except parent. Column("edges/left", np.searchsorted(coordinates, tables.edges.left)), Column("edges/right", np.searchsorted(coordinates, tables.edges.right)), Column("edges/parent", tables.edges.parent, delta_filter=True), Column("edges/child", tables.edges.child), Column("migrations/left", np.searchsorted(coordinates, tables.migrations.left)), Column( "migrations/right", np.searchsorted(coordinates, tables.migrations.right)), Column("migrations/node", tables.migrations.node), Column("migrations/source", tables.migrations.source), Column("migrations/dest", tables.migrations.dest), Column("migrations/time", tables.migrations.time), Column( "sites/position", np.searchsorted(coordinates, tables.sites.position), delta_filter=True), Column("sites/ancestral_state", tables.sites.ancestral_state), Column("sites/ancestral_state_offset", tables.sites.ancestral_state_offset), Column("sites/metadata", tables.sites.metadata), Column("sites/metadata_offset", tables.sites.metadata_offset), Column("mutations/site", tables.mutations.site), Column("mutations/node", tables.mutations.node), Column("mutations/parent", tables.mutations.parent), Column("mutations/derived_state", tables.mutations.derived_state), Column("mutations/derived_state_offset", tables.mutations.derived_state_offset), Column("mutations/metadata", tables.mutations.metadata), Column("mutations/metadata_offset", tables.mutations.metadata_offset), Column("populations/metadata", tables.populations.metadata), Column("populations/metadata_offset", tables.populations.metadata_offset), Column("provenances/timestamp", tables.provenances.timestamp), Column("provenances/timestamp_offset", tables.provenances.timestamp_offset), Column("provenances/record", tables.provenances.record), Column("provenances/record_offset", tables.provenances.record_offset), ] # Note: we're not providing any options to set this here because Blosc+Zstd seems to # have a clear advantage in compression performance and speed. There is very little # difference between compression level 6 and 9, and it's extremely fast in any case # so there's no point in adding complexity. The shuffle filter in particular makes # big difference. compressor = numcodecs.Blosc(cname='zstd', clevel=9, shuffle=numcodecs.Blosc.SHUFFLE) for column in columns: column.compress(root, compressor)
# Create and fill a caterva array using a block iterator t0 = time() a = cat.empty(shape, chunkshape=chunkshape, blockshape=blockshape, dtype=content.dtype, filename=fname_cat, cname=cname, clevel=clevel, filters=[filter], nthreads=nthreads) for block, info in a.iter_write(): block[:] = content[info.slice] acratio = a.cratio if persistent: del a t1 = time() print("Time for filling array (caterva, iter): %.3fs ; CRatio: %.1fx" % ((t1 - t0), acratio)) # Create and fill a zarr array t0 = time() compressor = numcodecs.Blosc(cname=cname, clevel=clevel, shuffle=filter, blocksize=blocksize) numcodecs.blosc.set_nthreads(nthreads) if persistent: z = zarr.open(fname_zarr, mode='w', shape=shape, chunks=chunkshape, dtype=dtype, compressor=compressor) else: z = zarr.empty(shape=shape, chunks=chunkshape, dtype=dtype, compressor=compressor) z[:] = content zratio = z.nbytes / z.nbytes_stored if persistent: del z t1 = time() print("Time for filling array (zarr): %.3fs ; CRatio: %.1fx" % ((t1 - t0), zratio)) # Create and fill a hdf5 array t0 = time() filters = tables.Filters(complevel=clevel, complib="blosc:%s" % cname, shuffle=True)
import zarr def write_n5(path, shape, block_size, compressor): store = zarr.N5Store(path) data = np.arange(np.prod(shape), dtype=np.uint16) data = data.reshape(shape) data_transpose = data.transpose() z = zarr.zeros( data_transpose.shape, chunks=block_size[::-1], store=store, dtype=data.dtype, overwrite=True, compressor=compressor) z[...] = data_transpose write_n5(path='raw', shape=[5, 4], block_size=[3, 2], compressor=None) write_n5( path='gzip', shape=[5, 4], block_size=[3, 2], compressor=numcodecs.GZip()) write_n5( path='bzip2', shape=[5, 4], block_size=[3, 2], compressor=numcodecs.BZ2()) write_n5( path='xz', shape=[5, 4], block_size=[3, 2], compressor=numcodecs.LZMA(preset=4)) write_n5( path='blosc', shape=[5, 4], block_size=[3, 2], compressor=numcodecs.Blosc())
# 'variants/MEND', # 'variants/MLEN', # 'variants/MSTART', # 'variants/SVLEN', # 'variants/SVTYPE', # 'variants/TSD', # 'variants/AC', # 'variants/AF', # 'variants/NS', # 'variants/AN', # 'variants/EAS_AF', # 'variants/EUR_AF', # 'variants/AFR_AF', # 'variants/AMR_AF', # 'variants/SAS_AF', # 'variants/DP', # 'variants/AA', # 'variants/VT', # 'variants/EX_TARGET', # 'variants/MULTI_ALLELIC'] # test_fields += ['variants/numalt', 'variants/svlen', 'variants/is_snp'] test_fields += ['variants/numalt','variants/is_snp', 'variants/svlen'] # test_fields = ['variants/*'] ska.vcf_to_zarr(vcf_file, vcf_file.replace('.vcf.gz', '.zarr'), fields=test_fields, alt_number=8, overwrite=True, compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False))
async def getStorBytes(app, key, filter_ops=None, offset=0, length=-1, bucket=None, use_proxy=False): """ Get object identified by key and read as bytes """ client = _getStorageClient(app) if not bucket: bucket = app['bucket_name'] if key[0] == '/': key = key[1:] # no leading slash if offset is None: offset = 0 if length is None: length = 0 log.info(f"getStorBytes({bucket}/{key}, offset={offset}, length: {length})") data_cache_page_size = int(config.get("data_cache_page_size")) shuffle = 0 compressor = None if filter_ops: log.debug(f"getStorBytes for {key} with filter_ops: {filter_ops}") if "use_shuffle" in filter_ops and filter_ops['use_shuffle']: shuffle = filter_ops['item_size'] log.debug("using shuffle filter") if "compressor" in filter_ops: compressor = filter_ops["compressor"] log.debug(f"using compressor: {compressor}") if offset > 0 and use_proxy and length < data_cache_page_size: # use rangeget proxy data = await rangegetProxy(app, bucket=bucket, key=key, offset=offset, length=length) else: data = await client.get_object(bucket=bucket, key=key, offset=offset, length=length) if data is None or len(data) == 0: log.info(f"no data found for {key}") return data log.info(f"read: {len(data)} bytes for key: {key}") if length > 0 and len(data) != length: log.warn(f"requested {length} bytes but got {len(data)} bytes") if compressor: # compressed chunk data... # first check if this was compressed with blosc blosc_metainfo = codecs.blosc.cbuffer_metainfo(data) # returns typesize, isshuffle, and memcopied if blosc_metainfo[0] > 0: log.info(f"blosc compressed data for {key}") try: blosc = codecs.Blosc() udata = blosc.decode(data) log.info(f"uncompressed to {len(udata)} bytes") data = udata shuffle = 0 # blosc will unshuffle the bytes for us except Exception as e: log.error(f"got exception: {e} using blosc decompression for {key}") raise HTTPInternalServerError() elif compressor == "zlib": # data may have been compressed without blosc, try using zlib directly log.info(f"using zlib to decompress {key}") try: udata = zlib.decompress(data) log.info(f"uncompressed to {len(udata)} bytes") data = udata except zlib.error as zlib_error: log.info(f"zlib_err: {zlib_error}") log.error(f"unable to uncompress obj: {key}") raise HTTPInternalServerError() else: log.error(f"don't know how to decompress data in {compressor} format for {key}") raise HTTPInternalServerError() if shuffle > 0: log.debug(f"shuffle is {shuffle}") start_time = time.time() unshuffled = _unshuffle(shuffle, data) if unshuffled is not None: log.debug(f"unshuffled to {len(unshuffled)} bytes") data = unshuffled finish_time = time.time() log.debug(f"unshuffled {len(data)} bytes, {(finish_time - start_time):.2f} elapsed") return data
# -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division import sys import numcodecs as codecs from numcodecs import blosc import numpy as np from numpy.testing import assert_array_equal codec = codecs.Blosc() data = np.arange(int(sys.argv[1])) for i in range(int(sys.argv[2])): enc = codec.encode(data) dec = codec.decode(enc) arr = np.frombuffer(dec, dtype=data.dtype) assert_array_equal(data, arr)
def calc_obsStats(vcfpath, chrom, pops, coord_bed, zarrpath, outpath): """Calculate stats from a VCF file.""" # if reuse_zarr is true if zarrpath.exists(): zarrfile = zarrpath else: zarrfile = zarrpath allel.vcf_to_zarr(str(vcfpath), str(zarrpath), group=chrom, fields='*', alt_number=2, log=sys.stdout, compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False)) # load pop info panel = pd.read_csv(pops, sep='\t', usecols=['sampleID', 'population']) # load zarr callset = zarr.open_group(str(zarrfile), mode='r') samples = callset[f'{chrom}/samples'][:] samples_list = list(samples) samples_callset_index = [samples_list.index(s) for s in panel['sampleID']] panel['callset_index'] = samples_callset_index panel = panel.sort_values(by='callset_index') # load gt pos = allel.SortedIndex(callset[f'{chrom}/variants/POS']) gt = allel.GenotypeArray(callset[f'{chrom}/calldata/GT']) # separate gt for each population ix_s = 0 pop_dt = {} pop_ix = [] for i, p in enumerate(panel["population"].unique()): p_ix = panel[panel["population"] == p]["callset_index"].values ix_e = len(p_ix) * 2 + ix_s pop_ix.append(list(range(ix_s, ix_e))) pop_dt[p] = gt.take(p_ix, axis=1).to_haplotypes() ix_s = ix_e # combine and transpose haps = np.concatenate(list(pop_dt.values()), axis=1).T # prep progress bar ln_count = 0 with open(coord_bed, 'r') as cb: for line in cb: if not line.startswith("chrom"): ln_count += 1 progressbar = tqdm(total=ln_count, desc="window numb", unit='window') # update stats_dt stats_dt["num_haps"] = haps.shape[0] stats_dt["pop_config"] = pop_ix stats_dt["length_bp"] = int( line.split()[-1]) # may be shorter than expected due to last window stats_dt["reps"] = ln_count # write headers outfile = outpath.parent / f"{outpath.stem}.Obs.pop_stats.txt" pops_outfile = open(outfile, 'w') pops_outfile, header_, header_ls = headers(pops_outfile, stats_dt, pop_names=list(pop_dt.keys()), obs=True) # calc stats # TODO: parallel chrom_ls = [] i = 0 stat_mat = np.zeros([ln_count, len(header_ls) - 1]) with open(coord_bed, 'r') as cb: for line in cb: if not line.startswith("chrom"): cb_lin = line.split() chrom = cb_lin[0] chrom_ls.append(chrom) start = int(cb_lin[1]) stop = int(cb_lin[2]) len_bp = stop - start stats_dt["length_bp"] = len_bp sites = int(cb_lin[3]) try: pos_ix = pos.locate_range(start, stop) except KeyError: continue pos_t = pos[pos_ix] - start haps_t = haps[:, pos_ix] counts_t = haps_t.sum(axis=0).astype(int) # run stats stats_ls = [start, stop, sites] popsumstats = PopSumStats(pos_t, haps_t, counts_t, stats_dt) for stat in stats_dt["calc_stats"]: stat_fx = getattr(popsumstats, stat) try: ss = stat_fx() # print(f"{stat} = {len(ss)}") except IndexError: ss = [np.nan] * len(stats_dt["pw_quants"]) stats_ls.extend(ss) try: stat_mat[i, :] = stats_ls i += 1 progressbar.update() except ValueError: continue # write stats out stat_mean = np.round(np.nanmean(stat_mat, axis=0), 5) stats_str = "\t".join(map(str, stat_mean[3:])) pops_outfile.write( f"mean_{chrom}\t{int(stat_mat[0, 0])}\t{stop}\t{np.sum(stat_mat[:, 2])}\t{stats_str}\n" ) for stat in range(stat_mat.shape[0]): chrom = chrom_ls[stat] start = int(stat_mat[stat, 0]) stop = int(stat_mat[stat, 1]) sites = int(stat_mat[stat, 2]) rd = [round(num, 5) for num in stat_mat[stat, 3:]] stats_str = "\t".join(map(str, rd)) pops_outfile.write(f"{chrom}\t{start}\t{stop}\t{sites}\t{stats_str}\n") progressbar.close() pops_outfile.close() return outfile
def compress_zarr(ts, root): # TODO this current version is the most extreme option where we throw away # all the non-site information. # First reduce to site topology tables = ts.dump_tables() tables.simplify(reduce_to_site_topology=True) nodes = root.create_group("nodes") flags = nodes.empty("flags", shape=len(tables.nodes), dtype=np.uint8) flags[:] = tables.nodes.flags logger.debug(flags.info) # Get the indexes into the position array. pos_map = np.hstack([tables.sites.position, [tables.sequence_length]]) pos_map[0] = 0 left_mapped = np.searchsorted(pos_map, tables.edges.left) if np.any(pos_map[left_mapped] != tables.edges.left): raise ValueError("Invalid left coordinates") right_mapped = np.searchsorted(pos_map, tables.edges.right) if np.any(pos_map[right_mapped] != tables.edges.right): raise ValueError("Invalid right coordinates") filters = [numcodecs.Delta(dtype=np.int32, astype=np.int32)] compressor = numcodecs.Blosc(cname='zstd', clevel=9, shuffle=numcodecs.Blosc.SHUFFLE) edges = root.create_group("edges") parent = edges.empty("parent", shape=len(tables.edges), dtype=np.int32, filters=filters, compressor=compressor) child = edges.empty("child", shape=len(tables.edges), dtype=np.int32, filters=filters, compressor=compressor) left = edges.empty("left", shape=len(tables.edges), dtype=np.uint32, filters=filters, compressor=compressor) right = edges.empty("right", shape=len(tables.edges), dtype=np.uint32, filters=filters, compressor=compressor) parent[:] = tables.edges.parent child[:] = tables.edges.child left[:] = left_mapped right[:] = right_mapped mutations = root.create_group("mutations") site = mutations.empty("site", shape=len(tables.mutations), dtype=np.int32, compressor=compressor) node = mutations.empty("node", shape=len(tables.mutations), dtype=np.int32, compressor=compressor) site[:] = tables.mutations.site node[:] = tables.mutations.node
import zarr import numcodecs from skimage.data import astronaut # choose chunks s.t. we do have overhanging edge-chunks CHUNKS = (100, 100, 1) STR_TO_COMPRESSOR = { 'gzip': numcodecs.GZip(), 'blosc': numcodecs.Blosc(), 'zlib': numcodecs.Zlib() } def generate_zarr_format(compressors=['gzip', 'blosc', 'zlib', None]): path = '../data/zarr.zr' im = astronaut() f = zarr.open(path) for compressor in compressors: name = compressor if compressor is not None else 'raw' compressor_impl = STR_TO_COMPRESSOR[ compressor] if compressor is not None else None f.create_dataset(name, data=im, chunks=CHUNKS, compressor=compressor_impl) # this needs PR https://github.com/zarr-developers/zarr/pull/309 def generate_n5_format(compressors=['gzip', None]): path = '../data/zarr.n5'