def _write(refs, outpath, filetype=None): types = { "json": "json", "parquet": "parquet", "zarr": "zarr" } if filetype is None: ext = os.path.splitext(outpath)[1].lstrip(".") filetype = types[ext] elif filetype not in types: raise KeyError if filetype == "json": with open(outpath, "w") as f: json.dump(refs, f) return import pandas as pd references2 = { k: {"data": v.encode('ascii') if not isinstance(v, list) else None, "url": v[0] if isinstance(v, list) else None, "offset": v[1] if isinstance(v, list) else None, "size": v[2] if isinstance(v, list) else None} for k, v in refs['refs'].items()} # use pandas for sorting df = pd.DataFrame(references2.values(), index=list(references2)).sort_values("offset") if filetype == "zarr": import zarr import numcodecs # compression should be NONE, if intent is to store in single zip g = zarr.open_group(outpath, mode='w') g.attrs.update({k: v for k, v in refs.items() if k in ['version', "templates", "gen"]}) g.array(name="key", data=df.index.values, dtype="object", compression="zstd", object_codec=numcodecs.VLenUTF8()) g.array(name="offset", data=df.offset.values, dtype="uint32", compression="zstd") g.array(name="size", data=df['size'].values, dtype="uint32", compression="zstd") g.array(name="data", data=df.url.values, dtype="object", object_codec=numcodecs.VLenUTF8(), compression="gzip") # may be better as fixed length g.array(name="url", data=df.url.values, dtype="object", object_codec=numcodecs.VLenUTF8(), compression='gzip') if filetype == "parquet": import fastparquet metadata = json.dumps( {k: v for k, v in refs.items() if k in ['version', "templates", "gen"]} ) fastparquet.write( outpath, custom_metadata=metadata, compression="ZSTD" )
def write_series(group, key, series, dataset_kwargs=MappingProxyType({})): if series.dtype == object: group.create_dataset( key, shape=series.shape, dtype=object, object_codec=numcodecs.VLenUTF8(), **dataset_kwargs, ) group[key][:] = series.values elif is_categorical_dtype(series): # This should work for categorical Index and Series categorical: pd.Categorical = series.values categories: np.ndarray = categorical.categories.values codes: np.ndarray = categorical.codes category_key = f"__categories/{key}" write_array(group, category_key, categories, dataset_kwargs=dataset_kwargs) write_array(group, key, codes, dataset_kwargs=dataset_kwargs) group[key].attrs["categories"] = category_key # Must coerce np.bool_ to bool for json writing group[category_key].attrs["ordered"] = bool(categorical.ordered) else: group[key] = series.values
def table(self, data, names=None, expectedlen=None, **kwargs): # setup names, columns = _util.check_table_like(data, names=names) kwargs = self._set_defaults(kwargs) chunks = kwargs.pop('chunks', None) g = zarr.group(**kwargs) # create columns for n, c in zip(names, columns): if chunks is None: chunks = default_chunks(c, expectedlen) if c.dtype == object: # peek at first value peek = c[0] if isinstance(peek, bytes): object_codec = numcodecs.VLenBytes() elif isinstance(peek, str): object_codec = numcodecs.VLenUTF8() else: object_codec = numcodecs.MsgPack() else: object_codec = None g.array(name=n, data=c, chunks=chunks, object_codec=object_codec) # create table ztbl = ZarrTable(g, names=names) return ztbl
def array(self, data, expectedlen=None, **kwargs): # setup data = _util.ensure_array_like(data) kwargs = self._set_defaults(kwargs) # determine chunks kwargs.setdefault('chunks', default_chunks(data, expectedlen)) # determine object codec if data.dtype == object: # peek at first value peek = data[0] if isinstance(peek, bytes): object_codec = numcodecs.VLenBytes() elif isinstance(peek, str): object_codec = numcodecs.VLenUTF8() else: object_codec = numcodecs.MsgPack() kwargs.setdefault('object_codec', object_codec) # create z = zarr.array(data, **kwargs) return z
def prepare_zarr_storage(variations, out_path): store = zarr.DirectoryStore(str(out_path)) root = zarr.group(store=store, overwrite=True) metadata = variations.metadata sources = [] targets = [] samples_array = variations.samples #samples_array.compute_chunk_sizes() sources.append(samples_array) object_codec = None if samples_array.dtype == object: object_codec = numcodecs.VLenUTF8() dataset = zarr.create(shape=samples_array.shape, path='samples', store=store, dtype=samples_array.dtype, object_codec=object_codec) targets.append(dataset) variants = root.create_group(ZARR_VARIANTS_GROUP_NAME, overwrite=True) calls = root.create_group(ZARR_CALL_GROUP_NAME, overwrite=True) for field, array in variations.items(): definition = ALLELE_ZARR_DEFINITION_MAPPINGS[field] field_metadata = metadata.get(field, None) array = variations[field] if array is None: continue array.compute_chunk_sizes() sources.append(array) group_name = definition['group'] group = calls if group_name == ZARR_CALL_GROUP_NAME else variants path = os.path.sep + os.path.join(group.path, definition['field']) object_codec = None if array.dtype == object: object_codec = numcodecs.VLenUTF8() dataset = zarr.create(shape=array.shape, path=path, store=store, object_codec=object_codec, dtype=array.dtype) if field_metadata is not None: for key, value in field_metadata.items(): dataset.attrs[key] = value targets.append(dataset) lock = SerializableLock() return da.store(sources, targets, compute=False, lock=lock)
def extract_splits(): """Extracts splits to ${DATASETS_DIR}/ucf101/splits.zarr.""" f = zarr.open(SPLITS_FINAL_DIR, 'w') for split in (1, 2, 3): g = f.create_group(str(split)) for subset in ('train', 'test'): names = load_split(SPLITS_DIR, split, subset) g.create_dataset(subset, data=names, dtype=object, object_codec=numcodecs.VLenUTF8()) print(f'Splits saved to {SPLITS_FINAL_DIR}')
def write_array(g, key, value, dataset_kwargs={}): if value.dtype == object: g.create_dataset( key, shape=value.shape, dtype=object, object_codec=numcodecs.VLenUTF8(), **dataset_kwargs, ) g[key][:] = value else: g.create_dataset(key, data=value, **dataset_kwargs)
def write_vlen_string_array_zarr(f, k, elem, dataset_kwargs=MappingProxyType({})): import numcodecs f.create_dataset( k, shape=elem.shape, dtype=object, object_codec=numcodecs.VLenUTF8(), **dataset_kwargs, ) f[k][:] = elem
def write_array(g, key, value, dataset_kwargs=MappingProxyType({})): if value.dtype == object: g.create_dataset( key, shape=value.shape, dtype=object, object_codec=numcodecs.VLenUTF8(), **dataset_kwargs, ) g[key][:] = value elif value.dtype.kind == "V": # Structured dtype g.create_dataset(key, data=_to_fixed_length_strings(value), **dataset_kwargs) else: g.create_dataset(key, data=value, **dataset_kwargs)
def write_series(g, k, s, dataset_kwargs={}): if s.dtype == object: g.create_dataset( k, shape=s.shape, dtype=object, object_codec=numcodecs.VLenUTF8(), **dataset_kwargs, ) g[k][:] = s.values elif is_categorical_dtype(s): g.create_dataset(k, shape=s.shape, dtype=s.cat.codes.dtype) g[k][:] = s.cat.codes g[k].attrs["categories"] = list(s.cat.categories) else: g[k] = s.values
def write_series(group, key, series, dataset_kwargs=MappingProxyType({})): if series.dtype == object: group.create_dataset( key, shape=series.shape, dtype=object, object_codec=numcodecs.VLenUTF8(), **dataset_kwargs, ) group[key][:] = series.values elif is_categorical_dtype(series): cats = series.cat.categories.values codes = series.cat.codes.values category_key = f"__categories/{key}" write_array(group, category_key, cats, dataset_kwargs) write_array(group, key, codes, dataset_kwargs) group[key].attrs["categories"] = category_key else: group[key] = series.values
def concat_zarrs_optimized( zarr_files: Sequence[str], output: Union[PathType, MutableMapping[str, bytes]], vars_to_rechunk: List[Hashable], vars_to_copy: List[Hashable], fix_strings: bool = False, ) -> None: if isinstance(output, Path): output = str(output) zarr_groups = [zarr.open_group(f) for f in zarr_files] first_zarr_group = zarr_groups[0] # create the top-level group zarr.open_group(output, mode="w") # copy variables that are to be rechunked # NOTE: that this uses _to_zarr function defined here that is needed to avoid # race conditions between writing the array contents and its metadata # see https://github.com/pystatgen/sgkit/pull/486 delayed = [] # do all the rechunking operations in one computation for var in vars_to_rechunk: dtype = None if fix_strings and var in {"variant_id", "variant_allele"}: max_len = _get_max_len(zarr_groups, f"max_length_{var}") dtype = f"S{max_len}" arr = concatenate_and_rechunk([group[var] for group in zarr_groups], dtype=dtype) _to_zarr_kwargs = dict( compressor=first_zarr_group[var].compressor, filters=first_zarr_group[var].filters, fill_value=None, ) if not fix_strings and arr.dtype == "O": # We assume that all object dtypes are variable length strings var_len_str_codec = numcodecs.VLenUTF8() _to_zarr_kwargs["object_codec"] = var_len_str_codec # Remove from filters to avoid double encoding error if var_len_str_codec in first_zarr_group[var].filters: filters = list(first_zarr_group[var].filters) filters.remove(var_len_str_codec) _to_zarr_kwargs["filters"] = filters d = _to_zarr( # type: ignore[no-untyped-call] arr, output, component=var, overwrite=True, compute=False, attrs=first_zarr_group[var].attrs.asdict(), **_to_zarr_kwargs, ) delayed.append(d) da.compute(*delayed) # copy unchanged variables and top-level metadata with zarr.open_group(output) as output_zarr: # copy variables that are not rechunked (e.g. sample_id) for var in vars_to_copy: output_zarr[var] = first_zarr_group[var] output_zarr[var].attrs.update(first_zarr_group[var].attrs) # copy top-level attributes group_attrs = dict(first_zarr_group.attrs) if "max_alt_alleles_seen" in group_attrs: max_alt_alleles_seen = _get_max_len(zarr_groups, "max_alt_alleles_seen") group_attrs["max_alt_alleles_seen"] = max_alt_alleles_seen output_zarr.attrs.update(group_attrs) # consolidate metadata zarr.consolidate_metadata(output)
def writeFactorData(self, factor_data, table_name, ifactor_name, if_exists="update", data_type=None): if data_type is None: data_type = _identifyDataType(factor_data.dtypes) if data_type=='double': try: factor_data = factor_data.astype('float') data_type = 'double' except: factor_data = factor_data.where(pd.notnull(factor_data), None) data_type = 'string' else: factor_data = factor_data.where(pd.notnull(factor_data), None) DTs = factor_data.index if pd.__version__>="0.20.0": factor_data.index = [idt.to_pydatetime().timestamp() for idt in factor_data.index] else: factor_data.index = [idt.timestamp() for idt in factor_data.index] TablePath = self.MainDir+os.sep+table_name with self._DataLock: if ifactor_name not in self._TableFactorDict.get(table_name, {}): self._TableFactorDict[table_name] = self._TableFactorDict.get(table_name, pd.Series()).append(pd.Series(data_type, index=[ifactor_name])) ZTable = zarr.open(TablePath, mode="a") if ifactor_name not in ZTable: ZFactor = ZTable.create_group(ifactor_name, overwrite=True) ZFactor.create_dataset("ID", shape=(factor_data.shape[1], ), data=factor_data.columns.values, dtype=object, object_codec=numcodecs.VLenUTF8(), overwrite=True) ZFactor.create_dataset("DateTime", shape=(factor_data.shape[0], ), data=factor_data.index.values, dtype="f8", overwrite=True) if data_type=="double": ZFactor.create_dataset("Data", shape=factor_data.shape, data=factor_data.values, dtype="f8", fill_value=np.nan, overwrite=True) elif data_type=="string": ZFactor.create_dataset("Data", shape=factor_data.shape, data=factor_data.values, dtype=object, object_codec=numcodecs.VLenUTF8(), overwrite=True) ZFactor.attrs["DataType"] = data_type ZTable.attrs["DataType"] = self._TableFactorDict[table_name].to_dict() factor_data.index = DTs return 0 if if_exists=="update": self._updateFactorData(factor_data, table_name, ifactor_name, data_type) elif if_exists=="append": OldData = self.getTable(table_name).readFactorData(ifactor_name=ifactor_name, ids=factor_data.columns.tolist(), dts=DTs.tolist()) OldData.index = factor_data.index factor_data = OldData.where(pd.notnull(OldData), factor_data) self._updateFactorData(factor_data, table_name, ifactor_name, data_type) factor_data.index = DTs return 0
def writeFactorData(self, factor_data, table_name, ifactor_name, if_exists="update", data_type=None): TablePath = self.MainDir + os.sep + table_name with self._DataLock: ZTable = zarr.open(TablePath, mode="a") if ifactor_name not in ZTable: factor_data, data_type = _identifyDataType( factor_data, data_type) ZFactor = ZTable.create_group(ifactor_name, overwrite=True) ZFactor.create_dataset("ID", shape=(factor_data.shape[1], ), data=factor_data.columns.values, dtype=object, object_codec=numcodecs.VLenUTF8(), overwrite=True) ZFactor.create_dataset("DateTime", shape=(factor_data.shape[0], ), data=factor_data.index.values, dtype="M8[ns]", overwrite=True) if data_type == "double": ZFactor.create_dataset("Data", shape=factor_data.shape, data=factor_data.values, dtype="f8", fill_value=np.nan, overwrite=True) elif data_type == "string": ZFactor.create_dataset("Data", shape=factor_data.shape, data=factor_data.values, dtype=object, object_codec=numcodecs.VLenUTF8(), overwrite=True) elif data_type == "object": ZFactor.create_dataset("Data", shape=factor_data.shape, data=factor_data.values, dtype=object, object_codec=numcodecs.Pickle(), overwrite=True) ZFactor.attrs["DataType"] = data_type DataType = ZTable.attrs.get("DataType", {}) DataType[ifactor_name] = data_type ZTable.attrs["DataType"] = DataType return 0 if if_exists == "update": self._updateFactorData(factor_data, table_name, ifactor_name, data_type) elif if_exists == "append": OldData = self.getTable(table_name).readFactorData( ifactor_name=ifactor_name, ids=factor_data.columns.tolist(), dts=factor_data.index.tolist()) OldData.index = factor_data.index factor_data = OldData.where(pd.notnull(OldData), factor_data) self._updateFactorData(factor_data, table_name, ifactor_name, data_type) return 0