def __init__(self, path: str, mode: str = 'r', storage_type: str = None) -> None: """ Initialize a Zarr file, if mode == 'w', create an empty one, otherwise, load from path path : `str`, path for the zarr object. storage_type : `str`, currently only support 'ZipStore' and 'NestedDirectoryStore'. If None, use 'NestedDirectoryStore' by default. """ self.store = self.root = None if storage_type is None: storage_type = 'NestedDirectoryStore' if mode == 'w': # Create a new zarr file check_and_remove_existing_path(path) self.store = zarr.ZipStore( path, mode='w' ) if storage_type == 'ZipStore' else zarr.NestedDirectoryStore( path) self.root = zarr.group(self.store, overwrite=True) else: # Load existing zarr file self.store = zarr.NestedDirectoryStore(path) if os.path.isdir( path) else zarr.ZipStore(path, mode='r') if mode == 'a' and isinstance(self.store, zarr.ZipStore): self._to_directory() self.root = zarr.open(self.store, mode=mode)
def _to_zip(self): if not isinstance(self.store, zarr.ZipStore): zip_path = self.store.path + '.zip' zip_store = zarr.ZipStore(zip_path, mode = 'w') zarr.copy_store(self.store, zip_store) zip_store.close() shutil.rmtree(self.store.path) self.store = zarr.ZipStore(zip_path, mode = 'r') self.root = zarr.open_group(self.store, mode = 'r')
def _to_directory(self): orig_path = self.store.path if not orig_path.endswith('.zip'): self.store.close() zip_path = orig_path + '.zip' check_and_remove_existing_path(zip_path) os.replace(orig_path, zip_path) self.store = zarr.ZipStore(zip_path, mode='r') else: zip_path = orig_path dest_path = zip_path[:-4] check_and_remove_existing_path(dest_path) dir_store = zarr.NestedDirectoryStore(dest_path) zarr.copy_store(self.store, dir_store) self.store.close() os.remove(zip_path) self.store = dir_store self.root = zarr.open_group(self.store) logger.info( f"Converted ZipStore zarr file {orig_path} to NestedDirectoryStore {dest_path}." )
def init_zarr(sample_id, path, file_format, schema_version): """Initializes the zarr output. Args: sample_id (str): sample or cell id path (str): path to the zarr output file_format (str): zarr file format [DirectoryStore, ZipStore] schema_version (str): version string of this output to allow for parsing of future changes Returns: root (zarr.hierarchy.Group): initialized zarr group """ store = None if file_format == "DirectoryStore": store = zarr.DirectoryStore(path) if file_format == "ZipStore": store = zarr.ZipStore(path, mode='w') # create the root group root = zarr.group(store, overwrite=True) #root.attrs['README'] = "The schema adopted in this zarr store may undergo changes in the future" root.attrs['sample_id'] = sample_id root.attrs['optimus_output_schema_version'] = schema_version # Create the expression_matrix group #root.create_group("expression_matrix", overwrite=True); return root
def _add_item(path: FilepathType, features: FeatureMap, input_dict: dict): parsed_input_dict = dict() for feat_name, feat in features.items(): try: feat_val = input_dict[feat_name] except KeyError: raise RuntimeError(f"Feature missing ({feat_name})") if feat.shape is not None and np.array(feat_val).shape != feat.shape: raise ValueError(f"Shape mismatch for {feat_name} " f"(expected {feat.shape}, " f"got {np.array(feat_val).shape})") feat_val = (np.expand_dims(feat_val, axis=0) if isinstance( feat_val, np.ndarray) else np.array( [feat_val], dtype=feat.dtype)).astype(feat.dtype) parsed_input_dict[feat_name] = feat_val path = Path(path).resolve() lock = FileLock(_lockfile(path)) with lock, warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) store = zarr.ZipStore(path, mode="a") root = zarr.group(store=store) for k, v in parsed_input_dict.items(): root[k].append(v) store.close()
def read_metadata(cls, path: FilepathType) -> Metadata: path = Path(path).resolve() lock = FileLock(_lockfile(path)) with lock, zarr.ZipStore(path, mode="r") as store: root = zarr.group(store=store) metadata = root.attrs[_METADATA_KEY] return Metadata.deserialize(metadata)
def check_simulation_result( datadir: Path, config: Dict[str, Any], run: Dict[str, Any], xp: Any, ) -> None: # Extract properties for simulation dataset, paramset = run["dataset"], run["paramset"] ds_config = config["datasets"][dataset] ps_config = config["paramsets"][paramset] dataset_dir = datadir / "dataset" / dataset result_dir = datadir / "result" / run["name"] # Load simulated data with zarr.ZipStore(str(dataset_dir / "genotypes.zarr.zip"), mode="r") as store: ds = xr.open_zarr(store, consolidated=False) df_covariate = load_covariates(dataset_dir) df_trait = load_traits(dataset_dir) contigs = ds["variant_contig"].values G = xp.asarray(ds["call_genotype"].sum(dim="ploidy").values) X = xp.asarray(df_covariate.values) Y = xp.asarray(df_trait.values) alphas = ps_config["alphas"] if alphas is not None: alphas = xp.asarray(alphas) # Define transformed traits res = regenie_transform( G.T, X, Y, contigs, variant_block_size=ps_config["variant_block_size"], sample_block_size=ps_config["sample_block_size"], normalize=True, add_intercept=False, alphas=alphas, orthogonalize=False, # Intentionally make mistakes related to these flags # in order to match Glow results _glow_adj_dof=True, _glow_adj_scaling=True, _glow_adj_alpha=True, ) YBP = res["regenie_base_prediction"].data YMP = res["regenie_meta_prediction"].data # Check equality of stage 1 and 2 transformations check_stage_1_results(YBP, ds_config, ps_config, result_dir) check_stage_2_results(YMP, df_trait, result_dir) # Check equality of GWAS results X = da.from_array(X) Q = da.linalg.qr(X)[0] YR = Y - YMP YP = YR - Q @ (Q.T @ YR) stats = linear_regression( _dask_cupy_to_numpy(G.T), _dask_cupy_to_numpy(YP), _dask_cupy_to_numpy(Q) ) check_stage_3_results(ds, stats, df_trait, result_dir)
def test_format_written(self): ts = msprime.simulate(10, random_seed=1) tszip.compress(ts, self.path) with zarr.ZipStore(str(self.path), mode="r") as store: root = zarr.group(store=store) self.assertEqual(root.attrs["format_name"], compression.FORMAT_NAME) self.assertEqual(root.attrs["format_version"], compression.FORMAT_VERSION)
def init_zarr(sample_id, path, file_format): """Initializes the zarr output Args: sample_id (str): sample or cell id path (str): path to the zarr output fileformat (str): zarr file format [ DirectoryStore, ZipStore] Returns: zarr.hierarchy.Group """ store = None if file_format == "DirectoryStore": store = zarr.DirectoryStore(path) if file_format == "ZipStore": store = zarr.ZipStore(path, mode='w') # create the root group root = zarr.group(store, overwrite=True) # add some readme for the user root.attrs['README'] = ( "The schema adopted in this zarr store may undergo " "changes in the future") root.attrs['sample_id'] = sample_id return root
def load(savepath, lazy: bool = False, normalize_strings: bool = True, use_temp: bool = False): """[summary] Args: savepath ([type]): [description] lazy (bool, optional): [description]. Defaults to True. normalize_strings (bool, optional): [description]. Defaults to True. use_temp (bool, optional): Unpack zip to temp file - potentially speeds up loading and allows overwriting existing zarr file. Defaults to True. Returns: [type]: [description] """ zarr_store = zarr.ZipStore(savepath, mode='r') if use_temp: dest = zarr.TempStore() zarr.copy_store(zarr_store, dest) zarr_store.close() zarr_store = dest dataset = xr.open_zarr(zarr_store) if not lazy: dataset.load() zarr_store.close() if normalize_strings: dataset = _normalize_strings(dataset) return dataset
def decompress(path): """ Returns a decompressed tskit tree sequence read from the specified path. """ store = zarr.ZipStore(path, mode='r') root = zarr.group(store=store) return decompress_zarr(root)
def init_zarr(sample_id, path, file_format): """Initializes the zarr output. Args: sample_id (str): sample or cell id path (str): path to the zarr output file_format (str): zarr file format [DirectoryStore, ZipStore] Returns: root (zarr.hierarchy.Group): initialized zarr group """ store = None if file_format == "DirectoryStore": store = zarr.DirectoryStore(path) if file_format == "ZipStore": store = zarr.ZipStore(path, mode='w') # create the root group root = zarr.group(store, overwrite=True) # add some readme for the user root.attrs[ 'README'] = "The schema adopted in this zarr store may undergo changes in the future" root.attrs['sample_id'] = sample_id # now iterate through list of expected groups and create them for dataset in ZARR_GROUP: root.create_group(dataset, overwrite=True) return root
def compress(ts, path, variants_only=False): """ Compresses the specified tree sequence and writes it to the specified path. By default, fully lossless compression is used so that tree sequences are identical before and after compression. By specifying the ``variants_only`` option, a lossy compression can be used, which discards any information that is not needed to represent the variants (which are stored losslessly). :param tskit.TreeSequence ts: The input tree sequence. :param str destination: The string or :class:`pathlib.Path` instance describing the location of the compressed file. :param bool variants_only: If True, discard all information not necessary to represent the variants in the input file. """ destination = str(path) # Write the file into a temporary directory on the same file system so that # we can write the output atomically. destdir = os.path.dirname(os.path.abspath(destination)) with tempfile.TemporaryDirectory(dir=destdir, prefix=".tszip_work_") as tmpdir: filename = os.path.join(tmpdir, "tmp.trees.tgz") logging.debug(f"Writing to temporary file {filename}") with zarr.ZipStore(filename, mode="w") as store: root = zarr.group(store=store) compress_zarr(ts, root, variants_only=variants_only) os.replace(filename, destination) logging.info(f"Wrote {destination}")
def create_zarr_store(ds, rootdir, ignore_vars=[], storetype='directory', consolidated=True): """ OBSOLETE Write each variable from a xarray Dataset ds into a new zarr ZipStore under the root directory rootdir, excluding optional variables from ignore_vars. PARAMETERS: =========== ds: xarray.Dataset input dataset rootdir: str root path to the zarr stores ignore_vars: list variables to ignore storetype: str zarr store type (directory, zip) consolidated: logical zarr option to store (default = True) RETURNS: ======== None """ for variable in ds.variables: if variable not in ignore_vars: print(f'writing {variable}') # create a bogus dataset to copy a single variable tmp = _xr.Dataset() tmp[variable] = ds[variable] # update output directory with variable name outputdir = rootdir.replace('<VARNAME>', variable) # create the output directory check = subprocess.check_call(f'mkdir -p {outputdir}', shell=True) exit_code(check) # create a zarr store in write mode store_exists = os.path.exists(f'{outputdir}/{variable}') if storetype == 'directory' and not store_exists: store = _zarr.DirectoryStore(f'{outputdir}/{variable}') # then copy to zarr tmp.to_zarr(store, consolidated=consolidated) elif storetype == 'zip': store = _zarr.ZipStore(f'{outputdir}/{variable}.zip', mode='w') # then copy to zarr tmp.to_zarr(store, mode='w', consolidated=consolidated) # and close store if storetype == 'zip': store.close() tmp.close() return None
def _0(obj: classo_problem) -> CLASSOProblemDirectoryFormat : # for output of regress ff = CLASSOProblemDirectoryFormat() zipfile = str(ff.path/'problem.zip') store = zarr.ZipStore(zipfile,mode='w') root = zarr.open(store=store) to_zarr(obj,'problem',root) store.close() return ff
def test_provenance(self): ts = msprime.simulate(10, random_seed=1) for variants_only in [True, False]: tszip.compress(ts, self.path, variants_only=variants_only) with zarr.ZipStore(str(self.path), mode='r') as store: root = zarr.group(store=store) self.assertEqual( root.attrs["provenance"], provenance.get_provenance_dict( {"variants_only": variants_only}))
def from_array(cls, path: Path, array: np.ndarray, overwrite: bool = False) -> Raw: if path.is_file() and (not overwrite): raise RuntimeError("File already exists") else: with zarr.ZipStore(path, zipfile.ZIP_DEFLATED) as store: zarr.save_array(store, array) return cls(path)
def __len__(self): sizes = set() lock = FileLock(_lockfile(self.path)) with lock, zarr.ZipStore(self.path, mode="r") as store: root = zarr.group(store=store) for feat_name in self.features.keys(): sizes.add(len(root[feat_name])) if len(sizes) != 1: raise RuntimeError("Dataset corrupted!!!") return sizes.pop()
def load_zarr(path): path = str(path) try: store = zarr.ZipStore(path, mode='r') except zipfile.BadZipFile as bzf: raise exceptions.FileFormatError("File is not in tgzip format") from bzf root = zarr.group(store=store) try: check_format(root) yield root finally: store.close()
def _load_dataset_file(dataset_filepath): """ Loads a single dataset file give by its path :param dataset_filepath: path where the file is located :return:starting_idx: [int] - List of indices where ech game starts x: nd.array - Numpy array which contains the game positions y_value: nd.array - Numpy array which describes the winner for each board position y_policy: nd.array - Numpy array which describes the policy distribution for each board state (in case of a pgn dataset the move is one hot encoded) """ return get_numpy_arrays( zarr.group(store=zarr.ZipStore(dataset_filepath, mode="r")))
def open_group(path): if path.endswith("h5"): return h5py.File(path, "r") elif path.endswith(("zarr2", "zarr")): return zarr.open_group(path, "r") elif path.endswith("zip"): zz = zarr.ZipStore(path) return zarr.Group(zz) else: raise ValueError( "Bad filepath provided: {0}. Only hdf5/zarr supported.".format( path))
def _load_dataset_file(dataset_filepath): """ Loads a single dataset file give by its path :param dataset_filepath: path where the file is located :return: """ store = zarr.ZipStore(dataset_filepath, mode='r') pgn_dataset = zarr.group(store=store) s_idcs, x, yv, yp = get_numpy_arrays(pgn_dataset) return s_idcs, x, yv, yp
def __getitem__(self, idx) -> DynamicNamedTuple: feat_names = list(self.features.keys()) lock = FileLock(_lockfile(self.path)) with lock, zarr.ZipStore(self.path, mode="r") as store: root = zarr.group(store=store) data_dict = { feat_name: root[feat_name][idx] for feat_name in feat_names } return (ItemViewMeta(feat_names)(**data_dict) if isinstance(idx, int) else ItemSetViewMeta(feat_names)(**data_dict))
def compress(ts, path): """ Compresses the specified tree sequence and writes it to the specified path. """ logging.info("Compressing to {}".format(path)) try: store = zarr.ZipStore(path, mode='w') root = zarr.group(store=store) compress_zarr(ts, root) store.close() except Exception as e: os.unlink(path) raise e
def run(dataset: str, dataset_dir="data/dataset"): dataset_dir = Path(dataset_dir) plink_path = dataset_dir / dataset / "genotypes" zarr_path = dataset_dir / dataset / "genotypes.zarr.zip" ds = read_plink(path=plink_path, bim_sep="\t", fam_sep="\t") # Pre-compute string lengths until this is done: # https://github.com/pystatgen/sgkit-plink/issues/12 ds = ds.compute() logger.info(f"Loaded dataset {dataset}:") logger.info("\n" + str(ds)) store = zarr.ZipStore(zarr_path, mode="w") ds.to_zarr(store, mode="w") store.close() logger.info(f"Conversion to zarr at {zarr_path} successful")
def check_simulation_result( datadir: Path, config: Dict[str, Any], run: Dict[str, Any] ) -> None: # Extract properties for simulation dataset, paramset = run["dataset"], run["paramset"] ds_config = config["datasets"][dataset] ps_config = config["paramsets"][paramset] dataset_dir = datadir / "dataset" / dataset result_dir = datadir / "result" / run["name"] # Load simulated data with zarr.ZipStore(str(dataset_dir / "genotypes.zarr.zip"), mode="r") as store: ds = xr.open_zarr(store) # type: ignore[no-untyped-call] df_covariate = load_covariates(dataset_dir) df_trait = load_traits(dataset_dir) contigs = ds["variant_contig"].values G = ds["call_genotype"].sum(dim="ploidy").values X = df_covariate.values Y = df_trait.values # Define transformed traits res = regenie_transform( G.T, X, Y, contigs, variant_block_size=ps_config["variant_block_size"], sample_block_size=ps_config["sample_block_size"], normalize=True, add_intercept=False, alphas=ps_config["alphas"], orthogonalize=False, # Intentionally make mistakes related to these flags # in order to match Glow results _glow_adj_dof=True, _glow_adj_scaling=True, _glow_adj_alpha=True, ) YBP = res["base_prediction"].data YMP = res["meta_prediction"].data # Check equality of stage 1 and 2 transformations check_stage_1_results(YBP, ds_config, ps_config, result_dir) check_stage_2_results(YMP, df_trait, result_dir) # Check equality of GWAS results YR = Y - YMP stats = linear_regression(G.T, X, YR) check_stage_3_results(ds, stats, df_trait, result_dir)
def test_backwards_compat_zarr(): import scanpy as sc import zarr pbmc_orig = sc.datasets.pbmc68k_reduced() # Old zarr writer couldn’t do sparse arrays pbmc_orig.raw._X = pbmc_orig.raw.X.toarray() del pbmc_orig.uns["neighbors"] # This was written out with anndata=0.6.22.post1 zarrpth = HERE / "data/pbmc68k_reduced_legacy.zarr.zip" with zarr.ZipStore(zarrpth, mode="r") as z: pbmc_zarr = ad.read_zarr(z) assert_equal(pbmc_zarr, pbmc_orig)
def compress_zarr_dataset(data, file_path, compression='lz4', clevel=5, start_idx=0, end_idx=0): """ Loads in a zarr data set and exports it with a given compression type and level :param data: Zarr data set which will be compressed :param file_path: File name path where the data will be exported (e.g. "./export/data.zip") :param compression: Compression type :param clevel: Compression level :param start_idx: Starting index of data to be exported. :param end_idx: If end_idx != 0 the data set will be exported to the specified index, excluding the sample at end_idx (e.g. end_idx = len(x) will export it fully) :return: True if a NaN value was detected """ compressor = Blosc(cname=compression, clevel=clevel, shuffle=Blosc.SHUFFLE) # open a dataset file and create arrays store = zarr.ZipStore(file_path, mode="w") zarr_file = zarr.group(store=store, overwrite=True) nan_detected = False for key in data.keys(): if end_idx == 0: x = data[key] else: x = data[key][start_idx:end_idx] if np.isnan(x).any(): nan_detected = True array_shape = list(x.shape) array_shape[0] = 128 # export array zarr_file.create_dataset( name=key, data=x, shape=x.shape, dtype=type(x.flatten()[0]), chunks=array_shape, synchronizer=zarr.ThreadSynchronizer(), compression=compressor, ) store.close() logging.info("dataset was exported to: %s", file_path) return nan_detected
def setup_input(samples, input_pattern, seqid, field): log('Setting up input array ...') input_paths = [input_pattern.format(sample=s) for s in samples] input_stores = [zarr.ZipStore(ip, mode='r') for ip in input_paths] input_roots = [zarr.group(store) for store in input_stores] input_arrays = [ root[s][seqid][field] for root, s in zip(input_roots, samples) ] input_arrays = [da.from_array(a, chunks=a.chunks) for a in input_arrays] # here we add a dim to allow the hstack to work. must share the shape (X, 1, ) input_arrays = [a[:, None] if a.ndim == 1 else a for a in input_arrays] input_array = da.hstack(input_arrays) log('Input array:', input_array) return input_array
def save_results(conn, image, data, dataset, path): filename, file_extension = os.path.splitext(image.getName()) # Save the probabilities file as an image print("Saving Probabilities as zarr file attached to the original Image") name = filename + "_Probabilities_zarr.zip" desc = "ilastik probabilities from Image:%s" % image.getId() # Re-organise array from tzyxc to zctyx order expected by OMERO # data = data.swapaxes(0, 1).swapaxes(3, 4).swapaxes(2, 3).swapaxes(1, 2) namespace = "ilastik.zarr.demo" fp = os.path.join(path, name) with zarr.ZipStore(fp, mode='w') as store: zarr.array(data, store=store, dtype='int16', compressor=zarr.Blosc(cname='zstd')) ann = conn.createFileAnnfromLocalFile(fp, mimetype="application/zip", ns=namespace, desc=desc) image.linkAnnotation(ann)