def _compute(self): # create a zarr groups to materialize arrays to sym = gensym("asndarray") store = zarr.TempStore() root = zarr.open(store, mode="w") # TODO: allow cloud storage # save arrays def save(indexed_row): index, row = indexed_row # remove array in case we are being materialized again zarr.storage.rmdir(store, "/{}".format(index)) root = zarr.group(store) root.array(str(index), row, chunks=False) self.pcollection | sym >> beam.Map(save) result = self.pipeline.run() result.wait_until_finish() # read back arrays local_rows = [None] * len(self.partition_row_counts) for (name, row) in root.arrays(): index = int(name) local_rows[index] = row return local_rows
def load(savepath, lazy: bool = False, normalize_strings: bool = True, use_temp: bool = False): """[summary] Args: savepath ([type]): [description] lazy (bool, optional): [description]. Defaults to True. normalize_strings (bool, optional): [description]. Defaults to True. use_temp (bool, optional): Unpack zip to temp file - potentially speeds up loading and allows overwriting existing zarr file. Defaults to True. Returns: [type]: [description] """ zarr_store = zarr.ZipStore(savepath, mode='r') if use_temp: dest = zarr.TempStore() zarr.copy_store(zarr_store, dest) zarr_store.close() zarr_store = dest dataset = xr.open_zarr(zarr_store) if not lazy: dataset.load() zarr_store.close() if normalize_strings: dataset = _normalize_strings(dataset) return dataset
def _set_defaults(self, kwargs): kwargs = super(ZarrTmpStorage, self)._set_defaults(kwargs) suffix = kwargs.pop('suffix', '.zarr') prefix = kwargs.pop('prefix', 'scikit_allel_') # noinspection PyShadowingBuiltins dir = kwargs.pop('dir', None) kwargs.setdefault( 'store', zarr.TempStore(suffix=suffix, prefix=prefix, dir=dir)) return kwargs
def test_write_zarr(self, adata, adata_dist): log1p(adata_dist) temp_store = zarr.TempStore() chunks = adata_dist.X.chunks # write metadata using regular anndata adata.write_zarr(temp_store, chunks) if isinstance(adata_dist.X, da.Array): adata_dist.X.to_zarr(temp_store.dir_path("X")) else: adata_dist.X.to_zarr(temp_store.dir_path("X"), chunks) # read back as zarr (without using RDDs) and check it is the same as adata.X adata_log1p = ad.read_zarr(temp_store) log1p(adata) npt.assert_allclose(adata_log1p.X, adata.X)
def xd_and_temp_store(self, sc, x, xz, chunks, request): if request.param == "direct_ndarray": yield zappy.direct.from_ndarray(x.copy(), chunks), zarr.TempStore() elif request.param == "direct_zarr": yield zappy.direct.from_zarr(xz), zarr.TempStore() elif request.param == "executor_ndarray": with concurrent.futures.ThreadPoolExecutor( max_workers=2) as executor: yield zappy.executor.from_ndarray(executor, x.copy(), chunks), zarr.TempStore() elif request.param == "executor_zarr": with concurrent.futures.ThreadPoolExecutor( max_workers=2) as executor: yield zappy.executor.from_zarr(executor, xz), zarr.TempStore() elif request.param == "spark_ndarray": yield zappy.spark.from_ndarray(sc, x.copy(), chunks), zarr.TempStore() elif request.param == "spark_zarr": yield zappy.spark.from_zarr(sc, xz), zarr.TempStore() elif request.param == "beam_ndarray": pipeline_options = PipelineOptions() pipeline = beam.Pipeline(options=pipeline_options) yield zappy.beam.from_ndarray(pipeline, x.copy(), chunks), zarr.TempStore() elif request.param == "beam_zarr": pipeline_options = PipelineOptions() pipeline = beam.Pipeline(options=pipeline_options) yield zappy.beam.from_zarr(pipeline, xz), zarr.TempStore() elif request.param == "pywren_ndarray": import s3fs.mapping def create_unique_bucket_name(prefix): import uuid return "%s-%s" % (prefix, str(uuid.uuid4()).replace("-", "")) s3 = s3fs.S3FileSystem() bucket = create_unique_bucket_name("zappy-test") s3.mkdir(bucket) path = "%s/%s" % (bucket, "test.zarr") s3store = s3fs.mapping.S3Map(path, s3=s3) executor = zappy.executor.PywrenExecutor() yield zappy.executor.from_ndarray(executor, x.copy(), chunks), s3store s3.rm(bucket, recursive=True)
def test_write_zarr(self, adata, adata_dist): import dask.array as da import zarr log1p(adata_dist) temp_store = zarr.TempStore() chunks = adata_dist.X.chunks if isinstance(chunks[0], tuple): chunks = (chunks[0][0], ) + chunks[1] # write metadata using regular anndata adata.write_zarr(temp_store, chunks) if isinstance(adata_dist.X, da.Array): adata_dist.X.to_zarr(temp_store.dir_path("X"), overwrite=True) else: adata_dist.X.to_zarr(temp_store.dir_path("X"), chunks) # read back as zarr directly and check it is the same as adata.X adata_log1p = ad.read_zarr(temp_store) log1p(adata) npt.assert_allclose(adata_log1p.X, adata.X)
def test_run_batch_dim(self, dims, data, clock, parallel, scheduler): @xs.process class P: in_var = xs.variable(dims=[(), "x"]) out_var = xs.variable(dims=[(), "x"], intent="out") idx_var = xs.index(dims="x") def initialize(self): self.idx_var = [0, 1] def run_step(self): self.out_var = self.in_var * 2 m = xs.Model({"p": P}) in_ds = xs.create_setup( model=m, clocks={"clock": [0, 1, 2]}, input_vars={"p__in_var": (dims, data)}, output_vars={"p__out_var": clock}, ) out_ds = in_ds.xsimlab.run( model=m, batch_dim="batch", parallel=parallel, scheduler=scheduler, store=zarr.TempStore(), ) if clock is None: coords = {} else: coords = {"clock": in_ds["clock"]} expected = xr.DataArray(data, dims=dims, coords=coords) * 2 xr.testing.assert_equal(out_ds["p__out_var"], expected)
def offcore_array( shape: Union[Tuple[int, ...], Generator[int, None, None]], dtype: numpy.dtype, force_memmap: bool = False, zarr_allowed: bool = False, no_memmap_limit: bool = True, max_memory_usage_ratio: float = 0.9, ): """ Instanciates an array of given shape and dtype in 'off-core' fashion i.e. not in main memory. Right now it simply uses memory mapping on temp file that is deleted after the file is closed Parameters ---------- shape dtype force_memmap zarr_allowed no_memmap_limit max_memory_usage_ratio """ with lsection(f"Array of shape: {shape} and dtype: {dtype} requested"): size_in_bytes = numpy.prod(shape) * numpy.dtype(dtype).itemsize lprint(f'Array requested will be {(size_in_bytes / 1E6)} MB.') total_physical_memory_in_bytes = psutil.virtual_memory().total total_swap_memory_in_bytes = psutil.swap_memory().total total_mem_in_bytes = total_physical_memory_in_bytes + total_swap_memory_in_bytes lprint( f'There is {int(psutil.virtual_memory().total / 1E6)} MB of physical memory' ) lprint( f'There is {int(psutil.swap_memory().total / 1E6)} MB of swap memory' ) lprint(f'There is {int(total_mem_in_bytes / 1E6)} MB of total memory') is_enough_physical_memory = (size_in_bytes < max_memory_usage_ratio * total_physical_memory_in_bytes) is_enough_total_memory = (size_in_bytes < max_memory_usage_ratio * total_mem_in_bytes) if not force_memmap and is_enough_total_memory: lprint( f'There is enough physical+swap memory -- we do not need to use a mem mapped array or zarr-backed array.' ) array = numpy.zeros(shape, dtype=dtype) elif no_memmap_limit: lprint( f'There is not enough physical+swap memory -- we will use a mem mapped array.' ) temp_file = tempfile.NamedTemporaryFile( dir=OffCore.memmap_directory) lprint( f'The temporary memory mapped file is at: {temp_file.name} (but you might not be able to see it!)' ) array = numpy.memmap(temp_file, dtype=dtype, mode='w+', shape=shape) elif zarr_allowed: lprint( f'There is not enough physical+swap memory -- we will use a zarr-backed array.' ) import zarr array = zarr.create(shape=shape, dtype=dtype, store=zarr.TempStore("output.zarr")) # from numcodecs import Blosc # compressor = Blosc(cname = 'zstd', clevel = 3, shuffle = Blosc.BITSHUFFLE) # array = zarr.zeros((102_0, 200, 210), chunks = (100, 200, 210), compressor = compressor return array
def write_image_by_tile( self, image_name: str, output_dir: Union[Path, str] = "", write_pyramid: bool = True, compression: Optional[str] = "default", zarr_temp_dir: Optional[Union[str, Path]] = None, ) -> str: """ Write images to OME-TIFF from temp zarr store with data. Parameters ---------- image_name: str file path stem of the image to be written output_dir: Union[str,Path] directory where image is to be written write_pyramid: bool whether to write a pyramid or single layer compression: str Use compression. "default" will be lossless "deflate" for non-rgb images and "jpeg" for RGB images zarr_temp_dir: Path or str Directory to store the temporary zarr data (mostly used for debugging) Returns ------- output_file_name: Path Path to written image file """ zstr = zarr.TempStore(dir=zarr_temp_dir) try: resample_zarray = self.write_tiles_to_zarr_store(zstr) output_file_name = str(Path(output_dir) / f"{image_name}.ome.tiff") if compression == "default": print("using default compression") compression = "jpeg" if self.reg_image.is_rgb else "deflate" else: compression = compression ( n_pyr_levels, subifds, out_tile_shape, omexml, ) = self._prepare_image_info( image_name, write_pyramid=write_pyramid ) print(f"saving to {output_file_name}") dask_image = da.from_zarr(resample_zarray) options = dict( tile=self.tile_shape, compression=compression, photometric="rgb" if self.reg_image.is_rgb else "minisblack", metadata=None, ) with TiffWriter(output_file_name, bigtiff=True) as tif: if self.reg_image.is_rgb: print( f"writing base layer RGB - shape: {dask_image.shape}" ) # tile_iterator_strides = self._get_tile_iterator_strides(dask_image) tile_iterator = self._transformed_tile_generator( dask_image, 0 ) tif.write( tile_iterator, subifds=subifds, description=omexml, shape=dask_image.shape, dtype=dask_image.dtype, **options, ) if write_pyramid: for pyr_idx in range(1, n_pyr_levels): sub_res = compute_sub_res( dask_image, pyr_idx, self.tile_shape[0], self.reg_image.is_rgb, self.reg_image.im_dtype, ) print( f"pyr {pyr_idx} : RGB-shape: {sub_res.shape}" ) # tile_strides = self._get_tile_iterator_strides(sub_res) sub_res_tile_iterator = ( self._transformed_tile_generator(sub_res, 0) ) tif.write( sub_res_tile_iterator, shape=sub_res.shape, dtype=self.reg_image.im_dtype, **options, subfiletype=1, ) else: for channel_idx in range(self.reg_image.n_ch): description = omexml if channel_idx == 0 else None print( f"writing channel {channel_idx} - shape: {dask_image.shape[1:]}" ) tile_iterator = self._transformed_tile_generator( dask_image, channel_idx ) tif.write( tile_iterator, subifds=subifds, description=description, shape=dask_image.shape[1:], dtype=dask_image.dtype, **options, ) if write_pyramid: for pyr_idx in range(1, n_pyr_levels): sub_res = compute_sub_res( dask_image, pyr_idx, self.tile_shape[0], self.reg_image.is_rgb, self.reg_image.im_dtype, ) sub_res_tile_iterator = ( self._transformed_tile_generator( sub_res, channel_idx ) ) tif.write( sub_res_tile_iterator, shape=sub_res.shape[1:], dtype=dask_image.dtype, **options, subfiletype=1, ) try: resample_zarray.store.clear() except FileNotFoundError: pass return output_file_name # bare except to always clear temporary storage on failure except Exception as e: print(e) try: resample_zarray.store.clear() except FileNotFoundError: pass
def generate_coalescent_synthetic_data(num_samples=1000, num_bases=1e7, Ne=1e4, mu=3.5e-9, rrate=1e-8, ploidy=2, seed=57): """ Function credits: Nick Harding Reference URL: https://hardingnj.github.io/2017/08/23/power-of-correct-tools.html """ tree_sequence = msprime.simulate(sample_size=num_samples * ploidy, Ne=Ne, length=num_bases, recombination_rate=rrate, mutation_rate=mu, random_seed=seed, model="dtwf") # Print the number of mutations in tree sequence print("Simulated ", tree_sequence.get_num_mutations(), "mutations") print("Creating Zarr data store root") store = zarr.DirectoryStore(ZARR_PATH) root = zarr.group(store=store, overwrite=True) print('Creating Zarr Array') compressor = Blosc(cname='zstd', clevel=1, shuffle=Blosc.AUTOSHUFFLE) z_shape = (tree_sequence.get_num_mutations(), num_samples, ploidy) z_chunks = (VARIANTS_PER_CHUNK, SAMPLES_PER_CHUNK, PLOIDY_PER_CHUNK) z = root.empty('calldata/GT', shape=z_shape, chunks=z_chunks, dtype='i1', compressor=compressor) print('Creating temporary Zarr Array for holding data') temp_chunks = (TEMP_VARIANTS_PER_CHUNK, TEMP_SAMPLES_PER_CHUNK, TEMP_PLOIDY_PER_CHUNK) temp_store = zarr.TempStore(dir='./') temp_root = zarr.group(store=temp_store, overwrite=True) temp_z = temp_root.empty('calldata/GT', shape=z_shape, chunks=temp_chunks, dtype='i1', compressor=None) num_variants = z.shape[0] num_samples = z.shape[1] num_ploidy = z.shape[2] print('Num Samples: {}'.format(num_samples)) print('Num Variants: {}'.format(num_variants)) print('Ploidy: {}'.format(num_ploidy)) print("Variation rate: {}".format(num_variants / num_bases)) bar = ProgressBar(tree_sequence.get_num_mutations(), max_width=80) print("Pulling variant data...") variant_counter = 0 for variant in tree_sequence.variants(): bar.numerator = variant_counter print(bar, end='\r') sys.stdout.flush() var = variant.genotypes.reshape((num_samples, ploidy)) temp_z[variant.index, :, :] = var variant_counter += 1 # Store data in final data store z[:, :, :] = temp_z print('Done.\n') print(z.info)