def main(cfg): dat = HubmapDataset(cfg["data_dir"], cfg["out_dir"]) store = zarr.DirectoryStore(dat.path.out / "zarr" / cfg["version"] / f"db.zarr") database = zarr.group(store=store, overwrite=False) for id_, _ in tqdm(list(dat.get_inf("train").iterrows())): database.create_group(id_) img = dat.get_img(id_) msk = dat.get_msk(id_, "target") shape = dat.get_shape(id_) rescale = A.Resize(height=int(shape[0] / cfg["scale"]), width=int(shape[1] / cfg["scale"]), p=1.0) transformed = rescale(image=img, mask=msk) img, msk = transformed["image"], transformed["mask"] del transformed gc.collect() database[id_]["img"] = zarr.array(img, chunks=(cfg["chunk_size"], cfg["chunk_size"], 3)) database[id_]["target"] = zarr.array(msk, chunks=(cfg["chunk_size"], cfg["chunk_size"])) del img, msk gc.collect()
def convert_to_zarr(df, store_type, chunks): """Anything is possible with ZARR""" path = _get_temp_path(".zarr") adj_chunks = (min(df.shape[0], chunks[0]), min(df.shape[1], chunks[1])) store = getattr(zarr, store_type)(path) zarr.array(df.as_matrix(), store=store, chunks=adj_chunks, dtype='f4') return path
def classo_to_dir(problem): result = CLASSOProblemDirectoryFormat() # only do it for PATH for now solPATH_classo, solPATH_file = problem.solution.PATH , result.PATH if type(solPATH_classo) != str : if type(solPATH_classo.SIGMAS)==str : data = zarr.array(np.array([solPATH_classo.LAMBDAS,solPATH_classo.BETAS]) ) else : data = zarr.array(np.array([solPATH_classo.LAMBDAS,solPATH_classo.BETAS,solPATH_classo.SIGMAS]) ) with solPATH_file.open() as fh : data.write(fh,format=SolutionFormat) # is it how it is supposed to be done ?? return result
def save_results(conn, image, data, dataset, path): filename, file_extension = os.path.splitext(image.getName()) # Save the probabilities file as an image print("Saving Probabilities as zarr file attached to the original Image") name = filename + "_Probabilities_zarr.zip" desc = "ilastik probabilities from Image:%s" % image.getId() # Re-organise array from tzyxc to zctyx order expected by OMERO # data = data.swapaxes(0, 1).swapaxes(3, 4).swapaxes(2, 3).swapaxes(1, 2) namespace = "ilastik.zarr.demo" fp = os.path.join(path, name) with zarr.ZipStore(fp, mode='w') as store: zarr.array(data, store=store, dtype='int16', compressor=zarr.Blosc(cname='zstd')) ann = conn.createFileAnnfromLocalFile(fp, mimetype="application/zip", ns=namespace, desc=desc) image.linkAnnotation(ann)
def read_box( self, bBox: BBox, outer_points: Union[bool, int] = False, aszarr: bool = False, ) -> Union[np.ndarray, zarr.Array]: """Reads a boxed sections of the geotiff to a zarr/numpy array Args: bBox (BBox): A bounding box outer_points (Union[bool, int]): Takes an int (n) that gets extra n layers of points/pixels that directly surround the bBox. Defaults to False. safe (bool): If True, returns a zarr array. If False, forces a returns as a numpy array by putting the data into memory. Defaults to False. Returns: np.ndarray: zarr array of the geotiff file """ ((x_min, y_min), (x_max, y_max)) = self.get_int_box(bBox, outer_points=outer_points) tiff_array = self.read() boxed_array = tiff_array[y_min:y_max, x_min:x_max] if aszarr: return zarr.array(boxed_array) return np.array(boxed_array)
def array(self, data, expectedlen=None, **kwargs): # setup data = _util.ensure_array_like(data) kwargs = self._set_defaults(kwargs) # determine chunks kwargs.setdefault('chunks', default_chunks(data, expectedlen)) # determine object codec if data.dtype == object: # peek at first value peek = data[0] if isinstance(peek, bytes): object_codec = numcodecs.VLenBytes() elif isinstance(peek, str): object_codec = numcodecs.VLenUTF8() else: object_codec = numcodecs.MsgPack() kwargs.setdefault('object_codec', object_codec) # create z = zarr.array(data, **kwargs) return z
def get_doc_cluster_matrix(self, l: int = 0, normalize: bool = False) -> da.array: if "cluster" not in self.data[l]: path = common.PROJDIR / "hSBM" / self.name / "model.pkl" if path.is_file(): model = dill.load(path.open("rb")) else: raise RuntimeError("File not found") def get_clusters(model: "sbmtm", l: int = 0) -> da.array: # rewrite from _sbmtm to use dask D = model.get_D() g = model.g state = model.state state_l = state.project_level(l).copy(overlap=True) state_l_edges = state_l.get_edge_blocks() # labeled half-edges # count labeled half-edges, group-memberships B = state_l.get_B() id_d = np.zeros(g.edge_index_range, dtype=np.dtype(int)) id_b = np.zeros(g.edge_index_range, dtype=np.dtype(int)) weig = np.zeros(g.edge_index_range, dtype=np.dtype(int)) for i, e in enumerate(g.edges()): id_b[i], _ = state_l_edges[e] id_d[i] = int(e.source()) weig[i] = g.ep["count"][e] n_db = sparse.COO( [id_d, id_b], weig, shape=(D, B), fill_value=0 ) # number of half-edges incident on document-node d and labeled as cluster del weig del id_b del id_d ##### ind_d = np.where(np.sum(n_db, axis=0) > 0)[0] n_db = n_db[:, ind_d] del ind_d # Mixture of clusters into documetns P(d | c) p_td_d = n_db / np.sum(n_db, axis=0).todense()[np.newaxis, :] return da.array(p_td_d).map_blocks(lambda b: b.todense(), dtype=np.dtype(float)) self.data[l]["cluster"] = Raw.from_dask_array( self.path / f"clusters{l}.zarr.zip", get_clusters(model, l)) self.save() doc_cluster = self.data[l]["cluster"].get() if normalize: return zarr.array(doc_cluster / (doc_cluster[:].sum(axis=1))[:, np.newaxis]) return doc_cluster
def _write_existing_rootgroup(self, xarr: xr.Dataset, data_loc_copy: Union[list, np.ndarray], var_name: str, dims_of_arrays: dict, chunksize: tuple, timlength: int, timaxis: int, startingshp: tuple): """ A slightly different operation than _write_new_dataset_rootgroup. To write to an existing rootgroup array, we use the data_loc as an index and create a new zarr array from the xarray Dataarray. The data_loc is only used if the var is a time based array. Parameters ---------- xarr data to write to zarr data_loc_copy either [start time index, end time index] for xarr, ex: [0,1000] if xarr time dimension is 1000 long, or np.array([4,5,6,7,1,2...]) for when data might not be continuous and we need to use a boolean mask var_name variable name dims_of_arrays where keys are array names and values list of dims/shape. Example: 'beampointingangle': [['time', 'sector', 'beam'], (5000, 3, 400)] chunksize chunk shape used to create the zarr array timlength Length of the time dimension for the input xarray Dataset timaxis index of the time dimension startingshp desired shape for the rootgroup array, might be modified later for total beams if necessary. if finalsize is None (the case when this is not the first write in a set of distributed writes) this is still returned but not used. """ # array to be written xarr_data = xarr[var_name].values if startingshp is not None: startingshp = self._write_adjust_max_beams(startingshp) self.rootgroup[var_name].resize(startingshp) if isinstance(data_loc_copy, list): # [start index, end index] # the last write will often be less than the block size. This is allowed in the zarr store, but we # need to correct the index for it. if timlength != data_loc_copy[1] - data_loc_copy[0]: data_loc_copy[1] = data_loc_copy[0] + timlength # location for new data, assume constant chunksize (as we are doing this outside of this function) chunk_time_range = slice(data_loc_copy[0], data_loc_copy[1]) # use the chunk_time_range for writes unless this variable is a non-time dim array (beam for example) chunk_idx = tuple( chunk_time_range if dims_of_arrays[var_name][1].index(i) == timaxis else slice(0, i) for i in dims_of_arrays[var_name][1]) self.rootgroup[var_name][chunk_idx] = zarr.array(xarr_data, shape=dims_of_arrays[var_name][1], chunks=chunksize) else: # np.array([4,5,6,1,2,3,8,9...]), indices of the new data, might not be sorted sorted_order = data_loc_copy.argsort() xarr_data = xarr_data[sorted_order] data_loc_copy = data_loc_copy[sorted_order] zarr_mask = np.zeros_like(self.rootgroup[var_name], dtype=bool) zarr_mask[data_loc_copy] = True # seems to require me to ravel first, examples only show setting with integer, not sure what is going on here self.rootgroup[var_name].set_mask_selection(zarr_mask, xarr_data.ravel())
def get_doc_topic_matrix( self, skip_hash_check: bool = False, normalize: bool = False, ) -> zarr.array: doc_topic = self.data["doc_topic"].get() if normalize: return zarr.array(doc_topic / (doc_topic[:].sum(axis=1))[:, np.newaxis]) return doc_topic
def get_topic_word_matrix( self, skip_hash_check: bool = False, normalize: bool = False, ) -> zarr.array: topic_word = self.data["topic_word"].get() if normalize: return zarr.array(topic_word / (topic_word[:].sum(axis=1))[:, np.newaxis]) return topic_word
def save(self, file_name = 'sim'): """ serialization of object and saving it to file""" root = zarr.open_group('state/' + file_name + '.zarr', mode = 'w') values = root.create_dataset('values', shape = (self.L_with_boundary, self.L_with_boundary), chunks = (10, 10), dtype = 'i4') values = zarr.array(self.values) #data_acquisition = root.create_dataset('data_acquisition', shape = (len(self.data_acquisition)), chunks = (1000), dtype = 'i4') #data_acquisition = zarr.array(self.data_acquisition) root.attrs['L'] = self.L root.attrs['save_every'] = self.save_every return root
def array(self, data, expectedlen=None, **kwargs): # setup data = _util.ensure_array_like(data) kwargs = self._set_defaults(kwargs) # determine chunks kwargs.setdefault('chunks', default_chunks(data, expectedlen)) # create z = zarr.array(data, **kwargs) return z
def test_numpy_writeable(): # Create data original = np.random.rand(1024, 1024) mutable = zarr.array(original) # Initialize app route = create_zarr_route(mutable) app = Starlette(routes=[route]) # Open remote array and compare remote_store = HTTPStore(TestClient(app)) arr = zarr.open_array(remote_store) arr[:50, :50] = 2 np.testing.assert_allclose(arr[:], mutable[:])
def test(directory_path): base = zarr.open(directory_path, mode='r+') downsize_dimensions = np.asarray(base.shape) / np.asarray(base.chunks) if (np.unique(downsize_dimensions).size != 1): print("not all dimensions reduce equally; should never happen?") downsize_factor = int(downsize_dimensions[0]**(1 / 2)) downsize_factor -= 1 # already have first level of pyramid small_image = base[:] #currently must fit into RAM, needs to scale levels = [] while (downsize_factor >= 0): small_image = countless(small_image) newLevel = zarr.array(small_image, chunks=base.chunks) zarr.save(os.path.join(directory_path, str(downsize_factor)), newLevel) downsize_factor -= 1
def __init__(self, store, data=None, store_type='sqlite', **kwarg): self._store_type = store_type store = store.replace('-', '/') if not (os.access(os.path.dirname(store), mode=os.R_OK)): raise ValueError('Library does not exists') if store_type == 'sqlite': self.store = zarr.SQLiteStore(store) else: self.store = zarr.DirectoryStore(store) self.group = zarr.open_group(store=self.store, **kwarg) if isinstance(data, np.ndarray) and data.dtype.kind == 'V': self.store.cursor.execute('BEGIN TRANSACTION') for col in data.dtype.names: self.group[col] = zarr.array(data[col]) self.store.cursor.execute('COMMIT')
def spots_with_flow(config, spots): prediction = None if hasattr(config, 'tiff_input') and config.tiff_input is not None: img_input = np.array([skimage.io.imread(f) for f in config.tiff_input]) elif config.zpath_input is not None: za_input = zarr.open(config.zpath_input, mode='a') za_flow = zarr.open(config.zpath_flow, mode='a') za_hash = zarr.open(config.zpath_flow_hashes, mode='a') # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file#answer-3431838 hash_md5 = hashlib.md5() with open(config.model_path, 'rb') as f: for chunk in iter(lambda: f.read(4096), b''): hash_md5.update(chunk) za_md5 = zarr.array( za_input[config.timepoint - 1:config.timepoint + 1]).digest('md5') hash_md5.update(za_md5) hash_md5.update(json.dumps(config.patch_size).encode('utf-8')) model_md5 = hash_md5.digest() if model_md5 == za_hash[config.timepoint - 1]: prediction = za_flow[config.timepoint - 1] else: img_input = np.array([ normalize_zero_one(za_input[i].astype('float32')) for i in range(config.timepoint - 1, config.timepoint + 1) ]) if prediction is None: try: prediction = _get_flow_prediction(img_input, config.timepoint, config.model_path, config.keep_axials, config.device, config.flow_norm_factor, config.patch_size) finally: torch.cuda.empty_cache() if config.output_prediction: za_flow[config.timepoint - 1] = prediction za_hash[config.timepoint - 1] = model_md5 else: za_hash[config.timepoint - 1] = 0 # Restore to voxel unit for d in range(prediction.shape[0]): prediction[d] *= config.flow_norm_factor[d] res_spots = _estimate_spots_with_flow(spots, prediction, config.scales) return res_spots
def test_numpy_read_only(): # Create data original = np.random.rand(1024, 1024) z = zarr.array(original, read_only=True) # Initialize app route = create_zarr_route(z) app = Starlette(routes=[route]) # Open remote array and compare remote_store = HTTPStore(TestClient(app)) arr = zarr.open_array(remote_store) np.testing.assert_allclose(arr[:], original) # Make sure can't write with pytest.raises(ValueError): arr[:50, :50] = 10
def _test_pairwise_distance(metric): # Simulate some data, N.B., oriented such that we want to compute # distance between columns. data = np.random.randint(low=0, high=3, size=(100, 10), dtype=np.int8) # Compute expected result, using scipy as reference implementation. expect = spd.pdist(data.T, metric=metric) # Test numpy array. actual = pairwise_distance(data, metric=metric) assert isinstance(actual, np.ndarray) assert_allclose(expect, actual) assert actual.dtype.kind == "f" # Test cuda array. data_cuda = cuda.to_device(data) actual = pairwise_distance(data_cuda, metric=metric) assert isinstance(actual, type(data_cuda)) assert_allclose(expect, actual.copy_to_host()) assert actual.dtype.kind == "f" # Test dask array. data_dask = da.from_array(data, chunks=(10, 5)) actual = pairwise_distance(data_dask, metric=metric) assert isinstance(actual, da.Array) ac = actual.compute(scheduler="single-threaded") assert_allclose(expect, ac) assert actual.dtype.kind == "f" # Test dask array with cuda. data_dask_cuda = data_dask.rechunk((10, -1)).map_blocks(cuda.to_device) actual = pairwise_distance(data_dask_cuda, metric=metric) assert isinstance(actual, da.Array) ac = actual.compute(scheduler="single-threaded") assert_allclose(expect, ac) assert actual.dtype.kind == "f" # Test zarr array. data_zarr = zarr.array(data, chunks=(10, 5)) actual = pairwise_distance(data_zarr, metric=metric) assert isinstance(actual, da.Array) assert_allclose(expect, actual.compute()) assert actual.dtype.kind == "f"
def test_count_alleles(): gt = np.array([[[0, 0], [0, 1], [2, 2]], [[-1, 0], [1, -1], [-1, -1]]], dtype=np.int8) expect = np.array([[3, 1, 2], [1, 1, 0]], dtype="i4") # Test numpy array. actual = genotypes_count_alleles(gt, max_allele=2) assert isinstance(actual, np.ndarray) assert_array_equal(expect, actual) # Test cuda array. gt_cuda = cuda.to_device(gt) actual = genotypes_count_alleles(gt_cuda, max_allele=2) assert isinstance(actual, type(gt_cuda)) assert_array_equal(expect, actual.copy_to_host()) # Test dask array. gt_dask = da.from_array(gt, chunks=(1, 2, -1)) actual = genotypes_count_alleles(gt_dask, max_allele=2) assert isinstance(actual, da.Array) assert_array_equal(expect, actual.compute()) # Test zarr array. gt_zarr = zarr.array(gt, chunks=(1, 2, None)) actual = genotypes_count_alleles(gt_zarr, max_allele=2) assert isinstance(actual, da.Array) assert_array_equal(expect, actual.compute()) # Test dask cuda array. gt_dask_cuda = gt_dask.map_blocks(cuda.to_device) actual = genotypes_count_alleles(gt_dask_cuda, max_allele=2) assert isinstance(actual, da.Array) assert_array_equal(expect, actual.compute(scheduler="single-threaded")) # Test exceptions. with pytest.raises(TypeError): # noinspection PyTypeChecker genotypes_count_alleles(gt, max_allele="foo") with pytest.raises(TypeError): # noinspection PyTypeChecker genotypes_count_alleles(gt, max_allele=[1]) with pytest.raises(ValueError): genotypes_count_alleles(gt, max_allele=128)
def compress_layer(layer_array: np.array): """ Returns compressed version of the original numpy array based on memory efficient compression. Parameters ---------- layer_array: Numpy array Original array version of the layer data Returns ------- layer_shape: tuple Tuple containing shape of the layer_array compressed_layer: np.array Sparse version (COO list) of the layer data, if is_sparse == True zarr array with zstd compression, if is_sparse == False is_sparse: bool True if Napari layer should be represented in COO list. """ layer_shape = tuple(layer_array.shape) # USE COORD LIST FOR SPARSE LABELLING tmp_coo = sparse.COO(layer_array) # join coords and data in single array coo_array = np.append(tmp_coo.coords, np.array([tmp_coo.data]), axis=0) coo_array = coo_array.astype(np.uint16) # USE ZSTD COMPRESSION FOR PREDICTIONS, higher clevel takes more time. zarr_chunks = tuple([1 for i in range(len(layer_array.shape) - 2)] + [1024, 1024]) compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) data = layer_array.astype(np.uint16) zarr_array = zarr.array(data, chunks=zarr_chunks, compressor=compressor) if zarr_array.nbytes_stored >= coo_array.nbytes: compressed_layer = coo_array is_sparse = True else: compressed_layer = zarr_array is_sparse = False return layer_shape, compressed_layer, is_sparse
def _create_array(self, data, **kwargs): # determine chunks chunks = default_chunks(data) kwargs.setdefault('chunks', chunks) # create array if 'path' in kwargs: kwargs['mode'] = 'w' kwargs['shape'] = data.shape # ensure dtype is specified dtype = kwargs.get('dtype', None) if not dtype: kwargs['dtype'] = data.dtype z = zarr.open(**kwargs) z[:] = data else: z = zarr.array(data, **kwargs) return z
def test_to_haplotypes(): gt = np.array([[[0, 0], [0, 1], [2, 2]], [[-1, 0], [1, -1], [-1, -1]]], dtype=np.int8) expect = np.array([[0, 0, 0, 1, 2, 2], [-1, 0, 1, -1, -1, -1]], dtype=np.int8) # Test numpy array. actual = genotypes_to_haplotypes(gt) assert isinstance(actual, np.ndarray) assert_array_equal(expect, actual) # Test numpy array, F order. actual = genotypes_to_haplotypes(np.asfortranarray(gt)) assert isinstance(actual, np.ndarray) assert_array_equal(expect, actual) # Test dask array. gt_dask = da.from_array(gt, chunks=(1, 2, -1)) actual = genotypes_to_haplotypes(gt_dask) assert isinstance(actual, da.Array) assert_array_equal(expect, actual.compute()) # Test zarr array. gt_zarr = zarr.array(gt, chunks=(1, 2, 2)) actual = genotypes_to_haplotypes(gt_zarr) assert isinstance(actual, da.Array) assert_array_equal(expect, actual.compute()) # Test exceptions. with pytest.raises(TypeError): # Wrong type. genotypes_to_haplotypes("foo") with pytest.raises(TypeError): # Wrong dtype. genotypes_to_haplotypes(gt.astype("f4")) with pytest.raises(ValueError): # Wrong ndim. genotypes_to_haplotypes(gt[0])
def serve(source, *, name=None, allowed_origins=None, **kwargs): """Starts an HTTP server, serving a part of a zarr hierarchy or numpy array as zarr. Parameters ---------- source : zarr.Array, zarr.Group, or np.ndarray Source data to serve over HTTP. The underlying store of a zarr.Array, or zarr.Group are used to forward requests. If a numpy array is provided, an in-memory zarry array is created, and the underlying store is wrapped. name : str Path prefix for underlying store keys (e.g. "data.zarr"). If provided, routes are prefixed with name. allowed_origins : list of str, optional List of allowed origins (as strings). Use wildcard "*" to allow all. **kwargs : keyword arguments All extra keyword arguments are forwarded to uvicorn.run """ if isinstance(source, np.ndarray): # Need to cast as zarr and create store for in memory numpy array source = zarr.array(source) if not isinstance(source, (zarr.Array, zarr.Group)): raise TypeError( "Source is not one of numpy.ndarray, zarr.Array, or zarr.Group.") route = create_zarr_route(source) routes = [route] if name is None else [Mount("/" + name, routes=[route])] server = Starlette(routes=routes) if allowed_origins: server.add_middleware( CORSMiddleware, allow_origins=allowed_origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) uvicorn.run(server, **kwargs)
def test_zarr(selenium): import numpy as np import zarr from numcodecs import Blosc # basic test z = zarr.zeros((1000, 1000), chunks=(100, 100), dtype="i4") assert z.shape == (1000, 1000) # test assignment z[0, :] = np.arange(1000) assert z[0, 1] == 1 # test saving and loading a1 = np.arange(10) zarr.save("/tmp/example.zarr", a1) a2 = zarr.load("/tmp/example.zarr") np.testing.assert_equal(a1, a2) # test compressor compressor = Blosc(cname="zstd", clevel=3, shuffle=Blosc.BITSHUFFLE) data = np.arange(10000, dtype="i4").reshape(100, 100) z = zarr.array(data, chunks=(10, 10), compressor=compressor) assert z.compressor == compressor
def zarr_im_rgb_np(): return zarr.array( np.random.randint(0, 255, (2048, 2048, 3), dtype=np.uint8))
def zarr_im_mch_np(): return zarr.array( np.random.randint(0, 255, (3, 2048, 2048), dtype=np.uint16))
""" generate test data for zarr-js """ import zarr from numpy import arange from numcodecs.zlib import Zlib # 1d.contiguous.compressed.i2 store = zarr.DirectoryStore('data/1d.contiguous.compressed.i2.zarr') z = zarr.array([1, 2, 3, 4], dtype='i2', store=store, chunks=(4, ), compressor=Zlib()) # 1d.contiguous.uncompressed.i2 store = zarr.DirectoryStore('data/1d.contiguous.uncompressed.i2.zarr') z = zarr.array([1, 2, 3, 4], dtype='i2', store=store, chunks=(4, ), compressor=None) # 1d.contiguous.compressed.i4 store = zarr.DirectoryStore('data/1d.contiguous.compressed.i4.zarr') z = zarr.array([1, 2, 3, 4], dtype='i4', store=store, chunks=(4, ), compressor=Zlib())
def _test_gt_func(f, gt, expect, compare, **kwargs): # 3D tests. assert gt.ndim == 3 # Test numpy array. actual = f(gt, **kwargs) assert isinstance(actual, np.ndarray) compare(expect, actual) # Test numpy array, Fortran order. actual = f(np.asfortranarray(gt), **kwargs) assert isinstance(actual, np.ndarray) compare(expect, actual) # Test dask array. gt_dask = da.from_array(gt, chunks=(1, 2, -1)) actual = f(gt_dask, **kwargs) assert isinstance(actual, da.Array) assert_array_equal(expect, actual.compute()) # Test zarr array. gt_zarr = zarr.array(data=gt, chunks=(1, 2, None)) actual = f(gt_zarr, **kwargs) assert isinstance(actual, da.Array) assert_array_equal(expect, actual.compute()) # Reshape to test as 2D. gt = gt.reshape((-1, gt.shape[2])) if expect.ndim == 3: expect = expect.reshape((gt.shape[0], -1)) elif expect.ndim == 2: expect = expect.reshape(-1) # Test numpy array. actual = f(gt, **kwargs) assert isinstance(actual, np.ndarray) compare(expect, actual) # Test dask array. gt_dask = da.from_array(gt, chunks=(2, -1)) actual = f(gt_dask, **kwargs) assert isinstance(actual, da.Array) assert_array_equal(expect, actual.compute()) # Test zarr array. gt_zarr = zarr.array(data=gt) actual = f(gt_zarr, **kwargs) assert isinstance(actual, da.Array) assert_array_equal(expect, actual.compute()) # Test exceptions. with pytest.raises(TypeError): # Wrong type. f("foo", **kwargs) with pytest.raises(TypeError): # Wrong dtype. f(gt.astype("f4"), **kwargs) with pytest.raises(ValueError): # Wrong ndim. f(gt[0], **kwargs)
from numcodecs import Blosc compressor = Blosc(cname='snappy') @contextmanager def timeit(title=''): start = perf_counter() yield delta = perf_counter() - start print(title, delta) def data(i): return ['abc' * 30, 'def' * 30, 'ijk' * 30][i % 3] N = 100_000 data = [data(i) for i in range(N)] gr = zarr.group() arr = zarr.array(data, dtype=str) # gr['names'] = arr st = zarr.open('text_store.zarr') with timeit('store'): zarr.copy(arr, st, 'names') with open('text_file.txt', 'w') as fh: fh.write('\n'.join(data)) print(st['names'][101] == data[101])
def setup_instance(self, data): z = zarr.array(data, chunks=(2, None)) return AlleleCountsDaskArray(z)
def test_array(): a = np.arange(100) z = array(a, chunks=10) eq(a.shape, z.shape) eq(a.dtype, z.dtype) assert_array_equal(a, z[:])
def _test_ac_func(f, ac, expect, compare): # 3D tests. assert ac.ndim == 3 # Test numpy array. actual = f(ac) assert isinstance(actual, np.ndarray) compare(expect, actual) assert expect.dtype == actual.dtype # Test numpy array, Fortran order. actual = f(np.asfortranarray(ac)) assert isinstance(actual, np.ndarray) compare(expect, actual) assert expect.dtype == actual.dtype # Test dask array. ac_dask = da.from_array(ac, chunks=(1, 2, -1)) actual = f(ac_dask) assert isinstance(actual, da.Array) compare(expect, actual.compute()) assert expect.dtype == actual.dtype # Test zarr array. ac_zarr = zarr.array(data=ac) actual = f(ac_zarr) assert isinstance(actual, da.Array) compare(expect, actual.compute()) assert expect.dtype == actual.dtype # Reshape to test as 2D. ac = ac.reshape((-1, ac.shape[2])) if expect.ndim == 3: expect = expect.reshape(ac.shape) elif expect.ndim == 2: expect = expect.reshape(-1) # Test numpy array. actual = f(ac) assert isinstance(actual, np.ndarray) compare(expect, actual) assert expect.dtype == actual.dtype # Test dask array. ac_dask = da.from_array(ac, chunks=(2, -1)) actual = f(ac_dask) assert isinstance(actual, da.Array) compare(expect, actual.compute()) assert expect.dtype == actual.dtype # Test zarr array. ac_zarr = zarr.array(data=ac) actual = f(ac_zarr) assert isinstance(actual, da.Array) compare(expect, actual.compute()) assert expect.dtype == actual.dtype # Test errors. with pytest.raises(TypeError): # Wrong type. f("foo") with pytest.raises(TypeError): # Wrong dtype. f(ac.astype("f4")) with pytest.raises(ValueError): # Wrong ndim. f(ac[0])