def test_info(): # setup g = zarr.group(store=dict(), chunk_store=dict(), synchronizer=zarr.ThreadSynchronizer()) g.create_group('foo') z = g.zeros('bar', shape=10, filters=[numcodecs.Adler32()]) # test group info items = g.info_items() keys = sorted([k for k, _ in items]) expected_keys = sorted([ 'Type', 'Read-only', 'Synchronizer type', 'Store type', 'Chunk store type', 'No. members', 'No. arrays', 'No. groups', 'Arrays', 'Groups', 'Name' ]) assert_list_equal(expected_keys, keys) # test array info items = z.info_items() keys = sorted([k for k, _ in items]) expected_keys = sorted([ 'Type', 'Data type', 'Shape', 'Chunk shape', 'Order', 'Read-only', 'Filter [0]', 'Compressor', 'Synchronizer type', 'Store type', 'Chunk store type', 'No. bytes', 'No. bytes stored', 'Storage ratio', 'Chunks initialized', 'Name' ]) assert_list_equal(expected_keys, keys)
def main(): # Command line args are in sys.argv[1], sys.argv[2] .. # sys.argv[0] is the script name itself and can be ignored # parse arguments parser = argparse.ArgumentParser( description="Runs Conway's Game of Life simulation.") # add arguments parser.add_argument("--grid-size", dest="N", required=False) parser.add_argument("--mov-file", dest="movfile", required=False) parser.add_argument("--interval", dest="interval", required=False) parser.add_argument("--glider", action="store_true", required=False) parser.add_argument("--gosper", action="store_true", required=False) parser.add_argument("--port") args = parser.parse_args() # set grid size N = 100 if args.N and int(args.N) > 8: N = int(args.N) # set animation update interval update_interval = 1 if args.interval: update_interval = int(args.interval) # declare grid grid = np.array([]) # check if "glider" demo flag is specified if args.glider: grid = np.zeros(N * N).reshape(N, N) add_glider(1, 1, grid) elif args.gosper: grid = np.zeros(N * N).reshape(N, N) add_gosper_glider_gun(10, 10, grid) else: # populate grid with random on/off - more off than on grid = random_grid(N) store = zarr.RedisStore(port=args.port) root = zarr.group(store=store, overwrite=True) t = 0 while True: arr = root.zeros(f"{t}", shape=grid.shape, chunks=(25, 25)) arr[...] = grid t += 1 time.sleep(update_interval) grid = update(grid, N) print(t, grid)
def fromzarr(path, group=None, dataset=None, chunk_size=None): import zarr try: # since v2.11.0, zarr convert mutable mappings to KVStore from zarr.storage import KVStore as zarr_kvstore except ImportError: # pragma: no cover zarr_kvstore = None if isinstance(path, zarr.Array): arr = path if zarr_kvstore is None and isinstance(arr.store, FSMap): # pragma: no cover root = arr.store.root path, dataset = root.rsplit("/", 1) elif zarr_kvstore and isinstance(arr.store, zarr_kvstore): root = arr.store._mutable_mapping.root path, dataset = root.rsplit("/", 1) else: path = arr.store.path if "/" in arr.path and group is None: group = arr.path.rsplit("/", 1)[0] dataset = arr.basename if not dataset: path, dataset = path.rsplit("/", 1) shape = arr.shape elif isinstance(path, str): fs = get_fs(path, None) fs_map = FSMap(path, fs) if group is None and dataset is None: arr = zarr.open(fs_map) if isinstance(arr, zarr.Array): return fromzarr(arr, chunk_size=chunk_size) g = zarr.group(store=fs_map) arr = g[TensorFromZarr.get_path(group, dataset)] shape = arr.shape else: raise TypeError("`path` passed has wrong type, " "expect str, or zarr.Array" f"got {type(path)}") chunk_size = chunk_size if chunk_size is not None else arr.chunks op = TensorFromZarr(filename=path, group=group, dataset=dataset, dtype=arr.dtype) return op(shape, chunk_size=chunk_size, order=TensorOrder(arr.order))
def write_dataset_zarr(dataset, path, key='images'): """ Given a PyTorch Dataset or array_like, write a Zarr dataset. We assume that the dataset returns either a single image, or a tuple whose first entry is an image. For example, in order to return both an image and a set of labels, the dataset can return those as a pair of torch Tensors. Note that the names of the extra members of the tuple can be overridden with the argument 'extra_keys'. """ try: import zarr, lmdb except ImportError: print( 'Please install the zarr and lmdb libraries to use write_dataset_zarr.' ) raise from .utils import tqdm if not isinstance(key, tuple): # make key a tuple if it's not already key = (key, ) store = zarr.DirectoryStore(path) root = zarr.group(store=store, overwrite=True) # determine size needed for h5 dataset ds0 = dataset[0] if not isinstance(ds0, tuple): ds0 = (ds0, ) # check that the length of the tuple matches args if len(ds0) != len(key): raise Exception(f"Dataset returns tuple with {len(ds0)} entries, " "but only {len(key)} keys given") ds = [] for d, k in zip(ds0, key): dtype = d.dtype if isinstance(d, torch.Tensor): # need a numpy dtype for h5py dtype = d.view(-1)[0].cpu().numpy().dtype sh = d.shape ds.append( root.zeros('/' + k, shape=(len(dataset), *sh), chunks=(1, *sh), dtype=dtype)) for i, di in enumerate(tqdm(dataset)): if not isinstance(di, (tuple, list)): di = [di] for I, dsi in zip(di, ds): if isinstance(I, torch.Tensor): I = I.cpu().numpy() dsi[i, ...] = I
def __create_group( self, store: MutableMapping, base: np.ndarray, pyramid: List[np.ndarray] ) -> zarr.hierarchy.Group: """Create group and datasets.""" grp = zarr.group(store) grp.create_dataset("base", data=base) series = [] for i, dataset in enumerate(pyramid): if i == 0: path = "base" else: path = "%s" % i grp.create_dataset(path, data=pyramid[i]) series.append({"path": path}) return grp
def test_copy_all(): """ https://github.com/zarr-developers/zarr-python/issues/269 copy_all used to not copy attributes as `.keys()` does not return hidden `.zattrs`. """ original_group = zarr.group(store=MemoryStore(), overwrite=True) original_group.attrs["info"] = "group attrs" original_subgroup = original_group.create_group("subgroup") original_subgroup.attrs["info"] = "sub attrs" destination_group = zarr.group(store=MemoryStore(), overwrite=True) # copy from memory to directory store copy_all( original_group, destination_group, dry_run=False, ) assert 'subgroup' in destination_group assert destination_group.attrs["info"] == "group attrs" assert destination_group.subgroup.attrs["info"] == "sub attrs"
def compress_zarr_dataset(data, file_path, compression='lz4', clevel=5, start_idx=0, end_idx=0): """ Loads in a zarr data set and exports it with a given compression type and level :param data: Zarr data set which will be compressed :param file_path: File name path where the data will be exported (e.g. "./export/data.zip") :param compression: Compression type :param clevel: Compression level :param start_idx: Starting index of data to be exported. :param end_idx: If end_idx != 0 the data set will be exported to the specified index, excluding the sample at end_idx (e.g. end_idx = len(x) will export it fully) :return: True if a NaN value was detected """ compressor = Blosc(cname=compression, clevel=clevel, shuffle=Blosc.SHUFFLE) # open a dataset file and create arrays store = zarr.ZipStore(file_path, mode="w") zarr_file = zarr.group(store=store, overwrite=True) nan_detected = False for key in data.keys(): if end_idx == 0: x = data[key] else: x = data[key][start_idx:end_idx] if np.isnan(x).any(): nan_detected = True array_shape = list(x.shape) array_shape[0] = 128 # export array zarr_file.create_dataset( name=key, data=x, shape=x.shape, dtype=type(x.flatten()[0]), chunks=array_shape, synchronizer=zarr.ThreadSynchronizer(), compression=compressor, ) store.close() logging.info("dataset was exported to: %s", file_path) return nan_detected
def test_select_mask(): # Setup. pos = np.arange(100) gt = np.random.randint(low=-1, high=4, size=(100, 10)) mask = np.zeros(100, dtype=bool) mask[1:99:3] = True # Numpy array. for a in pos, gt: expect = a.compress(mask, axis=0) actual = select_mask(a, mask, axis=0) assert isinstance(actual, np.ndarray) assert_array_equal(expect, actual) # Dask array. for a in pos, gt: expect = a.compress(mask, axis=0) d = da.from_array(a) actual = select_mask(d, mask, axis=0) assert isinstance(actual, da.Array) assert_array_equal(expect, actual.compute()) # With mask as dask array. actual = select_mask(d, da.from_array(mask), axis=0) assert isinstance(actual, da.Array) assert_array_equal(expect, actual.compute()) # Numpy group. g = DictGroup({"variants": {"POS": pos}, "calldata": {"GT": gt}}) actual = select_mask(g, mask, axis=0) assert isinstance(actual, GroupSelection) assert isinstance(actual["variants"]["POS"], np.ndarray) assert isinstance(actual["calldata"]["GT"], np.ndarray) assert_array_equal(pos.compress(mask, axis=0), actual["variants"]["POS"]) assert_array_equal(gt.compress(mask, axis=0), actual["calldata"]["GT"]) # Zarr group. g = zarr.group() g.create_dataset("variants/POS", data=pos) g.create_dataset("calldata/GT", data=gt) actual = select_mask(g, mask, axis=0) assert isinstance(actual, GroupSelection) assert isinstance(actual["variants"]["POS"], da.Array) assert isinstance(actual["calldata"]["GT"], da.Array) assert_array_equal(pos.compress(mask, axis=0), actual["variants"]["POS"].compute()) assert_array_equal(gt.compress(mask, axis=0), actual["calldata"]["GT"].compute())
def generate_gt_data(sample: str, chromosome_gt: dict, chunks=1): root = zarr.group() sample_group = root.create_group(sample) sample_group.create_groups(*chromosome_gt.keys()) for i in sample_group: gt = chromosome_gt[i] sites = len(gt) gt_data = [[x] for x in gt] calldata = sample_group[i].create_group("calldata") calldata.create_dataset('GT', shape=(sites, 1, 2), chunks=(chunks, 1, 2), dtype='int8', data=gt_data) return root
def setUp(self): self.ndims = 7 num_datasets = 3 self.temp_dir_zarr = tempfile.TemporaryDirectory(suffix=".zgroup") self.zarr_group = zarr.group(store=self.temp_dir_zarr.name, overwrite=True) self.dset_list = list( self.zarr_group.create_dataset( name='zarray' + str(i), data=np.random.rand(*self.srand.choices( range(1, 90 // self.ndims), k=self.ndims))) for i in range(num_datasets)) self.dsetview_list = list( DatasetView(self.dset_list[i]) for i in range(num_datasets)) print(LazyOpszarrTest)
def execute(cls, ctx, op): import zarr axis_offsets = op.axis_offsets shape = op.outputs[0].shape fs = get_fs(op.filename, None) fs_map = FSMap(op.filename, fs) root = zarr.group(store=fs_map) path = cls.get_path(op.group, op.dataset) arr = root[path] data = arr[tuple(slice(offset, offset + size) for offset, size in zip(axis_offsets, shape))] ctx[op.outputs[0].key] = data
def setup_input(samples, input_pattern, seqid, field): log('Setting up input array ...') input_paths = [input_pattern.format(sample=s) for s in samples] input_stores = [zarr.ZipStore(ip, mode='r') for ip in input_paths] input_roots = [zarr.group(store) for store in input_stores] input_arrays = [ root[s][seqid][field] for root, s in zip(input_roots, samples) ] input_arrays = [da.from_array(a, chunks=a.chunks) for a in input_arrays] # here we add a dim to allow the hstack to work. must share the shape (X, 1, ) input_arrays = [a[:, None] if a.ndim == 1 else a for a in input_arrays] input_array = da.hstack(input_arrays) log('Input array:', input_array) return input_array
def check_array_setup(samples, input_pattern, seqid, field): log('Determining number of variants ...') path = input_pattern.format(sample=samples[0]) callset = zarr.group(zarr.ZipStore(path, mode='r')) # expect sample name in hierarchy try: array = callset[samples[0]][seqid][field] except KeyError: field = field.replace("calldata/", "variants/") array = callset[samples[0]][seqid][field] log("{field} found in `variants` not `calldata`".format(field=field)) n_variants = array.shape[0] log('Found {:,} variants.'.format(n_variants)) return array, field
def __init__(self, h5f: BinaryIO, url: str, xarray: bool = False, spec=1, inline_threshold=0): # Open HDF5 file in read mode... lggr.debug(f'HDF5 file: {h5f}') self.input_file = h5f lggr.debug(f'xarray: {xarray}') self.spec = spec self.inline = inline_threshold self._h5f = h5py.File(h5f, mode='r') self._xr = xarray self.store = {} self._zroot = zarr.group(store=self.store, overwrite=True) self._uri = url lggr.debug(f'HDF5 file URI: {self._uri}')
def prepare_zarr_storage(variations, out_path): store = zarr.DirectoryStore(str(out_path)) root = zarr.group(store=store, overwrite=True) metadata = variations.metadata sources = [] targets = [] samples_array = variations.samples #samples_array.compute_chunk_sizes() sources.append(samples_array) object_codec = None if samples_array.dtype == object: object_codec = numcodecs.VLenUTF8() dataset = zarr.create(shape=samples_array.shape, path='samples', store=store, dtype=samples_array.dtype, object_codec=object_codec) targets.append(dataset) variants = root.create_group(ZARR_VARIANTS_GROUP_NAME, overwrite=True) calls = root.create_group(ZARR_CALL_GROUP_NAME, overwrite=True) for field, array in variations.items(): definition = ALLELE_ZARR_DEFINITION_MAPPINGS[field] field_metadata = metadata.get(field, None) array = variations[field] if array is None: continue array.compute_chunk_sizes() sources.append(array) group_name = definition['group'] group = calls if group_name == ZARR_CALL_GROUP_NAME else variants path = os.path.sep + os.path.join(group.path, definition['field']) object_codec = None if array.dtype == object: object_codec = numcodecs.VLenUTF8() dataset = zarr.create(shape=array.shape, path=path, store=store, object_codec=object_codec, dtype=array.dtype) if field_metadata is not None: for key, value in field_metadata.items(): dataset.attrs[key] = value targets.append(dataset) lock = SerializableLock() return da.store(sources, targets, compute=False, lock=lock)
def load(cls, path: PathType): """Load existing DirectoryStore state into a MemoryStore object.""" memory_store = zarr.MemoryStore() directory_store = zarr.DirectoryStore(path) zarr.convenience.copy_store(source=directory_store, dest=memory_store) group = zarr.group(store=memory_store) #<<<<<<< HEAD # xshape = cls._extract_xshape_from_zarr_group(group) # zdim = cls._extract_zdim_from_zarr_group(group) # return cls(zdim=zdim, xshape=xshape, store=memory_store) # # sim_shapes = cls._extract_sim_shapes_from_zarr_group(group) # # z = cls._extract_params_from_zarr_group(group) # # return MemoryCache(params=z, sim_shapes=sim_shapes, store=memory_store) #======= zdim = group[cls._filesystem.pars].shape[1] return MemoryStore(params=zdim, zarr_store=memory_store)
def create_tile_directory(self, resolution, width, height): tile_directory = os.path.join(self.slide_directory, str(resolution)) if self.file_type in ("n5", "zarr"): tile_directory = os.path.join(self.slide_directory, "pyramid.%s" % self.file_type) self.zarr_store = zarr.DirectoryStore(tile_directory) if self.file_type == "n5": self.zarr_store = zarr.N5Store(tile_directory) self.zarr_group = zarr.group(store=self.zarr_store) self.zarr_group.create_dataset(str(resolution), shape=(3, height, width), chunks=(None, self.tile_height, self.tile_width), dtype='B') else: os.mkdir(tile_directory) return tile_directory
def table(self, data, names=None, expectedlen=None, **kwargs): # setup names, columns = _util.check_table_like(data, names=names) kwargs = self._set_defaults(kwargs) g = zarr.group(**kwargs) # create columns chunks = kwargs.get('chunks', None) for n, c in zip(names, columns): if chunks is None: chunks = default_chunks(c, expectedlen) g.array(name=n, data=c, chunks=chunks) # create table ztbl = ZarrTable(g, names=names) return ztbl
def create_tile_directory(self, series, resolution, width, height): tile_directory = os.path.join(self.slide_directory, "data.%s" % self.file_type) self.zarr_store = zarr.DirectoryStore(tile_directory) if self.file_type == "n5": self.zarr_store = zarr.N5Store(tile_directory) self.zarr_group = zarr.group(store=self.zarr_store) self.zarr_group.attrs['bioformats2raw.layout'] = LAYOUT_VERSION # important to explicitly set the chunk size to 1 for non-XY dims # setting to None may cause all planes to be chunked together # ordering is TZCYX and hard-coded since Z and T are not present self.zarr_group.create_dataset( "%s/%s" % (str(series), str(resolution)), shape=(1, 1, 3, height, width), chunks=(1, 1, 1, self.tile_height, self.tile_width), dtype='B')
def test_select_values(): # Setup. pos = np.arange(1, 300, 3) gt = np.random.randint(low=-1, high=4, size=(100, 10)) query = [31, 61] # Numpy array. for a in pos, gt: expect = a[[10, 20]] actual = select_values(a, pos, query, axis=0) assert isinstance(actual, np.ndarray) assert_array_equal(expect, actual) # Dask array. for a in pos, gt: expect = a[[10, 20]] d = da.from_array(a) actual = select_values(d, pos, query, axis=0) assert isinstance(actual, da.Array) assert_array_equal(expect, actual.compute()) # Numpy group. g = DictGroup({"variants": {"POS": pos}, "calldata": {"GT": gt}}) actual = select_values(g, "variants/POS", query, axis=0) assert isinstance(actual, GroupSelection) assert isinstance(actual["variants"]["POS"], np.ndarray) assert isinstance(actual["calldata"]["GT"], np.ndarray) assert_array_equal(pos[[10, 20]], actual["variants"]["POS"]) assert_array_equal(gt[[10, 20]], actual["calldata"]["GT"]) # Zarr group. g = zarr.group() g.create_dataset("variants/POS", data=pos) g.create_dataset("calldata/GT", data=gt) actual = select_values(g, "variants/POS", query, axis=0) assert isinstance(actual, GroupSelection) assert isinstance(actual["variants"]["POS"], da.Array) assert isinstance(actual["calldata"]["GT"], da.Array) assert_array_equal(pos[[10, 20]], actual["variants"]["POS"].compute()) assert_array_equal(gt[[10, 20]], actual["calldata"]["GT"].compute()) # Errors. with pytest.raises(KeyError): select_values(gt, pos, query=[1, 999], axis=0)
def __init__( self, executor, dag, input, shape, chunks, dtype, partition_row_counts=None, intermediate_store=None, ): ZappyArray.__init__(self, shape, chunks, dtype, partition_row_counts) self.executor = executor self.dag = dag self.input = input if intermediate_store == None: intermediate_store = zarr.group() self.intermediate_store = intermediate_store
def output_to_zarr(path, seq_id, sample_id, arrays, cname, clevel, shuffle): log('Output zarr to {!r} ...'.format(path)) store = zarr.ZipStore(path, mode='w') root = zarr.group(store=store) callset = root.create_group(sample_id) seq_group = callset.require_group(seq_id) calldata_group = seq_group.require_group('calldata') variants_group = seq_group.require_group('variants') compressor = numcodecs.Blosc(cname=cname, clevel=clevel, shuffle=shuffle) for key, value in arrays.items(): calldata_group.create_dataset(key, data=value, compressor=compressor) log('Created output array: ' + repr(key)) store.close()
def load(input_obj): """ """ trx = TrxFile() if isinstance(input_obj, str): if os.path.isdir(input_obj): store = zarr.storage.DirectoryStore(input_obj) elif os.path.isfile(input_obj) and \ os.path.splitext(input_obj)[1] in ['.zip', '.trx']: store = zarr.ZipStore(input_obj) else: raise ValueError('Invalid input path/filename.') else: store = input_obj trx._zcontainer = zarr.group(store=store, overwrite=False) trx.storage = store return trx
def __init__( self, params: Union[int, list], zarr_store: Union[zarr.MemoryStore, zarr.DirectoryStore], simulator=None, sync_path: Optional[PathType] = None, ): """Initialize Store content dimensions. Args: params (list of strings or int): List of paramater names. If int use ['z0', 'z1', ...]. zarr_store: zarr storage. sync_path: path to the cache lock files. Must be accessible to all processes working on the cache. """ self._zarr_store = zarr_store self._simulator = simulator if isinstance(params, int): params = ["z%i" % i for i in range(params)] self.params = params synchronizer = zarr.ProcessSynchronizer(sync_path) if sync_path else None self._root = zarr.group(store=self.zarr_store, synchronizer=synchronizer) logging.debug(" params = %s" % str(params)) if set(["samples", "metadata"]) == set(self._root.keys()): logging.info("Loading existing store.") self._update() elif len(self._root.keys()) == 0: logging.info("Creating new store.") self._setup_new_zarr_store( len(self.params), simulator.sim_shapes, self._root ) logging.debug(" sim_shapes = %s" % str(simulator.sim_shapes)) else: raise KeyError( "The zarr storage is corrupted. It should either be empty or only have the keys ['samples', 'metadata']." ) self._lock = None if sync_path is not None: self._setup_lock(sync_path)
def read_features(cls, path: FilepathType) -> FeatureMap: path = Path(path).resolve() lock = FileLock(_lockfile(path)) with lock, zarr.ZipStore(path, mode="r") as store: root = zarr.group(store=store) features = root.attrs[_FEATURES_KEY] raw_features = FeatureMap.deserialize(features) if len(cls.features) == 0: return raw_features for required_feat_name, required_feat in cls.features.items(): if (required_feat_name not in raw_features or raw_features[required_feat_name] != required_feat): raise RuntimeError( "Dataset stored at %s is incompatible with %s" % (path, cls.__class__)) return cls.features
def test_rechunk_group(tmp_path, executor, source_store, target_store, temp_store): if source_store.startswith("mapper"): fsspec = pytest.importorskip("fsspec") store_source = fsspec.get_mapper(str(tmp_path) + source_store) target_store = fsspec.get_mapper(str(tmp_path) + target_store) temp_store = fsspec.get_mapper(str(tmp_path) + temp_store) else: store_source = str(tmp_path / source_store) target_store = str(tmp_path / target_store) temp_store = str(tmp_path / temp_store) group = zarr.group(store_source) group.attrs["foo"] = "bar" # 800 byte chunks a = group.ones("a", shape=(5, 10, 20), chunks=(1, 10, 20), dtype="f4") a.attrs["foo"] = "bar" b = group.ones("b", shape=(20, ), chunks=(10, ), dtype="f4") b.attrs["foo"] = "bar" max_mem = 1600 # should force a two-step plan for a target_chunks = {"a": (5, 10, 4), "b": (20, )} rechunked = api.rechunk( group, target_chunks, max_mem, target_store, temp_store=temp_store, executor=executor, ) assert isinstance(rechunked, api.Rechunked) target_group = zarr.open(target_store) assert "a" in target_group assert "b" in target_group assert dict(group.attrs) == dict(target_group.attrs) rechunked.execute() for aname in target_chunks: assert target_group[aname].chunks == target_chunks[aname] a_tar = dsa.from_zarr(target_group[aname]) assert dsa.equal(a_tar, 1).all().compute()
def to_filename(self, filename): """ Stores the greyordinate data to the given filename. Type of storage is determined by the extension of the filename: - .dscalar/dconn/dlabel.nii: CIFTI file - .h5/hdf5/he2/he5: HDF5 file representing CIFTI data - .zarr: zarr file representing CIFTI data - .gii: GIFTI file (only stores surface data; raises error if more that one surface is represented in the greyordinates) - .nii: NIFTI file (only stores the volumetric data) :param filename: target filename """ if hasExt(filename, ('.dscalar.nii', '.dconn.nii', '.dlabel.nii')): self.to_cifti().to_filename(filename) elif hasExt(filename, ('.h5', '.hdf5', '.he2', 'he5')): import h5py with h5py.File(filename, 'w') as f: self.to_hdf5(f) elif hasExt(filename, ('.zarr', )): import zarr f = zarr.group(filename) self.to_hdf5(f) elif hasExt(filename, ('.gii', )): surfaces = np.unique( self.brain_model_axis.name[self.brain_model_axis.surface_mask]) if len(surfaces) > 1: raise ValueError( f"Can not write to GIFTI file as more than one surface has been defined: {surfaces}" ) if len(surfaces) == 0: raise ValueError( "Can not write to GIFTI file as no surface has been provided" ) write_gifti(filename, [self.surface(surfaces[0])], surfaces[0]) elif hasExt(filename, ('.nii.gz', '.nii')): self.volume().to_filename(filename) else: raise IOError( f"Extension of {filename} not recognized for NIFTI, GIFTI, or CIFTI file" )
def create_tile_directory(self, series, resolution, width, height): dimension_separator = '/' if not self.nested: dimension_separator = '.' self.zarr_store = FSStore(self.slide_directory, dimension_separator=dimension_separator, normalize_keys=True, auto_mkdir=True) self.zarr_group = zarr.group(store=self.zarr_store) self.zarr_group.attrs['bioformats2raw.layout'] = LAYOUT_VERSION # important to explicitly set the chunk size to 1 for non-XY dims # setting to None may cause all planes to be chunked together # ordering is TZCYX and hard-coded since Z and T are not present self.zarr_group.create_dataset( "%s/%s" % (str(series), str(resolution)), shape=(1, 1, 3, height, width), chunks=(1, 1, 1, self.tile_height, self.tile_width), dtype='B')
def _parse_optimus_bundle(self, bundle_dir, bundle_manifest_path): """ Parses optimus analysis files into PSV rows for cell and expression Redshift tables. """ keys = self._parse_keys(bundle_dir) file_uuid = [f for f in json.load(open(bundle_manifest_path))["files"] if f["name"].endswith(".zattrs")][0]["uuid"] file_version = [f for f in json.load(open(bundle_manifest_path))["files"] if f["name"].endswith(".zattrs")][0]["version"] emptydrops_result = {} with open(os.path.join(bundle_dir, "empty_drops_result.csv")) as emptydrops_file: reader = csv.DictReader(emptydrops_file) for row in reader: emptydrops_result[row["CellId"]] = {"total_umi_count": int(row["Total"]), "is_cell": row["IsCell"] == "TRUE"} # read expression matrix from zarr store = DCPZarrStore(bundle_dir=bundle_dir) root = zarr.group(store=store) n_cells = root.expression_matrix.cell_id.shape[0] chunk_size = root.expression_matrix.cell_id.chunks[0] n_chunks = root.expression_matrix.cell_id.nchunks cell_lines = set() expression_lines = [] logger.info(f"Optimus bundle has {n_cells} cells and {n_chunks} chunks.") for i in range(n_chunks): self._parse_optimus_chunk( keys=keys, file_uuid=file_uuid, file_version=file_version, root=root, start_row=chunk_size * i, end_row=(i + 1) * chunk_size if (i + 1) * chunk_size < n_cells else n_cells, cell_lines=cell_lines, expression_lines=expression_lines, emptydrops_result=emptydrops_result ) return cell_lines, expression_lines
def __init__(self, path, transforms=None): self.path = path self.keys = ('images', 'labels') assert os.path.exists(path), 'file `{}` not exists!'.format(path) with zarr.LMDBStore(path) as store: zarr_db = zarr.group(store=store) self.num_examples = zarr_db['labels'].shape[0] self.datasets = None if transforms is None: transforms = { 'labels': lambda v: torch.tensor(v, dtype=torch.long), 'images': lambda v: torch.tensor( (v - 127.5) / 127.5, dtype=torch.float32) } self.transforms = transforms
def __init__(self, h5f, xarray=False): # Open HDF5 file in read mode... lggr.debug(f'HDF5 file: {h5f}') lggr.debug(f'xarray: {xarray}') self._h5f = h5py.File(h5f, mode='r') self._xr = xarray self.store = {} self._zroot = zarr.group(store=self.store, overwrite=True) # Figure out HDF5 file's URI... if hasattr(h5f, 'name'): self._uri = h5f.name elif hasattr(h5f, 'url'): parts = urlparse(h5f.url()) self._uri = urlunparse(parts[:3] + ('', ) * 3) else: self._uri = None lggr.debug(f'Source URI: {self._uri}')
def convert_data_set(path, data_set, batch_size=1000): loader = DataLoader( data_set, batch_size=batch_size, shuffle=False, num_workers=4) num_examples = len(data_set) os.makedirs(path, exist_ok=True) with zarr.LMDBStore(path) as store: root = zarr.group(store=store, overwrite=True) images_set = root.zeros( 'images', shape=(num_examples, 3, 96, 96), chunks=(1, None, None, None), dtype='u1') labels_set = root.zeros( 'labels', shape=(num_examples, ), chunks=(1, ), dtype='u1') current_iter = 0 for images, labels in tqdm(loader): size = images.shape[0] images_set[current_iter:current_iter + size] = images labels_set[current_iter:current_iter + size] = labels current_iter += size
def ImportDataTable(self, tableid): with self._calculationObject.LogHeader('Importing 2D datatable {0}'.format(tableid)): DQXUtils.CheckValidTableIdentifier(tableid) self._calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(self._datasetId)) self._calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(self._datasetId)) max_line_count = None if self._maxLineCount > 0: max_line_count = self._maxLineCount table_settings = self.getSettings(tableid) settingsFile, data_file = self._getDataFiles(tableid) zarr_file = zarr.DirectoryStore(data_file) zarr_file = zarr.group(zarr_file) #Check that the referenced tables exist and have the primary key specified. if table_settings['columnDataTable']: columnTableSettings = SettingsDataTable() columnTableSettings.loadFile( os.path.join(self._datasetFolder, 'datatables', table_settings['columnDataTable'], 'settings')) columnProperties = [prop['id'] for prop in columnTableSettings['properties']] # if table_settings['columnIndexField'] not in columnProperties: # raise Exception(table_settings['columnDataTable'] + ' does not have property ' + table_settings['columnIndexField']) if table_settings['rowDataTable']: rowTableSettings = SettingsDataTable() rowTableSettings.loadFile( os.path.join(self._datasetFolder, 'datatables', table_settings['rowDataTable'], 'settings')) rowProperties = [prop['id'] for prop in rowTableSettings['properties']] if table_settings['rowIndexField'] not in rowProperties: raise Exception(table_settings['rowDataTable'] + ' does not have property ' + table_settings['rowIndexField']) if table_settings['showInGenomeBrowser']: if not columnTableSettings['isPositionOnGenome']: raise Exception(table_settings['columnDataTable'] + ' is not a genomic position based table (IsPositionOnGenome in config), but you have asked to use this table as a column index on a genome browseable 2D array.') if not self._importSettings['ConfigOnly']: #Insert an index column into the index tables if table_settings['columnDataTable']: # Assume that index field has been created on import in LoadTable - it's much faster # We could just run the command and ignore the error raised if it already exists # sql = "ALTER TABLE `{0}` ADD `{1}_column_index` INT DEFAULT NULL;".format(table_settings['columnDataTable'], tableid) # self._execSql(sql) self._dao.insert2DIndexes(zarr_file, "column", tableid, table_settings, columnTableSettings['primKey'], max_line_count) if table_settings['rowDataTable']: self._dao.insert2DIndexes(zarr_file, "row", tableid, table_settings, rowTableSettings['primKey'], None) ImpUtils.mkdir(os.path.join(self._config.getBaseDir(), '2D_data')) path_join = os.path.join(self._config.getBaseDir(), '2D_data', self._datasetId + '_' + tableid + '.zarr') try: os.remove(path_join) except OSError: pass print("Symlinking 2D data") os.symlink(data_file, path_join)
def open(self): if self.conn is None: self.conn = zarr.group() self.attrs = self.conn.attrs