def test_compress(): x = np.arange(25).reshape((5, 5)) a = da.from_array(x, chunks=(2, 2)) c1 = np.array([True, False, True, False, True]) c2 = np.array([True, False]) c3 = [True, False] dc1 = da.from_array(c1, chunks=3) dc2 = da.from_array(c2, chunks=2) for c, dc in [(c1, c1), (c2, c2), (c3, c3), (c1, dc1), (c2, dc2), (c3, dc2)]: for axis in [None, 0, 1]: res = da.compress(dc, a, axis=axis) assert_eq(np.compress(c, x, axis=axis), res) if isinstance(dc, da.Array): # If condition is a dask array then we expect the shape of the # compressed array to be nan, because we won't know that until # the result is computed. axis = axis or 0 assert np.isnan(res.shape[axis]).all() assert np.isnan(res.chunks[axis]).all() else: # If condition is a not a dask array then we expect the shape of the # compressed axis to be known, i.e., not nan. axis = axis or 0 assert np.count_nonzero(dc) == res.shape[axis] assert not np.isnan(res.chunks[axis]).any() with pytest.raises(ValueError): da.compress([True, False], a, axis=100) with pytest.raises(ValueError): da.compress([[True], [False]], a, axis=100)
def dask_compress(indexer, data, axis): """Wrapper for dask.array.compress() which computes chunk sizes faster.""" # sanity checks assert isinstance(data, da.Array) assert isinstance(indexer, da.Array) assert isinstance(axis, int) assert indexer.shape[0] == data.shape[axis] old_chunks = data.chunks axis_old_chunks = old_chunks[axis] axis_n_chunks = len(axis_old_chunks) # apply the indexing operation v = da.compress(indexer, data, axis=axis) # need to compute chunks sizes in order to know dimension sizes; # would normally do v.compute_chunk_sizes() but that is slow for # multidimensional arrays, so hack something more efficient axis_new_chunks = tuple( indexer.rechunk((axis_old_chunks,)) .map_blocks( lambda b: np.sum(b, keepdims=True), chunks=((1,) * axis_n_chunks,), ) .compute() ) new_chunks = tuple( [axis_new_chunks if i == axis else c for i, c in enumerate(old_chunks)] ) v._chunks = new_chunks return v
def snp_sites(self, seq_id, field=None, site_mask=None, site_filters="dt_20200416"): """Access SNP site data (positions and alleles). Parameters ---------- seq_id : str Chromosome arm, e.g., "3R". field : {"POS", "REF", "ALT"}, optional Array to access. If not provided, all three arrays POS, REF, ALT will be returned as a tuple. site_mask : {"gamb_colu_arab", "gamb_colu", "arab"} Site filters mask to apply. site_filters : str Site filters analysis version. Returns ------- d : dask.array.Array or tuple of dask.array.Array """ if field is None: # return POS, REF, ALT ret = tuple( self.snp_sites(seq_id=seq_id, field=f, site_mask=None) for f in ("POS", "REF", "ALT")) else: root = self._open_snp_sites() z = root[seq_id]["variants"][field] ret = da.from_array(z, chunks=z.chunks) if site_mask is not None: filter_pass = self.site_filters(seq_id=seq_id, mask=site_mask, analysis=site_filters).compute() if isinstance(ret, tuple): ret = tuple(da.compress(filter_pass, d, axis=0) for d in ret) else: ret = da.compress(filter_pass, ret, axis=0) return ret
def load_variants(self, seq_id, field="POS", mask=None): calldata = self.__class__._data_catalog.ag3.snp_sites.to_zarr() arr = da.from_zarr(calldata[f"{seq_id}/variants/{field}"]) if mask is not None: assert isinstance(mask, da.core.Array), "mask must be a dask_array" arr = da.compress(mask, arr, axis=0).compute_chunk_sizes() return arr
def snp_genotypes( self, seq_id, sample_sets="v3_wild", field="GT", site_mask=None, site_filters="dt_20200416", ): """Access SNP genotypes and associated data. Parameters ---------- seq_id : str Chromosome arm, e.g., "3R". sample_sets : str or list of str Can be a sample set identifier (e.g., "AG1000G-AO") or a list of sample set identifiers (e.g., ["AG1000G-BF-A", "AG1000G-BF-B"]) or a release identifier (e.g., "v3") or a list of release identifiers. field : {"GT", "GQ", "AD", "MQ"} Array to access. site_mask : {"gamb_colu_arab", "gamb_colu", "arab"} Site filters mask to apply. site_filters : str, optional Site filters analysis version. Returns ------- d : dask.array.Array """ sample_sets = self._prep_sample_sets_arg(sample_sets=sample_sets) if isinstance(sample_sets, str): # single sample set root = self._open_snp_genotypes(sample_set=sample_sets) z = root[seq_id]["calldata"][field] d = da.from_array(z, chunks=z.chunks) else: # concatenate multiple sample sets ds = [ self.snp_genotypes(seq_id=seq_id, sample_sets=c, field=field) for c in sample_sets ] d = da.concatenate(ds, axis=1) if site_mask is not None: filter_pass = self.site_filters(seq_id=seq_id, mask=site_mask, analysis=site_filters).compute() d = da.compress(filter_pass, d, axis=0) return d
def test_compress(): carr = cupy.random.randint(0, 3, size=(10, 10)) darr = da.from_array(carr, chunks=(20, 5)) c = cupy.asarray([True]) res = da.compress(c, darr, axis=0) # cupy.compress is not implemented but dask implementation does not # rely on np.compress -- move originial data back to host and # compare da.compress with np.compress assert_eq(np.compress(c.tolist(), carr.tolist(), axis=0), res)
def test_compress(): x = np.arange(25).reshape((5, 5)) a = da.from_array(x, chunks=(2, 2)) c1 = np.array([True, False, True, False, True]) c2 = np.array([True, False]) c3 = [True, False] dc1 = da.from_array(c1, chunks=3) dc2 = da.from_array(c2, chunks=2) for c, dc in [(c1, c1), (c2, c2), (c3, c3), (c1, dc1), (c2, dc2), (c3, dc2)]: for axis in [None, 0, 1]: res = da.compress(dc, a, axis=axis) assert_eq(np.compress(c, x, axis=axis), res) if isinstance(dc, da.Array): axis = axis or 0 assert np.isnan(res.chunks[axis]).all() with pytest.raises(ValueError): da.compress([True, False], a, axis=100) with pytest.raises(ValueError): da.compress([[True], [False]], a, axis=100)
def load_sample_set_calldata(self, seq_id, sample_set, field="GT", mask=None): if isinstance(sample_set, str): calldata = self.__class__._data_catalog.ag3.snp_genotypes(sample_set=sample_set).to_zarr() arr = da.from_zarr(calldata[f"{seq_id}/calldata/{field}"]) elif isinstance(sample_set, list): arr = da.concatenate( [self.load_sample_set_calldata(seq_id, s, field=field, mask=None) for s in sample_set], axis=1) else: raise ValueError("sample_set must be a string, or a list of strings") if mask is not None: assert isinstance(mask, da.core.Array), "mask must be a dask_array" arr = da.compress(mask, arr, axis=0).compute_chunk_sizes() return arr
def load_variants_array(self, seq_id, field="POS", mask=None): """ release_pa """ path = self.release_dir / "snp_genotypes" / "all" / "sites" # need to open as mapping if this on cloud storez = self.gcs.get_mapper(path.as_posix()) calldata = zarr.Group(storez) arr = da.from_zarr(calldata[f"{seq_id}/variants/{field}"]) if mask is not None: assert isinstance(mask, da.core.Array), "mask must be a dask_array" arr = da.compress(mask, arr, axis=0).compute_chunk_sizes() return arr
def load_calldata_by_sampleset(self, seq_id, sampleset, field="GT", mask=None): if isinstance(sampleset, str): path = self.release_dir / "snp_genotypes" / "all" / sampleset print(path) # need to open as mapping if this on cloud storez = self.gcs.get_mapper(path.as_posix()) calldata = zarr.Group(storez) arr = da.from_zarr(calldata[f"{seq_id}/calldata/{field}"]) elif isinstance(sampleset, list): arr = da.concatenate([ self.load_calldata_by_sampleset( seq_id, s, field=field, mask=None) for s in sampleset ], axis=1) else: raise ValueError( "sampleset must be a string, or a list of strings") if mask is not None: assert isinstance(mask, da.core.Array), "mask must be a dask_array" arr = da.compress(mask, arr, axis=0).compute_chunk_sizes() if field == "GT": arr = allel.GenotypeDaskArray(arr) return arr
def compress(self, condition, axis=None): out = da.compress(condition, self, axis=axis) if len(out.shape) == len(self.shape): out = view_subclass(out, type(self)) return out
def select_mask(a, mask, *, axis=0): a = ensure_dask_array(a) mask = ensure_dask_or_numpy_array(mask) return da.compress(mask, a, axis=axis)