Exemple #1
0
def test_compress():
    x = np.arange(25).reshape((5, 5))
    a = da.from_array(x, chunks=(2, 2))

    c1 = np.array([True, False, True, False, True])
    c2 = np.array([True, False])
    c3 = [True, False]
    dc1 = da.from_array(c1, chunks=3)
    dc2 = da.from_array(c2, chunks=2)

    for c, dc in [(c1, c1), (c2, c2), (c3, c3), (c1, dc1), (c2, dc2),
                  (c3, dc2)]:
        for axis in [None, 0, 1]:
            res = da.compress(dc, a, axis=axis)
            assert_eq(np.compress(c, x, axis=axis), res)
            if isinstance(dc, da.Array):
                # If condition is a dask array then we expect the shape of the
                # compressed array to be nan, because we won't know that until
                # the result is computed.
                axis = axis or 0
                assert np.isnan(res.shape[axis]).all()
                assert np.isnan(res.chunks[axis]).all()
            else:
                # If condition is a not a dask array then we expect the shape of the
                # compressed axis to be known, i.e., not nan.
                axis = axis or 0
                assert np.count_nonzero(dc) == res.shape[axis]
                assert not np.isnan(res.chunks[axis]).any()

    with pytest.raises(ValueError):
        da.compress([True, False], a, axis=100)

    with pytest.raises(ValueError):
        da.compress([[True], [False]], a, axis=100)
def dask_compress(indexer, data, axis):
    """Wrapper for dask.array.compress() which computes chunk sizes faster."""

    # sanity checks
    assert isinstance(data, da.Array)
    assert isinstance(indexer, da.Array)
    assert isinstance(axis, int)
    assert indexer.shape[0] == data.shape[axis]
    old_chunks = data.chunks
    axis_old_chunks = old_chunks[axis]
    axis_n_chunks = len(axis_old_chunks)

    # apply the indexing operation
    v = da.compress(indexer, data, axis=axis)

    # need to compute chunks sizes in order to know dimension sizes;
    # would normally do v.compute_chunk_sizes() but that is slow for
    # multidimensional arrays, so hack something more efficient

    axis_new_chunks = tuple(
        indexer.rechunk((axis_old_chunks,))
        .map_blocks(
            lambda b: np.sum(b, keepdims=True),
            chunks=((1,) * axis_n_chunks,),
        )
        .compute()
    )
    new_chunks = tuple(
        [axis_new_chunks if i == axis else c for i, c in enumerate(old_chunks)]
    )
    v._chunks = new_chunks

    return v
    def snp_sites(self,
                  seq_id,
                  field=None,
                  site_mask=None,
                  site_filters="dt_20200416"):
        """Access SNP site data (positions and alleles).

        Parameters
        ----------
        seq_id : str
            Chromosome arm, e.g., "3R".
        field : {"POS", "REF", "ALT"}, optional
            Array to access. If not provided, all three arrays POS, REF, ALT will be returned as a tuple.
        site_mask : {"gamb_colu_arab", "gamb_colu", "arab"}
            Site filters mask to apply.
        site_filters : str
            Site filters analysis version.

        Returns
        -------
        d : dask.array.Array or tuple of dask.array.Array

        """

        if field is None:
            # return POS, REF, ALT
            ret = tuple(
                self.snp_sites(seq_id=seq_id, field=f, site_mask=None)
                for f in ("POS", "REF", "ALT"))

        else:
            root = self._open_snp_sites()
            z = root[seq_id]["variants"][field]
            ret = da.from_array(z, chunks=z.chunks)

        if site_mask is not None:
            filter_pass = self.site_filters(seq_id=seq_id,
                                            mask=site_mask,
                                            analysis=site_filters).compute()
            if isinstance(ret, tuple):
                ret = tuple(da.compress(filter_pass, d, axis=0) for d in ret)
            else:
                ret = da.compress(filter_pass, ret, axis=0)

        return ret
    def load_variants(self, seq_id, field="POS", mask=None):

        calldata = self.__class__._data_catalog.ag3.snp_sites.to_zarr()
        arr = da.from_zarr(calldata[f"{seq_id}/variants/{field}"])

        if mask is not None:
            
            assert isinstance(mask, da.core.Array), "mask must be a dask_array"
            arr = da.compress(mask, arr, axis=0).compute_chunk_sizes()

        return arr
    def snp_genotypes(
        self,
        seq_id,
        sample_sets="v3_wild",
        field="GT",
        site_mask=None,
        site_filters="dt_20200416",
    ):
        """Access SNP genotypes and associated data.

        Parameters
        ----------
        seq_id : str
            Chromosome arm, e.g., "3R".
        sample_sets : str or list of str
            Can be a sample set identifier (e.g., "AG1000G-AO") or a list of sample set
            identifiers (e.g., ["AG1000G-BF-A", "AG1000G-BF-B"]) or a release identifier (e.g.,
            "v3") or a list of release identifiers.
        field : {"GT", "GQ", "AD", "MQ"}
            Array to access.
        site_mask : {"gamb_colu_arab", "gamb_colu", "arab"}
            Site filters mask to apply.
        site_filters : str, optional
            Site filters analysis version.

        Returns
        -------
        d : dask.array.Array

        """

        sample_sets = self._prep_sample_sets_arg(sample_sets=sample_sets)

        if isinstance(sample_sets, str):
            # single sample set
            root = self._open_snp_genotypes(sample_set=sample_sets)
            z = root[seq_id]["calldata"][field]
            d = da.from_array(z, chunks=z.chunks)

        else:
            # concatenate multiple sample sets
            ds = [
                self.snp_genotypes(seq_id=seq_id, sample_sets=c, field=field)
                for c in sample_sets
            ]
            d = da.concatenate(ds, axis=1)

        if site_mask is not None:
            filter_pass = self.site_filters(seq_id=seq_id,
                                            mask=site_mask,
                                            analysis=site_filters).compute()
            d = da.compress(filter_pass, d, axis=0)

        return d
Exemple #6
0
def test_compress():
    carr = cupy.random.randint(0, 3, size=(10, 10))

    darr = da.from_array(carr, chunks=(20, 5))

    c = cupy.asarray([True])
    res = da.compress(c, darr, axis=0)

    # cupy.compress is not implemented but dask implementation does not
    # rely on np.compress -- move originial data back to host and
    # compare da.compress with np.compress
    assert_eq(np.compress(c.tolist(), carr.tolist(), axis=0), res)
def test_compress():
    x = np.arange(25).reshape((5, 5))
    a = da.from_array(x, chunks=(2, 2))

    c1 = np.array([True, False, True, False, True])
    c2 = np.array([True, False])
    c3 = [True, False]
    dc1 = da.from_array(c1, chunks=3)
    dc2 = da.from_array(c2, chunks=2)

    for c, dc in [(c1, c1), (c2, c2), (c3, c3), (c1, dc1), (c2, dc2),
                  (c3, dc2)]:
        for axis in [None, 0, 1]:
            res = da.compress(dc, a, axis=axis)
            assert_eq(np.compress(c, x, axis=axis), res)
            if isinstance(dc, da.Array):
                axis = axis or 0
                assert np.isnan(res.chunks[axis]).all()

    with pytest.raises(ValueError):
        da.compress([True, False], a, axis=100)

    with pytest.raises(ValueError):
        da.compress([[True], [False]], a, axis=100)
Exemple #8
0
def test_compress():
    x = np.arange(25).reshape((5, 5))
    a = da.from_array(x, chunks=(2, 2))

    c1 = np.array([True, False, True, False, True])
    c2 = np.array([True, False])
    c3 = [True, False]
    dc1 = da.from_array(c1, chunks=3)
    dc2 = da.from_array(c2, chunks=2)

    for c, dc in [(c1, c1), (c2, c2), (c3, c3),
                  (c1, dc1), (c2, dc2), (c3, dc2)]:
        for axis in [None, 0, 1]:
            res = da.compress(dc, a, axis=axis)
            assert_eq(np.compress(c, x, axis=axis), res)
            if isinstance(dc, da.Array):
                axis = axis or 0
                assert np.isnan(res.chunks[axis]).all()

    with pytest.raises(ValueError):
        da.compress([True, False], a, axis=100)

    with pytest.raises(ValueError):
        da.compress([[True], [False]], a, axis=100)
    def load_sample_set_calldata(self, seq_id, sample_set, field="GT", mask=None):

        if isinstance(sample_set, str):

            calldata = self.__class__._data_catalog.ag3.snp_genotypes(sample_set=sample_set).to_zarr()
            arr = da.from_zarr(calldata[f"{seq_id}/calldata/{field}"])
            
        elif isinstance(sample_set, list):
            arr = da.concatenate(
                [self.load_sample_set_calldata(seq_id, s, field=field, mask=None) for s in sample_set], axis=1)
        else:
            raise ValueError("sample_set must be a string, or a list of strings")

        if mask is not None:

            assert isinstance(mask, da.core.Array), "mask must be a dask_array"

            arr = da.compress(mask, arr, axis=0).compute_chunk_sizes()

        return arr
    def load_variants_array(self, seq_id, field="POS", mask=None):
        """
        release_pa

        """

        path = self.release_dir / "snp_genotypes" / "all" / "sites"

        # need to open as mapping if this on cloud
        storez = self.gcs.get_mapper(path.as_posix())
        calldata = zarr.Group(storez)

        arr = da.from_zarr(calldata[f"{seq_id}/variants/{field}"])

        if mask is not None:

            assert isinstance(mask, da.core.Array), "mask must be a dask_array"
            arr = da.compress(mask, arr, axis=0).compute_chunk_sizes()

        return arr
    def load_calldata_by_sampleset(self,
                                   seq_id,
                                   sampleset,
                                   field="GT",
                                   mask=None):

        if isinstance(sampleset, str):

            path = self.release_dir / "snp_genotypes" / "all" / sampleset
            print(path)

            # need to open as mapping if this on cloud
            storez = self.gcs.get_mapper(path.as_posix())
            calldata = zarr.Group(storez)

            arr = da.from_zarr(calldata[f"{seq_id}/calldata/{field}"])

        elif isinstance(sampleset, list):
            arr = da.concatenate([
                self.load_calldata_by_sampleset(
                    seq_id, s, field=field, mask=None) for s in sampleset
            ],
                                 axis=1)
        else:
            raise ValueError(
                "sampleset must be a string, or a list of strings")

        if mask is not None:

            assert isinstance(mask, da.core.Array), "mask must be a dask_array"

            arr = da.compress(mask, arr, axis=0).compute_chunk_sizes()

        if field == "GT":
            arr = allel.GenotypeDaskArray(arr)

        return arr
Exemple #12
0
 def compress(self, condition, axis=None):
     out = da.compress(condition, self, axis=axis)
     if len(out.shape) == len(self.shape):
         out = view_subclass(out, type(self))
     return out
def select_mask(a, mask, *, axis=0):
    a = ensure_dask_array(a)
    mask = ensure_dask_or_numpy_array(mask)
    return da.compress(mask, a, axis=axis)
Exemple #14
0
 def compress(self, condition, axis=None):
     out = da.compress(condition, self, axis=axis)
     if len(out.shape) == len(self.shape):
         out = view_subclass(out, type(self))
     return out