Ejemplo n.º 1
0
 def isin(element, test_elements, assume_unique=False, invert=False):
     element = da.asarray(element)
     test_elements = da.asarray(test_elements)
     element_axes = tuple(range(element.ndim))
     test_axes = tuple(i + element.ndim for i in range(test_elements.ndim))
     mapped = da.atop(_isin_kernel, element_axes + test_axes,
                      element, element_axes,
                      test_elements, test_axes,
                      adjust_chunks={axis: lambda _: 1
                                     for axis in test_axes},
                      dtype=bool,
                      assume_unique=assume_unique)
     result = mapped.any(axis=test_axes)
     if invert:
         result = ~result
     return result
Ejemplo n.º 2
0
def test_boolean_numpy_array_slicing():
    with pytest.raises(IndexError):
        da.asarray(range(2))[np.array([True])]
    with pytest.raises(IndexError):
        da.asarray(range(2))[np.array([False, False, False])]
    x = np.arange(5)
    ind = np.array([True, False, False, False, True])
    assert_eq(da.asarray(x)[ind], x[ind])
    # https://github.com/dask/dask/issues/3706
    ind = np.array([True])
    assert_eq(da.asarray([0])[ind], np.arange(1)[ind])
Ejemplo n.º 3
0
def test_tile_basic(reps):
    a = da.asarray([0, 1, 2])
    b = [[1, 2], [3, 4]]

    assert_eq(np.tile(a.compute(), reps), da.tile(a, reps))
    assert_eq(np.tile(b, reps), da.tile(b, reps))
Ejemplo n.º 4
0
def regenie_transform(
    G: ArrayLike,
    X: ArrayLike,
    Y: ArrayLike,
    contigs: ArrayLike,
    *,
    variant_block_size: Optional[Union[int, Tuple[int, ...]]] = None,
    sample_block_size: Optional[Union[int, Tuple[int, ...]]] = None,
    alphas: Optional[ArrayLike] = None,
    add_intercept: bool = True,
    orthogonalize: bool = False,
    normalize: bool = False,
    _glow_adj_dof: bool = False,
    _glow_adj_alpha: bool = False,
    _glow_adj_scaling: bool = False,
) -> Dataset:
    """Regenie trait transformation.

    Parameters
    ----------
    G
        [array-like, shape: (M, N)]
        Genotype data array, `M` samples by `N` variants.
    X
        [array-like, shape: (M, C)]
        Covariate array, `M` samples by `C` covariates.
    Y
        [array-like, shape: (M, O)]
        Outcome array, `M` samples by `O` outcomes.
    contigs
        [array-like, shape: (N,)]
        Variant contigs as monotonic increasting integer contig index.

    See the `regenie` function for documentation on remaining fields.

    Returns
    -------
    A dataset containing the following variables:

    - `base_prediction` (blocks, alphas, samples, outcomes): Stage 1
        predictions from ridge regression reduction .
    - `meta_prediction` (samples, outcomes): Stage 2 predictions from
        the best meta estimator trained on the out-of-sample Stage 1
        predictions.
    - `loco_prediction` (contigs, samples, outcomes): LOCO predictions
        resulting from Stage 2 predictions ignoring effects for variant
        blocks on held out contigs. This will be absent if the
        data provided does not contain at least 2 contigs.

    Raises
    ------
    ValueError
        If `G`, `X`, and `Y` do not have the same size along
        the first (samples) dimension.
    """
    if not G.shape[0] == X.shape[0] == Y.shape[0]:
        raise ValueError(
            "All data arrays must have same size along first (samples) dimension "
            f"(shapes provided: G={G.shape}, X={X.shape}, Y={Y.shape})")
    n_sample = Y.shape[0]
    n_variant = G.shape[1]

    if alphas is not None:
        alphas = np.asarray(alphas)

    G, X, Y = da.asarray(G), da.asarray(X), da.asarray(Y)
    contigs = da.asarray(contigs)

    # Set default block sizes if not provided
    if variant_block_size is None:
        # Block in groups of 1000, unless dataset is small
        # enough to default to 2 blocks (typically for tests)
        variant_block_size = min(1000, n_variant // 2)
    if sample_block_size is None:
        # Break into 10 chunks of approximately equal size
        sample_block_size = tuple(
            split_array_chunks(n_sample, min(10, n_sample)))
        assert sum(sample_block_size) == n_sample

    if normalize:
        # See: https://github.com/projectglow/glow/issues/255
        dof = 1 if _glow_adj_dof else 0
        G = (G - G.mean(axis=0)) / G.std(axis=0, ddof=dof)
        Y = (Y - Y.mean(axis=0)) / Y.std(axis=0)
        X = (X - X.mean(axis=0)) / X.std(axis=0)

    if add_intercept:
        X = da.concatenate([da.ones((X.shape[0], 1), dtype=X.dtype), X],
                           axis=1)

    # TODO: Test this after finding out whether or not there was a good reason
    # it was precluded in glow by unit covariate regularization:
    # https://github.com/projectglow/glow/issues/266
    if orthogonalize:  # pragma: no cover
        G = G - X @ da.linalg.lstsq(X, G)[0]
        Y = Y - X @ da.linalg.lstsq(X, Y)[0]
        G = G / G.std(axis=0)
        Y = Y / Y.std(axis=0)
        X = da.zeros(shape=(n_sample, 0), dtype=G.dtype)

    variant_chunk_start, variant_chunk_size = _variant_block_indexes(
        variant_block_size, contigs)
    G = G.rechunk(chunks=(sample_block_size, tuple(variant_chunk_size)))
    X = X.rechunk(chunks=(sample_block_size, -1))
    Y = Y.rechunk(chunks=(sample_block_size, -1))

    YP1 = _stage_1(G, X, Y, alphas=alphas)
    B2, YP2 = _stage_2(
        YP1,
        X,
        Y,
        alphas=alphas,
        _glow_adj_alpha=_glow_adj_alpha,
        _glow_adj_scaling=_glow_adj_scaling,
    )
    YP3 = _stage_3(B2, YP1, X, Y, contigs, variant_chunk_start)

    data_vars: Dict[Hashable, Any] = {}
    data_vars[variables.base_prediction] = xr.DataArray(
        YP1,
        dims=("blocks", "alphas", "samples", "outcomes"),
        attrs={"description": DESC_BASE_PRED},
    )
    data_vars[variables.meta_prediction] = xr.DataArray(
        YP2,
        dims=("samples", "outcomes"),
        attrs={"description": DESC_META_PRED})
    if YP3 is not None:
        data_vars[variables.loco_prediction] = xr.DataArray(
            YP3,
            dims=("contigs", "samples", "outcomes"),
            attrs={"description": DESC_LOCO_PRED},
        )
    return create_dataset(data_vars)
Ejemplo n.º 5
0
def count_call_alleles(
    ds: Dataset,
    *,
    call_genotype: Hashable = variables.call_genotype,
    merge: bool = True,
) -> Dataset:
    """Compute per sample allele counts from genotype calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_genotype
        Input variable name holding call_genotype as defined by
        :data:`sgkit.variables.call_genotype_spec`.
        Must be present in ``ds``.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.call_allele_count_spec`
    of allele counts with shape (variants, samples, alleles) and values corresponding to
    the number of non-missing occurrences of each allele.

    Examples
    --------

    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1
    variants
    0         1/0  1/0
    1         1/0  1/1
    2         0/1  1/0
    3         0/0  0/0

    >>> sg.count_call_alleles(ds)["call_allele_count"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[1, 1],
            [1, 1]],
    <BLANKLINE>
           [[1, 1],
            [0, 2]],
    <BLANKLINE>
           [[1, 1],
            [1, 1]],
    <BLANKLINE>
           [[2, 0],
            [2, 0]]], dtype=uint8)
    """
    variables.validate(ds, {call_genotype: variables.call_genotype_spec})
    n_alleles = ds.dims["alleles"]
    G = da.asarray(ds[call_genotype])
    shape = (G.chunks[0], G.chunks[1], n_alleles)
    N = da.empty(n_alleles, dtype=np.uint8)
    new_ds = create_dataset({
        variables.call_allele_count: (
            ("variants", "samples", "alleles"),
            da.map_blocks(count_alleles,
                          G,
                          N,
                          chunks=shape,
                          drop_axis=2,
                          new_axis=2),
        )
    })
    return conditional_merge_datasets(ds, new_ds, merge)
Ejemplo n.º 6
0
    def gradient(f, *varargs, **kwargs):
        f = da.asarray(f)

        kwargs["edge_order"] = math.ceil(kwargs.get("edge_order", 1))
        if kwargs["edge_order"] > 2:
            raise ValueError("edge_order must be less than or equal to 2.")

        drop_result_list = False
        axis = kwargs.pop("axis", None)
        if axis is None:
            axis = tuple(range(f.ndim))
        elif isinstance(axis, Integral):
            drop_result_list = True
            axis = (axis,)

        axis = validate_axis(axis, f.ndim)

        if len(axis) != len(set(axis)):
            raise ValueError("duplicate axes not allowed")

        axis = tuple(ax % f.ndim for ax in axis)

        if varargs == ():
            varargs = (1,)
        if len(varargs) == 1:
            varargs = len(axis) * varargs
        if len(varargs) != len(axis):
            raise TypeError(
                "Spacing must either be a single scalar, or a scalar / "
                "1d-array per axis"
            )

        if issubclass(f.dtype.type, (np.bool8, Integral)):
            f = f.astype(float)
        elif issubclass(f.dtype.type, Real) and f.dtype.itemsize < 4:
            f = f.astype(float)

        results = []
        for i, ax in enumerate(axis):
            for c in f.chunks[ax]:
                if np.min(c) < kwargs["edge_order"] + 1:
                    raise ValueError(
                        'Chunk size must be larger than edge_order + 1. '
                        'Minimum chunk for aixs {} is {}. Rechunk to '
                        'proceed.'.format(np.min(c), ax))

            if np.isscalar(varargs[i]):
                array_locs = None
            else:
                if isinstance(varargs[i], da.Array):
                    raise NotImplementedError(
                        'dask array coordinated is not supported.')
                # coordinate position for each block taking overlap into
                # account
                chunk = np.array(f.chunks[ax])
                array_loc_stop = np.cumsum(chunk) + 1
                array_loc_start = array_loc_stop - chunk - 2
                array_loc_stop[-1] -= 1
                array_loc_start[0] = 0
                array_locs = (array_loc_start, array_loc_stop)

            results.append(f.map_overlap(
                _gradient_kernel,
                dtype=f.dtype,
                depth={j: 1 if j == ax else 0 for j in range(f.ndim)},
                boundary="none",
                coord=varargs[i],
                axis=ax,
                array_locs=array_locs,
                grad_kwargs=kwargs,
            ))

        if drop_result_list:
            results = results[0]

        return results
Ejemplo n.º 7
0
ax.set_aspect('equal')
plt.show()

# +
fig, ax = plt.subplots(figsize=[10, 10], constrained_layout=True)
base_extent = np.array(
    [-dims[1] // 2, dims[1] // 2, -dims[2] // 2, dims[2] // 2])

ax.scatter(*cpc,
           c=cropindices,
           cmap='nipy_spectral',
           zorder=5,
           linewidths=1,
           edgecolors='black')
cfac = 4
coarse_mask = da.coarsen(np.all, da.asarray(mask), {0: cfac, 1: cfac})
cropdata = da.coarsen(np.mean, data[cropindices], {1: cfac, 2: cfac}).persist()

xlim, ylim = np.array([ax.get_xlim(), ax.get_ylim()])
vmin, vmax = da.nanmin(cropdata).compute(), da.nanmax(cropdata).compute()
for i in range(len(cropdata)):
    plt.imshow(
        np.where(coarse_mask, cropdata[i], np.nan).T,
        extent=base_extent +
        np.array([cpc[0, i], cpc[0, i], cpc[1, i], cpc[1, i]]),
        origin='lower',
        #alpha=0.5,
        cmap='gray',
        vmin=vmin,
        vmax=vmax,
    )
aligned_prior_fluxes = aligned_prior_fluxes.transpose("flux_time", "dim_y",
                                                      "dim_x", "realization")
aligned_influences = aligned_influences.transpose("observation", "flux_time",
                                                  "dim_y", "dim_x")
write_progress_message("Rechunked to square")
aligned_influences = aligned_influences.fillna(0)
aligned_true_fluxes.astype(np.float32).load()
aligned_prior_fluxes.astype(np.float32).load()
aligned_influences.astype(np.float32).load()
write_progress_message("Loaded data")

# 23 min for seven towers over a month
# 11 min next run
# This includes the realignment time deferred with dask above
sparse_influences = sparse.COO(aligned_influences.values)
aligned_influences.data = da.asarray(sparse_influences)

print(datetime.datetime.now(UTC).strftime("%c"), "Converted to COO")
flush_output_streams()

posterior_var_atts = aligned_prior_fluxes.attrs.copy()
posterior_var_atts.update(
    dict(
        long_name="posterior_fluxes",
        units=PRIOR_FLUXES_MATCHED[PRIOR_FLUX_NAME].attrs["units"],
        description="posterior fluxes using dask for a month",
        origin="OI using dask for a month",
        prior_flux_name=PRIOR_FLUX_NAME,
        flux_window=FLUX_WINDOW,
        observation_window=OBS_WINDOW,
        ancillary_variables="reduced_posterior_covariance",
Ejemplo n.º 9
0
def test_unique():
    a = [1, 2, 2]
    assert_allclose(unique(a), [1, 2])

    a = da.asarray(a)
    assert_allclose(unique(a), [1, 2])
Ejemplo n.º 10
0
    def test_inverse_transform(self, array):

        a = dpp.LabelEncoder()
        assert_eq_ar(a.inverse_transform(a.fit_transform(array)),
                     da.asarray(array))
Ejemplo n.º 11
0
def individual_heterozygosity(
    ds: Dataset,
    *,
    call_allele_count: Hashable = variables.call_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute per call individual heterozygosity.

    Individual heterozygosity is the probability that two alleles
    drawn at random without replacement, from an individual at a
    given site, are not identical in state. Therefore, individual
    heterozygosity is defined for diploid and polyploid calls but
    will return nan in the case of haploid calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_allele_count
        Input variable name holding call_allele_count as defined by
        :data:`sgkit.variables.call_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_call_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.
    Returns
    -------
    A dataset containing :data:`sgkit.variables.call_heterozygosity_spec`
    of per genotype observed heterozygosity with shape (variants, samples)
    containing values within the interval [0, 1] or nan if ploidy < 2.

    Examples
    --------

    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1
    variants
    0         1/0  1/0
    1         1/0  1/1
    2         0/1  1/0
    3         0/0  0/0

    >>> sg.individual_heterozygosity(ds)["call_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE
    array([[1., 1.],
           [1., 0.],
           [1., 1.],
           [0., 0.]])
    """
    ds = define_variable_if_absent(ds, variables.call_allele_count,
                                   call_allele_count, count_call_alleles)
    variables.validate(ds,
                       {call_allele_count: variables.call_allele_count_spec})

    AC = da.asarray(ds.call_allele_count)
    K = AC.sum(axis=-1)
    # use nan denominator to avoid divide by zero with K - 1
    K2 = da.where(K > 1, K, np.nan)
    AF = AC / K2[..., None]
    HI = (1 - da.sum(AF**2, axis=-1)) * (K / (K2 - 1))
    new_ds = create_dataset(
        {variables.call_heterozygosity: (("variants", "samples"), HI)})
    return conditional_merge_datasets(ds, new_ds, merge)
Ejemplo n.º 12
0
def test_asarray():
    y = da.asarray(xr.DataArray([1, 2, 3.0]))
    assert isinstance(y, da.Array)
    assert_eq(y, y)
Ejemplo n.º 13
0
def segment(
    image,
    channels,
    model_type,
    diameter,
    fast_mode=False,
    use_anisotropy=True,
    iou_depth=2,
    iou_threshold=0.7,
):
    """Use cellpose to segment nuclei in fluorescence data.

    Parameters
    ----------
    image : array of shape (z, y, x, channel)
        Image used for detection of objects
    channels : array of int with size 2
        See cellpose
    model_type : str
        "cyto" or "nuclei"
    diameter : tuple of size 3
        Approximate diameter (in pixels) of a segmented region, i.e. cell width
    fast_mode : bool
        In fast mode, network averaging, tiling, and augmentation are turned off.
    use_anisotropy : bool
        If true, use anisotropy parameter of cellpose
    iou_depth: dask depth parameter
        Number of pixels of overlap to use in intersection-over-union calculation when
        linking segments across neighboring, overlapping dask chunk regions.
    iou_threshold: float
        Minimum intersection-over-union in neighboring, overlapping dask chunk regions
        to be considered the same segment.  The region for calculating IOU is given by the
        iou_depth parameter.

    Returns:
        segments : array of int32 with same shape as input
            Each segmented cell is assigned a number and all its pixels contain that value (0 is background)
    """
    assert image.ndim == 4, image.ndim
    assert image.shape[-1] in {1, 2}, image.shape
    assert diameter[1] == diameter[2], diameter

    diameter_yx = diameter[1]
    anisotropy = diameter[0] / diameter[1] if use_anisotropy else None

    image = da.asarray(image)
    image = image.rechunk({-1: -1})  # color channel is chunked together

    depth = tuple(np.ceil(diameter).astype(np.int64))
    boundary = "reflect"

    # No chunking in channel direction
    image = da.overlap.overlap(image, depth + (0, ), boundary)

    block_iter = zip(
        np.ndindex(*image.numblocks),
        map(
            functools.partial(operator.getitem, image),
            da.core.slices_from_chunks(image.chunks),
        ),
    )

    labeled_blocks = np.empty(image.numblocks[:-1], dtype=object)
    total = None
    for index, input_block in block_iter:
        labeled_block, n = dask.delayed(segment_chunk, nout=2)(
            input_block,
            channels,
            model_type,
            diameter_yx,
            anisotropy,
            fast_mode,
            index,
        )

        shape = input_block.shape[:-1]
        labeled_block = da.from_delayed(labeled_block,
                                        shape=shape,
                                        dtype=np.int32)

        n = dask.delayed(np.int32)(n)
        n = da.from_delayed(n, shape=(), dtype=np.int32)

        total = n if total is None else total + n

        block_label_offset = da.where(labeled_block > 0, total, np.int32(0))
        labeled_block += block_label_offset

        labeled_blocks[index[:-1]] = labeled_block
        total += n

    # Put all the blocks together
    block_labeled = da.block(labeled_blocks.tolist())

    depth = da.overlap.coerce_depth(len(depth), depth)

    if np.prod(block_labeled.numblocks) > 1:
        iou_depth = da.overlap.coerce_depth(len(depth), iou_depth)

        if any(iou_depth[ax] > depth[ax] for ax in depth.keys()):
            raise DistSegError("iou_depth (%s) > depth (%s)" %
                               (iou_depth, depth))

        trim_depth = {k: depth[k] - iou_depth[k] for k in depth.keys()}
        block_labeled = da.overlap.trim_internal(block_labeled,
                                                 trim_depth,
                                                 boundary=boundary)
        block_labeled = link_labels(
            block_labeled,
            total,
            iou_depth,
            iou_threshold=iou_threshold,
        )

        block_labeled = da.overlap.trim_internal(block_labeled,
                                                 iou_depth,
                                                 boundary=boundary)

    else:
        block_labeled = da.overlap.trim_internal(block_labeled,
                                                 depth,
                                                 boundary=boundary)

    return block_labeled
Ejemplo n.º 14
0
def dask_hist1d(
    a: Array, bins=None, range=None, normed=False, weights=None, density=None
):
    """
    Blocked variant of :func:`numpy.histogram`, but using the fast-histogram module.

    Parameters
    ----------
    a : array_like
        Input data. The histogram is computed over the flattened array.
    bins : int or sequence of scalars, optional
        Either an iterable specifying the ``bins`` or the number of ``bins``
        and a ``range`` argument is required as computing ``min`` and ``max``
        over blocked arrays is an expensive operation that must be performed
        explicitly.
        If `bins` is an int, it defines the number of equal-width
        bins in the given range (10, by default). If `bins` is a
        sequence, it defines a monotonically increasing array of bin edges,
        including the rightmost edge, allowing for non-uniform bin widths.
    range : (float, float), optional
        The lower and upper range of the bins.  If not provided, range
        is simply ``(a.min(), a.max())``.  Values outside the range are
        ignored. The first element of the range must be less than or
        equal to the second. `range` affects the automatic bin
        computation as well. While bin width is computed to be optimal
        based on the actual data within `range`, the bin count will fill
        the entire range including portions containing no data.
    normed : bool, optional
        This is equivalent to the ``density`` argument, but produces incorrect
        results for unequal bin widths. It should not be used.
    weights : array_like, optional
        A dask.array.Array of weights, of the same block structure as ``a``.  Each value in
        ``a`` only contributes its associated weight towards the bin count
        (instead of 1). If ``density`` is True, the weights are
        normalized, so that the integral of the density over the range
        remains 1.
    density : bool, optional
        If ``False``, the result will contain the number of samples in
        each bin. If ``True``, the result is the value of the
        probability *density* function at the bin, normalized such that
        the *integral* over the range is 1. Note that the sum of the
        histogram values will not be equal to 1 unless bins of unity
        width are chosen; it is not a probability *mass* function.
        Overrides the ``normed`` keyword if given.
        If ``density`` is True, ``bins`` cannot be a single-number delayed
        value. It must be a concrete number, or a (possibly-delayed)
        array/sequence of the bin edges.
    Returns
    -------
    hist : dask Array
        The values of the histogram. See `density` and `weights` for a
        description of the possible semantics.
    bin_edges : dask Array of dtype float
        Return the bin edges ``(length(hist)+1)``.


    Examples
    --------
    Using number of bins and range:

    >>> import dask.array as da
    >>> import numpy as np
    >>> x = da.from_array(np.arange(10000), chunks=10)
    >>> h, bins = da.histogram(x, bins=10, range=[0, 10000])
    >>> bins
    array([    0.,  1000.,  2000.,  3000.,  4000.,  5000.,  6000.,  7000.,
            8000.,  9000., 10000.])
    >>> h.compute()
    array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000])

    Explicitly specifying the bins:

    >>> h, bins = da.histogram(x, bins=np.array([0, 5000, 10000]))
    >>> bins
    array([    0,  5000, 10000])
    >>> h.compute()
    array([5000, 5000])
    """
    if isinstance(bins, Array):
        scalar_bins = bins.ndim == 0
        # ^ `np.ndim` is not implemented by Dask array.
    elif isinstance(bins, Delayed):
        scalar_bins = bins._length is None or bins._length == 1
    else:
        scalar_bins = np.ndim(bins) == 0

    if bins is None or (scalar_bins and range is None):
        raise ValueError(
            "dask.array.histogram requires either specifying "
            "bins as an iterable or specifying both a range and "
            "the number of bins"
        )

    if weights is not None and weights.chunks != a.chunks:
        raise ValueError("Input array and weights must have the same chunked structure")

    if normed is not False:
        raise ValueError(
            "The normed= keyword argument has been deprecated. "
            "Please use density instead. "
            "See the numpy.histogram docstring for more information."
        )

    if density and scalar_bins and isinstance(bins, (Array, Delayed)):
        raise NotImplementedError(
            "When `density` is True, `bins` cannot be a scalar Dask object. "
            "It must be a concrete number or a (possibly-delayed) array/sequence of bin edges."
        )

    for argname, val in [("bins", bins), ("range", range), ("weights", weights)]:
        if not isinstance(bins, (Array, Delayed)) and is_dask_collection(bins):
            raise TypeError(
                "Dask types besides Array and Delayed are not supported "
                "for `histogram`. For argument `{}`, got: {!r}".format(argname, val)
            )

    if range is not None:
        try:
            if len(range) != 2:
                raise ValueError(
                    f"range must be a sequence or array of length 2, but got {len(range)} items"
                )
            if isinstance(range, (Array, np.ndarray)) and range.shape != (2,):
                raise ValueError(
                    f"range must be a 1-dimensional array of two items, but got an array of shape {range.shape}"
                )
        except TypeError:
            raise TypeError(
                f"Expected a sequence or array for range, not {range}"
            ) from None

    token = tokenize(a, bins, range, weights, density)
    name = "histogram-sum-" + token

    if scalar_bins:
        bins = _linspace_from_delayed(range[0], range[1], bins + 1)
        # ^ NOTE `range[1]` is safe because of the above check, and the initial check
        # that range must not be None if `scalar_bins`
    else:
        if not isinstance(bins, (Array, np.ndarray)):
            bins = asarray(bins)
        if bins.ndim != 1:
            raise ValueError(
                f"bins must be a 1-dimensional array or sequence, got shape {bins.shape}"
            )

    (bins_ref, range_ref), deps = unpack_collections([bins, range])

    # Map the histogram to all bins, forming a 2D array of histograms, stacked for each chunk
    if weights is None:
        dsk = {
            (name, i, 0): (_block_fast_hist1d, k, bins_ref, range_ref)
            for i, k in enumerate(flatten(a.__dask_keys__()))
        }
        dtype = np.histogram([])[0].dtype
    else:
        a_keys = flatten(a.__dask_keys__())
        w_keys = flatten(weights.__dask_keys__())
        dsk = {
            (name, i, 0): (_block_fast_hist1d, k, bins_ref, range_ref, w)
            for i, (k, w) in enumerate(zip(a_keys, w_keys))
        }
        dtype = weights.dtype

    deps = (a,) + deps
    if weights is not None:
        deps += (weights,)
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=deps)

    # Turn graph into a 2D Array of shape (nchunks, nbins)
    nchunks = len(list(flatten(a.__dask_keys__())))
    nbins = bins.size - 1  # since `bins` is 1D
    chunks = ((1,) * nchunks, (nbins,))
    mapped = Array(graph, name, chunks, dtype=dtype)

    # Sum over chunks to get the final histogram
    n = mapped.sum(axis=0)

    # We need to replicate normed and density options from numpy
    if density is not None:
        if density:
            db = asarray(np.diff(bins).astype(float), chunks=n.chunks)
            return n / db / n.sum(), bins
        else:
            return n, bins
    else:
        return n, bins
Ejemplo n.º 15
0
def observed_heterozygosity(
    ds: Dataset,
    *,
    call_heterozygosity: Hashable = variables.call_heterozygosity,
    sample_cohort: Hashable = variables.sample_cohort,
    merge: bool = True,
) -> Dataset:
    """Compute per cohort observed heterozygosity.

    The observed heterozygosity of a cohort is the mean of individual
    heterozygosity values among all samples of that cohort as described
    in :func:`individual_heterozygosity`. Calls with a nan value for
    individual heterozygosity are ignored when calculating the cohort
    mean.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window_by_position` or :func:`window_by_variant` before calling
    this function.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_heterozygosity
        Input variable name holding call_heterozygosity as defined by
        :data:`sgkit.variables.call_heterozygosity_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`individual_heterozygosity`.
    sample_cohort
        Input variable name holding sample_cohort as defined by
        :data:`sgkit.variables.sample_cohort_spec`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.
    Returns
    -------
    A dataset containing :data:`sgkit.variables.stat_observed_heterozygosity_spec`
    of per cohort observed heterozygosity with shape (variants, cohorts)
    containing values within the inteval [0, 1] or nan.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.observed_heterozygosity(ds)["stat_observed_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.5, 1. ],
        [1. , 0.5],
        [0. , 1. ],
        [0.5, 0.5],
        [0.5, 0.5]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window_by_variant(ds, size=3)
    >>> sg.observed_heterozygosity(ds)["stat_observed_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE
    array([[1.5, 2.5],
        [1. , 1. ]])
    """
    ds = define_variable_if_absent(
        ds,
        variables.call_heterozygosity,
        call_heterozygosity,
        individual_heterozygosity,
    )
    variables.validate(
        ds, {call_heterozygosity: variables.call_heterozygosity_spec})
    hi = da.asarray(ds[call_heterozygosity])
    sc = da.asarray(ds[sample_cohort])
    n_cohorts = sc.max().compute() + 1
    shape = (hi.chunks[0], n_cohorts)
    n = da.zeros(n_cohorts, dtype=np.uint8)
    ho = da.map_blocks(
        _cohort_observed_heterozygosity,
        hi,
        sc,
        n,
        chunks=shape,
        drop_axis=1,
        new_axis=1,
        dtype=np.float64,
    )
    if has_windows(ds):
        ho_sum = window_statistic(
            ho,
            np.sum,
            ds.window_start.values,
            ds.window_stop.values,
            dtype=ho.dtype,
            axis=0,
        )
        new_ds = create_dataset({
            variables.stat_observed_heterozygosity: (
                ("windows", "cohorts"),
                ho_sum,
            )
        })
    else:
        new_ds = create_dataset({
            variables.stat_observed_heterozygosity: (
                ("variants", "cohorts"),
                ho,
            )
        })
    return conditional_merge_datasets(ds, new_ds, merge)
Ejemplo n.º 16
0
def pbs(
    ds: Dataset,
    *,
    stat_Fst: Hashable = variables.stat_Fst,
    cohorts: Optional[Sequence[Union[Tuple[int, int, int],
                                     Tuple[str, str, str]]]] = None,
    merge: bool = True,
) -> Dataset:
    """Compute the population branching statistic (PBS) between cohort triples.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window_by_position` or :func:`window_by_variant` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    stat_Fst
        Fst variable to use or calculate. Defined by
        :data:`sgkit.variables.stat_Fst_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`Fst`.
    cohorts
        The cohort triples to compute statistics for, specified as a sequence of
        tuples of cohort indexes or IDs. None (the default) means compute statistics
        for all cohorts.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the PBS value between cohort triples, as defined by
    :data:`sgkit.variables.stat_pbs_spec`.
    Shape (variants, cohorts, cohorts, cohorts), or
    (windows, cohorts, cohorts, cohorts) if windowing information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=6)

    >>> # Divide samples into three named cohorts
    >>> n_cohorts = 3
    >>> sample_cohort = np.repeat(range(n_cohorts), ds.dims["samples"] // n_cohorts)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")
    >>> cohort_names = [f"co_{i}" for i in range(n_cohorts)]
    >>> ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names, "cohorts_2": cohort_names})

    >>> # Divide into two windows of size three (variants)
    >>> ds = sg.window_by_variant(ds, size=3)
    >>> sg.pbs(ds)["stat_pbs"].sel(cohorts_0="co_0", cohorts_1="co_1", cohorts_2="co_2").values # doctest: +NORMALIZE_WHITESPACE
    array([ 0.      , -0.160898])
    """

    ds = define_variable_if_absent(ds, variables.stat_Fst, stat_Fst, Fst)
    variables.validate(ds, {stat_Fst: variables.stat_Fst_spec})

    fst = ds[variables.stat_Fst]
    fst = fst.clip(min=0, max=(1 - np.finfo(float).epsneg))

    t = -np.log(1 - fst)
    n_cohorts = ds.dims["cohorts"]
    n_windows = ds.dims["windows"]
    assert_array_shape(t, n_windows, n_cohorts, n_cohorts)

    # calculate PBS triples
    t = da.asarray(t)
    shape = (t.chunks[0], n_cohorts, n_cohorts, n_cohorts)

    cohorts = cohorts or list(itertools.combinations(range(n_cohorts),
                                                     3))  # type: ignore
    ct = _cohorts_to_array(cohorts, ds.indexes.get("cohorts_0", None))

    p = da.map_blocks(lambda t: _pbs_cohorts(t, ct),
                      t,
                      chunks=shape,
                      new_axis=3,
                      dtype=np.float64)
    assert_array_shape(p, n_windows, n_cohorts, n_cohorts, n_cohorts)

    new_ds = create_dataset({
        variables.stat_pbs:
        (["windows", "cohorts_0", "cohorts_1", "cohorts_2"], p)
    })
    return conditional_merge_datasets(ds, new_ds, merge)
 def test_lazy_nop(self):
     src = self.realistic_cube[:2, :3, :10, :10]
     src.data = da.asarray(src.data, chunks=((1, 1), (2, 1), (10,), (10,)))
     res = regrid_area_weighted(src, src)
     self.assertTrue(res.has_lazy_data())
     self.assertEqual(res, src)
results = []
for variable_name, variable in ds.data_vars.items():
    y = delayed(convert_variable)(variable_name, variable)
    results.append(y)

total = delayed(sum)(results)
total.visualize()
# +
# %%time

print(total.compute(), "data variables converted")
# -

for variable_name in ["lat", "lon", "prob"]:
    variable = ds[variable_name]
    zarr_path = target_path.joinpath(variable_name)
    if zarr_path.exists():
        shutil.rmtree(zarr_path)
    da.asarray(variable.data).to_zarr(str(zarr_path))

variable_name = "time"
time = ds[variable_name]
time_in_days = np.array(time[0], dtype="datetime64[D]") + np.arange(
    (len(time) - 1) * days_per_month + 1)
time_in_days.shape

zarr_path = target_path.joinpath(variable_name)
if zarr_path.exists():
    shutil.rmtree(zarr_path)
da.from_array(time_in_days).to_zarr(str(zarr_path))
Ejemplo n.º 19
0
def gwas_linear_regression(
    ds: Dataset,
    *,
    dosage: Hashable,
    covariates: Union[Hashable, Sequence[Hashable]],
    traits: Union[Hashable, Sequence[Hashable]],
    add_intercept: bool = True,
    call_genotype: Hashable = variables.call_genotype,
    merge: bool = True,
) -> Dataset:
    """Run linear regression to identify continuous trait associations with genetic variants.

    This method solves OLS regressions for each variant simultaneously and reports
    effect statistics as defined in [1]. This is facilitated by the removal of
    sample (i.e. person/individual) covariates through orthogonal projection
    of both the genetic variant and phenotype data [2]. A consequence of this
    rotation is that effect sizes and significances cannot be reported for
    covariates, only variants.

    Parameters
    ----------
    ds
        Dataset containing necessary dependent and independent variables.
    dosage
        Name of genetic dosage variable.
        Defined by :data:`sgkit.variables.dosage_spec`.
    covariates
        Names of covariate variables (1D or 2D).
        Defined by :data:`sgkit.variables.covariates_spec`.
    traits
        Names of trait variables (1D or 2D).
        Defined by :data:`sgkit.variables.traits_spec`.
    add_intercept
        Add intercept term to covariate set, by default True.
    call_genotype
        Input variable name holding call_genotype.
        Defined by :data:`sgkit.variables.call_genotype_spec`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Warnings
    --------
    Regression statistics from this implementation are only valid when an
    intercept is present. The `add_intercept` flag is a convenience for adding one
    when not already present, but there is currently no parameterization for
    intercept-free regression.

    Additionally, both covariate and trait arrays will be rechunked to have blocks
    along the sample (row) dimension but not the column dimension (i.e.
    they must be tall and skinny).

    Returns
    -------
    Dataset containing (N = num variants, O = num traits):

    variant_linreg_beta : [array-like, shape: (N, O)]
        Beta values associated with each variant and trait
    variant_linreg_t_value : [array-like, shape: (N, O)]
        T statistics for each beta
    variant_linreg_p_value : [array-like, shape: (N, O)]
        P values as float in [0, 1]

    References
    ----------
    - [1] Hastie, Trevor, Robert Tibshirani, and Jerome Friedman. 2009. The Elements
        of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition.
        Springer Science & Business Media.
    - [2] Loh, Po-Ru, George Tucker, Brendan K. Bulik-Sullivan, Bjarni J. Vilhjálmsson,
        Hilary K. Finucane, Rany M. Salem, Daniel I. Chasman, et al. 2015. “Efficient
        Bayesian Mixed-Model Analysis Increases Association Power in Large Cohorts.”
        Nature Genetics 47 (3): 284–90.

    """
    if isinstance(covariates, Hashable):
        covariates = [covariates]
    if isinstance(traits, Hashable):
        traits = [traits]

    variables.validate(
        ds,
        {dosage: variables.dosage_spec},
        {c: variables.covariates_spec for c in covariates},
        {t: variables.traits_spec for t in traits},
    )

    G = _get_loop_covariates(ds, dosage=dosage, call_genotype=call_genotype)

    if len(covariates) == 0:
        if add_intercept:
            X = da.ones((ds.dims["samples"], 1), dtype=np.float32)
        else:
            raise ValueError("add_intercept must be True if no covariates specified")
    else:
        X = da.asarray(concat_2d(ds[list(covariates)], dims=("samples", "covariates")))
        if add_intercept:
            X = da.concatenate([da.ones((X.shape[0], 1), dtype=X.dtype), X], axis=1)
    # Note: dask qr decomp (used by lstsq) requires no chunking in one
    # dimension, and because dim 0 will be far greater than the number
    # of covariates for the large majority of use cases, chunking
    # should be removed from dim 1. Also, dim 0 should have the same chunking
    # as G dim 1, so that when XLP is computed in linear_regression() the
    # two arrays have the same chunking.
    X = X.rechunk((G.chunksize[1], -1))

    Y = da.asarray(concat_2d(ds[list(traits)], dims=("samples", "traits")))
    # Like covariates, traits must also be tall-skinny arrays
    Y = Y.rechunk((None, -1))

    res = linear_regression(G.T, X, Y)
    new_ds = create_dataset(
        {
            variables.variant_linreg_beta: (("variants", "traits"), res.beta),
            variables.variant_linreg_t_value: (("variants", "traits"), res.t_value),
            variables.variant_linreg_p_value: (("variants", "traits"), res.p_value),
        }
    )
    return conditional_merge_datasets(ds, new_ds, merge)
Ejemplo n.º 20
0
def hardy_weinberg_test(ds: Dataset,
                        *,
                        genotype_count: Optional[Hashable] = None,
                        ploidy: Optional[int] = None,
                        alleles: Optional[int] = None,
                        merge: bool = True) -> Dataset:
    """Exact test for HWE as described in Wigginton et al. 2005 [1].

    Parameters
    ----------
    ds
        Dataset containing genotype calls or precomputed genotype counts.
    genotype_count
        Name of variable containing precomputed genotype counts, by default
        None. If not provided, these counts will be computed automatically
        from genotype calls. If present, must correspond to an (`N`, 3) array
        where `N` is equal to the number of variants and the 3 columns contain
        heterozygous, homozygous reference, and homozygous alternate counts
        (in that order) across all samples for a variant.
    ploidy
        Genotype ploidy, defaults to ``ploidy`` dimension of provided dataset.
        If the `ploidy` dimension is not present, then this value must be set explicitly.
        Currently HWE calculations are only supported for diploid datasets,
        i.e. ``ploidy`` must equal 2.
    alleles
        Genotype allele count, defaults to ``alleles`` dimension of provided dataset.
        If the `alleles` dimension is not present, then this value must be set explicitly.
        Currently HWE calculations are only supported for biallelic datasets,
        i.e. ``alleles`` must equal 2.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Warnings
    --------
    This function is only applicable to diploid, biallelic datasets.

    Returns
    -------
    Dataset containing (N = num variants):

    variant_hwe_p_value : [array-like, shape: (N, O)]
        P values from HWE test for each variant as float in [0, 1].

    References
    ----------
    - [1] Wigginton, Janis E., David J. Cutler, and Goncalo R. Abecasis. 2005.
        “A Note on Exact Tests of Hardy-Weinberg Equilibrium.” American Journal of
        Human Genetics 76 (5): 887–93.

    Raises
    ------
    NotImplementedError
        If ploidy of provided dataset != 2
    NotImplementedError
        If maximum number of alleles in provided dataset != 2
    """
    ploidy = ploidy or ds.dims.get("ploidy")
    if not ploidy:
        raise ValueError(
            "`ploidy` parameter must be set when not present as dataset dimension."
        )
    if ploidy != 2:
        raise NotImplementedError(
            "HWE test only implemented for diploid genotypes")

    alleles = alleles or ds.dims.get("alleles")
    if not alleles:
        raise ValueError(
            "`alleles` parameter must be set when not present as dataset dimension."
        )
    if alleles != 2:
        raise NotImplementedError(
            "HWE test only implemented for biallelic genotypes")

    # Use precomputed genotype counts if provided
    if genotype_count is not None:
        variables.validate(ds, {genotype_count: variables.genotype_count_spec})
        obs = list(da.asarray(ds[genotype_count]).T)
    # Otherwise compute genotype counts from calls
    else:
        ds = count_genotypes(ds, dim="samples")
        obs = [
            da.asarray(ds[v]) for v in
            ["variant_n_het", "variant_n_hom_ref", "variant_n_hom_alt"]
        ]
    p = da.map_blocks(hardy_weinberg_p_value_vec_jit, *obs)
    new_ds = create_dataset({variables.variant_hwe_p_value: ("variants", p)})
    return conditional_merge_datasets(ds, new_ds, merge)
Ejemplo n.º 21
0
def linear_regression(
    XL: ArrayLike, XC: ArrayLike, Y: ArrayLike
) -> LinearRegressionResult:
    """Efficient linear regression estimation for multiple covariate sets

    Parameters
    ----------
    XL
        [array-like, shape: (M, N)]
        "Loop" covariates for which N separate regressions will be run
    XC
        [array-like, shape: (M, P)]
        "Core" covariates included in the regressions for each loop
        covariate. All P core covariates are used in each of the N
        loop covariate regressions.
    Y
        [array-like, shape: (M, O)]
        Continuous outcomes

    Returns
    -------
    Dataclass containing:

    beta : [array-like, shape: (N, O)]
        Beta values associated with each loop covariate and outcome
    t_value : [array-like, shape: (N, O)]
        T statistics for each beta
    p_value : [array-like, shape: (N, O)]
        P values as float in [0, 1]
    """
    XL, XC = da.asarray(XL), da.asarray(XC)  # Coerce for `lstsq`
    if set([x.ndim for x in [XL, XC, Y]]) != {2}:
        raise ValueError("All arguments must be 2D")
    n_core_covar, n_loop_covar, n_obs, n_outcome = (
        XC.shape[1],
        XL.shape[1],
        Y.shape[0],
        Y.shape[1],
    )
    dof = n_obs - n_core_covar - 1
    if dof < 1:
        raise ValueError(
            "Number of observations (N) too small to calculate sampling statistics. "
            "N must be greater than number of core covariates (C) plus one. "
            f"Arguments provided: N={n_obs}, C={n_core_covar}."
        )

    # Apply orthogonal projection to eliminate core covariates
    # Note: QR factorization or SVD should be used here to find
    # what are effectively OLS residuals rather than matrix inverse
    # to avoid need for MxM array; additionally, dask.lstsq fails
    # with numpy arrays
    LS = XC @ da.linalg.lstsq(XC, XL)[0]
    assert XL.chunksize == LS.chunksize
    XLP = XL - LS
    assert XLP.shape == (n_obs, n_loop_covar)
    YP = Y - XC @ da.linalg.lstsq(XC, Y)[0]
    assert YP.shape == (n_obs, n_outcome)

    # Estimate coefficients for each loop covariate
    # Note: A key assumption here is that 0-mean residuals
    # from projection require no extra terms in variance
    # estimate for loop covariates (columns of G), which is
    # only true when an intercept is present.
    XLPS = (XLP ** 2).sum(axis=0, keepdims=True).T
    assert XLPS.shape == (n_loop_covar, 1)
    B = (XLP.T @ YP) / XLPS
    assert B.shape == (n_loop_covar, n_outcome)

    # Compute residuals for each loop covariate and outcome separately
    YR = YP[:, np.newaxis, :] - XLP[..., np.newaxis] * B[np.newaxis, ...]
    assert YR.shape == (n_obs, n_loop_covar, n_outcome)
    RSS = (YR ** 2).sum(axis=0)
    assert RSS.shape == (n_loop_covar, n_outcome)
    # Get t-statistics for coefficient estimates
    T = B / np.sqrt(RSS / dof / XLPS)
    assert T.shape == (n_loop_covar, n_outcome)
    # Match to p-values
    # Note: t dist not implemented in Dask so this must be delayed,
    # see https://github.com/dask/dask/issues/6857
    P = da.map_blocks(
        lambda t: 2 * stats.distributions.t.sf(np.abs(t), dof), T, dtype="float64"
    )
    assert P.shape == (n_loop_covar, n_outcome)

    return LinearRegressionResult(beta=B, t_value=T, p_value=P)
Ejemplo n.º 22
0
def test_ms_create(tmp_path, chunks, num_chans, corr_types, sources):
    # Set up
    rs = np.random.RandomState(42)

    ms_path = tmp_path / "create.ms"

    ms_table_name = str(ms_path)
    ant_table_name = "::".join((ms_table_name, "ANTENNA"))
    ddid_table_name = "::".join((ms_table_name, "DATA_DESCRIPTION"))
    pol_table_name = "::".join((ms_table_name, "POLARIZATION"))
    spw_table_name = "::".join((ms_table_name, "SPECTRAL_WINDOW"))
    # SOURCE is an optional MS sub-table
    src_table_name = "::".join((ms_table_name, "SOURCE"))

    ms_datasets = []
    ant_datasets = []
    ddid_datasets = []
    pol_datasets = []
    spw_datasets = []
    src_datasets = []

    # For comparison
    all_data_desc_id = []
    all_data = []

    # Create ANTENNA dataset of 64 antennas
    # Each column in the ANTENNA has a fixed shape so we
    # can represent all rows with one dataset
    na = 64
    position = da.random.random((na, 3)) * 10000
    offset = da.random.random((na, 3))
    names = np.array(['ANTENNA-%d' % i for i in range(na)], dtype=np.object)
    ds = Dataset({
        'POSITION': (("row", "xyz"), position),
        'OFFSET': (("row", "xyz"), offset),
        'NAME': (("row", ), da.from_array(names, chunks=na)),
    })
    ant_datasets.append(ds)

    # Create SOURCE datasets
    for s, (name, direction, rest_freq) in enumerate(sources):
        dask_num_lines = da.full((1, ), len(rest_freq), dtype=np.int32)
        dask_direction = da.asarray(direction)[None, :]
        dask_rest_freq = da.asarray(rest_freq)[None, :]
        dask_name = da.asarray(np.asarray([name], dtype=np.object), chunks=1)
        ds = Dataset({
            "NUM_LINES": (("row", ), dask_num_lines),
            "NAME": (("row", ), dask_name),
            "REST_FREQUENCY": (("row", "line"), dask_rest_freq),
            "DIRECTION": (("row", "dir"), dask_direction),
        })
        src_datasets.append(ds)

    # Create POLARISATION datasets.
    # Dataset per output row required because column shapes are variable
    for r, corr_type in enumerate(corr_types):
        dask_num_corr = da.full((1, ), len(corr_type), dtype=np.int32)
        dask_corr_type = da.from_array(corr_type,
                                       chunks=len(corr_type))[None, :]
        ds = Dataset({
            "NUM_CORR": (("row", ), dask_num_corr),
            "CORR_TYPE": (("row", "corr"), dask_corr_type),
        })

        pol_datasets.append(ds)

    # Create multiple MeerKAT L-band SPECTRAL_WINDOW datasets
    # Dataset per output row required because column shapes are variable
    for num_chan in num_chans:
        dask_num_chan = da.full((1, ), num_chan, dtype=np.int32)
        dask_chan_freq = da.linspace(.856e9,
                                     2 * .856e9,
                                     num_chan,
                                     chunks=num_chan)[None, :]
        dask_chan_width = da.full((1, num_chan), .856e9 / num_chan)

        ds = Dataset({
            "NUM_CHAN": (("row", ), dask_num_chan),
            "CHAN_FREQ": (("row", "chan"), dask_chan_freq),
            "CHAN_WIDTH": (("row", "chan"), dask_chan_width),
        })

        spw_datasets.append(ds)

    # For each cartesian product of SPECTRAL_WINDOW and POLARIZATION
    # create a corresponding DATA_DESCRIPTION.
    # Each column has fixed shape so we handle all rows at once
    spw_ids, pol_ids = zip(
        *product(range(len(num_chans)), range(len(corr_types))))
    dask_spw_ids = da.asarray(np.asarray(spw_ids, dtype=np.int32))
    dask_pol_ids = da.asarray(np.asarray(pol_ids, dtype=np.int32))
    ddid_datasets.append(
        Dataset({
            "SPECTRAL_WINDOW_ID": (("row", ), dask_spw_ids),
            "POLARIZATION_ID": (("row", ), dask_pol_ids),
        }))

    # Now create the associated MS dataset
    for ddid, (spw_id, pol_id) in enumerate(zip(spw_ids, pol_ids)):
        # Infer row, chan and correlation shape
        row = sum(chunks['row'])
        chan = spw_datasets[spw_id].CHAN_FREQ.shape[1]
        corr = pol_datasets[pol_id].CORR_TYPE.shape[1]

        # Create some dask vis data
        dims = ("row", "chan", "corr")
        np_data = (rs.normal(size=(row, chan, corr)) +
                   1j * rs.normal(size=(row, chan, corr))).astype(np.complex64)

        data_chunks = tuple((chunks['row'], chan, corr))
        dask_data = da.from_array(np_data, chunks=data_chunks)
        # Create dask ddid column
        dask_ddid = da.full(row, ddid, chunks=chunks['row'], dtype=np.int32)
        dataset = Dataset({
            'DATA': (dims, dask_data),
            'DATA_DESC_ID': (("row", ), dask_ddid)
        })
        ms_datasets.append(dataset)
        all_data.append(dask_data)
        all_data_desc_id.append(dask_ddid)

    ms_writes = xds_to_table(ms_datasets, ms_table_name, columns="ALL")
    ant_writes = xds_to_table(ant_datasets, ant_table_name, columns="ALL")
    pol_writes = xds_to_table(pol_datasets, pol_table_name, columns="ALL")
    spw_writes = xds_to_table(spw_datasets, spw_table_name, columns="ALL")
    ddid_writes = xds_to_table(ddid_datasets, ddid_table_name, columns="ALL")
    source_writes = xds_to_table(src_datasets, src_table_name, columns="ALL")

    dask.compute(ms_writes, ant_writes, pol_writes, spw_writes, ddid_writes,
                 source_writes)

    # Check ANTENNA table correctly created
    with pt.table(ant_table_name, ack=False) as A:
        assert_array_equal(A.getcol("NAME"), names)
        assert_array_equal(A.getcol("POSITION"), position)
        assert_array_equal(A.getcol("OFFSET"), offset)

        required_desc = pt.required_ms_desc("ANTENNA")
        required_columns = set(k for k in required_desc.keys()
                               if not k.startswith("_"))

        assert set(A.colnames()) == set(required_columns)

    # Check POLARIZATION table correctly created
    with pt.table(pol_table_name, ack=False) as P:
        for r, corr_type in enumerate(corr_types):
            assert_array_equal(P.getcol("CORR_TYPE", startrow=r, nrow=1),
                               [corr_type])
            assert_array_equal(P.getcol("NUM_CORR", startrow=r, nrow=1),
                               [len(corr_type)])

        required_desc = pt.required_ms_desc("POLARIZATION")
        required_columns = set(k for k in required_desc.keys()
                               if not k.startswith("_"))

        assert set(P.colnames()) == set(required_columns)

    # Check SPECTRAL_WINDOW table correctly created
    with pt.table(spw_table_name, ack=False) as S:
        for r, num_chan in enumerate(num_chans):
            assert_array_equal(
                S.getcol("NUM_CHAN", startrow=r, nrow=1)[0], num_chan)
            assert_array_equal(
                S.getcol("CHAN_FREQ", startrow=r, nrow=1)[0],
                np.linspace(.856e9, 2 * .856e9, num_chan))
            assert_array_equal(
                S.getcol("CHAN_WIDTH", startrow=r, nrow=1)[0],
                np.full(num_chan, .856e9 / num_chan))

        required_desc = pt.required_ms_desc("SPECTRAL_WINDOW")
        required_columns = set(k for k in required_desc.keys()
                               if not k.startswith("_"))

        assert set(S.colnames()) == set(required_columns)

    # We should get a cartesian product out
    with pt.table(ddid_table_name, ack=False) as D:
        spw_id, pol_id = zip(
            *product(range(len(num_chans)), range(len(corr_types))))
        assert_array_equal(pol_id, D.getcol("POLARIZATION_ID"))
        assert_array_equal(spw_id, D.getcol("SPECTRAL_WINDOW_ID"))

        required_desc = pt.required_ms_desc("DATA_DESCRIPTION")
        required_columns = set(k for k in required_desc.keys()
                               if not k.startswith("_"))

        assert set(D.colnames()) == set(required_columns)

    with pt.table(src_table_name, ack=False) as S:
        for r, (name, direction, rest_freq) in enumerate(sources):
            assert_array_equal(S.getcol("NAME", startrow=r, nrow=1)[0], [name])
            assert_array_equal(S.getcol("REST_FREQUENCY", startrow=r, nrow=1),
                               [rest_freq])
            assert_array_equal(S.getcol("DIRECTION", startrow=r, nrow=1),
                               [direction])

    with pt.table(ms_table_name, ack=False) as T:
        # DATA_DESC_ID's are all the same shape
        assert_array_equal(T.getcol("DATA_DESC_ID"),
                           da.concatenate(all_data_desc_id))

        # DATA is variably shaped (on DATA_DESC_ID) so we
        # compared each one separately.
        for ddid, data in enumerate(all_data):
            ms_data = T.getcol("DATA", startrow=ddid * row, nrow=row)
            assert_array_equal(ms_data, data)

        required_desc = pt.required_ms_desc()
        required_columns = set(k for k in required_desc.keys()
                               if not k.startswith("_"))

        # Check we have the required columns
        assert set(T.colnames()) == required_columns.union(
            ["DATA", "DATA_DESC_ID"])
Ejemplo n.º 23
0
    def gradient(f, *varargs, **kwargs):
        f = da.asarray(f)

        kwargs["edge_order"] = math.ceil(kwargs.get("edge_order", 1))
        if kwargs["edge_order"] > 2:
            raise ValueError("edge_order must be less than or equal to 2.")

        drop_result_list = False
        axis = kwargs.pop("axis", None)
        if axis is None:
            axis = tuple(range(f.ndim))
        elif isinstance(axis, Integral):
            drop_result_list = True
            axis = (axis, )

        axis = validate_axis(axis, f.ndim)

        if len(axis) != len(set(axis)):
            raise ValueError("duplicate axes not allowed")

        axis = tuple(ax % f.ndim for ax in axis)

        if varargs == ():
            varargs = (1, )
        if len(varargs) == 1:
            varargs = len(axis) * varargs
        if len(varargs) != len(axis):
            raise TypeError(
                "Spacing must either be a single scalar, or a scalar / "
                "1d-array per axis")

        if issubclass(f.dtype.type, (np.bool8, Integral)):
            f = f.astype(float)
        elif issubclass(f.dtype.type, Real) and f.dtype.itemsize < 4:
            f = f.astype(float)

        results = []
        for i, ax in enumerate(axis):
            for c in f.chunks[ax]:
                if np.min(c) < kwargs["edge_order"] + 1:
                    raise ValueError(
                        'Chunk size must be larger than edge_order + 1. '
                        'Minimum chunk for aixs {} is {}. Rechunk to '
                        'proceed.'.format(np.min(c), ax))

            if np.isscalar(varargs[i]):
                array_locs = None
            else:
                if isinstance(varargs[i], da.Array):
                    raise NotImplementedError(
                        'dask array coordinated is not supported.')
                # coordinate position for each block taking overlap into
                # account
                chunk = np.array(f.chunks[ax])
                array_loc_stop = np.cumsum(chunk) + 1
                array_loc_start = array_loc_stop - chunk - 2
                array_loc_stop[-1] -= 1
                array_loc_start[0] = 0
                array_locs = (array_loc_start, array_loc_stop)

            results.append(
                f.map_overlap(
                    _gradient_kernel,
                    dtype=f.dtype,
                    depth={j: 1 if j == ax else 0
                           for j in range(f.ndim)},
                    boundary="none",
                    coord=varargs[i],
                    axis=ax,
                    array_locs=array_locs,
                    grad_kwargs=kwargs,
                ))

        if drop_result_list:
            results = results[0]

        return results
Ejemplo n.º 24
0
def count_cohort_alleles(
    ds: Dataset,
    *,
    call_allele_count: Hashable = variables.call_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute per cohort allele counts from per-sample allele counts, or genotype calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_allele_count
        Input variable name holding call_allele_count as defined by
        :data:`sgkit.variables.call_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_call_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.cohort_allele_count_spec`
    of allele counts with shape (variants, cohorts, alleles) and values corresponding to
    the number of non-missing occurrences of each allele.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.dims["samples"] // 2), dims="samples")
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1   S2   S3
    variants
    0         0/0  1/0  1/0  0/1
    1         1/0  0/1  0/0  1/0
    2         1/1  0/0  1/0  0/1
    3         1/0  1/1  1/1  1/0
    4         1/0  0/0  1/0  1/1

    >>> sg.count_cohort_alleles(ds)["cohort_allele_count"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[3, 1],
            [2, 2]],
    <BLANKLINE>
            [[2, 2],
            [3, 1]],
    <BLANKLINE>
            [[2, 2],
            [2, 2]],
    <BLANKLINE>
            [[1, 3],
            [1, 3]],
    <BLANKLINE>
            [[3, 1],
            [1, 3]]])
    """
    ds = define_variable_if_absent(ds, variables.call_allele_count,
                                   call_allele_count, count_call_alleles)
    variables.validate(ds,
                       {call_allele_count: variables.call_allele_count_spec})

    n_variants = ds.dims["variants"]
    n_alleles = ds.dims["alleles"]

    AC, SC = da.asarray(ds.call_allele_count), da.asarray(ds.sample_cohort)
    n_cohorts = SC.max().compute() + 1  # 0-based indexing
    C = da.empty(n_cohorts, dtype=np.uint8)

    G = da.asarray(ds.call_genotype)
    shape = (G.chunks[0], n_cohorts, n_alleles)

    AC = da.map_blocks(_count_cohort_alleles,
                       AC,
                       SC,
                       C,
                       chunks=shape,
                       dtype=np.int32)
    assert_array_shape(AC, n_variants, n_cohorts * AC.numblocks[1],
                       n_alleles * AC.numblocks[2])

    # Stack the blocks and sum across them
    # (which will only work because each chunk is guaranteed to have same size)
    AC = da.stack([AC.blocks[:, i]
                   for i in range(AC.numblocks[1])]).sum(axis=0)
    assert_array_shape(AC, n_variants, n_cohorts, n_alleles)

    new_ds = create_dataset({
        variables.cohort_allele_count: (("variants", "cohorts", "alleles"), AC)
    })
    return conditional_merge_datasets(ds, new_ds, merge)
Ejemplo n.º 25
0
    ds['call_dosage_mask'] = ds['call_genotype_probability_mask']
    return ds


def rechunk_to_zarr(
        ds: Dataset,
        store: Union[PathType, MutableMapping],  # type: ignore[type-arg]
        *,
        mode: str = "w",
        chunk_length: int = 10_000,
        chunk_width: int = 10_000,
        compressor: Any = zarr.Blosc(cname="zstd", clevel=7, shuffle=2),
        compute: bool = True) -> ZarrStore:
    ds = pack_variables(ds)
    for v in ['call_genotype_probability', 'call_genotype_probability_mask']:
        chunk_size = da.asarray(ds[v]).chunksize[0]
        if chunk_length % chunk_size != 0:
            raise ValueError(
                f"Chunk size in variant dimension for variable '{v}' ({chunk_size}) "
                f"must evenly divide target chunk size {chunk_length}")
        ds[v] = ds[v].chunk(chunks=dict(samples=chunk_width))
    encoding = encode_variables(ds, compressor=compressor)
    return ds.to_zarr(store, mode=mode, encoding=encoding, compute=compute)


def rechunk_from_zarr(store: Union[PathType, MutableMapping],
                      chunk_length: int = 10_000,
                      chunk_width: int = 10_000,
                      mask_and_scale: bool = True):
    ds = xr.open_zarr(store, mask_and_scale=mask_and_scale)
    for v in ['call_genotype_probability', 'call_genotype_probability_mask']:
Ejemplo n.º 26
0
def threshold_local(image, block_size, method='gaussian', offset=0,
                    mode='reflect', param=None, cval=0):
    """Compute a threshold mask image based on local pixel neighborhood.

    Also known as adaptive or dynamic thresholding[1]_. The threshold value is
    the weighted mean for the local neighborhood of a pixel subtracted by a
    constant. Alternatively the threshold can be determined dynamically by a
    given function, using the 'generic' method.

    Parameters
    ----------
    image : (N, M) dask ndarray
        Input image.
    block_size : int or list/tuple/array
        Size of pixel neighborhood which is used to calculate the
        threshold value.
        (1) A single value for use in all dimensions or
        (2) A tuple, list, or array with length equal to image.ndim
    method : {'generic', 'gaussian', 'mean', 'median'}, optional
        Method used to determine adaptive threshold for local neighbourhood in
        weighted mean image.

        * 'generic': use custom function (see `param` parameter)
        * 'gaussian': apply gaussian filter (see `param` parameter for custom\
                      sigma value)
        * 'mean': apply arithmetic mean filter
        * 'median': apply median rank filter

        By default the 'gaussian' method is used.
    offset : float, optional
        Constant subtracted from weighted mean of neighborhood to calculate
        the local threshold value. Default offset is 0.
    mode : {'reflect', 'constant', 'nearest', 'mirror', 'wrap'}, optional
        The mode parameter determines how the array borders are handled, where
        cval is the value when mode is equal to 'constant'.
        Default is 'reflect'.
    param : {int, function}, optional
        Either specify sigma for 'gaussian' method or function object for
        'generic' method. This functions takes the flat array of local
        neighbourhood as a single argument and returns the calculated
        threshold for the centre pixel.
    cval : float, optional
        Value to fill past edges of input if mode is 'constant'.

    Returns
    -------
    threshold : (N, M) dask ndarray
        Threshold image. All pixels in the input image higher than the
        corresponding pixel in the threshold image are considered foreground.

    References
    ----------
    .. [1] https://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=threshold

    Examples
    --------
    >>> import dask.array as da
    >>> image = da.random.random((1000, 1000), chunks=(100, 100))
    >>> result = threshold_local(image, 15, 'gaussian')
    """  # noqa

    image = image.astype(np.float64)

    if method == 'generic':
        if not callable(param):
            raise ValueError("Must include a valid function to use as the "
                             "'param' keyword argument.")
        thresh_image = _generic.generic_filter(image, param, block_size,
                                               mode=mode, cval=cval)
    elif method == 'gaussian':
        if param is None:
            sigma = (da.asarray(block_size) - 1) / 6.0
        else:
            sigma = param
        thresh_image = _gaussian.gaussian_filter(image, sigma, mode=mode,
                                                 cval=cval)
    elif method == 'mean':
        thresh_image = _generic.generic_filter(
            image, dispatch_threshold_local_mean(image), block_size, mode=mode,
            cval=cval)
    elif method == 'median':
        thresh_image = _order.median_filter(image, block_size, mode=mode,
                                            cval=cval)
    else:
        raise ValueError("Invalid method specified. Please use `generic`, "
                         "`gaussian`, `mean`, or `median`.")
    return thresh_image - offset
Ejemplo n.º 27
0
    raise(ValueError("data can only be rechunked to monthly and yearly zarr archives by now"))
ds
# -

chunk_dict = {"lat": 1, "lon": 1, "prob": 1}
ds_rechunked = ds.chunk(chunk_dict)
ds_rechunked

# +
# overwite potentially existing zarr files?

overwrite = False # if False, raises zarr.errors.ContainsArrayError if zarr archive already exists

# +
# stupid nanny and worker messages: just wait, computations are running (see dashboard for status)

results = []
for variable_name, variable in tqdm(ds_rechunked.variables.items()):
    zarr_dir_path = target_path.joinpath(variable_name)
    zarr_dir_name = str(zarr_dir_path)
    print(zarr_dir_name)
    
    if overwrite & zarr_dir_path.exists():
        print('overwrtiting...')
        shutil.rmtree(zarr_dir_path)

    da.asarray(variable.data).to_zarr(zarr_dir_name)
# -


Ejemplo n.º 28
0
def window_statistic(
    values: ArrayLike,
    statistic: Callable[..., ArrayLike],
    window_starts: ArrayLike,
    window_stops: ArrayLike,
    dtype: DType,
    chunks: Any = None,
    new_axis: Union[None, int, Iterable[int]] = None,
    **kwargs: Any,
) -> da.Array:

    values = da.asarray(values)
    desired_chunks = chunks or values.chunks

    window_lengths = window_stops - window_starts
    depth = np.max(window_lengths)  # type: ignore[no-untyped-call]

    # Dask will raise an error if the last chunk size is smaller than the depth
    # Workaround by rechunking to combine the last two chunks in first axis
    # See https://github.com/dask/dask/issues/6597
    if depth > values.chunks[0][-1]:
        chunk0 = values.chunks[0]
        new_chunk0 = tuple(list(chunk0[:-2]) + [chunk0[-2] + chunk0[-1]])
        values = values.rechunk({0: new_chunk0})

    chunks = values.chunks[0]

    rel_window_starts, windows_per_chunk = _get_chunked_windows(
        chunks, window_starts, window_stops)

    # Add depth for map_overlap
    rel_window_starts = rel_window_starts + depth
    rel_window_stops = rel_window_starts + window_lengths

    chunk_offsets = _sizes_to_start_offsets(windows_per_chunk)

    def blockwise_moving_stat(x: ArrayLike,
                              block_info: Any = None) -> ArrayLike:
        if block_info is None or len(block_info) == 0:
            return np.array([])
        chunk_number = block_info[0]["chunk-location"][0]
        chunk_offset_start = chunk_offsets[chunk_number]
        chunk_offset_stop = chunk_offsets[chunk_number + 1]
        chunk_window_starts = rel_window_starts[
            chunk_offset_start:chunk_offset_stop]
        chunk_window_stops = rel_window_stops[
            chunk_offset_start:chunk_offset_stop]
        out = np.array([
            statistic(x[i:j], **kwargs)
            for i, j in zip(chunk_window_starts, chunk_window_stops)
        ])
        return out

    if values.ndim == 1:
        new_chunks = (tuple(windows_per_chunk), )
    else:
        # depth is 0 except in first axis
        depth = {0: depth}
        # new chunks are same except in first axis
        new_chunks = tuple([tuple(windows_per_chunk)] +
                           list(desired_chunks[1:]))  # type: ignore
    return values.map_overlap(
        blockwise_moving_stat,
        dtype=dtype,
        chunks=new_chunks,
        depth=depth,
        boundary=0,
        trim=False,
        new_axis=new_axis,
    )
Ejemplo n.º 29
0
def Fst(
    ds: Dataset,
    *,
    estimator: Optional[str] = None,
    stat_divergence: Hashable = variables.stat_divergence,
    merge: bool = True,
) -> Dataset:
    """Compute Fst between pairs of cohorts.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window_by_position` or :func:`window_by_variant` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    estimator
        Determines the formula to use for computing Fst.
        If None (the default), or ``Hudson``, Fst is calculated
        using the method of Hudson (1992) elaborated by Bhatia et al. (2013),
        (the same estimator as scikit-allel).
        Other supported estimators include ``Nei`` (1986), (the same estimator
        as tskit).
    stat_divergence
        Divergence variable to use or calculate. Defined by
        :data:`sgkit.variables.stat_divergence_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`divergence`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the Fst value between pairs of cohorts, as defined by
    :data:`sgkit.variables.stat_Fst_spec`.
    Shape (variants, cohorts, cohorts), or (windows, cohorts, cohorts) if windowing
    information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[        nan, -0.16666667],
            [-0.16666667,         nan]],
    <BLANKLINE>
        [[        nan, -0.16666667],
            [-0.16666667,         nan]],
    <BLANKLINE>
        [[        nan, -0.33333333],
            [-0.33333333,         nan]],
    <BLANKLINE>
        [[        nan, -0.33333333],
            [-0.33333333,         nan]],
    <BLANKLINE>
        [[        nan,  0.2       ],
            [ 0.2       ,         nan]]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window_by_variant(ds, size=3)
    >>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[        nan, -0.22222222],
            [-0.22222222,         nan]],
    <BLANKLINE>
        [[        nan,  0.        ],
            [ 0.        ,         nan]]])
    """
    known_estimators = {"Hudson": _Fst_Hudson, "Nei": _Fst_Nei}
    if estimator is not None and estimator not in known_estimators:
        raise ValueError(
            f"Estimator '{estimator}' is not a known estimator: {known_estimators.keys()}"
        )
    estimator = estimator or "Hudson"
    ds = define_variable_if_absent(ds, variables.stat_divergence,
                                   stat_divergence, divergence)
    variables.validate(ds, {stat_divergence: variables.stat_divergence_spec})

    n_cohorts = ds.dims["cohorts"]
    gs = da.asarray(ds.stat_divergence)
    shape = (gs.chunks[0], n_cohorts, n_cohorts)
    fst = da.map_blocks(known_estimators[estimator],
                        gs,
                        chunks=shape,
                        dtype=np.float64)
    # TODO: reinstate assert (first dim could be either variants or windows)
    # assert_array_shape(fst, n_windows, n_cohorts, n_cohorts)
    new_ds = create_dataset(
        {variables.stat_Fst: (("windows", "cohorts_0", "cohorts_1"), fst)})
    return conditional_merge_datasets(ds, new_ds, merge)
Ejemplo n.º 30
0
def test_tile_np_kroncompare_examples(shape, reps):
    x = np.random.random(shape)
    d = da.asarray(x)

    assert_eq(np.tile(x, reps), da.tile(d, reps))
Ejemplo n.º 31
0
def pairwise_distance(
    x: ArrayLike,
    metric: str = "euclidean",
) -> np.ndarray:
    """Calculates the pairwise distance between all pairs of row vectors in the
    given two dimensional array x.

    To illustrate the algorithm consider the following (4, 5) two dimensional array:

    [e.00, e.01, e.02, e.03, e.04]
    [e.10, e.11, e.12, e.13, e.14]
    [e.20, e.21, e.22, e.23, e.24]
    [e.30, e.31, e.32, e.33, e.34]

    The rows of the above matrix are the set of vectors. Now let's label all
    the vectors as v0, v1, v2, v3.

    The result will be a two dimensional symmetric matrix which will contain
    the distance between all pairs. Since there are 4 vectors, calculating the
    distance between each vector and every other vector, will result in 16
    distances and the resultant array will be of size (4, 4) as follows:

    [v0.v0, v0.v1, v0.v2, v0.v3]
    [v1.v0, v1.v1, v1.v2, v1.v3]
    [v2.v0, v2.v1, v2.v2, v2.v3]
    [v3.v0, v3.v1, v3.v2, v3.v3]

    The (i, j) position in the resulting array (matrix) denotes the distance
    between vi and vj vectors.

    Negative and nan values are considered as missing values. They are ignored
    for all distance metric calculations.

    Parameters
    ----------
    x
        [array-like, shape: (M, N)]
        An array like two dimensional matrix. The rows are the
        vectors used for comparison, i.e. for pairwise distance.
    metric
        The distance metric to use. The distance function can be
        'euclidean' or 'correlation'.

    Returns
    -------

    [array-like, shape: (M, M)]
    A two dimensional distance matrix, which will be symmetric. The dimension
    will be (M, M). The (i, j) position in the resulting array
    (matrix) denotes the distance between ith and jth row vectors
    in the input array.

    Examples
    --------

    >>> from sgkit.distance.api import pairwise_distance
    >>> import dask.array as da
    >>> x = da.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]]).rechunk(2, 2)
    >>> pairwise_distance(x, metric='euclidean')
    array([[0.        , 2.44948974, 4.69041576],
           [2.44948974, 0.        , 5.47722558],
           [4.69041576, 5.47722558, 0.        ]])

    >>> import numpy as np
    >>> x = np.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]])
    >>> pairwise_distance(x, metric='euclidean')
    array([[0.        , 2.44948974, 4.69041576],
           [2.44948974, 0.        , 5.47722558],
           [4.69041576, 5.47722558, 0.        ]])

    >>> x = np.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]])
    >>> pairwise_distance(x, metric='correlation')
    array([[1.11022302e-16, 2.62956526e-01, 2.82353505e-03],
           [2.62956526e-01, 0.00000000e+00, 2.14285714e-01],
           [2.82353505e-03, 2.14285714e-01, 0.00000000e+00]])
    """

    try:
        metric_ufunc = getattr(metrics, metric)
    except AttributeError:
        raise NotImplementedError(f"Given metric: {metric} is not implemented.")

    x = da.asarray(x)
    x_distance = da.blockwise(
        # Lambda wraps reshape for broadcast
        lambda _x, _y: metric_ufunc(_x[:, None, :], _y),
        "jk",
        x,
        "ji",
        x,
        "ki",
        dtype="float64",
        concatenate=True,
    )
    x_distance = da.triu(x_distance, 1) + da.triu(x_distance).T
    return x_distance.compute()
Ejemplo n.º 32
0
    def potential_energy(self):
        """
        Specific potential energy calculation.

        Calculates the specific potencial energy
        of dark matter, star and gas particles.

        Returns
        -------
        gx : `galaxy object`
            New instanced galaxy specific potencial energy calculated for
            stars, dark matter and gas particles.

        Examples
        --------
        This returns the specific potential energy of stars, dark matter and
        gas particles.

        >>> import galaxychop as gc
        >>> galaxy = gc.Galaxy(...)
        >>> gpot = galaxy.potential_energy()
        >>> pot_s, pot_dm, pot_g = gpot.pot_s, gpot.pot_dm, gpot.pot_g

        Note
        ----
        If the potentials are entered when the `galaxy` object is instanced,
        then, the calculation of `potential_energy` will raise a `ValueError`.
        """
        m_s = self.arr_.m_s
        x_s = self.arr_.x_s
        y_s = self.arr_.y_s
        z_s = self.arr_.z_s

        m_dm = self.arr_.m_dm
        x_dm = self.arr_.x_dm
        y_dm = self.arr_.y_dm
        z_dm = self.arr_.z_dm

        m_g = self.arr_.m_g
        x_g = self.arr_.x_g
        y_g = self.arr_.y_g
        z_g = self.arr_.z_g

        pot_s = self.arr_.pot_s
        pot_dm = self.arr_.pot_dm
        pot_g = self.arr_.pot_g

        pot_s = self.arr_.pot_s
        pot_dm = self.arr_.pot_dm
        pot_g = self.arr_.pot_g

        eps_s = self.arr_.eps_s
        eps_dm = self.arr_.eps_dm
        eps_g = self.arr_.eps_g

        potential = np.concatenate([pot_s, pot_dm, pot_g])

        if np.all(potential == 0.0):
            x = np.hstack((x_s, x_dm, x_g))
            y = np.hstack((y_s, y_dm, y_g))
            z = np.hstack((z_s, z_dm, z_g))
            m = np.hstack((m_s, m_dm, m_g))
            eps = np.max([eps_s, eps_dm, eps_g])

            pot = utils.potential(
                da.asarray(x, chunks=100),
                da.asarray(y, chunks=100),
                da.asarray(z, chunks=100),
                da.asarray(m, chunks=100),
                da.asarray(eps),
            )

            num_s = len(m_s)
            num = len(m_s) + len(m_dm)

            pot_s = pot[:num_s]
            pot_dm = pot[num_s:num]
            pot_g = pot[num:]

            new = attr.asdict(self, recurse=False)
            del new["arr_"]
            new.update(
                pot_s=-pot_s * (u.km / u.s) ** 2,
                pot_dm=-pot_dm * (u.km / u.s) ** 2,
                pot_g=-pot_g * (u.km / u.s) ** 2,
            )

            return Galaxy(**new)

        else:
            raise ValueError("Potentials are already calculated")
Ejemplo n.º 33
0
def Tajimas_D(
    ds: Dataset,
    *,
    variant_allele_count: Hashable = variables.variant_allele_count,
    stat_diversity: Hashable = variables.stat_diversity,
    merge: bool = True,
) -> Dataset:
    """Compute Tajimas' D for a genotype call dataset.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window_by_position` or :func:`window_by_variant` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    variant_allele_count
        Variant allele count variable to use or calculate. Defined by
        :data:`sgkit.variables.variant_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_variant_alleles`.
    stat_diversity
        Diversity variable to use or calculate. Defined by
        :data:`sgkit.variables.stat_diversity_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`diversity`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the Tajimas' D value, as defined by :data:`sgkit.variables.stat_Tajimas_D_spec`.
    Shape (variants, cohorts), or (windows, cohorts) if windowing information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.88883234, 2.18459998],
           [2.18459998, 0.88883234],
           [2.18459998, 2.18459998],
           [0.88883234, 0.88883234],
           [0.88883234, 0.88883234]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window_by_variant(ds, size=3)
    >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE
    array([[2.40517586, 2.40517586],
           [1.10393559, 1.10393559]])
    """
    ds = define_variable_if_absent(ds, variables.variant_allele_count,
                                   variant_allele_count, count_variant_alleles)
    ds = define_variable_if_absent(ds, variables.stat_diversity,
                                   stat_diversity, diversity)
    variables.validate(
        ds,
        {
            variant_allele_count: variables.variant_allele_count_spec,
            stat_diversity: variables.stat_diversity_spec,
        },
    )

    ac = ds[variant_allele_count]
    ac = da.asarray(ac)

    # count segregating. Note that this uses the definition in tskit,
    # which is the number of alleles - 1. In the biallelic case this
    # gives us the number of non-monomorphic sites.
    S = (ac > 0).sum(axis=1) - 1

    if has_windows(ds):
        S = window_statistic(
            S,
            np.sum,
            ds.window_start.values,
            ds.window_stop.values,
            dtype=S.dtype,
            axis=0,
        )

    # assume number of chromosomes sampled is constant for all variants
    # NOTE: even tho ac has dtype uint, we promote the sum to float
    #       because the computation below requires floats
    n = ac.sum(axis=1, dtype="float").max()

    # (n-1)th harmonic number
    a1 = (1 / da.arange(1, n)).sum()

    # calculate Watterson's theta (absolute value)
    theta = S / a1

    # get diversity
    div = ds[stat_diversity]

    # N.B., both theta estimates are usually divided by the number of
    # (accessible) bases but here we want the absolute difference
    d = div - theta[:, np.newaxis]

    # calculate the denominator (standard deviation)
    a2 = (1 / (da.arange(1, n)**2)).sum()
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2))
    e1 = c1 / a1
    e2 = c2 / (a1**2 + a2)
    d_stdev = da.sqrt((e1 * S) + (e2 * S * (S - 1)))

    # Let IEEE decide the semantics of division by zero here. The return value
    # will be -inf, nan or +inf, depending on the value of the numerator.
    # Currently this will raise a RuntimeWarning, if we divide by zero.
    D = d / d_stdev[:, np.newaxis]

    if has_windows(ds):
        new_ds = create_dataset(
            {variables.stat_Tajimas_D: (["windows", "cohorts"], D.data)})
    else:
        new_ds = create_dataset(
            {variables.stat_Tajimas_D: (["variants", "cohorts"], D.data)})
    return conditional_merge_datasets(ds, new_ds, merge)