Python define_variable_if_absent Examples, sgkit.utils.define_variable_if_absent Python Examples

Example #1

0

Show file

File: test_utils.py Project: timothymillar/sgkit

def test_define_variable_if_absent():
    ds = xr.Dataset(dict(x=xr.DataArray(da.zeros(100))))

    def def_y(ds):
        ds["y"] = xr.DataArray(da.zeros(100))
        return ds

    ds = define_variable_if_absent(ds, "x", "x", lambda ds: ds)
    assert "x" in ds
    assert "y" not in ds

    with pytest.raises(
            ValueError,
            match=
            r"Variable 'z' with non-default name is missing and will not be automatically defined.",
    ):
        define_variable_if_absent(ds, "y", "z", def_y)

    ds = define_variable_if_absent(ds, "y", "y", def_y)
    assert "x" in ds
    assert "y" in ds

    ds2 = define_variable_if_absent(ds, "y", None, def_y)
    assert "x" in ds2
    assert "y" in ds2

Example #2

0

Show file

def count_variant_alleles(
    ds: Dataset,
    *,
    call_allele_count: Hashable = variables.call_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute allele count from per-sample allele counts, or genotype calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_allele_count
        Input variable name holding call_allele_count as defined by
        :data:`sgkit.variables.call_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_call_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.variant_allele_count_spec`
    of allele counts with shape (variants, alleles) and values corresponding to
    the number of non-missing occurrences of each allele.

    Examples
    --------

    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1
    variants
    0         1/0  1/0
    1         1/0  1/1
    2         0/1  1/0
    3         0/0  0/0

    >>> sg.count_variant_alleles(ds)["variant_allele_count"].values # doctest: +SKIP
    array([[2, 2],
           [1, 3],
           [2, 2],
           [4, 0]], dtype=uint64)
    """
    ds = define_variable_if_absent(ds, variables.call_allele_count,
                                   call_allele_count, count_call_alleles)
    variables.validate(ds,
                       {call_allele_count: variables.call_allele_count_spec})

    new_ds = create_dataset({
        variables.variant_allele_count:
        ds[call_allele_count].sum(dim="samples")
    })
    return conditional_merge_datasets(ds, new_ds, merge)

Example #3

0

Show file

File: aggregation.py Project: tomwhite/sgkit

def infer_call_ploidy(
    ds: Dataset,
    *,
    call_genotype: Hashable = variables.call_genotype,
    call_genotype_non_allele: Hashable = variables.call_genotype_non_allele,
    merge: bool = True,
) -> Dataset:
    """Infer the ploidy of each call genotype based on the number of
    non-allele values in each call genotype.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_genotype
        Input variable name holding call_genotype as defined by
        :data:`sgkit.variables.call_genotype_spec`.
        Must be present in ``ds``.
    call_genotype_non_allele
        Input variable name holding call_genotype_non_allele as defined by
        :data:`sgkit.variables.call_genotype_non_allele_spec`.
        If the variable is not present in ``ds``, it will be computed
        assuming that allele values less than -1 are non-alleles in mixed ploidy
        datasets, or that no non-alleles are present in fixed ploidy datasets.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.call_ploidy_spec`.
    """
    ds = define_variable_if_absent(
        ds,
        variables.call_genotype_non_allele,
        call_genotype_non_allele,
        infer_non_alleles,
    )
    mixed_ploidy = ds[variables.call_genotype].attrs.get("mixed_ploidy", False)
    if mixed_ploidy:
        call_ploidy = (~ds[call_genotype_non_allele]).sum(
            axis=-1)  # type: ignore[operator]
    else:
        ploidy = ds[variables.call_genotype].shape[-1]
        call_ploidy = xr.full_like(ds[variables.call_genotype][..., 0], ploidy)

    new_ds = create_dataset({variables.call_ploidy: call_ploidy})
    return conditional_merge_datasets(ds, variables.validate(new_ds), merge)

Example #4

0

Show file

def infer_sample_ploidy(
    ds: Dataset,
    *,
    call_genotype: Hashable = variables.call_genotype,
    call_ploidy: Hashable = variables.call_ploidy,
    merge: bool = True,
) -> Dataset:
    """Infer the ploidy of each sample across all variants based on
    the number of non-allele values in call genotypes.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_genotype
        Input variable name holding call_genotype as defined by
        :data:`sgkit.variables.call_genotype_spec`.
        Must be present in ``ds``.
    call_ploidy
        Input variable name holding call_ploidy as defined by
        :data:`sgkit.variables.call_ploidy_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`infer_call_ploidy`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.sample_ploidy_spec`.
    """
    ds = define_variable_if_absent(ds, variables.call_ploidy, call_ploidy,
                                   infer_call_ploidy)
    # validate against spec
    mixed_ploidy = ds[variables.call_genotype].attrs.get("mixed_ploidy", False)
    if mixed_ploidy:
        sample_ploidy_fixed = (ds[call_ploidy][0, :] == ds[call_ploidy]).all(
            axis=-1)
        sample_ploidy = xr.where(sample_ploidy_fixed, ds[call_ploidy][0, :],
                                 -1)  # type: ignore[no-untyped-call]
    else:
        ploidy = ds[variables.call_genotype].shape[-1]
        sample_ploidy = xr.full_like(ds[call_ploidy][0, ...], ploidy)

    new_ds = create_dataset({variables.sample_ploidy: sample_ploidy})
    return conditional_merge_datasets(ds, variables.validate(new_ds), merge)

Example #5

0

Show file

def Fst(
    ds: Dataset,
    *,
    estimator: Optional[str] = None,
    stat_divergence: Hashable = variables.stat_divergence,
    merge: bool = True,
) -> Dataset:
    """Compute Fst between pairs of cohorts.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    estimator
        Determines the formula to use for computing Fst.
        If None (the default), or ``Hudson``, Fst is calculated
        using the method of Hudson (1992) elaborated by Bhatia et al. (2013),
        (the same estimator as scikit-allel).
        Other supported estimators include ``Nei`` (1986), (the same estimator
        as tskit).
    stat_divergence
        Divergence variable to use or calculate. Defined by
        :data:`sgkit.variables.stat_divergence_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`divergence`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the Fst value between pairs of cohorts, as defined by
    :data:`sgkit.variables.stat_Fst_spec`.
    Shape (variants, cohorts, cohorts), or (windows, cohorts, cohorts) if windowing
    information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[        nan, -0.16666667],
            [-0.16666667,         nan]],
    <BLANKLINE>
        [[        nan, -0.16666667],
            [-0.16666667,         nan]],
    <BLANKLINE>
        [[        nan, -0.33333333],
            [-0.33333333,         nan]],
    <BLANKLINE>
        [[        nan, -0.33333333],
            [-0.33333333,         nan]],
    <BLANKLINE>
        [[        nan,  0.2       ],
            [ 0.2       ,         nan]]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window(ds, size=3)
    >>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[        nan, -0.22222222],
            [-0.22222222,         nan]],
    <BLANKLINE>
        [[        nan,  0.        ],
            [ 0.        ,         nan]]])
    """
    known_estimators = {"Hudson": _Fst_Hudson, "Nei": _Fst_Nei}
    if estimator is not None and estimator not in known_estimators:
        raise ValueError(
            f"Estimator '{estimator}' is not a known estimator: {known_estimators.keys()}"
        )
    estimator = estimator or "Hudson"
    ds = define_variable_if_absent(ds, variables.stat_divergence,
                                   stat_divergence, divergence)
    variables.validate(ds, {stat_divergence: variables.stat_divergence_spec})

    n_cohorts = ds.dims["cohorts"]
    gs = da.asarray(ds.stat_divergence)
    shape = (gs.chunks[0], n_cohorts, n_cohorts)
    fst = da.map_blocks(known_estimators[estimator],
                        gs,
                        chunks=shape,
                        dtype=np.float64)
    # TODO: reinstate assert (first dim could be either variants or windows)
    # assert_array_shape(fst, n_windows, n_cohorts, n_cohorts)
    new_ds = create_dataset(
        {variables.stat_Fst: (("windows", "cohorts_0", "cohorts_1"), fst)})
    return conditional_merge_datasets(ds, new_ds, merge)

Example #6

0

Show file

def diversity(
    ds: Dataset,
    *,
    cohort_allele_count: Hashable = variables.cohort_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute diversity from cohort allele counts.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    cohort_allele_count
        Cohort allele count variable to use or calculate. Defined by
        :data:`sgkit.variables.cohort_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_cohort_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the diversity values, as defined by :data:`sgkit.variables.stat_diversity_spec`.
    Shape (variants, cohorts), or (windows, cohorts) if windowing information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.diversity(ds)["stat_diversity"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.5       , 0.66666667],
        [0.66666667, 0.5       ],
        [0.66666667, 0.66666667],
        [0.5       , 0.5       ],
        [0.5       , 0.5       ]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window(ds, size=3)
    >>> sg.diversity(ds)["stat_diversity"].values # doctest: +NORMALIZE_WHITESPACE
    array([[1.83333333, 1.83333333],
        [1.        , 1.        ]])
    """
    ds = define_variable_if_absent(ds, variables.cohort_allele_count,
                                   cohort_allele_count, count_cohort_alleles)
    variables.validate(
        ds, {cohort_allele_count: variables.cohort_allele_count_spec})

    ac = ds[cohort_allele_count]
    an = ac.sum(axis=2)
    n_pairs = an * (an - 1) / 2
    n_same = (ac * (ac - 1) / 2).sum(axis=2)
    n_diff = n_pairs - n_same
    # replace zeros to avoid divide by zero error
    n_pairs_na = n_pairs.where(n_pairs != 0)
    pi = n_diff / n_pairs_na

    if has_windows(ds):
        div = window_statistic(
            pi,
            np.sum,
            ds.window_start.values,
            ds.window_stop.values,
            dtype=pi.dtype,
            axis=0,
        )
        new_ds = create_dataset(
            {variables.stat_diversity: (
                ("windows", "cohorts"),
                div,
            )})
    else:
        new_ds = create_dataset(
            {variables.stat_diversity: (
                ("variants", "cohorts"),
                pi,
            )})
    return conditional_merge_datasets(ds, new_ds, merge)

Example #7

0

Show file

def divergence(
    ds: Dataset,
    *,
    cohort_allele_count: Hashable = variables.cohort_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute divergence between pairs of cohorts.

    The entry at (i, j) is the divergence between for cohort i and cohort j,
    except for the case where i and j are the same, in which case the entry
    is the diversity for cohort i.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    cohort_allele_count
        Cohort allele count variable to use or calculate. Defined by
        :data:`sgkit.variables.cohort_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_cohort_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the divergence value between pairs of cohorts, as defined by
    :data:`sgkit.variables.stat_divergence_spec`.
    Shape (variants, cohorts, cohorts), or (windows, cohorts, cohorts) if windowing
    information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[0.5       , 0.5       ],
            [0.5       , 0.66666667]],
    <BLANKLINE>
        [[0.66666667, 0.5       ],
            [0.5       , 0.5       ]],
    <BLANKLINE>
        [[0.66666667, 0.5       ],
            [0.5       , 0.66666667]],
    <BLANKLINE>
        [[0.5       , 0.375     ],
            [0.375     , 0.5       ]],
    <BLANKLINE>
        [[0.5       , 0.625     ],
            [0.625     , 0.5       ]]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window(ds, size=3)
    >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[1.83333333, 1.5       ],
            [1.5       , 1.83333333]],
    <BLANKLINE>
        [[1.        , 1.        ],
            [1.        , 1.        ]]])
    """

    ds = define_variable_if_absent(ds, variables.cohort_allele_count,
                                   cohort_allele_count, count_cohort_alleles)
    variables.validate(
        ds, {cohort_allele_count: variables.cohort_allele_count_spec})
    ac = ds[cohort_allele_count]

    n_variants = ds.dims["variants"]
    n_cohorts = ds.dims["cohorts"]
    ac = da.asarray(ac)
    shape = (ac.chunks[0], n_cohorts, n_cohorts)
    d = da.map_blocks(_divergence, ac, chunks=shape, dtype=np.float64)
    assert_array_shape(d, n_variants, n_cohorts, n_cohorts)

    if has_windows(ds):
        div = window_statistic(
            d,
            np.sum,
            ds.window_start.values,
            ds.window_stop.values,
            dtype=d.dtype,
            axis=0,
        )
        new_ds = create_dataset({
            variables.stat_divergence: (
                ("windows", "cohorts_0", "cohorts_1"),
                div,
            )
        })
    else:
        new_ds = create_dataset({
            variables.stat_divergence: (
                ("variants", "cohorts_0", "cohorts_1"),
                d,
            )
        })
    return conditional_merge_datasets(ds, new_ds, merge)

Example #8

0

Show file

File: ibs.py Project: timothymillar/sgkit

def identity_by_state(
    ds: Dataset,
    *,
    call_allele_frequency: Hashable = variables.call_allele_frequency,
    merge: bool = True,
) -> Dataset:
    """Compute identity by state (IBS) probabilities between
    all pairs of samples.

    The IBS probability between a pair of individuals is the
    probability that a randomly drawn allele from the first individual
    is identical in state with a randomly drawn allele from the second
    individual at a single random locus.

    Parameters
    ----------
    ds
        Dataset containing call genotype alleles.
    call_allele_frequency
        Input variable name holding call_allele_frequency as defined by
        :data:`sgkit.variables.call_allele_frequency_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`call_allele_frequencies`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.stat_identity_by_state_spec`
    which is a matrix of pairwise IBS probabilities among all samples.
    The dimensions are named ``samples_0`` and ``samples_1``.

    Raises
    ------
    NotImplementedError
        If the variable holding call_allele_frequency is chunked along the
        samples dimension.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=2, n_sample=3, seed=2)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1   S2
    variants
    0         0/0  1/1  1/0
    1         1/1  1/1  1/0
    >>> sg.identity_by_state(ds)["stat_identity_by_state"].values # doctest: +NORMALIZE_WHITESPACE
    array([[1. , 0.5, 0.5],
           [0.5, 1. , 0.5],
           [0.5, 0.5, 0.5]])
    """
    ds = define_variable_if_absent(
        ds,
        variables.call_allele_frequency,
        call_allele_frequency,
        call_allele_frequencies,
    )
    variables.validate(
        ds, {call_allele_frequency: variables.call_allele_frequency_spec}
    )
    af = da.asarray(ds[call_allele_frequency])
    if len(af.chunks[1]) > 1:
        raise NotImplementedError(
            "identity_by_state does not support chunking in the samples dimension"
        )
    af0 = da.where(da.isnan(af), 0.0, af)
    num = da.einsum("ixj,iyj->xy", af0, af0)
    called = da.nansum(af, axis=-1)
    count = da.einsum("ix,iy->xy", called, called)
    denom = da.where(count == 0, np.nan, count)
    new_ds = create_dataset(
        {
            variables.stat_identity_by_state: (
                ("samples_0", "samples_1"),
                num / denom,
            )
        }
    )
    return conditional_merge_datasets(ds, new_ds, merge)

Example #9

0

Show file

File: ibs.py Project: timothymillar/sgkit

def Weir_Goudet_beta(
    ds: Dataset,
    *,
    stat_identity_by_state: Hashable = variables.stat_identity_by_state,
    merge: bool = True,
) -> Dataset:
    """Estimate pairwise beta between all pairs of samples as described
    in Weir and Goudet 2017 [1].

    Beta is the kinship scaled by the average kinship of all pairs of
    individuals in the dataset such that the non-diagonal (non-self) values
    sum to zero.

    Beta may be corrected to more accurately reflect pedigree based kinship
    estimates using the formula
    :math:`\\hat{\\beta}^c=\\frac{\\hat{\\beta}-\\hat{\\beta}_0}{1-\\hat{\\beta}_0}`
    where :math:`\\hat{\\beta}_0` is the estimated beta between samples which are
    known to be unrelated [1].

    Parameters
    ----------
    ds
        Genotype call dataset.
    stat_identity_by_state
        Input variable name holding stat_identity_by_state as defined
        by :data:`sgkit.variables.stat_identity_by_state_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`identity_by_state`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.stat_Weir_Goudet_beta_spec`
    which is a matrix of estimated pairwise kinship relative to the average
    kinship of all pairs of individuals in the dataset.
    The dimensions are named ``samples_0`` and ``samples_1``.

    Examples
    --------

    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=3, n_sample=3, n_allele=10, seed=3)
    >>> # sample 2 "inherits" alleles from samples 0 and 1
    >>> ds.call_genotype.data[:, 2, 0] = ds.call_genotype.data[:, 0, 0]
    >>> ds.call_genotype.data[:, 2, 1] = ds.call_genotype.data[:, 1, 0]
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1   S2
    variants
    0         7/1  8/6  7/8
    1         9/5  3/6  9/3
    2         8/8  8/3  8/8
    >>> # estimate beta
    >>> ds = sg.Weir_Goudet_beta(ds).compute()
    >>> ds.stat_Weir_Goudet_beta.values # doctest: +NORMALIZE_WHITESPACE
    array([[ 0.5 , -0.25,  0.25],
           [-0.25,  0.25,  0.  ],
           [ 0.25,  0.  ,  0.5 ]])
    >>> # correct beta assuming least related samples are unrelated
    >>> beta = ds.stat_Weir_Goudet_beta
    >>> beta0 = beta.min()
    >>> beta_corrected = (beta - beta0) / (1 - beta0)
    >>> beta_corrected.values # doctest: +NORMALIZE_WHITESPACE
    array([[0.6, 0. , 0.4],
           [0. , 0.4, 0.2],
           [0.4, 0.2, 0.6]])

    References
    ----------
    [1] - Bruce, S. Weir, and Jérôme Goudet 2017.
    "A Unified Characterization of Population Structure and Relatedness."
    Genetics 206 (4): 2085-2103.
    """
    ds = define_variable_if_absent(
        ds, variables.stat_identity_by_state, stat_identity_by_state, identity_by_state
    )
    variables.validate(
        ds, {stat_identity_by_state: variables.stat_identity_by_state_spec}
    )
    ibs = ds[stat_identity_by_state].data
    # average matching is the mean of non-diagonal elements
    num = da.nansum(da.tril(ibs, -1))
    denom = da.nansum(da.tril(~da.isnan(ibs), -1))
    avg = num / denom
    beta = (ibs - avg) / (1 - avg)
    new_ds = create_dataset(
        {
            variables.stat_Weir_Goudet_beta: (
                ("samples_0", "samples_1"),
                beta,
            )
        }
    )
    return conditional_merge_datasets(ds, new_ds, merge)

Example #10

0

Show file

File: aggregation.py Project: hammer/sgkit

def individual_heterozygosity(
    ds: Dataset,
    *,
    call_allele_count: Hashable = variables.call_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute per call individual heterozygosity.

    Individual heterozygosity is the probability that two alleles
    drawn at random without replacement, from an individual at a
    given site, are not identical in state. Therefore, individual
    heterozygosity is defined for diploid and polyploid calls but
    will return nan in the case of haploid calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_allele_count
        Input variable name holding call_allele_count as defined by
        :data:`sgkit.variables.call_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_call_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.
    Returns
    -------
    A dataset containing :data:`sgkit.variables.call_heterozygosity_spec`
    of per genotype observed heterozygosity with shape (variants, samples)
    containing values within the interval [0, 1] or nan if ploidy < 2.

    Examples
    --------

    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1
    variants
    0         1/0  1/0
    1         1/0  1/1
    2         0/1  1/0
    3         0/0  0/0

    >>> sg.individual_heterozygosity(ds)["call_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE
    array([[1., 1.],
           [1., 0.],
           [1., 1.],
           [0., 0.]])
    """
    ds = define_variable_if_absent(
        ds, variables.call_allele_count, call_allele_count, count_call_alleles
    )
    variables.validate(ds, {call_allele_count: variables.call_allele_count_spec})

    AC = da.asarray(ds.call_allele_count)
    K = AC.sum(axis=-1)
    # use nan denominator to avoid divide by zero with K - 1
    K2 = da.where(K > 1, K, np.nan)
    AF = AC / K2[..., None]
    HI = (1 - da.sum(AF ** 2, axis=-1)) * (K / (K2 - 1))
    new_ds = create_dataset(
        {variables.call_heterozygosity: (("variants", "samples"), HI)}
    )
    return conditional_merge_datasets(ds, new_ds, merge)

Example #11

0

Show file

def Tajimas_D(
    ds: Dataset,
    *,
    variant_allele_count: Hashable = variables.variant_allele_count,
    stat_diversity: Hashable = variables.stat_diversity,
    merge: bool = True,
) -> Dataset:
    """Compute Tajimas' D for a genotype call dataset.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window_by_position` or :func:`window_by_variant` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    variant_allele_count
        Variant allele count variable to use or calculate. Defined by
        :data:`sgkit.variables.variant_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_variant_alleles`.
    stat_diversity
        Diversity variable to use or calculate. Defined by
        :data:`sgkit.variables.stat_diversity_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`diversity`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the Tajimas' D value, as defined by :data:`sgkit.variables.stat_Tajimas_D_spec`.
    Shape (variants, cohorts), or (windows, cohorts) if windowing information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.88883234, 2.18459998],
           [2.18459998, 0.88883234],
           [2.18459998, 2.18459998],
           [0.88883234, 0.88883234],
           [0.88883234, 0.88883234]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window_by_variant(ds, size=3)
    >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE
    array([[2.40517586, 2.40517586],
           [1.10393559, 1.10393559]])
    """
    ds = define_variable_if_absent(ds, variables.variant_allele_count,
                                   variant_allele_count, count_variant_alleles)
    ds = define_variable_if_absent(ds, variables.stat_diversity,
                                   stat_diversity, diversity)
    variables.validate(
        ds,
        {
            variant_allele_count: variables.variant_allele_count_spec,
            stat_diversity: variables.stat_diversity_spec,
        },
    )

    ac = ds[variant_allele_count]
    ac = da.asarray(ac)

    # count segregating. Note that this uses the definition in tskit,
    # which is the number of alleles - 1. In the biallelic case this
    # gives us the number of non-monomorphic sites.
    S = (ac > 0).sum(axis=1) - 1

    if has_windows(ds):
        S = window_statistic(
            S,
            np.sum,
            ds.window_start.values,
            ds.window_stop.values,
            dtype=S.dtype,
            axis=0,
        )

    # assume number of chromosomes sampled is constant for all variants
    # NOTE: even tho ac has dtype uint, we promote the sum to float
    #       because the computation below requires floats
    n = ac.sum(axis=1, dtype="float").max()

    # (n-1)th harmonic number
    a1 = (1 / da.arange(1, n)).sum()

    # calculate Watterson's theta (absolute value)
    theta = S / a1

    # get diversity
    div = ds[stat_diversity]

    # N.B., both theta estimates are usually divided by the number of
    # (accessible) bases but here we want the absolute difference
    d = div - theta[:, np.newaxis]

    # calculate the denominator (standard deviation)
    a2 = (1 / (da.arange(1, n)**2)).sum()
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2))
    e1 = c1 / a1
    e2 = c2 / (a1**2 + a2)
    d_stdev = da.sqrt((e1 * S) + (e2 * S * (S - 1)))

    # Let IEEE decide the semantics of division by zero here. The return value
    # will be -inf, nan or +inf, depending on the value of the numerator.
    # Currently this will raise a RuntimeWarning, if we divide by zero.
    D = d / d_stdev[:, np.newaxis]

    if has_windows(ds):
        new_ds = create_dataset(
            {variables.stat_Tajimas_D: (["windows", "cohorts"], D.data)})
    else:
        new_ds = create_dataset(
            {variables.stat_Tajimas_D: (["variants", "cohorts"], D.data)})
    return conditional_merge_datasets(ds, new_ds, merge)

Example #12

0

Show file

File: aggregation.py Project: hammer/sgkit

def count_cohort_alleles(
    ds: Dataset,
    *,
    call_allele_count: Hashable = variables.call_allele_count,
    sample_cohort: Hashable = variables.sample_cohort,
    merge: bool = True,
) -> Dataset:
    """Compute per cohort allele counts from per-sample allele counts, or genotype calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_allele_count
        Input variable name holding call_allele_count as defined by
        :data:`sgkit.variables.call_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_call_alleles`.
    sample_cohort
        Input variable name holding sample_cohort as defined by
        :data:`sgkit.variables.sample_cohort_spec`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.cohort_allele_count_spec`
    of allele counts with shape (variants, cohorts, alleles) and values corresponding to
    the number of non-missing occurrences of each allele.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.dims["samples"] // 2), dims="samples")
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1   S2   S3
    variants
    0         0/0  1/0  1/0  0/1
    1         1/0  0/1  0/0  1/0
    2         1/1  0/0  1/0  0/1
    3         1/0  1/1  1/1  1/0
    4         1/0  0/0  1/0  1/1

    >>> sg.count_cohort_alleles(ds)["cohort_allele_count"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[3, 1],
            [2, 2]],
    <BLANKLINE>
            [[2, 2],
            [3, 1]],
    <BLANKLINE>
            [[2, 2],
            [2, 2]],
    <BLANKLINE>
            [[1, 3],
            [1, 3]],
    <BLANKLINE>
            [[3, 1],
            [1, 3]]])
    """
    ds = define_variable_if_absent(
        ds, variables.call_allele_count, call_allele_count, count_call_alleles
    )
    variables.validate(ds, {call_allele_count: variables.call_allele_count_spec})

    n_variants = ds.dims["variants"]
    n_alleles = ds.dims["alleles"]

    AC, SC = da.asarray(ds[call_allele_count]), da.asarray(ds[sample_cohort])
    n_cohorts = SC.max().compute() + 1  # 0-based indexing
    C = da.empty(n_cohorts, dtype=np.uint8)

    G = da.asarray(ds.call_genotype)
    shape = (G.chunks[0], n_cohorts, n_alleles)

    AC = da.map_blocks(_count_cohort_alleles, AC, SC, C, chunks=shape, dtype=np.int32)
    assert_array_shape(
        AC, n_variants, n_cohorts * AC.numblocks[1], n_alleles * AC.numblocks[2]
    )

    # Stack the blocks and sum across them
    # (which will only work because each chunk is guaranteed to have same size)
    AC = da.stack([AC.blocks[:, i] for i in range(AC.numblocks[1])]).sum(axis=0)
    assert_array_shape(AC, n_variants, n_cohorts, n_alleles)

    new_ds = create_dataset(
        {variables.cohort_allele_count: (("variants", "cohorts", "alleles"), AC)}
    )
    return conditional_merge_datasets(ds, new_ds, merge)

Example #13

0

Show file

def observed_heterozygosity(
    ds: Dataset,
    *,
    call_heterozygosity: Hashable = variables.call_heterozygosity,
    sample_cohort: Hashable = variables.sample_cohort,
    merge: bool = True,
) -> Dataset:
    """Compute per cohort observed heterozygosity.

    The observed heterozygosity of a cohort is the mean of individual
    heterozygosity values among all samples of that cohort as described
    in :func:`individual_heterozygosity`. Calls with a nan value for
    individual heterozygosity are ignored when calculating the cohort
    mean.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window_by_position` or :func:`window_by_variant` before calling
    this function.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_heterozygosity
        Input variable name holding call_heterozygosity as defined by
        :data:`sgkit.variables.call_heterozygosity_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`individual_heterozygosity`.
    sample_cohort
        Input variable name holding sample_cohort as defined by
        :data:`sgkit.variables.sample_cohort_spec`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.
    Returns
    -------
    A dataset containing :data:`sgkit.variables.stat_observed_heterozygosity_spec`
    of per cohort observed heterozygosity with shape (variants, cohorts)
    containing values within the inteval [0, 1] or nan.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.observed_heterozygosity(ds)["stat_observed_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.5, 1. ],
        [1. , 0.5],
        [0. , 1. ],
        [0.5, 0.5],
        [0.5, 0.5]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window_by_variant(ds, size=3)
    >>> sg.observed_heterozygosity(ds)["stat_observed_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE
    array([[1.5, 2.5],
        [1. , 1. ]])
    """
    ds = define_variable_if_absent(
        ds,
        variables.call_heterozygosity,
        call_heterozygosity,
        individual_heterozygosity,
    )
    variables.validate(
        ds, {call_heterozygosity: variables.call_heterozygosity_spec})
    hi = da.asarray(ds[call_heterozygosity])
    sc = da.asarray(ds[sample_cohort])
    n_cohorts = sc.max().compute() + 1
    shape = (hi.chunks[0], n_cohorts)
    n = da.zeros(n_cohorts, dtype=np.uint8)
    ho = da.map_blocks(
        _cohort_observed_heterozygosity,
        hi,
        sc,
        n,
        chunks=shape,
        drop_axis=1,
        new_axis=1,
        dtype=np.float64,
    )
    if has_windows(ds):
        ho_sum = window_statistic(
            ho,
            np.sum,
            ds.window_start.values,
            ds.window_stop.values,
            dtype=ho.dtype,
            axis=0,
        )
        new_ds = create_dataset({
            variables.stat_observed_heterozygosity: (
                ("windows", "cohorts"),
                ho_sum,
            )
        })
    else:
        new_ds = create_dataset({
            variables.stat_observed_heterozygosity: (
                ("variants", "cohorts"),
                ho,
            )
        })
    return conditional_merge_datasets(ds, new_ds, merge)

Example #14

0

Show file

def Tajimas_D(
    ds: Dataset,
    *,
    variant_allele_count: Hashable = variables.variant_allele_count,
    stat_diversity: Hashable = variables.stat_diversity,
    merge: bool = True,
) -> Dataset:
    """Compute Tajimas' D for a genotype call dataset.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    variant_allele_count
        Variant allele count variable to use or calculate. Defined by
        :data:`sgkit.variables.variant_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_variant_alleles`.
    stat_diversity
        Diversity variable to use or calculate. Defined by
        :data:`sgkit.variables.stat_diversity_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`diversity`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the Tajimas' D value, as defined by :data:`sgkit.variables.stat_Tajimas_D_spec`.
    Shape (variants, cohorts), or (windows, cohorts) if windowing information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE
    array([[-3.35891429, -2.96698697],
        [-2.96698697, -3.35891429],
        [-2.96698697, -2.96698697],
        [-3.35891429, -3.35891429],
        [-3.35891429, -3.35891429]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window(ds, size=3)
    >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE
    array([[-0.22349574, -0.22349574],
        [-2.18313233, -2.18313233]])
    """
    ds = define_variable_if_absent(ds, variables.variant_allele_count,
                                   variant_allele_count, count_variant_alleles)
    ds = define_variable_if_absent(ds, variables.stat_diversity,
                                   stat_diversity, diversity)
    variables.validate(
        ds,
        {
            variant_allele_count: variables.variant_allele_count_spec,
            stat_diversity: variables.stat_diversity_spec,
        },
    )

    ac = ds[variant_allele_count]

    # count segregating
    S = ((ac > 0).sum(axis=1) > 1).sum()

    # assume number of chromosomes sampled is constant for all variants
    # NOTE: even tho ac has dtype uint, we promote the sum to float
    #       because the computation below requires floats
    n = ac.sum(axis=1, dtype="float").max()

    # (n-1)th harmonic number
    a1 = (1 / da.arange(1, n)).sum()

    # calculate Watterson's theta (absolute value)
    theta = S / a1

    # get diversity
    div = ds[stat_diversity]

    # N.B., both theta estimates are usually divided by the number of
    # (accessible) bases but here we want the absolute difference
    d = div - theta

    # calculate the denominator (standard deviation)
    a2 = (1 / (da.arange(1, n)**2)).sum()
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2))
    e1 = c1 / a1
    e2 = c2 / (a1**2 + a2)
    d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1)))

    if d_stdev == 0:
        D = np.nan
    else:
        # finally calculate Tajima's D
        D = d / d_stdev

    new_ds = create_dataset({variables.stat_Tajimas_D: D})
    return conditional_merge_datasets(ds, new_ds, merge)

Example #15

0

Show file

File: aggregation.py Project: hammer/sgkit

def call_allele_frequencies(
    ds: Dataset,
    *,
    call_allele_count: Hashable = variables.call_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute per sample allele frequencies from genotype calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_allele_count
        Input variable name holding call_allele_count as defined by
        :data:`sgkit.variables.call_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_call_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.call_allele_frequency_spec`
    of allele frequencies with shape (variants, samples, alleles) and values
    corresponding to the frequency of non-missing occurrences of each allele.

    Examples
    --------
    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1
    variants
    0         1/0  1/0
    1         1/0  1/1
    2         0/1  1/0
    3         0/0  0/0
    >>> sg.call_allele_frequencies(ds)["call_allele_frequency"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[0.5, 0.5],
            [0.5, 0.5]],
    <BLANKLINE>
           [[0.5, 0.5],
            [0. , 1. ]],
    <BLANKLINE>
           [[0.5, 0.5],
            [0.5, 0.5]],
    <BLANKLINE>
           [[1. , 0. ],
            [1. , 0. ]]])
    """
    ds = define_variable_if_absent(
        ds, variables.call_allele_count, call_allele_count, count_call_alleles
    )
    variables.validate(ds, {call_allele_count: variables.call_allele_count_spec})
    AC = ds[call_allele_count]
    K = AC.sum(dim="alleles")
    # avoid divide by zero
    AF = AC / xr.where(K > 0, K, np.nan)  # type: ignore[no-untyped-call]
    new_ds = create_dataset({variables.call_allele_frequency: AF})
    return conditional_merge_datasets(ds, new_ds, merge)

Example #16

0

Show file

def pbs(
    ds: Dataset,
    *,
    stat_Fst: Hashable = variables.stat_Fst,
    cohorts: Optional[Sequence[Union[Tuple[int, int, int],
                                     Tuple[str, str, str]]]] = None,
    merge: bool = True,
) -> Dataset:
    """Compute the population branching statistic (PBS) between cohort triples.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    stat_Fst
        Fst variable to use or calculate. Defined by
        :data:`sgkit.variables.stat_Fst_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`Fst`.
    cohorts
        The cohort triples to compute statistics for, specified as a sequence of
        tuples of cohort indexes or IDs. None (the default) means compute statistics
        for all cohorts.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the PBS value between cohort triples, as defined by
    :data:`sgkit.variables.stat_pbs_spec`.
    Shape (variants, cohorts, cohorts, cohorts), or
    (windows, cohorts, cohorts, cohorts) if windowing information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=6)

    >>> # Divide samples into three named cohorts
    >>> n_cohorts = 3
    >>> sample_cohort = np.repeat(range(n_cohorts), ds.dims["samples"] // n_cohorts)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")
    >>> cohort_names = [f"co_{i}" for i in range(n_cohorts)]
    >>> ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names, "cohorts_2": cohort_names})

    >>> # Divide into two windows of size three (variants)
    >>> ds = sg.window(ds, size=3)
    >>> sg.pbs(ds)["stat_pbs"].sel(cohorts_0="co_0", cohorts_1="co_1", cohorts_2="co_2").values # doctest: +NORMALIZE_WHITESPACE
    array([ 0.      , -0.160898])
    """

    ds = define_variable_if_absent(ds, variables.stat_Fst, stat_Fst, Fst)
    variables.validate(ds, {stat_Fst: variables.stat_Fst_spec})

    fst = ds[variables.stat_Fst]
    fst = fst.clip(min=0, max=(1 - np.finfo(float).epsneg))

    t = -np.log(1 - fst)
    n_cohorts = ds.dims["cohorts"]
    n_windows = ds.dims["windows"]
    assert_array_shape(t, n_windows, n_cohorts, n_cohorts)

    # calculate PBS triples
    t = da.asarray(t)
    shape = (t.chunks[0], n_cohorts, n_cohorts, n_cohorts)

    cohorts = cohorts or list(itertools.combinations(range(n_cohorts),
                                                     3))  # type: ignore
    ct = _cohorts_to_array(cohorts, ds.indexes.get("cohorts_0", None))

    p = da.map_blocks(lambda t: _pbs_cohorts(t, ct),
                      t,
                      chunks=shape,
                      new_axis=3,
                      dtype=np.float64)
    assert_array_shape(p, n_windows, n_cohorts, n_cohorts, n_cohorts)

    new_ds = create_dataset({
        variables.stat_pbs:
        (["windows", "cohorts_0", "cohorts_1", "cohorts_2"], p)
    })
    return conditional_merge_datasets(ds, new_ds, merge)

Example #17

0

Show file

File: aggregation.py Project: timothymillar/sgkit

def cohort_allele_frequencies(
    ds: Dataset,
    *,
    cohort_allele_count: Hashable = variables.cohort_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute allele frequencies for each cohort.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    cohort_allele_count
        Input variable name holding cohort_allele_count as defined by
        :data:`sgkit.variables.cohort_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_cohort_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.cohort_allele_frequency_spec`
    of allele frequencies with shape (variants, cohorts, alleles) and values
    corresponding to the frequency of non-missing occurrences of each allele.

    Examples
    --------
    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.dims["samples"] // 2), dims="samples")
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1   S2   S3
    variants
    0         0/0  1/0  1/0  0/1
    1         1/0  0/1  0/0  1/0
    2         1/1  0/0  1/0  0/1
    3         1/0  1/1  1/1  1/0
    4         1/0  0/0  1/0  1/1

    >>> sg.cohort_allele_frequencies(ds)["cohort_allele_frequency"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[0.75, 0.25],
            [0.5 , 0.5 ]],
    <BLANKLINE>
            [[0.5 , 0.5 ],
            [0.75, 0.25]],
    <BLANKLINE>
            [[0.5 , 0.5 ],
            [0.5 , 0.5 ]],
    <BLANKLINE>
            [[0.25, 0.75],
            [0.25, 0.75]],
    <BLANKLINE>
            [[0.75, 0.25],
            [0.25, 0.75]]])
    """
    ds = define_variable_if_absent(ds, variables.cohort_allele_count,
                                   cohort_allele_count, count_cohort_alleles)
    variables.validate(
        ds, {cohort_allele_count: variables.cohort_allele_count_spec})
    AC = ds[cohort_allele_count]
    AF = AC / AC.sum(dim="alleles")
    new_ds = create_dataset({variables.cohort_allele_frequency: AF})
    return conditional_merge_datasets(ds, new_ds, merge)