Exemple #1
0
def count_genotypes(
    ds: Dataset,
    dim: Dimension,
    call_genotype: Hashable = variables.call_genotype,
    call_genotype_mask: Hashable = variables.call_genotype_mask,
    merge: bool = True,
) -> Dataset:
    variables.validate(
        ds,
        {
            call_genotype_mask: variables.call_genotype_mask_spec,
            call_genotype: variables.call_genotype_spec,
        },
    )
    odim = _swap(dim)[:-1]
    M, G = ds[call_genotype_mask].any(dim="ploidy"), ds[call_genotype]
    n_hom_ref = (G == 0).all(dim="ploidy")
    n_hom_alt = ((G > 0) & (G[..., 0] == G)).all(dim="ploidy")
    n_non_ref = (G > 0).any(dim="ploidy")
    n_het = ~(n_hom_alt | n_hom_ref)
    # This would 0 out the `het` case with any missing calls
    agg = lambda x: xr.where(M, False, x).sum(dim=dim)  # type: ignore[no-untyped-call]
    new_ds = create_dataset(
        {
            f"{odim}_n_het": agg(n_het),  # type: ignore[no-untyped-call]
            f"{odim}_n_hom_ref": agg(n_hom_ref),  # type: ignore[no-untyped-call]
            f"{odim}_n_hom_alt": agg(n_hom_alt),  # type: ignore[no-untyped-call]
            f"{odim}_n_non_ref": agg(n_non_ref),  # type: ignore[no-untyped-call]
        }
    )
    return conditional_merge_datasets(ds, new_ds, merge)
Exemple #2
0
def convert_probability_to_call(
    ds: Dataset,
    call_genotype_probability: str = variables.call_genotype_probability,
    threshold: float = 0.9,
    merge: bool = True,
) -> Dataset:
    """
    Convert genotype probabilities to hard calls.

    Parameters
    ----------
    ds
        Dataset containing genotype probabilities, such as from :func:`sgkit.io.bgen.read_bgen`.
    call_genotype_probability
        Genotype probability variable to be converted as defined by
        :data:`sgkit.variables.call_genotype_probability_spec`.
    threshold
        Probability threshold in [0, 1] that must be met or exceeded by at least one genotype
        probability in order for any calls to be made -- all values will be -1 (missing)
        otherwise. Setting this value to less than or equal to 0 disables any effect it has.
        Default value is 0.9.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the following variables:

    - `call_genotype` (variants, samples, ploidy): Converted hard calls.
        Defined by :data:`sgkit.variables.call_genotype_spec`.

    - `call_genotype_mask` (variants, samples, ploidy): Mask for converted hard calls.
        Defined by :data:`sgkit.variables.call_genotype_mask_spec`.
    """
    if not (0 <= threshold <= 1):
        raise ValueError(
            f"Threshold must be float in [0, 1], not {threshold}.")
    variables.validate(
        ds,
        {call_genotype_probability: variables.call_genotype_probability_spec})
    if ds.dims["genotypes"] != 3:
        raise NotImplementedError(
            f"Hard call conversion only supported for diploid, biallelic genotypes; "
            f"num genotypes in provided probabilities array = {ds.dims['genotypes']}."
        )
    GP = da.asarray(ds[call_genotype_probability])
    # Remove chunking in genotypes dimension, if present
    if len(GP.chunks[2]) > 1:
        GP = GP.rechunk((None, None, -1))
    K = da.empty(2, dtype=np.uint8)
    GT = _convert_probability_to_call(GP, K, threshold)
    new_ds = create_dataset({
        variables.call_genotype: (("variants", "samples", "ploidy"), GT),
        variables.call_genotype_mask:
        (("variants", "samples", "ploidy"), GT < 0),
    })
    return conditional_merge_datasets(ds, new_ds, merge)
Exemple #3
0
def count_variant_alleles(
    ds: Dataset,
    *,
    call_allele_count: Hashable = variables.call_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute allele count from per-sample allele counts, or genotype calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_allele_count
        Input variable name holding call_allele_count as defined by
        :data:`sgkit.variables.call_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_call_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.variant_allele_count_spec`
    of allele counts with shape (variants, alleles) and values corresponding to
    the number of non-missing occurrences of each allele.

    Examples
    --------

    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1
    variants
    0         1/0  1/0
    1         1/0  1/1
    2         0/1  1/0
    3         0/0  0/0

    >>> sg.count_variant_alleles(ds)["variant_allele_count"].values # doctest: +SKIP
    array([[2, 2],
           [1, 3],
           [2, 2],
           [4, 0]], dtype=uint64)
    """
    ds = define_variable_if_absent(ds, variables.call_allele_count,
                                   call_allele_count, count_call_alleles)
    variables.validate(ds,
                       {call_allele_count: variables.call_allele_count_spec})

    new_ds = create_dataset({
        variables.variant_allele_count:
        ds[call_allele_count].sum(dim="samples")
    })
    return conditional_merge_datasets(ds, new_ds, merge)
Exemple #4
0
def test_variables__multiple_specs(dummy_ds: xr.Dataset) -> None:
    spec = ArrayLikeSpec("baz", "baz doc", kind="i", ndim=1)
    invalid_spec = ArrayLikeSpec("baz", "baz doc", kind="i", ndim=2)
    variables.validate(dummy_ds, {"foo": spec, "bar": spec})
    variables.validate(dummy_ds, {"foo": spec})
    variables.validate(dummy_ds, {"bar": spec})
    with pytest.raises(ValueError, match="bar does not match the spec"):
        variables.validate(dummy_ds, {"bar": invalid_spec})
    with pytest.raises(ValueError, match="bar does not match the spec"):
        variables.validate(dummy_ds, {"foo": spec}, {"bar": invalid_spec})
Exemple #5
0
def test_variables__validate_by_name(dummy_ds: xr.Dataset) -> None:
    spec = ArrayLikeSpec("foo", "foo doc", kind="i", ndim=1)
    try:
        assert "foo" not in SgkitVariables.registered_variables
        name, spec_b = SgkitVariables.register_variable(spec)
        assert "foo" in SgkitVariables.registered_variables
        assert name == "foo"
        assert spec_b == spec
        variables.validate(dummy_ds, "foo")
    finally:
        SgkitVariables.registered_variables.pop("foo", None)
        assert "foo" not in SgkitVariables.registered_variables
Exemple #6
0
def test_variables__whole_ds(dummy_ds: xr.Dataset) -> None:
    spec_foo = ArrayLikeSpec("foo", "foo doc", kind="i", ndim=1)
    spec_bar = ArrayLikeSpec("bar", "bar doc", kind="i", ndim=1)
    try:
        SgkitVariables.register_variable(spec_foo)
        with pytest.raises(ValueError, match="`foo` already registered"):
            SgkitVariables.register_variable(spec_foo)
        SgkitVariables.register_variable(spec_bar)
        variables.validate(dummy_ds)
    finally:
        SgkitVariables.registered_variables.pop("foo", None)
        SgkitVariables.registered_variables.pop("bar", None)
Exemple #7
0
def infer_call_genotype_fill(
    ds: Dataset,
    *,
    call_genotype: Hashable = variables.call_genotype,
    merge: bool = True,
) -> Dataset:
    variables.validate(ds, {call_genotype: variables.call_genotype_spec})
    mixed_ploidy = ds[variables.call_genotype].attrs.get("mixed_ploidy", False)
    if mixed_ploidy:
        call_genotype_fill = ds[call_genotype] < -1
    else:
        call_genotype_fill = xr.full_like(ds[call_genotype], False, "b1")
    new_ds = create_dataset({variables.call_genotype_fill: call_genotype_fill})
    return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
Exemple #8
0
def pca_stats(ds: Dataset,
              est: BaseEstimator,
              *,
              merge: bool = True) -> Dataset:
    """ Extract attributes from PCA estimator """
    new_ds = {
        variables.sample_pca_component: (
            ("variants", "components"),
            _get(est, "components_", fn=lambda v: v.T),
        ),
        variables.sample_pca_explained_variance: (
            "components",
            _get(est, "explained_variance_"),
        ),
        variables.sample_pca_explained_variance_ratio: (
            "components",
            _get(est, "explained_variance_ratio_"),
        ),
    }
    new_ds = Dataset({k: v for k, v in new_ds.items() if v[1] is not None})
    if "sample_pca_component" in new_ds and "sample_pca_explained_variance" in new_ds:
        new_ds[variables.sample_pca_loading] = new_ds[
            variables.sample_pca_component] * np.sqrt(
                new_ds[variables.sample_pca_explained_variance].data)
    return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
Exemple #9
0
def allele_frequency(
    ds: Dataset,
    call_genotype_mask: Hashable,
    variant_allele_count: Hashable,
) -> Dataset:
    data_vars: Dict[Hashable, Any] = {}
    # only compute variant allele count if not already in dataset
    if variant_allele_count in ds:
        variables.validate(
            ds, {variant_allele_count: variables.variant_allele_count_spec}
        )
        AC = ds[variant_allele_count]
    else:
        AC = count_variant_alleles(ds, merge=False)[variables.variant_allele_count]
        data_vars[variables.variant_allele_count] = AC

    M = ds[call_genotype_mask].stack(calls=("samples", "ploidy"))
    AN = (~M).sum(dim="calls")
    assert AN.shape == (ds.dims["variants"],)

    data_vars[variables.variant_allele_total] = AN
    data_vars[variables.variant_allele_frequency] = AC / AN
    return create_dataset(data_vars)
Exemple #10
0
def infer_call_ploidy(
    ds: Dataset,
    *,
    call_genotype: Hashable = variables.call_genotype,
    call_genotype_non_allele: Hashable = variables.call_genotype_non_allele,
    merge: bool = True,
) -> Dataset:
    """Infer the ploidy of each call genotype based on the number of
    non-allele values in each call genotype.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_genotype
        Input variable name holding call_genotype as defined by
        :data:`sgkit.variables.call_genotype_spec`.
        Must be present in ``ds``.
    call_genotype_non_allele
        Input variable name holding call_genotype_non_allele as defined by
        :data:`sgkit.variables.call_genotype_non_allele_spec`.
        If the variable is not present in ``ds``, it will be computed
        assuming that allele values less than -1 are non-alleles in mixed ploidy
        datasets, or that no non-alleles are present in fixed ploidy datasets.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.call_ploidy_spec`.
    """
    ds = define_variable_if_absent(
        ds,
        variables.call_genotype_non_allele,
        call_genotype_non_allele,
        infer_non_alleles,
    )
    mixed_ploidy = ds[variables.call_genotype].attrs.get("mixed_ploidy", False)
    if mixed_ploidy:
        call_ploidy = (~ds[call_genotype_non_allele]).sum(
            axis=-1)  # type: ignore[operator]
    else:
        ploidy = ds[variables.call_genotype].shape[-1]
        call_ploidy = xr.full_like(ds[variables.call_genotype][..., 0], ploidy)

    new_ds = create_dataset({variables.call_ploidy: call_ploidy})
    return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
Exemple #11
0
def pca_transform(
    ds: Dataset,
    est: BaseEstimator,
    *,
    variable: str = "call_alternate_allele_count",
    check_missing: bool = True,
    merge: bool = True,
) -> Dataset:
    """ Apply PCA estimator to new data """
    AC = _allele_counts(ds, variable, check_missing=check_missing)
    projection = est.transform(da.asarray(AC).T)
    new_ds = Dataset(
        {variables.sample_pca_projection: (("samples", "components"), projection)}
    )
    return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
Exemple #12
0
def infer_sample_ploidy(
    ds: Dataset,
    *,
    call_genotype: Hashable = variables.call_genotype,
    call_ploidy: Hashable = variables.call_ploidy,
    merge: bool = True,
) -> Dataset:
    """Infer the ploidy of each sample across all variants based on
    the number of non-allele values in call genotypes.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_genotype
        Input variable name holding call_genotype as defined by
        :data:`sgkit.variables.call_genotype_spec`.
        Must be present in ``ds``.
    call_ploidy
        Input variable name holding call_ploidy as defined by
        :data:`sgkit.variables.call_ploidy_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`infer_call_ploidy`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.sample_ploidy_spec`.
    """
    ds = define_variable_if_absent(ds, variables.call_ploidy, call_ploidy,
                                   infer_call_ploidy)
    # validate against spec
    mixed_ploidy = ds[variables.call_genotype].attrs.get("mixed_ploidy", False)
    if mixed_ploidy:
        sample_ploidy_fixed = (ds[call_ploidy][0, :] == ds[call_ploidy]).all(
            axis=-1)
        sample_ploidy = xr.where(sample_ploidy_fixed, ds[call_ploidy][0, :],
                                 -1)  # type: ignore[no-untyped-call]
    else:
        ploidy = ds[variables.call_genotype].shape[-1]
        sample_ploidy = xr.full_like(ds[call_ploidy][0, ...], ploidy)

    new_ds = create_dataset({variables.sample_ploidy: sample_ploidy})
    return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
Exemple #13
0
def individual_heterozygosity(
    ds: Dataset,
    *,
    call_allele_count: Hashable = variables.call_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute per call individual heterozygosity.

    Individual heterozygosity is the probability that two alleles
    drawn at random without replacement, from an individual at a
    given site, are not identical in state. Therefore, individual
    heterozygosity is defined for diploid and polyploid calls but
    will return nan in the case of haploid calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_allele_count
        Input variable name holding call_allele_count as defined by
        :data:`sgkit.variables.call_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_call_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.
    Returns
    -------
    A dataset containing :data:`sgkit.variables.call_heterozygosity_spec`
    of per genotype observed heterozygosity with shape (variants, samples)
    containing values within the interval [0, 1] or nan if ploidy < 2.

    Examples
    --------

    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1
    variants
    0         1/0  1/0
    1         1/0  1/1
    2         0/1  1/0
    3         0/0  0/0

    >>> sg.individual_heterozygosity(ds)["call_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE
    array([[1., 1.],
           [1., 0.],
           [1., 1.],
           [0., 0.]])
    """
    ds = define_variable_if_absent(
        ds, variables.call_allele_count, call_allele_count, count_call_alleles
    )
    variables.validate(ds, {call_allele_count: variables.call_allele_count_spec})

    AC = da.asarray(ds.call_allele_count)
    K = AC.sum(axis=-1)
    # use nan denominator to avoid divide by zero with K - 1
    K2 = da.where(K > 1, K, np.nan)
    AF = AC / K2[..., None]
    HI = (1 - da.sum(AF ** 2, axis=-1)) * (K / (K2 - 1))
    new_ds = create_dataset(
        {variables.call_heterozygosity: (("variants", "samples"), HI)}
    )
    return conditional_merge_datasets(ds, new_ds, merge)
Exemple #14
0
def sample_stats(
    ds: Dataset,
    *,
    call_genotype_mask: Hashable = variables.call_genotype_mask,
    call_genotype: Hashable = variables.call_genotype,
    variant_allele_count: Hashable = variables.variant_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute quality control sample statistics from genotype calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_genotype
        Input variable name holding call_genotype.
        Defined by :data:`sgkit.variables.call_genotype_spec`.
        Must be present in ``ds``.
    call_genotype_mask
        Input variable name holding call_genotype_mask.
        Defined by :data:`sgkit.variables.call_genotype_mask_spec`
        Must be present in ``ds``.
    variant_allele_count
        Input variable name holding variant_allele_count,
        as defined by :data:`sgkit.variables.variant_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_variant_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the following variables:

    - :data:`sgkit.variables.sample_n_called_spec` (samples):
      The number of variants with called genotypes.
    - :data:`sgkit.variables.sample_call_rate_spec` (samples):
      The fraction of variants with called genotypes.
    - :data:`sgkit.variables.sample_n_het_spec` (samples):
      The number of variants with heterozygous calls.
    - :data:`sgkit.variables.sample_n_hom_ref_spec` (samples):
      The number of variants with homozygous reference calls.
    - :data:`sgkit.variables.sample_n_hom_alt_spec` (samples):
      The number of variants with homozygous alternate calls.
    - :data:`sgkit.variables.sample_n_non_ref_spec` (samples):
      The number of variants that are not homozygous reference calls.
    """
    variables.validate(
        ds,
        {
            call_genotype: variables.call_genotype_spec,
            call_genotype_mask: variables.call_genotype_mask_spec,
        },
    )
    new_ds = xr.merge(
        [
            call_rate(ds, dim="variants", call_genotype_mask=call_genotype_mask),
            count_genotypes(
                ds,
                dim="variants",
                call_genotype=call_genotype,
                call_genotype_mask=call_genotype_mask,
                merge=False,
            ),
        ]
    )
    return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
Exemple #15
0
def call_allele_frequencies(
    ds: Dataset,
    *,
    call_allele_count: Hashable = variables.call_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute per sample allele frequencies from genotype calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_allele_count
        Input variable name holding call_allele_count as defined by
        :data:`sgkit.variables.call_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_call_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.call_allele_frequency_spec`
    of allele frequencies with shape (variants, samples, alleles) and values
    corresponding to the frequency of non-missing occurrences of each allele.

    Examples
    --------
    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1
    variants
    0         1/0  1/0
    1         1/0  1/1
    2         0/1  1/0
    3         0/0  0/0
    >>> sg.call_allele_frequencies(ds)["call_allele_frequency"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[0.5, 0.5],
            [0.5, 0.5]],
    <BLANKLINE>
           [[0.5, 0.5],
            [0. , 1. ]],
    <BLANKLINE>
           [[0.5, 0.5],
            [0.5, 0.5]],
    <BLANKLINE>
           [[1. , 0. ],
            [1. , 0. ]]])
    """
    ds = define_variable_if_absent(
        ds, variables.call_allele_count, call_allele_count, count_call_alleles
    )
    variables.validate(ds, {call_allele_count: variables.call_allele_count_spec})
    AC = ds[call_allele_count]
    K = AC.sum(dim="alleles")
    # avoid divide by zero
    AF = AC / xr.where(K > 0, K, np.nan)  # type: ignore[no-untyped-call]
    new_ds = create_dataset({variables.call_allele_frequency: AF})
    return conditional_merge_datasets(ds, new_ds, merge)
Exemple #16
0
def count_cohort_alleles(
    ds: Dataset,
    *,
    call_allele_count: Hashable = variables.call_allele_count,
    sample_cohort: Hashable = variables.sample_cohort,
    merge: bool = True,
) -> Dataset:
    """Compute per cohort allele counts from per-sample allele counts, or genotype calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_allele_count
        Input variable name holding call_allele_count as defined by
        :data:`sgkit.variables.call_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_call_alleles`.
    sample_cohort
        Input variable name holding sample_cohort as defined by
        :data:`sgkit.variables.sample_cohort_spec`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.cohort_allele_count_spec`
    of allele counts with shape (variants, cohorts, alleles) and values corresponding to
    the number of non-missing occurrences of each allele.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.dims["samples"] // 2), dims="samples")
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1   S2   S3
    variants
    0         0/0  1/0  1/0  0/1
    1         1/0  0/1  0/0  1/0
    2         1/1  0/0  1/0  0/1
    3         1/0  1/1  1/1  1/0
    4         1/0  0/0  1/0  1/1

    >>> sg.count_cohort_alleles(ds)["cohort_allele_count"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[3, 1],
            [2, 2]],
    <BLANKLINE>
            [[2, 2],
            [3, 1]],
    <BLANKLINE>
            [[2, 2],
            [2, 2]],
    <BLANKLINE>
            [[1, 3],
            [1, 3]],
    <BLANKLINE>
            [[3, 1],
            [1, 3]]])
    """
    ds = define_variable_if_absent(
        ds, variables.call_allele_count, call_allele_count, count_call_alleles
    )
    variables.validate(ds, {call_allele_count: variables.call_allele_count_spec})

    n_variants = ds.dims["variants"]
    n_alleles = ds.dims["alleles"]

    AC, SC = da.asarray(ds[call_allele_count]), da.asarray(ds[sample_cohort])
    n_cohorts = SC.max().compute() + 1  # 0-based indexing
    C = da.empty(n_cohorts, dtype=np.uint8)

    G = da.asarray(ds.call_genotype)
    shape = (G.chunks[0], n_cohorts, n_alleles)

    AC = da.map_blocks(_count_cohort_alleles, AC, SC, C, chunks=shape, dtype=np.int32)
    assert_array_shape(
        AC, n_variants, n_cohorts * AC.numblocks[1], n_alleles * AC.numblocks[2]
    )

    # Stack the blocks and sum across them
    # (which will only work because each chunk is guaranteed to have same size)
    AC = da.stack([AC.blocks[:, i] for i in range(AC.numblocks[1])]).sum(axis=0)
    assert_array_shape(AC, n_variants, n_cohorts, n_alleles)

    new_ds = create_dataset(
        {variables.cohort_allele_count: (("variants", "cohorts", "alleles"), AC)}
    )
    return conditional_merge_datasets(ds, new_ds, merge)
Exemple #17
0
def test_variables__no_spec(dummy_ds: xr.Dataset) -> None:
    with pytest.raises(ValueError, match="No array spec registered for foo"):
        variables.validate(dummy_ds, "foo")
    variables.validate(dummy_ds,
                       "bar")  # no spec needed for coordinates or indexes
Exemple #18
0
def ld_matrix(
    ds: Dataset,
    *,
    dosage: Hashable = variables.dosage,
    threshold: Optional[float] = None,
    variant_score: Optional[Hashable] = None,
) -> DataFrame:
    """Compute a sparse linkage disequilibrium (LD) matrix.

    This method computes the Rogers Huff R2 value for each pair of variants in
    a window, and returns those that exceed the provided threshold, as a sparse
    matrix dataframe.

    Parameters
    ----------
    ds
        Dataset containing genotype dosages. Must already be windowed with :func:`window`.
    dosage
        Name of genetic dosage variable.
        Defined by :data:`sgkit.variables.dosage_spec`.
    threshold
        R2 threshold below which no variant pairs will be returned. This should almost
        always be something at least slightly above 0 to avoid the large density very
        near zero LD present in most datasets.
    variant_score
        Optional name of variable to use to prioritize variant selection
        (e.g. minor allele frequency). Defaults to None.
        Defined by :data:`sgkit.variables.variant_score_spec`.

    Returns
    -------
    Upper triangle (including diagonal) of LD matrix as COO in dataframe.  Fields:

    - ``i``: Row (variant) index 1
    - ``j``: Row (variant) index 2
    - ``value``: R2 value
    - ``cmp``: If ``variant_score`` is provided, this is 1, 0, or -1 indicating whether or not ``i > j`` (1), ``i < j`` (-1), or ``i == j`` (0)

    Raises
    ------
    ValueError
        If the dataset is not windowed.
    """

    if not has_windows(ds):
        raise ValueError(
            "Dataset must be windowed for ld_matrix. See the sgkit.window function."
        )

    variables.validate(ds, {dosage: variables.dosage_spec})

    x = da.asarray(ds[dosage])

    threshold = threshold or np.nan

    if variant_score is not None:
        variables.validate(ds, {variant_score: variables.variant_score_spec})
        scores = da.asarray(ds[variant_score])
    else:
        scores = None

    # Find windows in each chunk
    window_starts = ds.window_start.values
    window_stops = ds.window_stop.values
    window_lengths = window_stops - window_starts

    chunks = x.chunks[0]
    chunk_starts = _sizes_to_start_offsets(chunks)
    rel_window_starts, windows_per_chunk = _get_chunked_windows(
        chunks, window_starts, window_stops)
    rel_window_stops = rel_window_starts + window_lengths
    chunk_offset_indexes = _sizes_to_start_offsets(windows_per_chunk)

    def to_ld_df(x: ArrayLike, chunk_index: int) -> DataFrame:
        chunk_offset_index_start = chunk_offset_indexes[chunk_index]
        chunk_offset_index_stop = chunk_offset_indexes[chunk_index + 1]
        chunk_window_starts = rel_window_starts[
            chunk_offset_index_start:chunk_offset_index_stop]
        chunk_window_stops = rel_window_stops[
            chunk_offset_index_start:chunk_offset_index_stop]
        max_stop = np.max(
            chunk_window_stops) if len(chunk_window_stops) > 0 else 0
        abs_chunk_start = chunk_starts[chunk_index]
        abs_chunk_end = abs_chunk_start + max_stop  # this may extend into later chunks
        block_x = x[abs_chunk_start:abs_chunk_end]
        block_scores = (scores[abs_chunk_start:abs_chunk_end]
                        if scores is not None else None)

        # Look at the next window (not in this chunk) to find out where to stop processing
        # windows in this chunk (see _ld_matrix_jit)
        if len(window_starts) == chunk_offset_index_stop:
            # if there are no more windows, then need to process the all windows in this chunk entirely
            chunk_max_window_start = max_stop
        else:
            # otherwise only process up the start of the next window
            chunk_max_window_start = (window_starts[chunk_offset_index_stop] -
                                      chunk_starts[chunk_index])

        index_dtype = np.int32
        value_dtype = np.float32

        f = dask.delayed(_ld_matrix)(
            block_x,
            chunk_window_starts,
            chunk_window_stops,
            abs_chunk_start,
            chunk_max_window_start,
            index_dtype,
            value_dtype,
            threshold=threshold,
            scores=block_scores,
        )
        meta = [("i", index_dtype), ("j", index_dtype), ("value", value_dtype)]
        if scores is not None:
            meta.append(("cmp", np.int8))
        return dd.from_delayed([f], meta=meta)

    return dd.concat([
        to_ld_df(x, chunk_index)
        for chunk_index in range(len(windows_per_chunk))
    ])
Exemple #19
0
def identity_by_state(
    ds: Dataset,
    *,
    call_allele_frequency: Hashable = variables.call_allele_frequency,
    merge: bool = True,
) -> Dataset:
    """Compute identity by state (IBS) probabilities between
    all pairs of samples.

    The IBS probability between a pair of individuals is the
    probability that a randomly drawn allele from the first individual
    is identical in state with a randomly drawn allele from the second
    individual at a single random locus.

    Parameters
    ----------
    ds
        Dataset containing call genotype alleles.
    call_allele_frequency
        Input variable name holding call_allele_frequency as defined by
        :data:`sgkit.variables.call_allele_frequency_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`call_allele_frequencies`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.stat_identity_by_state_spec`
    which is a matrix of pairwise IBS probabilities among all samples.
    The dimensions are named ``samples_0`` and ``samples_1``.

    Raises
    ------
    NotImplementedError
        If the variable holding call_allele_frequency is chunked along the
        samples dimension.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=2, n_sample=3, seed=2)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1   S2
    variants
    0         0/0  1/1  1/0
    1         1/1  1/1  1/0
    >>> sg.identity_by_state(ds)["stat_identity_by_state"].values # doctest: +NORMALIZE_WHITESPACE
    array([[1. , 0.5, 0.5],
           [0.5, 1. , 0.5],
           [0.5, 0.5, 0.5]])
    """
    ds = define_variable_if_absent(
        ds,
        variables.call_allele_frequency,
        call_allele_frequency,
        call_allele_frequencies,
    )
    variables.validate(
        ds, {call_allele_frequency: variables.call_allele_frequency_spec}
    )
    af = da.asarray(ds[call_allele_frequency])
    if len(af.chunks[1]) > 1:
        raise NotImplementedError(
            "identity_by_state does not support chunking in the samples dimension"
        )
    af0 = da.where(da.isnan(af), 0.0, af)
    num = da.einsum("ixj,iyj->xy", af0, af0)
    called = da.nansum(af, axis=-1)
    count = da.einsum("ix,iy->xy", called, called)
    denom = da.where(count == 0, np.nan, count)
    new_ds = create_dataset(
        {
            variables.stat_identity_by_state: (
                ("samples_0", "samples_1"),
                num / denom,
            )
        }
    )
    return conditional_merge_datasets(ds, new_ds, merge)
Exemple #20
0
def count_call_alleles(
    ds: Dataset,
    *,
    call_genotype: Hashable = variables.call_genotype,
    merge: bool = True,
) -> Dataset:
    """Compute per sample allele counts from genotype calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_genotype
        Input variable name holding call_genotype as defined by
        :data:`sgkit.variables.call_genotype_spec`.
        Must be present in ``ds``.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.call_allele_count_spec`
    of allele counts with shape (variants, samples, alleles) and values corresponding to
    the number of non-missing occurrences of each allele.

    Examples
    --------

    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1
    variants
    0         1/0  1/0
    1         1/0  1/1
    2         0/1  1/0
    3         0/0  0/0

    >>> sg.count_call_alleles(ds)["call_allele_count"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[1, 1],
            [1, 1]],
    <BLANKLINE>
           [[1, 1],
            [0, 2]],
    <BLANKLINE>
           [[1, 1],
            [1, 1]],
    <BLANKLINE>
           [[2, 0],
            [2, 0]]], dtype=uint8)
    """
    variables.validate(ds, {call_genotype: variables.call_genotype_spec})
    n_alleles = ds.dims["alleles"]
    G = da.asarray(ds[call_genotype])
    shape = (G.chunks[0], G.chunks[1], n_alleles)
    N = da.empty(n_alleles, dtype=np.uint8)
    new_ds = create_dataset(
        {
            variables.call_allele_count: (
                ("variants", "samples", "alleles"),
                da.map_blocks(
                    count_alleles, G, N, chunks=shape, drop_axis=2, new_axis=2
                ),
            )
        }
    )
    return conditional_merge_datasets(ds, new_ds, merge)
Exemple #21
0
def test_variables__validate_by_dummy_spec(dummy_ds: xr.Dataset) -> None:
    spec = ArrayLikeSpec("foo", "foo doc", kind="i", ndim=1)
    variables.validate(dummy_ds, spec)
Exemple #22
0
def test_variables__no_present_in_ds(dummy_ds: xr.Dataset) -> None:
    spec = ArrayLikeSpec("baz", "baz doc", kind="i", ndim=1)
    with pytest.raises(ValueError, match="foobarbaz not present in"):
        variables.validate(dummy_ds, {"foobarbaz": spec})
Exemple #23
0
def test_variables__alternative_names(dummy_ds: xr.Dataset) -> None:
    spec = ArrayLikeSpec("baz", "baz doc", kind="i", ndim=1)
    variables.validate(dummy_ds, {"foo": spec, "bar": spec})
Exemple #24
0
def test_variables__invalid_spec_fails(dummy_ds: xr.Dataset) -> None:
    invalid_spec = ArrayLikeSpec("foo", "foo doc", kind="i", ndim=2)
    with pytest.raises(ValueError, match="foo does not match the spec"):
        variables.validate(dummy_ds, invalid_spec)
Exemple #25
0
def hardy_weinberg_test(ds: Dataset,
                        *,
                        genotype_counts: Optional[Hashable] = None,
                        ploidy: Optional[int] = None,
                        alleles: Optional[int] = None,
                        merge: bool = True) -> Dataset:
    """Exact test for HWE as described in Wigginton et al. 2005 [1].

    Parameters
    ----------
    ds
        Dataset containing genotype calls or precomputed genotype counts.
    genotype_counts
        Name of variable containing precomputed genotype counts, by default
        None. If not provided, these counts will be computed automatically
        from genotype calls. If present, must correspond to an (`N`, 3) array
        where `N` is equal to the number of variants and the 3 columns contain
        heterozygous, homozygous reference, and homozygous alternate counts
        (in that order) across all samples for a variant.
    ploidy
        Genotype ploidy, defaults to ``ploidy`` dimension of provided dataset.
        If the `ploidy` dimension is not present, then this value must be set explicitly.
        Currently HWE calculations are only supported for diploid datasets,
        i.e. ``ploidy`` must equal 2.
    alleles
        Genotype allele count, defaults to ``alleles`` dimension of provided dataset.
        If the `alleles` dimension is not present, then this value must be set explicitly.
        Currently HWE calculations are only supported for biallelic datasets,
        i.e. ``alleles`` must equal 2.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Warnings
    --------
    This function is only applicable to diploid, biallelic datasets.

    Returns
    -------
    Dataset containing (N = num variants):

    variant_hwe_p_value : [array-like, shape: (N, O)]
        P values from HWE test for each variant as float in [0, 1].

    References
    ----------
    - [1] Wigginton, Janis E., David J. Cutler, and Goncalo R. Abecasis. 2005.
        “A Note on Exact Tests of Hardy-Weinberg Equilibrium.” American Journal of
        Human Genetics 76 (5): 887–93.

    Raises
    ------
    NotImplementedError
        If ploidy of provided dataset != 2
    NotImplementedError
        If maximum number of alleles in provided dataset != 2
    """
    ploidy = ploidy or ds.dims.get("ploidy")
    if not ploidy:
        raise ValueError(
            "`ploidy` parameter must be set when not present as dataset dimension."
        )
    if ploidy != 2:
        raise NotImplementedError(
            "HWE test only implemented for diploid genotypes")

    alleles = alleles or ds.dims.get("alleles")
    if not alleles:
        raise ValueError(
            "`alleles` parameter must be set when not present as dataset dimension."
        )
    if alleles != 2:
        raise NotImplementedError(
            "HWE test only implemented for biallelic genotypes")

    # Use precomputed genotype counts if provided
    if genotype_counts is not None:
        variables.validate(ds,
                           {genotype_counts: variables.genotype_counts_spec})
        obs = list(da.asarray(ds[genotype_counts]).T)
    # Otherwise compute genotype counts from calls
    else:
        ds = count_genotypes(ds, dim="samples")
        obs = [
            da.asarray(ds[v]) for v in
            ["variant_n_het", "variant_n_hom_ref", "variant_n_hom_alt"]
        ]
    p = da.map_blocks(hardy_weinberg_p_value_vec_jit, *obs)
    new_ds = create_dataset({variables.variant_hwe_p_value: ("variants", p)})
    return conditional_merge_datasets(ds, new_ds, merge)
Exemple #26
0
def test_variables_in_multi_index(dummy_ds: xr.Dataset) -> None:
    # create a multi index
    ds = dummy_ds.set_index({"ind": ("foo", "bar")})

    spec = ArrayLikeSpec("foo", "foo doc", kind="i", ndim=1)
    variables.validate(ds, spec)
Exemple #27
0
def Weir_Goudet_beta(
    ds: Dataset,
    *,
    stat_identity_by_state: Hashable = variables.stat_identity_by_state,
    merge: bool = True,
) -> Dataset:
    """Estimate pairwise beta between all pairs of samples as described
    in Weir and Goudet 2017 [1].

    Beta is the kinship scaled by the average kinship of all pairs of
    individuals in the dataset such that the non-diagonal (non-self) values
    sum to zero.

    Beta may be corrected to more accurately reflect pedigree based kinship
    estimates using the formula
    :math:`\\hat{\\beta}^c=\\frac{\\hat{\\beta}-\\hat{\\beta}_0}{1-\\hat{\\beta}_0}`
    where :math:`\\hat{\\beta}_0` is the estimated beta between samples which are
    known to be unrelated [1].

    Parameters
    ----------
    ds
        Genotype call dataset.
    stat_identity_by_state
        Input variable name holding stat_identity_by_state as defined
        by :data:`sgkit.variables.stat_identity_by_state_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`identity_by_state`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.stat_Weir_Goudet_beta_spec`
    which is a matrix of estimated pairwise kinship relative to the average
    kinship of all pairs of individuals in the dataset.
    The dimensions are named ``samples_0`` and ``samples_1``.

    Examples
    --------

    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=3, n_sample=3, n_allele=10, seed=3)
    >>> # sample 2 "inherits" alleles from samples 0 and 1
    >>> ds.call_genotype.data[:, 2, 0] = ds.call_genotype.data[:, 0, 0]
    >>> ds.call_genotype.data[:, 2, 1] = ds.call_genotype.data[:, 1, 0]
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1   S2
    variants
    0         7/1  8/6  7/8
    1         9/5  3/6  9/3
    2         8/8  8/3  8/8
    >>> # estimate beta
    >>> ds = sg.Weir_Goudet_beta(ds).compute()
    >>> ds.stat_Weir_Goudet_beta.values # doctest: +NORMALIZE_WHITESPACE
    array([[ 0.5 , -0.25,  0.25],
           [-0.25,  0.25,  0.  ],
           [ 0.25,  0.  ,  0.5 ]])
    >>> # correct beta assuming least related samples are unrelated
    >>> beta = ds.stat_Weir_Goudet_beta
    >>> beta0 = beta.min()
    >>> beta_corrected = (beta - beta0) / (1 - beta0)
    >>> beta_corrected.values # doctest: +NORMALIZE_WHITESPACE
    array([[0.6, 0. , 0.4],
           [0. , 0.4, 0.2],
           [0.4, 0.2, 0.6]])

    References
    ----------
    [1] - Bruce, S. Weir, and Jérôme Goudet 2017.
    "A Unified Characterization of Population Structure and Relatedness."
    Genetics 206 (4): 2085-2103.
    """
    ds = define_variable_if_absent(
        ds, variables.stat_identity_by_state, stat_identity_by_state, identity_by_state
    )
    variables.validate(
        ds, {stat_identity_by_state: variables.stat_identity_by_state_spec}
    )
    ibs = ds[stat_identity_by_state].data
    # average matching is the mean of non-diagonal elements
    num = da.nansum(da.tril(ibs, -1))
    denom = da.nansum(da.tril(~da.isnan(ibs), -1))
    avg = num / denom
    beta = (ibs - avg) / (1 - avg)
    new_ds = create_dataset(
        {
            variables.stat_Weir_Goudet_beta: (
                ("samples_0", "samples_1"),
                beta,
            )
        }
    )
    return conditional_merge_datasets(ds, new_ds, merge)
Exemple #28
0
def pc_relate(ds: xr.Dataset,
              *,
              maf: float = 0.01,
              call_genotype: Hashable = variables.call_genotype,
              call_genotype_mask: Hashable = variables.call_genotype_mask,
              sample_pcs: Hashable = variables.sample_pcs,
              merge: bool = True) -> xr.Dataset:
    """Compute PC-Relate as described in Conomos, et al. 2016 [1].

    This method computes the kinship coefficient matrix. The kinship coefficient for
    a pair of individuals ``i`` and ``j`` is commonly defined to be the probability that
    a random allele selected from ``i`` and a random allele selected from ``j`` at
    a locus are IBD. Several of the most common family relationships and their
    corresponding kinship coefficient:

    +--------------------------------------------------+---------------------+
    | Relationship                                     | Kinship coefficient |
    +==================================================+=====================+
    | Individual-self                                  | 1/2                 |
    +--------------------------------------------------+---------------------+
    | full sister/full brother                         | 1/4                 |
    +--------------------------------------------------+---------------------+
    | mother/father/daughter/son                       | 1/4                 |
    +--------------------------------------------------+---------------------+
    | grandmother/grandfather/granddaughter/grandson   | 1/8                 |
    +--------------------------------------------------+---------------------+
    | aunt/uncle/niece/nephew                          | 1/8                 |
    +--------------------------------------------------+---------------------+
    | first cousin                                     | 1/16                |
    +--------------------------------------------------+---------------------+
    | half-sister/half-brother                         | 1/8                 |
    +--------------------------------------------------+---------------------+

    Parameters
    ----------
    ds
        Dataset containing (S = num samples, V = num variants, D = ploidy, PC = num PC)

        - genotype calls: (SxVxD)
        - genotype calls mask: (SxVxD)
        - sample PCs: (PCxS)
    maf
        individual minor allele frequency filter. If an individual's estimated
        individual-specific minor allele frequency at a SNP is less than this value,
        that SNP will be excluded from the analysis for that individual.
        The default value is 0.01. Must be between (0.0, 0.1).
    call_genotype
        Input variable name holding call_genotype.
        Defined by :data:`sgkit.variables.call_genotype_spec`.
    call_genotype_mask
        Input variable name holding call_genotype_mask.
        Defined by :data:`sgkit.variables.call_genotype_mask_spec`
    sample_pcs
        Input variable name holding sample_pcs.
        Defined by :data:`sgkit.variables.sample_pcs_spec`
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Warnings
    --------
    This function is only applicable to diploid, biallelic datasets.
    This version is compatible with the R implementation of PC Relate
    method from the GENESIS package version 2.18.0.

    Returns
    -------
    Dataset containing (S = num samples):

    :data:`sgkit.variables.pc_relate_phi_spec`: (S,S) ArrayLike
    pairwise recent kinship coefficient matrix as float in [-0.5, 0.5].

    References
    ----------
    [1] - Conomos, Matthew P., Alexander P. Reiner, Bruce S. Weir, and Timothy A. Thornton. 2016.
    "Model-Free Estimation of Recent Genetic Relatedness."
    American Journal of Human Genetics 98 (1): 127–48.

    Raises
    ------
    ValueError
        If ploidy of provided dataset != 2
    ValueError
        If maximum number of alleles in provided dataset != 2
    ValueError
        Input dataset is missing any of the required variables
    ValueError
        If maf is not in (0.0, 1.0)
    """
    if maf <= 0.0 or maf >= 1.0:
        raise ValueError("MAF must be between (0.0, 1.0)")
    if "ploidy" in ds.dims and ds.dims["ploidy"] != 2:
        raise ValueError("PC Relate only works for diploid genotypes")
    if "alleles" in ds.dims and ds.dims["alleles"] != 2:
        raise ValueError("PC Relate only works for biallelic genotypes")
    variables.validate(
        ds,
        {
            call_genotype: variables.call_genotype_spec,
            call_genotype_mask: variables.call_genotype_mask_spec,
            sample_pcs: variables.sample_pcs_spec,
        },
    )

    call_g, call_g_mask = _collapse_ploidy(ds, call_genotype,
                                           call_genotype_mask)
    imputed_call_g = _impute_genotype_call_with_variant_mean(
        call_g, call_g_mask)

    # 𝔼[gs|V] = 1β0 + Vβ, where 1 is a length _s_ vector of 1s, and β = (β1,...,βD)^T
    # is a length D vector of regression coefficients for each of the PCs
    pcs = ds[sample_pcs]
    pcsi = da.concatenate([da.ones((1, pcs.shape[1]), dtype=pcs.dtype), pcs],
                          axis=0)
    # Note: dask qr decomp requires no chunking in one dimension, and because number of
    # components should be smaller than number of samples in most cases, we disable
    # chunking on components
    pcsi = pcsi.T.rechunk((None, -1))

    q, r = da.linalg.qr(pcsi)
    # mu, eq: 3
    half_beta = da.linalg.inv(2 * r).dot(q.T).dot(imputed_call_g.T)
    mu = pcsi.dot(half_beta).T
    # phi, eq: 4
    mask = (mu <= maf) | (mu >= 1.0 - maf) | call_g_mask
    mu_mask = da.ma.masked_array(mu, mask=mask)
    variance = mu_mask * (1.0 - mu_mask)
    variance = da.ma.filled(variance, fill_value=0.0)
    stddev = da.sqrt(variance)
    centered_af = call_g / 2 - mu_mask
    centered_af = da.ma.filled(centered_af, fill_value=0.0)
    # NOTE: gramian could be a performance bottleneck, and we could explore
    #       performance improvements like (or maybe sth else):
    #       * calculating only the pairs we are interested in
    #       * using an optimized einsum.
    assert centered_af.shape == call_g.shape
    assert stddev.shape == call_g.shape
    phi = gramian(centered_af) / gramian(stddev)
    # NOTE: phi is of shape (S x S), S = num samples
    assert phi.shape == (call_g.shape[1], ) * 2
    new_ds = create_dataset(
        {variables.pc_relate_phi: (("sample_x", "sample_y"), phi)})
    return conditional_merge_datasets(ds, new_ds, merge)
Exemple #29
0
def filter_partial_calls(
    ds: Dataset,
    *,
    call_genotype: Hashable = variables.call_genotype,
    merge: bool = True,
) -> Dataset:
    """Replace partial genotype calls with missing values.

    Parameters
    ----------
    ds
        Genotype call dataset such as from
        :func:`sgkit.create_genotype_call_dataset`.
    call_genotype
        Input variable name holding call_genotype as defined by
        :data:`sgkit.variables.call_genotype_spec`
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    Dataset containing `call_genotype_complete` and
    `call_genotype_complete_mask` in which partial genotype calls are
    replaced with completely missing genotype calls.

    Examples
    --------
    >>> import sgkit as sg
    >>> from sgkit.testing import simulate_genotype_call_dataset
    >>> ds = simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1, missing_pct=0.3)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1
    variants
    0         ./0  ./.
    1         ./0  1/1
    2         0/1  ./0
    3         ./0  0/0
    >>> ds2 = filter_partial_calls(ds)
    >>> ds2['call_genotype'] = ds2['call_genotype_complete']
    >>> ds2['call_genotype_mask'] = ds2['call_genotype_complete_mask']
    >>> sg.display_genotypes(ds2) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1
    variants
    0         ./.  ./.
    1         ./.  1/1
    2         0/1  ./.
    3         ./.  0/0


    Notes
    -----
    The returned dataset will still contain the initial `call_genotype` and
    `call_genotype_mask` variables. Many sgkit functions will default to
    using `call_genotype` and/or `call_genotype_mask`, hence it is necessary
    to overwrite these variables (see the example) or explicitly pass the new
    variables as function arguments in order to remove partial calls from
    futher analysis.
    """
    variables.validate(ds, {call_genotype: variables.call_genotype_spec})
    G = ds[call_genotype]
    mixed_ploidy = G.attrs.get("mixed_ploidy", False)
    if mixed_ploidy:
        P = (G == -1).any(axis=-1) & (G >= -1)
    else:
        P = (G < 0).any(axis=-1)
    F = xr.where(P, -1, G)  # type: ignore[no-untyped-call]
    new_ds = create_dataset({
        variables.call_genotype_complete: F,
        variables.call_genotype_complete_mask: F < 0,
    })
    new_ds[
        variables.call_genotype_complete].attrs["mixed_ploidy"] = mixed_ploidy
    return conditional_merge_datasets(ds, new_ds, merge)
Exemple #30
0
def cohort_allele_frequencies(
    ds: Dataset,
    *,
    cohort_allele_count: Hashable = variables.cohort_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute allele frequencies for each cohort.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    cohort_allele_count
        Input variable name holding cohort_allele_count as defined by
        :data:`sgkit.variables.cohort_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_cohort_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.cohort_allele_frequency_spec`
    of allele frequencies with shape (variants, cohorts, alleles) and values
    corresponding to the frequency of non-missing occurrences of each allele.

    Examples
    --------
    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.dims["samples"] // 2), dims="samples")
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1   S2   S3
    variants
    0         0/0  1/0  1/0  0/1
    1         1/0  0/1  0/0  1/0
    2         1/1  0/0  1/0  0/1
    3         1/0  1/1  1/1  1/0
    4         1/0  0/0  1/0  1/1

    >>> sg.cohort_allele_frequencies(ds)["cohort_allele_frequency"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[0.75, 0.25],
            [0.5 , 0.5 ]],
    <BLANKLINE>
            [[0.5 , 0.5 ],
            [0.75, 0.25]],
    <BLANKLINE>
            [[0.5 , 0.5 ],
            [0.5 , 0.5 ]],
    <BLANKLINE>
            [[0.25, 0.75],
            [0.25, 0.75]],
    <BLANKLINE>
            [[0.75, 0.25],
            [0.25, 0.75]]])
    """
    ds = define_variable_if_absent(ds, variables.cohort_allele_count,
                                   cohort_allele_count, count_cohort_alleles)
    variables.validate(
        ds, {cohort_allele_count: variables.cohort_allele_count_spec})
    AC = ds[cohort_allele_count]
    AF = AC / AC.sum(dim="alleles")
    new_ds = create_dataset({variables.cohort_allele_frequency: AF})
    return conditional_merge_datasets(ds, new_ds, merge)