Beispiel #1
0
def phase_parents_by_transmission(g, window_size):
    """Phase parent genotypes from a trio or cross, given progeny genotypes
    already phased by Mendelian transmission.

    Parameters
    ----------
    g : GenotypeArray
        Genotype array, with parents as first two columns and progeny as
        remaining columns, where progeny genotypes are already phased.
    window_size : int
        Number of previous heterozygous sites to include when phasing each
        parent. A number somewhere between 10 and 100 may be appropriate,
        depending on levels of heterozygosity and quality of data.

    Returns
    -------
    g : GenotypeArray
        Genotype array with parents phased where possible.

    """

    # setup
    check_type(g, GenotypeArray)
    check_dtype(g.values, 'i1')
    check_ploidy(g.ploidy, 2)
    if g.is_phased is None:
        raise ValueError('genotype array must first have progeny phased by transmission')
    check_min_samples(g.n_samples, 3)

    # run the phasing
    is_phased = g.is_phased.view('u1')
    phase_parents_by_transmission_int8(g.values, is_phased, window_size)

    # outputs
    return g
Beispiel #2
0
def phase_by_transmission(g, window_size, copy=True):
    """Phase genotypes in a trio or cross where possible using Mendelian
    transmission.

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, 2)
        Genotype array, with parents as first two columns and progeny as
        remaining columns.
    window_size : int
        Number of previous heterozygous sites to include when phasing each
        parent. A number somewhere between 10 and 100 may be appropriate,
        depending on levels of heterozygosity and quality of data.
    copy : bool, optional
        If False, attempt to phase genotypes in-place. Note that this is
        only possible if the input array has int8 dtype, otherwise a copy is
        always made regardless of this parameter.

    Returns
    -------
    g : GenotypeArray
        Genotype array with progeny phased where possible.

    """

    # setup
    g = np.asarray(g, dtype='i1')
    g = GenotypeArray(g, copy=copy)
    g._values = memoryview_safe(g.values)
    check_ploidy(g.ploidy, 2)
    check_min_samples(g.n_samples, 3)

    # phase the progeny
    is_phased = _opt_phase_progeny_by_transmission(g.values)
    g.is_phased = np.asarray(is_phased).view(bool)

    # phase the parents
    _opt_phase_parents_by_transmission(g.values, is_phased, window_size)

    return g
Beispiel #3
0
def phase_progeny_by_transmission(g):
    """Phase progeny genotypes from a trio or cross using Mendelian
    transmission.

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, 2)
        Genotype array, with parents as first two columns and progeny as
        remaining columns.

    Returns
    -------
    g : ndarray, int8, shape (n_variants, n_samples, 2)
        Genotype array with progeny phased where possible.

    Examples
    --------
    >>> import allel
    >>> g = allel.GenotypeArray([
    ...     [[0, 0], [0, 0], [0, 0]],
    ...     [[1, 1], [1, 1], [1, 1]],
    ...     [[0, 0], [1, 1], [0, 1]],
    ...     [[1, 1], [0, 0], [0, 1]],
    ...     [[0, 0], [0, 1], [0, 0]],
    ...     [[0, 0], [0, 1], [0, 1]],
    ...     [[0, 1], [0, 0], [0, 1]],
    ...     [[0, 1], [0, 1], [0, 1]],
    ...     [[0, 1], [1, 2], [0, 1]],
    ...     [[1, 2], [0, 1], [1, 2]],
    ...     [[0, 1], [2, 3], [0, 2]],
    ...     [[2, 3], [0, 1], [1, 3]],
    ...     [[0, 0], [0, 0], [-1, -1]],
    ...     [[0, 0], [0, 0], [1, 1]],
    ... ], dtype='i1')
    >>> g = allel.phase_progeny_by_transmission(g)
    >>> print(g.to_str(row_threshold=None))
    0/0 0/0 0|0
    1/1 1/1 1|1
    0/0 1/1 0|1
    1/1 0/0 1|0
    0/0 0/1 0|0
    0/0 0/1 0|1
    0/1 0/0 1|0
    0/1 0/1 0/1
    0/1 1/2 0|1
    1/2 0/1 2|1
    0/1 2/3 0|2
    2/3 0/1 3|1
    0/0 0/0 ./.
    0/0 0/0 1/1
    >>> g.is_phased
    array([[False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False, False],
           [False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False, False],
           [False, False, False]])

    """

    # setup
    g = GenotypeArray(g, dtype='i1', copy=True)
    check_ploidy(g.ploidy, 2)
    check_min_samples(g.n_samples, 3)

    # run the phasing
    # N.B., a copy has already been made, so no need to make memoryview safe
    is_phased = _opt_phase_progeny_by_transmission(g.values)
    g.is_phased = np.asarray(is_phased).view(bool)

    # outputs
    return g
Beispiel #4
0
def mendel_errors(parent_genotypes, progeny_genotypes):
    """Locate genotype calls not consistent with Mendelian transmission of
    alleles.

    Parameters
    ----------
    parent_genotypes : array_like, int, shape (n_variants, 2, 2)
        Genotype calls for the two parents.
    progeny_genotypes : array_like, int, shape (n_variants, n_progeny, 2)
        Genotype calls for the progeny.

    Returns
    -------
    me : ndarray, int, shape (n_variants, n_progeny)
        Count of Mendel errors for each progeny genotype call.

    Examples
    --------
    The following are all consistent with Mendelian transmission. Note that a
    value of 0 is returned for missing calls::

        >>> import allel
        >>> import numpy as np
        >>> genotypes = np.array([
        ...     # aa x aa -> aa
        ...     [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1], [-1, -1]],
        ...     [[1, 1], [1, 1], [1, 1], [-1, -1], [-1, -1], [-1, -1]],
        ...     [[2, 2], [2, 2], [2, 2], [-1, -1], [-1, -1], [-1, -1]],
        ...     # aa x ab -> aa or ab
        ...     [[0, 0], [0, 1], [0, 0], [0, 1], [-1, -1], [-1, -1]],
        ...     [[0, 0], [0, 2], [0, 0], [0, 2], [-1, -1], [-1, -1]],
        ...     [[1, 1], [0, 1], [1, 1], [0, 1], [-1, -1], [-1, -1]],
        ...     # aa x bb -> ab
        ...     [[0, 0], [1, 1], [0, 1], [-1, -1], [-1, -1], [-1, -1]],
        ...     [[0, 0], [2, 2], [0, 2], [-1, -1], [-1, -1], [-1, -1]],
        ...     [[1, 1], [2, 2], [1, 2], [-1, -1], [-1, -1], [-1, -1]],
        ...     # aa x bc -> ab or ac
        ...     [[0, 0], [1, 2], [0, 1], [0, 2], [-1, -1], [-1, -1]],
        ...     [[1, 1], [0, 2], [0, 1], [1, 2], [-1, -1], [-1, -1]],
        ...     # ab x ab -> aa or ab or bb
        ...     [[0, 1], [0, 1], [0, 0], [0, 1], [1, 1], [-1, -1]],
        ...     [[1, 2], [1, 2], [1, 1], [1, 2], [2, 2], [-1, -1]],
        ...     [[0, 2], [0, 2], [0, 0], [0, 2], [2, 2], [-1, -1]],
        ...     # ab x bc -> ab or ac or bb or bc
        ...     [[0, 1], [1, 2], [0, 1], [0, 2], [1, 1], [1, 2]],
        ...     [[0, 1], [0, 2], [0, 0], [0, 1], [0, 1], [1, 2]],
        ...     # ab x cd -> ac or ad or bc or bd
        ...     [[0, 1], [2, 3], [0, 2], [0, 3], [1, 2], [1, 3]],
        ... ])
        >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:])
        >>> me
        array([[0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0]])

    The following are cases of 'non-parental' inheritance where one or two
    alleles are found in the progeny that are not present in either parent.
    Note that the number of errors may be 1 or 2 depending on the number of
    non-parental alleles::

        >>> genotypes = np.array([
        ...     # aa x aa -> ab or ac or bb or cc
        ...     [[0, 0], [0, 0], [0, 1], [0, 2], [1, 1], [2, 2]],
        ...     [[1, 1], [1, 1], [0, 1], [1, 2], [0, 0], [2, 2]],
        ...     [[2, 2], [2, 2], [0, 2], [1, 2], [0, 0], [1, 1]],
        ...     # aa x ab -> ac or bc or cc
        ...     [[0, 0], [0, 1], [0, 2], [1, 2], [2, 2], [2, 2]],
        ...     [[0, 0], [0, 2], [0, 1], [1, 2], [1, 1], [1, 1]],
        ...     [[1, 1], [0, 1], [1, 2], [0, 2], [2, 2], [2, 2]],
        ...     # aa x bb -> ac or bc or cc
        ...     [[0, 0], [1, 1], [0, 2], [1, 2], [2, 2], [2, 2]],
        ...     [[0, 0], [2, 2], [0, 1], [1, 2], [1, 1], [1, 1]],
        ...     [[1, 1], [2, 2], [0, 1], [0, 2], [0, 0], [0, 0]],
        ...     # ab x ab -> ac or bc or cc
        ...     [[0, 1], [0, 1], [0, 2], [1, 2], [2, 2], [2, 2]],
        ...     [[0, 2], [0, 2], [0, 1], [1, 2], [1, 1], [1, 1]],
        ...     [[1, 2], [1, 2], [0, 1], [0, 2], [0, 0], [0, 0]],
        ...     # ab x bc -> ad or bd or cd or dd
        ...     [[0, 1], [1, 2], [0, 3], [1, 3], [2, 3], [3, 3]],
        ...     [[0, 1], [0, 2], [0, 3], [1, 3], [2, 3], [3, 3]],
        ...     [[0, 2], [1, 2], [0, 3], [1, 3], [2, 3], [3, 3]],
        ...     # ab x cd -> ae or be or ce or de
        ...     [[0, 1], [2, 3], [0, 4], [1, 4], [2, 4], [3, 4]],
        ... ])
        >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:])
        >>> me
        array([[1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 1, 2],
               [1, 1, 1, 2],
               [1, 1, 1, 2],
               [1, 1, 1, 1]])

    The following are cases of 'hemi-parental' inheritance, where progeny
    appear to have inherited two copies of an allele found only once in one of
    the parents::

        >>> genotypes = np.array([
        ...     # aa x ab -> bb
        ...     [[0, 0], [0, 1], [1, 1], [-1, -1]],
        ...     [[0, 0], [0, 2], [2, 2], [-1, -1]],
        ...     [[1, 1], [0, 1], [0, 0], [-1, -1]],
        ...     # ab x bc -> aa or cc
        ...     [[0, 1], [1, 2], [0, 0], [2, 2]],
        ...     [[0, 1], [0, 2], [1, 1], [2, 2]],
        ...     [[0, 2], [1, 2], [0, 0], [1, 1]],
        ...     # ab x cd -> aa or bb or cc or dd
        ...     [[0, 1], [2, 3], [0, 0], [1, 1]],
        ...     [[0, 1], [2, 3], [2, 2], [3, 3]],
        ... ])
        >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:])
        >>> me
        array([[1, 0],
               [1, 0],
               [1, 0],
               [1, 1],
               [1, 1],
               [1, 1],
               [1, 1],
               [1, 1]])

    The following are cases of 'uni-parental' inheritance, where progeny
    appear to have inherited both alleles from a single parent::

        >>> genotypes = np.array([
        ...     # aa x bb -> aa or bb
        ...     [[0, 0], [1, 1], [0, 0], [1, 1]],
        ...     [[0, 0], [2, 2], [0, 0], [2, 2]],
        ...     [[1, 1], [2, 2], [1, 1], [2, 2]],
        ...     # aa x bc -> aa or bc
        ...     [[0, 0], [1, 2], [0, 0], [1, 2]],
        ...     [[1, 1], [0, 2], [1, 1], [0, 2]],
        ...     # ab x cd -> ab or cd
        ...     [[0, 1], [2, 3], [0, 1], [2, 3]],
        ... ])
        >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:])
        >>> me
        array([[1, 1],
               [1, 1],
               [1, 1],
               [1, 1],
               [1, 1],
               [1, 1]])

    """

    # setup
    parent_genotypes = GenotypeArray(parent_genotypes)
    progeny_genotypes = GenotypeArray(progeny_genotypes)
    check_ploidy(parent_genotypes.ploidy, 2)
    check_ploidy(progeny_genotypes.ploidy, 2)

    # transform into per-call allele counts
    max_allele = max(parent_genotypes.max(), progeny_genotypes.max())
    parent_gc = parent_genotypes.to_allele_counts(max_allele=max_allele, dtype='i1')
    progeny_gc = progeny_genotypes.to_allele_counts(max_allele=max_allele, dtype='i1')

    # detect nonparental and hemiparental inheritance by comparing allele
    # counts between parents and progeny
    max_progeny_gc = parent_gc.clip(max=1).sum(axis=1)
    max_progeny_gc = max_progeny_gc[:, np.newaxis, :]
    me = (progeny_gc - max_progeny_gc).clip(min=0).sum(axis=2)

    # detect uniparental inheritance by finding cases where no alleles are
    # shared between parents, then comparing progeny allele counts to each
    # parent
    p1_gc = parent_gc[:, 0, np.newaxis, :]
    p2_gc = parent_gc[:, 1, np.newaxis, :]
    # find variants where parents don't share any alleles
    is_shared_allele = (p1_gc > 0) & (p2_gc > 0)
    no_shared_alleles = ~np.any(is_shared_allele, axis=2)
    # find calls where progeny genotype is identical to one or the other parent
    me[no_shared_alleles &
       (np.all(progeny_gc == p1_gc, axis=2) |
        np.all(progeny_gc == p2_gc, axis=2))] = 1

    # retrofit where either or both parent has a missing call
    me[np.any(parent_genotypes.is_missing(), axis=1)] = 0

    return me