Example #1
0
    def calc(self):
        """
        Returns a summary of a set of sequences that can be partitioned into
        the list of lists of taxa given by ``taxon_groups``.
        """
        diffs_x, mean_diffs_x, sq_diff_x = _count_differences(
            self.pop1_seqs, self.state_alphabet, self.ignore_uncertain)
        diffs_y, mean_diffs_y, sq_diff_y = _count_differences(
            self.pop2_seqs, self.state_alphabet, self.ignore_uncertain)
        d_x = diffs_x / combinatorics.choose(len(self.pop1_seqs), 2)
        d_y = diffs_y / combinatorics.choose(len(self.pop2_seqs), 2)
        d_xy = self._average_number_of_pairwise_differences_between_populations(
        )
        s2_x = (float(sq_diff_x) /
                combinatorics.choose(len(self.pop1_seqs), 2)) - (d_x**2)
        s2_y = (float(sq_diff_y) /
                combinatorics.choose(len(self.pop2_seqs), 2)) - (d_y**2)
        s2_xy = self._variance_of_pairwise_differences_between_populations(
            d_xy)
        n = len(self.combined_seqs)
        n_x = float(len(self.pop1_seqs))
        n_y = float(len(self.pop2_seqs))
        a = float(n * (n - 1))
        ax = float(n_x * (n_x - 1))
        ay = float(n_y * (n_y - 1))
        k = _average_number_of_pairwise_differences(self.combined_seqs,
                                                    self.state_alphabet,
                                                    self.ignore_uncertain)
        n = len(self.combined_seqs)

        # Hickerson 2006: pi #
        self.average_number_of_pairwise_differences = k

        # Hickerson 2006: pi_b #
        self.average_number_of_pairwise_differences_between = d_xy

        # Hickerson 2006: pi_w #
        self.average_number_of_pairwise_differences_within = d_x + d_y

        # Hickerson 2006: pi_net #
        self.average_number_of_pairwise_differences_net = d_xy - (d_x + d_y)

        # Hickerson 2006: S #
        self.num_segregating_sites = _num_segregating_sites(
            self.combined_seqs, self.state_alphabet, self.ignore_uncertain)

        # Hickerson 2006: theta #
        a1 = sum([1.0 / i for i in range(1, n)])
        self.wattersons_theta = float(self.num_segregating_sites) / a1

        # Wakeley 1996 #
        self.wakeleys_psi = (float(1) /
                             (a)) * (ax * (math.sqrt(s2_x) / d_x) + ay *
                                     (math.sqrt(s2_y) / d_y) +
                                     (2 * n_x * n_y * math.sqrt(s2_xy) / k))

        # Tajima's D #
        self.tajimas_d = _tajimas_d(
            n, self.average_number_of_pairwise_differences,
            self.num_segregating_sites)
Example #2
0
def hypergeometric_pmf(x, m, n, k):
    """
    Given a population consisting of ``m`` items of class M and ``n`` items of class N,
    this returns the probability of observing ``x`` items of class M when sampling
    ``k`` times without replacement from the entire population (i.e., {M,N})

            p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
    """
    return float(combinatorics.choose(m, x) * combinatorics.choose(n, k-x))/combinatorics.choose(m+n, k)
Example #3
0
def hypergeometric_pmf(x, m, n, k):
    """
    Given a population consisting of ``m`` items of class M and ``n`` items of class N,
    this returns the probability of observing ``x`` items of class M when sampling
    ``k`` times without replacement from the entire population (i.e., {M,N})

            p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
    """
    return float(combinatorics.choose(m, x) * combinatorics.choose(
        n, k - x)) / combinatorics.choose(m + n, k)
Example #4
0
    def calc(self):
        """
        Returns a summary of a set of sequences that can be partitioned into
        the list of lists of taxa given by ``taxon_groups``.
        """
        diffs_x, mean_diffs_x, sq_diff_x = _count_differences(self.pop1_seqs, self.state_alphabet, self.ignore_uncertain)
        diffs_y, mean_diffs_y, sq_diff_y = _count_differences(self.pop2_seqs, self.state_alphabet, self.ignore_uncertain)
        d_x = diffs_x / combinatorics.choose(len(self.pop1_seqs), 2)
        d_y = diffs_y / combinatorics.choose(len(self.pop2_seqs), 2)
        d_xy = self._average_number_of_pairwise_differences_between_populations()
        s2_x = (float(sq_diff_x) / combinatorics.choose(len(self.pop1_seqs), 2) ) - (d_x ** 2)
        s2_y = (float(sq_diff_y) / combinatorics.choose(len(self.pop2_seqs), 2) ) - (d_y ** 2)
        s2_xy = self._variance_of_pairwise_differences_between_populations(d_xy)
        n = len(self.combined_seqs)
        n_x = float(len(self.pop1_seqs))
        n_y = float(len(self.pop2_seqs))
        a = float(n * (n-1))
        ax = float(n_x * (n_x - 1))
        ay = float(n_y * (n_y - 1))
        k = _average_number_of_pairwise_differences(self.combined_seqs, self.state_alphabet, self.ignore_uncertain)
        n = len(self.combined_seqs)

        # Hickerson 2006: pi #
        self.average_number_of_pairwise_differences = k

        # Hickerson 2006: pi_b #
        self.average_number_of_pairwise_differences_between = d_xy

        # Hickerson 2006: pi_w #
        self.average_number_of_pairwise_differences_within = d_x + d_y

        # Hickerson 2006: pi_net #
        self.average_number_of_pairwise_differences_net = d_xy - (d_x + d_y)

        # Hickerson 2006: S #
        self.num_segregating_sites = _num_segregating_sites(
                self.combined_seqs,
                self.state_alphabet,
                self.ignore_uncertain)

        # Hickerson 2006: theta #
        a1 = sum([1.0/i for i in range(1, n)])
        self.wattersons_theta = float(self.num_segregating_sites) / a1

        # Wakeley 1996 #
        self.wakeleys_psi = (float(1)/(a)) * ( ax * (math.sqrt(s2_x)/d_x) + ay * (math.sqrt(s2_y)/d_y) + (2 * n_x * n_y * math.sqrt(s2_xy)/k))

        # Tajima's D #
        self.tajimas_d = _tajimas_d(n, self.average_number_of_pairwise_differences, self.num_segregating_sites)
Example #5
0
def hypergeometric_pmf(x, m, n, k):
    """
    Given a population consisting of ``m`` items of class M and ``n`` items of class N,
    this returns the probability of observing ``x`` items of class M when sampling
    ``k`` times without replacement from the entire population (i.e., {M,N})

            p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
    """
    # following fails with 'OverflowError: long int too large to convert to
    # float' with large numbers
    # return float(combinatorics.choose(m, x) * combinatorics.choose(n, k-x))/combinatorics.choose(m+n, k)
    a = math.log(combinatorics.choose(m, x))
    b = math.log(combinatorics.choose(n, k-x))
    c = math.log(combinatorics.choose(m+n, k))
    return math.exp(a+b-c)
Example #6
0
def hypergeometric_pmf(x, m, n, k):
    """
    Given a population consisting of ``m`` items of class M and ``n`` items of class N,
    this returns the probability of observing ``x`` items of class M when sampling
    ``k`` times without replacement from the entire population (i.e., {M,N})

            p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
    """
    # following fails with 'OverflowError: long int too large to convert to
    # float' with large numbers
    # return float(combinatorics.choose(m, x) * combinatorics.choose(n, k-x))/combinatorics.choose(m+n, k)
    a = math.log(combinatorics.choose(m, x))
    b = math.log(combinatorics.choose(n, k - x))
    c = math.log(combinatorics.choose(m + n, k))
    return math.exp(a + b - c)
Example #7
0
def expected_tmrca(n_genes, pop_size=None, n_to_coalesce=2):
    """
    Expected (mean) value for the Time to the Most Recent Common Ancestor of
    ``n_to_coalesce`` genes in a sample of ``n_genes`` drawn from a population of
    ``pop_size`` genes.

    Parameters
    ----------
    n_genes : integer
        The number of genes in the sample.
    pop_size : integer
        The effective *haploid* population size; i.e., number of genes in the
        population: 2 * N in a diploid population of N individuals, or N in a
        haploid population of N individuals.
    n_to_coalesce : integer
        The waiting time that will be returned will be the waiting time for
        this number of genes in the sample to coalesce.
    rng : ``Random`` object
        The random number generator instance.

    Returns
    -------
    k : float
        The expected waiting time (in continuous time) for ``n_to_coalesce``
        genes to coalesce out of a sample of ``n_genes`` in a population of
        ``pop_size`` genes.

    """
    nc2 = combinatorics.choose(n_genes, n_to_coalesce)
    tmrca = (float(1) / nc2)
    if pop_size is not None:
        return tmrca * pop_size
    else:
        return tmrca
Example #8
0
def expected_tmrca(n_genes, pop_size=None, n_to_coalesce=2):
    """
    Expected (mean) value for the Time to the Most Recent Common Ancestor of
    ``n_to_coalesce`` genes in a sample of ``n_genes`` drawn from a population of
    ``pop_size`` genes.

    Parameters
    ----------
    n_genes : integer
        The number of genes in the sample.
    pop_size : integer
        The effective *haploid* population size; i.e., number of genes in the
        population: 2 * N in a diploid population of N individuals, or N in a
        haploid population of N individuals.
    n_to_coalesce : integer
        The waiting time that will be returned will be the waiting time for
        this number of genes in the sample to coalesce.
    rng : ``Random`` object
        The random number generator instance.

    Returns
    -------
    k : float
        The expected waiting time (in continuous time) for ``n_to_coalesce``
        genes to coalesce out of a sample of ``n_genes`` in a population of
        ``pop_size`` genes.

    """
    nc2 = combinatorics.choose(n_genes, n_to_coalesce)
    tmrca = (float(1)/nc2)
    if pop_size is not None:
        return tmrca * pop_size
    else:
        return tmrca
Example #9
0
def time_to_coalescence(n_genes,
        pop_size=None,
        n_to_coalesce=2,
        rng=None):
    """
    A random draw from the "Kingman distribution" (discrete time version): Time
    to go from ``n_genes`` genes to ``n_genes``-1 genes in a continuous-time
    Wright-Fisher population of ``pop_size`` genes; i.e. waiting time until
    ``n-genes`` lineages coalesce in a population of ``pop_size`` genes.

    Given the number of gene lineages in a sample, ``n_genes``, and a
    population size, ``pop_size``, this function returns a random number from
    an exponential distribution with rate $\\choose(``pop_size``, 2)$.
    ``pop_size`` is the effective *haploid* population size; i.e., number of gene
    in the population: 2 * N in a diploid population of N individuals,
    or N in a haploid population of N individuals. If ``pop_size`` is 1 or 0 or
    None, then time is in haploid population units; i.e. where 1 unit of time
    equals 2N generations for a diploid population of size N, or N generations
    for a haploid population of size N. Otherwise time is in generations.

    The coalescence time, or the waiting time for the coalescence, of two
    gene lineages evolving in a population with haploid size $N$ is an
    exponentially-distributed random variable with rate of $N$ an
    expectation of $\\frac{1}{N}$).
    The waiting time for coalescence of *any* two gene lineages in a sample of
    $n$ gene lineages evolving in a population with haploid size $N$ is an
    exponentially-distributed random variable with rate of $\\choose{N, 2}$ and
    an expectation of $\\frac{1}{\choose{N, 2}}$.

    Parameters
    ----------
    n_genes : integer
        The number of genes in the sample.
    pop_size : integer
        The effective *haploid* population size; i.e., number of genes in the
        population: 2 * N in a diploid population of N individuals, or N in a
        haploid population of N individuals.
    n_to_coalesce : integer
        The waiting time that will be returned will be the waiting time for
        this number of genes in the sample to coalesce.
    rng : ``Random`` object
        The random number generator instance to use.

    Returns
    -------
    k : float
        A randomly-generated waiting time (in continuous time) for
        ``n_to_coalesce`` genes to coalesce out of a sample of ``n_genes`` in a
        population of ``pop_size`` genes.
    """
    if rng is None:
        rng = GLOBAL_RNG
    if not pop_size:
        time_units = 1.0
    else:
        time_units = pop_size
    rate = combinatorics.choose(n_genes, n_to_coalesce)
    tmrca = rng.expovariate(rate)
    return tmrca * time_units
Example #10
0
def _average_number_of_pairwise_differences(char_sequences, state_alphabet, ignore_uncertain=True):
    """
    Returns $k$ (Tajima 1983; Wakely 1996), calculated for a set of sequences:

    k = \frac{\right(\sum \sum \k_{ij}\left)}{n \choose 2}

    where $k_{ij}$ is the number of pairwise differences between the
    $i$th and $j$th sequence, and $n$ is the number of DNA sequences
    sampled.
    """
    sum_diff, mean_diff, sq_diff = _count_differences(char_sequences, state_alphabet, ignore_uncertain)
    return sum_diff / combinatorics.choose(len(char_sequences), 2)
Example #11
0
def _average_number_of_pairwise_differences(char_sequences, state_alphabet, ignore_uncertain=True):
    """
    Returns $k$ (Tajima 1983; Wakely 1996), calculated for a set of sequences:

    k = \frac{\right(\sum \sum \k_{ij}\left)}{n \choose 2}

    where $k_{ij}$ is the number of pairwise differences between the
    $i$th and $j$th sequence, and $n$ is the number of DNA sequences
    sampled.
    """
    sum_diff, mean_diff, sq_diff = _count_differences(char_sequences, state_alphabet, ignore_uncertain)
    return sum_diff / combinatorics.choose(len(char_sequences), 2)
 def test_all_distinct_mapped_taxa_pairs(self):
     n1 = len(self.tree.taxon_namespace)
     taxon_pair_iter1 = iter(self.pdm._all_distinct_mapped_taxa_pairs)
     taxon_pair_iter2 = self.pdm.distinct_taxon_pair_iter()
     for tpi in (taxon_pair_iter1, taxon_pair_iter2):
         seen_pairs = set()
         visited_taxa = set()
         for taxon1, taxon2 in tpi:
             s = frozenset([taxon1, taxon2])
             self.assertIn(taxon1, self.pdm._mapped_taxa)
             self.assertIn(taxon1, self.tree.taxon_namespace)
             self.assertIn(taxon2, self.pdm._mapped_taxa)
             self.assertIn(taxon2, self.tree.taxon_namespace)
             self.assertNotIn(s, seen_pairs)
             seen_pairs.add(s)
             visited_taxa.add(taxon1)
             visited_taxa.add(taxon2)
         self.assertEqual(len(visited_taxa), n1)
         self.assertEqual(len(seen_pairs), combinatorics.choose(n1, 2))
Example #13
0
def discrete_time_to_coalescence(n_genes,
                                 pop_size=None,
                                 n_to_coalesce=2,
                                 rng=None):
    """
    A random draw from the "Kingman distribution" (discrete time version): Time
    to go from ``n_genes`` genes to ``n_genes``-1 genes in a discrete-time
    Wright-Fisher population of ``pop_size`` genes; i.e. waiting time until
    ``n-genes`` lineages coalesce in a population of ``pop_size`` genes.

    Parameters
    ----------

    n_genes : integer
        The number of genes in the sample.
    pop_size : integer
        The effective *haploid* population size; i.e., number of genes in the
        population: 2 * N in a diploid population of N individuals, or N in a
        haploid population of N individuals.
    n_to_coalesce : integer
        The waiting time that will be returned will be the waiting time for
        this number of genes in the sample to coalesce.
    rng : ``Random`` object
        The random number generator instance.

    Returns
    -------
    k : integer
        A randomly-generated waiting time (in discrete generations) for
        ``n_to_coalesce`` genes to coalesce out of a sample of ``n_genes`` in a
        population of ``pop_size`` genes.

    """
    if not pop_size:
        time_units = 1.0
    else:
        time_units = pop_size
    if rng is None:
        rng = GLOBAL_RNG
    p = pop_size / combinatorics.choose(n_genes, n_to_coalesce)
    tmrca = probability.geometric_rv(p)
    return tmrca * time_units
Example #14
0
def discrete_time_to_coalescence(n_genes,
                                 pop_size=None,
                                 n_to_coalesce=2,
                                 rng=None):
    """
    A random draw from the "Kingman distribution" (discrete time version): Time
    to go from ``n_genes`` genes to ``n_genes``-1 genes in a discrete-time
    Wright-Fisher population of ``pop_size`` genes; i.e. waiting time until
    ``n-genes`` lineages coalesce in a population of ``pop_size`` genes.

    Parameters
    ----------

    n_genes : integer
        The number of genes in the sample.
    pop_size : integer
        The effective *haploid* population size; i.e., number of genes in the
        population: 2 * N in a diploid population of N individuals, or N in a
        haploid population of N individuals.
    n_to_coalesce : integer
        The waiting time that will be returned will be the waiting time for
        this number of genes in the sample to coalesce.
    rng : ``Random`` object
        The random number generator instance.

    Returns
    -------
    k : integer
        A randomly-generated waiting time (in discrete generations) for
        ``n_to_coalesce`` genes to coalesce out of a sample of ``n_genes`` in a
        population of ``pop_size`` genes.

    """
    if not pop_size:
        time_units = 1.0
    else:
        time_units = pop_size
    if rng is None:
        rng = GLOBAL_RNG
    p = pop_size / combinatorics.choose(n_genes, n_to_coalesce)
    tmrca = probability.geometric_rv(p)
    return tmrca * time_units