Beispiel #1
0
def time_to_coalescence(n_genes,
        pop_size=None,
        haploid=True,
        rng=None):
    """
    A random draw from the "Kingman distribution" (continuous time version):
    Time to go from n genes to n-1 genes; i.e. waiting time until two
    lineages coalesce.  This is a random number with an exponential
    distribution with a rate of (n choose 2).
    `pop_size` is the effective *haploid* population size; i.e., number of gene
    in the population: 2 * N in a diploid population of N individuals,
    or N in a haploid population of N individuals.
    If `pop_size` is 1 or 0 or None, then time is in haploid population units;
    i.e. where 1 unit of time equals 2N generations for a diploid population of
    size N, or N generations for a haploid population of size N. Otherwise time
    is in generations.

    """
    if rng is None:
        rng = GLOBAL_RNG
    if not pop_size:
        time_units = 1
    else:
        time_units = pop_size * 2
    rate = probability.binomial_coefficient(n_genes, 2)
    tmrca = rng.expovariate(rate)
    return tmrca * pop_size
Beispiel #2
0
def time_to_coalescence(n_genes,
        pop_size=None,
        haploid=True,
        rng=None):
    """
    A random draw from the "Kingman distribution" (continuous time version):
    Time to go from n genes to n-1 genes; i.e. waiting time until two
    lineages coalesce.  This is a random number with an exponential
    distribution with a rate of (n choose 2).
    `pop_size` is the effective *haploid* population size; i.e., number of gene
    in the population: 2 * N in a diploid population of N individuals,
    or N in a haploid population of N individuals.
    If `pop_size` is 1 or 0 or None, then time is in haploid population units;
    i.e. where 1 unit of time equals 2N generations for a diploid population of
    size N, or N generations for a haploid population of size N. Otherwise time
    is in generations.

    """
    if rng is None:
        rng = GLOBAL_RNG
    if not pop_size:
        time_units = 1
    else:
        time_units = pop_size * 2
    rate = probability.binomial_coefficient(n_genes, 2)
    tmrca = rng.expovariate(rate)
    return tmrca * pop_size
Beispiel #3
0
def discrete_time_to_coalescence(n_genes,
                                 pop_size=None,
                                 rng=None):
    """
    A random draw from the "Kingman distribution" (discrete time version):
    Time to go from n genes to n-1 genes; i.e. waiting time until two
    lineages coalesce.
    `pop_size` is the effective *haploid* population size; i.e., number of
    genes in the population: 2 * N in a diploid population of N individuals, or
    N in a haploid population of N individuals.
    If `pop_size` is 1 or 0 or None, then time is in haploid population units;
    i.e. where 1 unit of time equals 2N generations for a diploid population of
    size N, or N generations for a haploid population of size N. Otherwise time
    is in generations.


    """
    if not pop_size:
        time_units = 1
    else:
        time_units = pop_size * 2
    if rng is None:
        rng = GLOBAL_RNG
    p = float(probability.binomial_coefficient(n_genes, 2)) / time_units
    tmrca = probability.geometric_rv(p)
    return tmrca * time_units
Beispiel #4
0
def discrete_time_to_coalescence(n_genes,
                                 pop_size=None,
                                 rng=None):
    """
    A random draw from the "Kingman distribution" (discrete time version):
    Time to go from n genes to n-1 genes; i.e. waiting time until two
    lineages coalesce.
    `pop_size` is the effective *haploid* population size; i.e., number of
    genes in the population: 2 * N in a diploid population of N individuals, or
    N in a haploid population of N individuals.
    If `pop_size` is 1 or 0 or None, then time is in haploid population units;
    i.e. where 1 unit of time equals 2N generations for a diploid population of
    size N, or N generations for a haploid population of size N. Otherwise time
    is in generations.


    """
    if not pop_size:
        time_units = 1
    else:
        time_units = pop_size * 2
    if rng is None:
        rng = GLOBAL_RNG
    p = float(probability.binomial_coefficient(n_genes, 2)) / time_units
    tmrca = probability.geometric_rv(p)
    return tmrca * time_units
Beispiel #5
0
    def calc(self):
        """
        Returns a summary of a set of sequences that can be partitioned into
        the list of lists of taxa given by `taxon_groups`.
        """
        diffs_x, mean_diffs_x, sq_diff_x = _count_differences(self.pop1_seqs, self.state_alphabet, self.ignore_uncertain)
        diffs_y, mean_diffs_y, sq_diff_y = _count_differences(self.pop2_seqs, self.state_alphabet, self.ignore_uncertain)
        d_x = diffs_x / probability.binomial_coefficient(len(self.pop1_seqs), 2)
        d_y = diffs_y / probability.binomial_coefficient(len(self.pop2_seqs), 2)
        d_xy = self._average_number_of_pairwise_differences_between_populations()
        s2_x = (float(sq_diff_x) / probability.binomial_coefficient(len(self.pop1_seqs), 2) ) - (d_x ** 2)
        s2_y = (float(sq_diff_y) / probability.binomial_coefficient(len(self.pop2_seqs), 2) ) - (d_y ** 2)
        s2_xy = self._variance_of_pairwise_differences_between_populations(d_xy)
        n = len(self.combined_seqs)
        n_x = float(len(self.pop1_seqs))
        n_y = float(len(self.pop2_seqs))
        a = float(n * (n-1))
        ax = float(n_x * (n_x - 1))
        ay = float(n_y * (n_y - 1))
        k = _average_number_of_pairwise_differences(self.combined_seqs, self.state_alphabet, self.ignore_uncertain)
        n = len(self.combined_seqs)

        # Hickerson 2006: pi #
        self.average_number_of_pairwise_differences = k

        # Hickerson 2006: pi_b #
        self.average_number_of_pairwise_differences_between = d_xy

        # Hickerson 2006: pi_w #
        self.average_number_of_pairwise_differences_within = d_x + d_y

        # Hickerson 2006: pi_net #
        self.average_number_of_pairwise_differences_net = d_xy - (d_x + d_y)

        # Hickerson 2006: S #
        self.num_segregating_sites = _num_segregating_sites(self.combined_seqs, self.state_alphabet, self.ignore_uncertain)

        # Hickerson 2006: theta #
        a1 = sum([1.0/i for i in range(1, n)])
        self.wattersons_theta = float(self.num_segregating_sites) / a1

        # Wakeley 1996 #
        self.wakeleys_psi = (float(1)/(a)) * ( ax * (math.sqrt(s2_x)/d_x) + ay * (math.sqrt(s2_y)/d_y) + (2 * n_x * n_y * math.sqrt(s2_xy)/k))

        # Tajima's D #
        self.tajimas_d = _tajimas_d(n, self.average_number_of_pairwise_differences, self.num_segregating_sites)
Beispiel #6
0
def _average_number_of_pairwise_differences(char_vectors, state_alphabet, ignore_uncertain=True):
    """
    Returns $k$ (Tajima 1983; Wakely 1996), calculated for a set of sequences:

    k = \frac{\right(\sum \sum \k_{ij}\left)}{n \choose 2}

    where $k_{ij}$ is the number of pairwise differences between the
    $i$th and $j$th sequence, and $n$ is the number of DNA sequences
    sampled.
    """
    sum_diff, mean_diff, sq_diff = _count_differences(char_vectors, state_alphabet, ignore_uncertain)
    return sum_diff / probability.binomial_coefficient(len(char_vectors), 2)
Beispiel #7
0
def expected_tmrca(n_genes, pop_size=None):
    """
    Expected (mean) value for the Time to the Most Recent Common Ancestor.
    `n_genes` is the number of genes in the sample.
    `pop_size` is the effective *haploid* population size; i.e., number of gene
    in the population: 2 * N in a diploid population of N individuals,
    or N in a haploid population of N individuals.
    If `pop_size` is 1 or 0 or None, then time is in haploid population units;
    i.e. where 1 unit of time equals 2N generations for a diploid population of
    size N, or N generations for a haploid population of size N. Otherwise time
    is in generations.

    """
    nc2 = probability.binomial_coefficient(n_genes, 2)
    tmrca = (float(1)/nc2)
    return tmrca * pop_size
Beispiel #8
0
def _average_number_of_pairwise_differences(char_vectors,
                                            state_alphabet,
                                            ignore_uncertain=True):
    """
    Returns $k$ (Tajima 1983; Wakely 1996), calculated for a set of sequences:

    k = \frac{\right(\sum \sum \k_{ij}\left)}{n \choose 2}

    where $k_{ij}$ is the number of pairwise differences between the
    $i$th and $j$th sequence, and $n$ is the number of DNA sequences
    sampled.
    """
    sum_diff, mean_diff, sq_diff = _count_differences(char_vectors,
                                                      state_alphabet,
                                                      ignore_uncertain)
    return sum_diff / probability.binomial_coefficient(len(char_vectors), 2)
Beispiel #9
0
def expected_tmrca(n_genes, pop_size=None, rng=None):
    """
    Expected (mean) value for the Time to the Most Recent Common Ancestor.
    `n_genes` is the number of genes in the sample.
    `pop_size` is the effective *haploid* population size; i.e., number of gene
    in the population: 2 * N in a diploid population of N individuals,
    or N in a haploid population of N individuals.
    If `pop_size` is 1 or 0 or None, then time is in haploid population units;
    i.e. where 1 unit of time equals 2N generations for a diploid population of
    size N, or N generations for a haploid population of size N. Otherwise time
    is in generations.

    """
    if rng is None:
        rng = GLOBAL_RNG
    nc2 = probability.binomial_coefficient(n_genes, 2)
    tmrca = (float(1)/nc2)
    return tmrca * pop_sze
Beispiel #10
0
    def kl_divergence_coalescent_waiting_times(allele_waiting_time_dist, haploid_pop_size):
        """
        `allele_branch_len_dist` is a dictionary with number of alleles as keys
        and a list of waiting times associated with that number of alleles as
        values. `haploid_pop_size` is the population size in terms of total numbers
        of genes. This returns a the KL-divergence between the distribution of
        waiting times and the Kingman coalescent distribution.

        D_{\mathrm{KL}}(P\|Q) = \sum_i P(i) \log \frac{P(i)}{Q(i)}.

        """
        d_kl = 0.0
        for k, wts in allele_waiting_time_dist.items():
            p = float(probability.binomial_coefficient(k, 2)) / haploid_pop_size
            for t in wts:
                # Kernel types:
                #
                # 'E' or 'Epanechnikov'
                #     Epanechnikov kernel (default)
                #
                # 'U' or 'Uniform'
                #     Uniform kernel
                #
                # 'T' or 'Triangle'
                #     Triangle kernel
                #
                # 'G' or  'Gaussian'
                #     Gaussian kernel
                #
                # 'B' or 'Biweight'
                #     Quartic/biweight kernel
                #
                # '3' or 'Triweight'
                #     Triweight kernel
                #
                # 'C' or 'Cosine'
                #     Cosine kernel
                q = de_hoon_lib.pdf(wts, [k], kernel = 'Gaussian')
                if q == 0:
                    q = 1e-100
                d_kl += p * math.log(p/q)
        return d_kl
Beispiel #11
0
    def kl_divergence_coalescent_waiting_times(allele_waiting_time_dist, haploid_pop_size):
        """
        `allele_branch_len_dist` is a dictionary with number of alleles as keys
        and a list of waiting times associated with that number of alleles as
        values. `haploid_pop_size` is the population size in terms of total numbers
        of genes. This returns a the KL-divergence between the distribution of
        waiting times and the Kingman coalescent distribution.

        D_{\mathrm{KL}}(P\|Q) = \sum_i P(i) \log \frac{P(i)}{Q(i)}.

        """
        d_kl = 0.0
        for k, wts in allele_waiting_time_dist.items():
            p = float(probability.binomial_coefficient(k, 2)) / haploid_pop_size
            for t in wts:
                # Kernel types:
                #
                # 'E' or 'Epanechnikov'
                #     Epanechnikov kernel (default)
                #
                # 'U' or 'Uniform'
                #     Uniform kernel
                #
                # 'T' or 'Triangle'
                #     Triangle kernel
                #
                # 'G' or  'Gaussian'
                #     Gaussian kernel
                #
                # 'B' or 'Biweight'
                #     Quartic/biweight kernel
                #
                # '3' or 'Triweight'
                #     Triweight kernel
                #
                # 'C' or 'Cosine'
                #     Cosine kernel
                q = de_hoon_lib.pdf(wts, [k], kernel = 'Gaussian')
                if q == 0:
                    q = 1e-100
                d_kl += p * math.log(p/q)
        return d_kl
Beispiel #12
0
    def calc(self):
        """
        Returns a summary of a set of sequences that can be partitioned into
        the list of lists of taxa given by `taxon_groups`.
        """
        diffs_x, mean_diffs_x, sq_diff_x = _count_differences(
            self.pop1_seqs, self.state_alphabet, self.ignore_uncertain)
        diffs_y, mean_diffs_y, sq_diff_y = _count_differences(
            self.pop2_seqs, self.state_alphabet, self.ignore_uncertain)
        d_x = diffs_x / probability.binomial_coefficient(
            len(self.pop1_seqs), 2)
        d_y = diffs_y / probability.binomial_coefficient(
            len(self.pop2_seqs), 2)
        d_xy = self._average_number_of_pairwise_differences_between_populations(
        )
        s2_x = (float(sq_diff_x) / probability.binomial_coefficient(
            len(self.pop1_seqs), 2)) - (d_x**2)
        s2_y = (float(sq_diff_y) / probability.binomial_coefficient(
            len(self.pop2_seqs), 2)) - (d_y**2)
        s2_xy = self._variance_of_pairwise_differences_between_populations(
            d_xy)
        n = len(self.combined_seqs)
        n_x = float(len(self.pop1_seqs))
        n_y = float(len(self.pop2_seqs))
        a = float(n * (n - 1))
        ax = float(n_x * (n_x - 1))
        ay = float(n_y * (n_y - 1))
        k = _average_number_of_pairwise_differences(self.combined_seqs,
                                                    self.state_alphabet,
                                                    self.ignore_uncertain)
        n = len(self.combined_seqs)

        # Hickerson 2006: pi #
        self.average_number_of_pairwise_differences = k

        # Hickerson 2006: pi_b #
        self.average_number_of_pairwise_differences_between = d_xy

        # Hickerson 2006: pi_w #
        self.average_number_of_pairwise_differences_within = d_x + d_y

        # Hickerson 2006: pi_net #
        self.average_number_of_pairwise_differences_net = d_xy - (d_x + d_y)

        # Hickerson 2006: S #
        self.num_segregating_sites = _num_segregating_sites(
            self.combined_seqs, self.state_alphabet, self.ignore_uncertain)

        # Hickerson 2006: theta #
        a1 = sum([1.0 / i for i in range(1, n)])
        self.wattersons_theta = float(self.num_segregating_sites) / a1

        # Wakeley 1996 #
        self.wakeleys_psi = (float(1) /
                             (a)) * (ax * (math.sqrt(s2_x) / d_x) + ay *
                                     (math.sqrt(s2_y) / d_y) +
                                     (2 * n_x * n_y * math.sqrt(s2_xy) / k))

        # Tajima's D #
        self.tajimas_d = _tajimas_d(
            n, self.average_number_of_pairwise_differences,
            self.num_segregating_sites)