def calc(self): """ Returns a summary of a set of sequences that can be partitioned into the list of lists of taxa given by ``taxon_groups``. """ diffs_x, mean_diffs_x, sq_diff_x = _count_differences( self.pop1_seqs, self.state_alphabet, self.ignore_uncertain ) diffs_y, mean_diffs_y, sq_diff_y = _count_differences( self.pop2_seqs, self.state_alphabet, self.ignore_uncertain ) d_x = diffs_x / probability.binomial_coefficient(len(self.pop1_seqs), 2) d_y = diffs_y / probability.binomial_coefficient(len(self.pop2_seqs), 2) d_xy = self._average_number_of_pairwise_differences_between_populations() s2_x = (float(sq_diff_x) / probability.binomial_coefficient(len(self.pop1_seqs), 2)) - (d_x ** 2) s2_y = (float(sq_diff_y) / probability.binomial_coefficient(len(self.pop2_seqs), 2)) - (d_y ** 2) s2_xy = self._variance_of_pairwise_differences_between_populations(d_xy) n = len(self.combined_seqs) n_x = float(len(self.pop1_seqs)) n_y = float(len(self.pop2_seqs)) a = float(n * (n - 1)) ax = float(n_x * (n_x - 1)) ay = float(n_y * (n_y - 1)) k = _average_number_of_pairwise_differences(self.combined_seqs, self.state_alphabet, self.ignore_uncertain) n = len(self.combined_seqs) # Hickerson 2006: pi # self.average_number_of_pairwise_differences = k # Hickerson 2006: pi_b # self.average_number_of_pairwise_differences_between = d_xy # Hickerson 2006: pi_w # self.average_number_of_pairwise_differences_within = d_x + d_y # Hickerson 2006: pi_net # self.average_number_of_pairwise_differences_net = d_xy - (d_x + d_y) # Hickerson 2006: S # self.num_segregating_sites = _num_segregating_sites( self.combined_seqs, self.state_alphabet, self.ignore_uncertain ) # Hickerson 2006: theta # a1 = sum([1.0 / i for i in range(1, n)]) self.wattersons_theta = float(self.num_segregating_sites) / a1 # Wakeley 1996 # self.wakeleys_psi = (float(1) / (a)) * ( ax * (math.sqrt(s2_x) / d_x) + ay * (math.sqrt(s2_y) / d_y) + (2 * n_x * n_y * math.sqrt(s2_xy) / k) ) # Tajima's D # self.tajimas_d = _tajimas_d(n, self.average_number_of_pairwise_differences, self.num_segregating_sites)
def expected_tmrca(n_genes, pop_size=None, n_to_coalesce=2): """ Expected (mean) value for the Time to the Most Recent Common Ancestor of ``n_to_coalesce`` genes in a sample of ``n_genes`` drawn from a population of ``pop_size`` genes. Parameters ---------- n_genes : integer The number of genes in the sample. pop_size : integer The effective *haploid* population size; i.e., number of genes in the population: 2 * N in a diploid population of N individuals, or N in a haploid population of N individuals. n_to_coalesce : integer The waiting time that will be returned will be the waiting time for this number of genes in the sample to coalesce. rng : `Random` The random number generator instance. Returns ------- k : float The expected waiting time (in continuous time) for ``n_to_coalesce`` genes to coalesce out of a sample of ``n_genes`` in a population of ``pop_size`` genes. """ nc2 = probability.binomial_coefficient(n_genes, n_to_coalesce) tmrca = (float(1)/nc2) if pop_size is not None: return tmrca * pop_size else: return tmrca
def time_to_coalescence(n_genes, pop_size=None, n_to_coalesce=2, rng=None): """ A random draw from the "Kingman distribution" (discrete time version): Time to go from ``n_genes`` genes to ``n_genes``-1 genes in a continuous-time Wright-Fisher population of ``pop_size`` genes; i.e. waiting time until ``n-genes`` lineages coalesce in a population of ``pop_size`` genes. Given the number of gene lineages in a sample, ``n_genes``, and a population size, ``pop_size``, this function returns a random number from an exponential distribution with rate $\choose(``pop_size``, 2)$. ``pop_size`` is the effective *haploid* population size; i.e., number of gene in the population: 2 * N in a diploid population of N individuals, or N in a haploid population of N individuals. If ``pop_size`` is 1 or 0 or None, then time is in haploid population units; i.e. where 1 unit of time equals 2N generations for a diploid population of size N, or N generations for a haploid population of size N. Otherwise time is in generations. The coalescence time, or the waiting time for the coalescence, of two gene lineages evolving in a population with haploid size $N$ is an exponentially-distributed random variable with rate of $N$ an expectation of $\frac{1}{N}$). The waiting time for coalescence of *any* two gene lineages in a sample of $n$ gene lineages evolving in a population with haploid size $N$ is an exponentially-distributed random variable with rate of $\choose{N, 2}$ and an expectation of $\frac{1}{\choose{N, 2}}$. Parameters ---------- n_genes : integer The number of genes in the sample. pop_size : integer The effective *haploid* population size; i.e., number of genes in the population: 2 * N in a diploid population of N individuals, or N in a haploid population of N individuals. n_to_coalesce : integer The waiting time that will be returned will be the waiting time for this number of genes in the sample to coalesce. rng : `Random` The random number generator instance to use. Returns ------- k : float A randomly-generated waiting time (in continuous time) for ``n_to_coalesce`` genes to coalesce out of a sample of ``n_genes`` in a population of ``pop_size`` genes. """ if rng is None: rng = GLOBAL_RNG if not pop_size: time_units = 1.0 else: time_units = pop_size rate = probability.binomial_coefficient(n_genes, n_to_coalesce) tmrca = rng.expovariate(rate) return tmrca * time_units
def _average_number_of_pairwise_differences(char_sequences, state_alphabet, ignore_uncertain=True): """ Returns $k$ (Tajima 1983; Wakely 1996), calculated for a set of sequences: k = \frac{\right(\sum \sum \k_{ij}\left)}{n \choose 2} where $k_{ij}$ is the number of pairwise differences between the $i$th and $j$th sequence, and $n$ is the number of DNA sequences sampled. """ sum_diff, mean_diff, sq_diff = _count_differences(char_sequences, state_alphabet, ignore_uncertain) return sum_diff / probability.binomial_coefficient(len(char_sequences), 2)
def discrete_time_to_coalescence(n_genes, pop_size=None, n_to_coalesce=2, rng=None): """ A random draw from the "Kingman distribution" (discrete time version): Time to go from ``n_genes`` genes to ``n_genes``-1 genes in a discrete-time Wright-Fisher population of ``pop_size`` genes; i.e. waiting time until ``n-genes`` lineages coalesce in a population of ``pop_size`` genes. Parameters ---------- n_genes : integer The number of genes in the sample. pop_size : integer The effective *haploid* population size; i.e., number of genes in the population: 2 * N in a diploid population of N individuals, or N in a haploid population of N individuals. n_to_coalesce : integer The waiting time that will be returned will be the waiting time for this number of genes in the sample to coalesce. rng : `Random` The random number generator instance. Returns ------- k : integer A randomly-generated waiting time (in discrete generations) for ``n_to_coalesce`` genes to coalesce out of a sample of ``n_genes`` in a population of ``pop_size`` genes. """ if not pop_size: time_units = 1.0 else: time_units = pop_size if rng is None: rng = GLOBAL_RNG p = pop_size / probability.binomial_coefficient(n_genes, n_to_coalesce) tmrca = probability.geometric_rv(p) return tmrca * time_units