def calc_coverage_threshold(cov_dict): ''' calculate minimum coverage threshold for each key in cov_dict. see end of 'alternative parameterization' section of Negative binomial page and scipy negative binomial documentation for details of calculation. ''' threshold_dict = {} for g in cov_dict: mean = float(cov_dict[g]['mean']) var = float(cov_dict[g]['variance']) q = (var-mean)/var n = mean**2/(var-mean) p = 1 - q ## assert that I did the math correctly. assert(isclose(nbinom.mean(n,p), mean)) assert(isclose(nbinom.var(n,p), var)) ## find the integer threshold that includes ~95% of REL606 distribution, ## excluding 5% on the left hand side. my_threshold = nbinom.ppf(0.05,n,p) my_threshold_p = nbinom.cdf(my_threshold,n,p) threshold_dict[g] = {'threshold':str(my_threshold), 'threshold_p':str(my_threshold_p)} return threshold_dict
def test_mran_var_p2(self): n, p = sm.distributions.zinegbin.convert_params(7, 1, 2) nbinom_mean, nbinom_var = nbinom.mean(n, p), nbinom.var(n, p) zinb_mean = sm.distributions.zinegbin.mean(7, 1, 2, 0) zinb_var = sm.distributions.zinegbin.var(7, 1, 2, 0) assert_allclose(nbinom_mean, zinb_mean, rtol=1e-10) assert_allclose(nbinom_var, zinb_var, rtol=1e-10)
def test_mean_var(self): for m in [9, np.array([1, 5, 10])]: n, p = sm.distributions.zinegbin.convert_params(m, 1, 1) nbinom_mean, nbinom_var = nbinom.mean(n, p), nbinom.var(n, p) zinb_mean = sm.distributions.zinegbin._mean(m, 1, 1, 0) zinb_var = sm.distributions.zinegbin._var(m, 1, 1, 0) assert_allclose(nbinom_mean, zinb_mean, rtol=1e-10) assert_allclose(nbinom_var, zinb_var, rtol=1e-10)
def calc_2X_coverage_threshold(cov_dict): ''' calculate coverage threshold for each key in cov_dict, based on a likelihood ratio between empirical Nbinom(mu,disp) 1X coverage distribution, and a theoretical Poisson(2*mu) 2X coverage distribution. see end of 'alternative parameterization' section of Negative binomial page and scipy negative binomial documentation for details of calculation. choose coverage threshold s.t. log likelihood ratio > 10. ''' ## to convert my IDs to REL IDs. rel_name = {'RM3-130-1':'REL11734','RM3-130-2':'REL11735', 'RM3-130-3':'REL11736','RM3-130-4':'REL11737', 'RM3-130-5':'REL11738','RM3-130-6':'REL11739', 'RM3-130-7':'REL11740','RM3-130-8':'REL11741', 'RM3-130-9':'REL11742','RM3-130-10':'REL11743', 'RM3-130-11':'REL11744','RM3-130-12':'REL11745', 'RM3-130-13':'REL11746','RM3-130-14':'REL11747', 'RM3-130-15':'REL11748','RM3-130-16':'REL11749', 'RM3-130-17':'REL11750','RM3-130-18':'REL11751', 'RM3-130-19':'REL11752','RM3-130-20':'REL11753', 'RM3-130-21':'REL11754','RM3-130-22':'REL11755', 'RM3-130-23':'REL11756','RM3-130-24':'REL11757', 'REL4397':'REL4397', 'REL4398':'REL4398', 'REL288':'REL288','REL291':'REL291','REL296':'REL296','REL298':'REL298'} threshold_dict = {} for g in cov_dict: mean = float(cov_dict[g]['mean']) var = float(cov_dict[g]['variance']) q = (var-mean)/var n = mean**2/(var-mean) p = 1 - q ## assert that I did the math correctly. assert(isclose(nbinom.mean(n,p), mean)) assert(isclose(nbinom.var(n,p), var)) ## find the integer threshold that includes ~95% of REL606 distribution, ## excluding 5% on the left hand side. for x in range(int(mean),int(2*mean)): p0 = nbinom.pmf(x,n,p) p1 = poisson.pmf(x,2*mean) lratio = p1/p0 if lratio > 10: my_threshold = x my_threshold_p0 = p0 my_threshold_p1 = p1 my_lratio = lratio break threshold_dict[rel_name[g]] = {'threshold':str(my_threshold), 'threshold_p0':str(my_threshold_p0), 'threshold_p1':str(my_threshold_p1), 'lratio':str(lratio)} return threshold_dict
def ComputeNBMeanVar(ExprPar): ''' Compute the mean and the variance of a NB distribution with parameter n and p ''' n = ExprPar[1] p = ExprPar[0] M = nbinom.mean(n, p) V = nbinom.var(n, p) return M, V
def test_inversion_diffs(self): cfg = AppSettings() reps = 1000 deltas = [] # observed number of differences for _ in range(0, reps): dna = Chromosome() old_seq = dna.sequence dna.inversion() deltas.append( sum(1 for a, b in zip(old_seq, dna.sequence) if a != b)) pmfs = [] expected_deltas = [] # expected differences # Assumes the length of an inversion is drawn from a negative binomial # distribution. Calculates the probability of each length until # 99.99% of the distribution is accounted for. The expected number of # differences for each length is multiplied by the probability of that length # and the sum of that gives the expected differences overall. k = 0 while sum(pmfs) <= 0.9999: pmf = nbinom.pmf(k, 1, (1 - cfg.genetics.mutation_length / (1 + cfg.genetics.mutation_length))) pmfs.append(pmf) diffs = math.floor( k / 2) * (1 - 1 / len(Chromosome.nucleotides())) * 2 expected_deltas.append(pmf * diffs) k += 1 expected_delta = sum(expected_deltas) # Since we are multiplying the binomial distribution (probably of differences at # a given lenght) by a negative binomial distribution (probability of a length) # we must compute the variance of two independent random variables # is Var(X * Y) = var(x) * var(y) + var(x) * mean(y) + mean(x) * var(y) # http://www.odelama.com/data-analysis/Commonly-Used-Math-Formulas/ mean_binom = cfg.genetics.mutation_length var_binom = binom.var(mean_binom, 1 / (len(Chromosome.nucleotides()))) mean_nbinom = cfg.genetics.mutation_length var_nbinom = nbinom.var(cfg.genetics.mutation_length, mean_nbinom / (1 + mean_nbinom)) var = var_binom * var_nbinom + \ var_binom * mean_nbinom + \ mean_binom * var_nbinom observed_delta = sum(deltas) / reps conf_99 = ((var / reps)**(1 / 2)) * 5 assert expected_delta - conf_99 < observed_delta < expected_delta + conf_99
def test_random_chromosome_length(self): """Ensures that random chromosomes are created at the correct average length.""" reps = 1000 cfg = AppSettings() lengths = [] for _ in range(0, reps): chrom = Chromosome() lengths.append(len(chrom.sequence)) mean_length = float(sum(lengths)) / len(lengths) expected_length = cfg.genetics.chromosome_length p = 1 - (expected_length / (1 + expected_length)) conf_99 = (nbinom.var(1, p) / reps)**(1 / 2) * 4 assert (expected_length - conf_99) <= mean_length <= (expected_length + conf_99)
def test_insertion_length(self): """Tests that insertion mutations are of the correct length""" cfg = AppSettings() reps = 1000 deltas = [] for _ in range(0, reps): dna = Chromosome() init_length = len(dna.sequence) dna.insertion() deltas.append(len(dna.sequence) - init_length) expected_delta = cfg.genetics.mutation_length var = nbinom.var( 1, cfg.genetics.mutation_length / (1 + cfg.genetics.mutation_length)) conf_99 = ((var / reps)**(1 / 2)) * 4 observed_delta = (sum(deltas) / reps) assert (expected_delta - conf_99) < observed_delta < (expected_delta + conf_99)
def test_deletion_length(self): """Test that deletions return the correct averge length""" cfg = AppSettings() reps = 1000 deltas = [] for _ in range(0, reps): dna = Chromosome() init_length = len(dna.sequence) dna.deletion() deltas.append(init_length - len(dna.sequence)) expected_delta = cfg.genetics.mutation_length var = nbinom.var( 1, cfg.genetics.mutation_length / (1 + cfg.genetics.mutation_length)) # Because there is a little slop around short strings or positions near the # end of the string, I multiply # the confidence by 10 just to limit the number of failing tests. conf_99 = ((var / reps)**(1 / 2)) * 10 observed_delta = sum(deltas) / reps assert (expected_delta - conf_99) < observed_delta < (expected_delta + conf_99)
def var(self, n, p): var = nbinom.var(self, n, p) return var