def calc_coverage_threshold(cov_dict):
    '''
    calculate minimum coverage threshold for each key in cov_dict.
    see end of 'alternative parameterization' section of Negative binomial page
    and scipy negative binomial documentation for details of calculation.
    '''
    threshold_dict = {}
    for g in cov_dict:
        mean = float(cov_dict[g]['mean'])
        var = float(cov_dict[g]['variance'])
        q = (var-mean)/var
        n = mean**2/(var-mean)
        p = 1 - q

        ## assert that I did the math correctly.
        assert(isclose(nbinom.mean(n,p), mean))
        assert(isclose(nbinom.var(n,p), var))

        ## find the integer threshold that includes ~95% of REL606 distribution,
        ## excluding 5% on the left hand side.
        my_threshold = nbinom.ppf(0.05,n,p)
        my_threshold_p = nbinom.cdf(my_threshold,n,p)
        threshold_dict[g] = {'threshold':str(my_threshold),
                             'threshold_p':str(my_threshold_p)}
    return threshold_dict
Exemple #2
0
 def test_mran_var_p2(self):
     n, p = sm.distributions.zinegbin.convert_params(7, 1, 2)
     nbinom_mean, nbinom_var = nbinom.mean(n, p), nbinom.var(n, p)
     zinb_mean = sm.distributions.zinegbin.mean(7, 1, 2, 0)
     zinb_var = sm.distributions.zinegbin.var(7, 1, 2, 0)
     assert_allclose(nbinom_mean, zinb_mean, rtol=1e-10)
     assert_allclose(nbinom_var, zinb_var, rtol=1e-10)
Exemple #3
0
 def test_mean_var(self):
     for m in [9, np.array([1, 5, 10])]:
         n, p = sm.distributions.zinegbin.convert_params(m, 1, 1)
         nbinom_mean, nbinom_var = nbinom.mean(n, p), nbinom.var(n, p)
         zinb_mean = sm.distributions.zinegbin._mean(m, 1, 1, 0)
         zinb_var = sm.distributions.zinegbin._var(m, 1, 1, 0)
         assert_allclose(nbinom_mean, zinb_mean, rtol=1e-10)
         assert_allclose(nbinom_var, zinb_var, rtol=1e-10)
def calc_2X_coverage_threshold(cov_dict):
    '''
    calculate coverage threshold for each key in cov_dict, based on a likelihood ratio 
    between empirical Nbinom(mu,disp) 1X coverage distribution, and a theoretical 
    Poisson(2*mu) 2X coverage distribution.
    see end of 'alternative parameterization' section of Negative binomial page
    and scipy negative binomial documentation for details of calculation.

    choose coverage threshold s.t. log likelihood ratio > 10.

    '''

    ## to convert my IDs to REL IDs.
    rel_name = {'RM3-130-1':'REL11734','RM3-130-2':'REL11735',
                'RM3-130-3':'REL11736','RM3-130-4':'REL11737',
                'RM3-130-5':'REL11738','RM3-130-6':'REL11739',
                'RM3-130-7':'REL11740','RM3-130-8':'REL11741',
                'RM3-130-9':'REL11742','RM3-130-10':'REL11743',
                'RM3-130-11':'REL11744','RM3-130-12':'REL11745',
                'RM3-130-13':'REL11746','RM3-130-14':'REL11747',
                'RM3-130-15':'REL11748','RM3-130-16':'REL11749',
                'RM3-130-17':'REL11750','RM3-130-18':'REL11751',
                'RM3-130-19':'REL11752','RM3-130-20':'REL11753',
                'RM3-130-21':'REL11754','RM3-130-22':'REL11755',
                'RM3-130-23':'REL11756','RM3-130-24':'REL11757',
                'REL4397':'REL4397', 'REL4398':'REL4398',
                'REL288':'REL288','REL291':'REL291','REL296':'REL296','REL298':'REL298'}

    
    threshold_dict = {}
    for g in cov_dict:
        mean = float(cov_dict[g]['mean'])
        var = float(cov_dict[g]['variance'])
        q = (var-mean)/var
        n = mean**2/(var-mean)
        p = 1 - q
        
        ## assert that I did the math correctly.
        assert(isclose(nbinom.mean(n,p), mean))
        assert(isclose(nbinom.var(n,p), var))

        ## find the integer threshold that includes ~95% of REL606 distribution,
        ## excluding 5% on the left hand side.
        for x in range(int(mean),int(2*mean)):
            p0 = nbinom.pmf(x,n,p)
            p1 = poisson.pmf(x,2*mean)
            lratio = p1/p0
            if lratio > 10:
                my_threshold = x
                my_threshold_p0 = p0
                my_threshold_p1 = p1
                my_lratio = lratio
                break    
        threshold_dict[rel_name[g]] = {'threshold':str(my_threshold),
                             'threshold_p0':str(my_threshold_p0),
                             'threshold_p1':str(my_threshold_p1),
                             'lratio':str(lratio)}
    return threshold_dict
Exemple #5
0
def ComputeNBMeanVar(ExprPar):
    '''
    Compute the mean and the variance of a NB distribution with parameter n and p
    '''

    n = ExprPar[1]
    p = ExprPar[0]

    M = nbinom.mean(n, p)
    V = nbinom.var(n, p)
    return M, V
Exemple #6
0
    def test_inversion_diffs(self):
        cfg = AppSettings()

        reps = 1000
        deltas = []  # observed number of differences

        for _ in range(0, reps):
            dna = Chromosome()
            old_seq = dna.sequence
            dna.inversion()
            deltas.append(
                sum(1 for a, b in zip(old_seq, dna.sequence) if a != b))

        pmfs = []
        expected_deltas = []  # expected differences

        # Assumes the length of an inversion is drawn from a negative binomial
        # distribution. Calculates the probability of each length until
        # 99.99% of the distribution is accounted for. The expected number of
        # differences for each length is multiplied by the probability of that length
        # and the sum of that gives the expected differences overall.
        k = 0
        while sum(pmfs) <= 0.9999:
            pmf = nbinom.pmf(k, 1, (1 - cfg.genetics.mutation_length /
                                    (1 + cfg.genetics.mutation_length)))
            pmfs.append(pmf)

            diffs = math.floor(
                k / 2) * (1 - 1 / len(Chromosome.nucleotides())) * 2
            expected_deltas.append(pmf * diffs)
            k += 1

        expected_delta = sum(expected_deltas)

        # Since we are multiplying the binomial distribution (probably of differences at
        # a given lenght) by a negative binomial distribution (probability of a length)
        # we must compute the variance of two independent random variables
        # is Var(X * Y) = var(x) * var(y) + var(x) * mean(y) + mean(x) * var(y)
        # http://www.odelama.com/data-analysis/Commonly-Used-Math-Formulas/

        mean_binom = cfg.genetics.mutation_length
        var_binom = binom.var(mean_binom, 1 / (len(Chromosome.nucleotides())))

        mean_nbinom = cfg.genetics.mutation_length
        var_nbinom = nbinom.var(cfg.genetics.mutation_length,
                                mean_nbinom / (1 + mean_nbinom))

        var = var_binom * var_nbinom + \
              var_binom * mean_nbinom + \
              mean_binom * var_nbinom

        observed_delta = sum(deltas) / reps
        conf_99 = ((var / reps)**(1 / 2)) * 5
        assert expected_delta - conf_99 < observed_delta < expected_delta + conf_99
Exemple #7
0
    def test_random_chromosome_length(self):
        """Ensures that random chromosomes are created at the correct average
           length."""
        reps = 1000
        cfg = AppSettings()
        lengths = []
        for _ in range(0, reps):
            chrom = Chromosome()
            lengths.append(len(chrom.sequence))

        mean_length = float(sum(lengths)) / len(lengths)
        expected_length = cfg.genetics.chromosome_length

        p = 1 - (expected_length / (1 + expected_length))
        conf_99 = (nbinom.var(1, p) / reps)**(1 / 2) * 4
        assert (expected_length - conf_99) <= mean_length <= (expected_length +
                                                              conf_99)
Exemple #8
0
    def test_insertion_length(self):
        """Tests that insertion mutations are of the correct length"""
        cfg = AppSettings()
        reps = 1000
        deltas = []

        for _ in range(0, reps):
            dna = Chromosome()
            init_length = len(dna.sequence)
            dna.insertion()
            deltas.append(len(dna.sequence) - init_length)

        expected_delta = cfg.genetics.mutation_length
        var = nbinom.var(
            1,
            cfg.genetics.mutation_length / (1 + cfg.genetics.mutation_length))

        conf_99 = ((var / reps)**(1 / 2)) * 4
        observed_delta = (sum(deltas) / reps)
        assert (expected_delta - conf_99) < observed_delta < (expected_delta +
                                                              conf_99)
Exemple #9
0
    def test_deletion_length(self):
        """Test that deletions return the correct averge length"""
        cfg = AppSettings()
        reps = 1000
        deltas = []

        for _ in range(0, reps):
            dna = Chromosome()
            init_length = len(dna.sequence)
            dna.deletion()
            deltas.append(init_length - len(dna.sequence))

        expected_delta = cfg.genetics.mutation_length
        var = nbinom.var(
            1,
            cfg.genetics.mutation_length / (1 + cfg.genetics.mutation_length))

        # Because there is a little slop around short strings or positions near the
        # end of the string, I multiply
        # the confidence by 10 just to limit the number of failing tests.
        conf_99 = ((var / reps)**(1 / 2)) * 10
        observed_delta = sum(deltas) / reps
        assert (expected_delta - conf_99) < observed_delta < (expected_delta +
                                                              conf_99)
Exemple #10
0
 def var(self, n, p):
     var = nbinom.var(self, n, p)
     return var