def test_analyse_sample_zero(self):
        ''' test we raise an error if the de novo count is zero
        '''
        rates = WeightedChoice()
        rates.add_choice(200, 1e-5, 'A', 'G')
        rates.add_choice(201, 2e-5, 'C', 'T')

        severity = [5, 10]
        with self.assertRaises(ValueError):
            analyse(rates, severity, 0, 0, iterations=10000)
    def test_analyse_mismatch(self):
        ''' test for error when the rates and severity lengths are different
        '''

        rates = WeightedChoice()
        rates.add_choice(200, 1e-5, 'A', 'G')
        rates.add_choice(201, 2e-5, 'C', 'T')

        severity = [5, 10, 5]

        with self.assertRaises(ValueError):
            analyse(rates, severity, 8, 1, iterations=100000)
Example #3
0
    def test___init__(self):
        """ check that __init__() initiates the object correctly
        """

        choices = WeightedChoice()

        # check that an object without any possible choices has a cumulative
        # sum of 0, but returns a choice of -1
        self.assertEqual(choices.get_summed_rate(), 0)
        self.assertEqual(choices.choice(), -1)

        # check that the type is set correctly
        self.assertEqual(type(choices), WeightedChoice)
Example #4
0
 def test___init__(self):
     """ check that __init__() initiates the object correctly
     """
     
     choices = WeightedChoice()
     
     # check that an object without any possible choices has a cumulative
     # sum of 0, but returns a choice of -1
     self.assertEqual(choices.get_summed_rate(), 0)
     self.assertEqual(choices.choice(), -1)
     
     # check that the type is set correctly
     self.assertEqual(type(choices), WeightedChoice)
class TestSimulationsPy(unittest.TestCase):
    """ unit test the simulation functions
    """
    
    def setUp(self):
        """
        """
        
        # set up a range of possible positions, all with a uniform probability
        # of being selected
        self.choices = WeightedChoice()
        for x in range(1000):
            self.choices.add_choice(x, 0.0001)
        
        self.iterations = 100000
    
    def test_analyse_de_novos_dispersed(self):
        """ test analyse_de_novos() works correctly for dispersed de novos
        """
        
        # spread sites throughout a 1000 bp transcript
        positions = [100, 300, 600]
        distances = get_distances(positions)
        observed = geomean(distances)
        
        p_val = analyse_de_novos(self.choices, self.iterations, len(positions), observed)
        
        self.assertAlmostEqual(p_val, 0.635, places=2)
    
    def test_analyse_de_novos_clustered(self):
        """ test analyse_de_novos() works correctly for clustered de novos
        """
        
        # cluster sites within 20 bp in a 1000 bp transcript
        positions = [100, 110, 120]
        distances = get_distances(positions)
        observed = geomean(distances)
        
        p_val = analyse_de_novos(self.choices, 1000000, len(positions), observed)
        
        self.assertAlmostEqual(p_val, 0.002, places=3)
    
    def test_simulate_distribution(self):
        ''' check that simulate_distribution works correctly
        '''
        
        # repeated function calls should give different samples
        first = simulate_distribution(self.choices, iterations=5, de_novos_count=3)
        second = simulate_distribution(self.choices, iterations=5, de_novos_count=3)
        
        self.assertNotEqual(first, second)
Example #6
0
    def test_choice(self):
        """ test that choice() works correctly.
        
        Since WeightedChoice is a weighted random sampler, we can't rely on
        getting exact values out, so repeated samples are expected to obtain
        proportions of values equivalent to their weight value. The difference
        to the expected proportion minimises with larger sample sets, but at
        the cost of making the test hang for > 1 second for 1 million samples,
        or > 10 s for 10 million samples.
        """

        iterations = 1000000

        choices = WeightedChoice()
        choices.add_choice(1, 1)
        choices.add_choice(2, 5)
        s = [choices.choice() for x in range(iterations)]
        self.assertAlmostEqual(s.count(1) / len(s), 0.1667, places=2)

        # add another choice, then check that all of the choices have been
        # sampled at the expecetd proportions
        choices.add_choice(3, 4)
        s = [choices.choice() for x in range(iterations)]
        self.assertAlmostEqual(s.count(1) / len(s), 0.100, places=2)
        self.assertAlmostEqual(s.count(2) / len(s), 0.500, places=2)
        self.assertAlmostEqual(s.count(3) / len(s), 0.400, places=2)

        # check that all the choices have been made from the inserted values
        self.assertEqual(set(s), set([1, 2, 3]))
    def test_analyse_bigger(self):
        ''' test a more realistically sized data set
        '''

        seed(0)
        rates = WeightedChoice()
        pos = sorted(set([randint(1000, 3000) for x in range(2000)]))

        for x in pos:
            rates.add_choice(x, uniform(1e-10, 1e-7), 'A', 'G')

        severity = [randint(0, 40) for x in pos]

        p = analyse(rates, severity, 150, 4, iterations=10000)
        self.assertAlmostEqual(p, 3e-4, places=2)
def analyse_gene(ensembl, mut_dict, cadd, symbol, de_novos, constraint,
                 weights):
    ''' analyse the severity of de novos found in a gene
    
    Args:
        ensembl: EnsemblRequest object, for transcript coordinates and sequence
        mut_dict: list of sequence-context mutation probabilities.
        cadd: pysam.TabixFile object for CADD scores (SNVs only)
        symbol: HGNC symbol for current gene
        de_novos: list of de novo mutations observed in current gene. Each entry
            is a dict with 'position', 'ref', 'alt', and 'consequence' keys.
        weights: dictionary of objects to weight CADD severity scores. We have
            different weights for protein-truncating and protein-altering
            variants, and within the protein-altering variants, different
            weights for variants in constrained and unconstrained regions.
    
    Returns:
        p-value for the observed total severity with respect to a null
        distribution of severities for the gene.
    '''

    sites = [x['pos'] for x in de_novos]
    try:
        # create gene/transcript for de novo mutations
        transcripts = load_gene(ensembl, symbol, sites)
    except IndexError:
        return 'NA'

    # get per site/allele mutation rates
    rates_by_cq = get_site_sampler(transcripts, mut_dict)

    chrom = transcripts[0].get_chrom()

    # get per site/allele severity scores, weighted by enrichment of missense
    # in known dominant at different severity thresholds
    constrained = get_constrained_positions(ensembl, constraint, symbol)
    severity = get_severity(cadd, chrom, rates_by_cq, weights, constrained)

    # convert the rates per site per consequence to rates per site
    rates = WeightedChoice()
    for cq in sorted(rates_by_cq):
        rates.append(rates_by_cq[cq])

    # get summed score for observed de novos
    observed = sum((get_severity(cadd, chrom, de_novos, weights, constrained)))

    # simulate distribution of summed scores within transcript
    return analyse(rates, severity, observed, len(de_novos), 1000000)
Example #9
0
 def test_choice(self):
     """ test that choice() works correctly.
     
     Since WeightedChoice is a weighted random sampler, we can't rely on
     getting exact values out, so repeated samples are expected to obtain
     proportions of values equivalent to their weight value. The difference
     to the expected proportion minimises with larger sample sets, but at
     the cost of making the test hang for > 1 second for 1 million samples,
     or > 10 s for 10 million samples.
     """
     
     iterations = 1000000
     
     choices = WeightedChoice()
     choices.add_choice(1, 1)
     choices.add_choice(2, 5)
     s = [ choices.choice() for x in range(iterations) ]
     self.assertAlmostEqual(s.count(1)/len(s), 0.1667, places=2)
     
     # add another choice, then check that all of the choices have been
     # sampled at the expecetd proportions
     choices.add_choice(3, 4)
     s = [ choices.choice() for x in range(iterations) ]
     self.assertAlmostEqual(s.count(1)/len(s), 0.100, places=2)
     self.assertAlmostEqual(s.count(2)/len(s), 0.500, places=2)
     self.assertAlmostEqual(s.count(3)/len(s), 0.400, places=2)
     
     # check that all the choices have been made from the inserted values
     self.assertEqual(set(s), set([1, 2, 3]))
 def test_append(self):
     """ test that append() works correctly
     """
     
     # construct two objects
     a = WeightedChoice()
     a.add_choice(1, 0.5)
     
     b = WeightedChoice()
     b.add_choice(2, 1)
     
     # add one object to the other
     a.append(b)
     
     # check that the first object has changed correctly, but the other
     # remains unchanged
     self.assertEqual(a.get_summed_rate(), 1.5)
     self.assertEqual(b.get_summed_rate(), 1.0)
 def setUp(self):
     """
     """
     
     # set up a range of possible positions, all with a uniform probability
     # of being selected
     self.choices = WeightedChoice()
     for x in range(1000):
         self.choices.add_choice(x, 0.0001)
     
     self.iterations = 100000
def get_site_sampler(transcripts, mut_dict):
    ''' get per position and alt allele mutation probability sampler.
    
    We need to be able to sample each site within a gene, where the probability
    of sampling a given site is equal to the sequence-context derived mutation
    probability. We use the denovonear.weights.WeightedChoice for this, which
    wraps around a cpp class for quick sampling. We use the SiteRates class to
    derive the per site/allele probabilities for different consequence
    categories. We combine the categories of interest intoa single object, so we
    can sample across the full transcript at once. This also allows for multiple
    transcripts for a single gene, by taking the union of transcripts.
    
    Args:
        transcripts: list of Transcript objects for a gene.
        mut_dict: list of sequence-context mutation probabilities.
    
    Returns:
        denovonear.WeightedChoice object, containing the mutation probabilities
        per position and alt allele.
    '''

    consequences = ['nonsense', 'missense', 'splice_lof']
    all_rates = {}
    for cq in consequences:
        all_rates[cq] = WeightedChoice()

    combined_tx = None
    for tx in transcripts:

        rates = SiteRates(tx,
                          mut_dict,
                          masked_sites=combined_tx,
                          cds_coords=False)
        if combined_tx is None:
            combined_tx = tx
        else:
            combined_tx += tx

        for cq in consequences:
            all_rates[cq].append(rates[cq])

    return all_rates
    def test_analyse(self):
        ''' test that we run the simulations correctly
        '''

        rates = WeightedChoice()
        rates.add_choice(200, 1e-5, 'A', 'G')
        rates.add_choice(201, 2e-5, 'C', 'T')
        rates.add_choice(202, 1e-5, 'C', 'G')

        severity = [5, 10, 5]

        # define a test where the observed score will fall at the midpoint of
        # the simulated null distribution
        p = analyse(rates, severity, 8, 1, iterations=100000)
        self.assertAlmostEqual(p, 0.5, places=2)

        # now check when we sample two de novo mutations
        p = analyse(rates, severity, 15, 2, iterations=100000)
        self.assertAlmostEqual(p, 0.25, places=2)
    def test_analyse_extreme_p_value(self):
        ''' test when the observed severity score exceeds all possible values
        '''

        rates = WeightedChoice()
        rates.add_choice(200, 1e-5, 'A', 'G')
        rates.add_choice(201, 2e-5, 'C', 'T')
        rates.add_choice(202, 1e-5, 'C', 'G')

        severity = [5, 10, 5]

        # now check when the observed severity score exceeds all possible
        # values from the severity distribution. This test gives an absurd
        # p-value at 1e-6, but that is because the observed value is
        # unachievable given the existsing severity scores. In practice the
        # observed score will always be theoretically achieveable in the null
        # distribution, since the observed score is calculated from the
        # existsing scores.
        p = analyse(rates, severity, 20, 1, iterations=100000)
        self.assertAlmostEqual(p, 1e-6, places=4)
Example #15
0
    def test_add_choice(self):
        """ test that add_choice() works correctly
        """

        # check the cumulative sum while adding in new values
        choices = WeightedChoice()
        choices.add_choice(1, 1)
        self.assertEqual(choices.get_summed_rate(), 1)
        choices.add_choice(2, 5)
        self.assertEqual(choices.get_summed_rate(), 6)
        choices.add_choice(3, 10)
        self.assertEqual(choices.get_summed_rate(), 16)

        # check that it works for unsorted probabilities
        choices = WeightedChoice()
        choices.add_choice(1, 1)
        choices.add_choice(2, 10)
        choices.add_choice(3, 5)
        self.assertEqual(choices.get_summed_rate(), 16)

        # check for very low values, with very high precision (but not
        # necessarily exactly equal)
        choices = WeightedChoice()
        choices.add_choice(1, 5e-9)
        choices.add_choice(2, 1e-8)
        choices.add_choice(3, 1.000000000000005e-10)
        self.assertAlmostEqual(choices.get_summed_rate(),
                               1.51000000000000005e-8,
                               places=23)
Example #16
0
 def test_choice_with_alleles(self):
     """ test that choice_with_alleles() works correctly.
     """
     
     # if you add a choice with alleles, then check that we get back alleles,
     # and that they are the same
     choices = WeightedChoice()
     choices.add_choice(1, 1, "A", "T")
     self.assertEqual(choices.choice_with_alleles(),
         {'alt': 'T', 'ref': 'A', 'pos': 1, 'offset': 0})
     self.assertEqual(choices.choice(), 1)
     
     # if you add choices without alleles, then default the alleles to "N"
     choices = WeightedChoice()
     choices.add_choice(1, 1)
     self.assertEqual(choices.choice_with_alleles(),
         {'alt': 'N', 'ref': 'N', 'pos': 1, 'offset': 0})
     
     # make sure you can't add multi-base alleles to the choices
     with self.assertRaises(TypeError):
         choices.add_choice(1, 1, "AA", "A")
         choices.add_choice(1, 1, "A", "AG")
     
     # make sure non-zero offsets are returned corectly
     choices = WeightedChoice()
     choices.add_choice(1, 1, "A", "T", 3)
     self.assertEqual(choices.choice_with_alleles(),
         {'alt': 'T', 'ref': 'A', 'pos': 1, 'offset': 3})
     self.assertEqual(choices.choice(), 1)
Example #17
0
 def test_add_choice(self):
     """ test that add_choice() works correctly
     """
     
     # check the cumulative sum while adding in new values
     choices = WeightedChoice()
     choices.add_choice(1, 1)
     self.assertEqual(choices.get_summed_rate(), 1)
     choices.add_choice(2, 5)
     self.assertEqual(choices.get_summed_rate(), 6)
     choices.add_choice(3, 10)
     self.assertEqual(choices.get_summed_rate(), 16)
     
     # check that it works for unsorted probabilities
     choices = WeightedChoice()
     choices.add_choice(1, 1)
     choices.add_choice(2, 10)
     choices.add_choice(3, 5)
     self.assertEqual(choices.get_summed_rate(), 16)
     
     # check for very low values, with very high precision (but not
     # necessarily exactly equal)
     choices = WeightedChoice()
     choices.add_choice(1, 5e-9)
     choices.add_choice(2, 1e-8)
     choices.add_choice(3, 1.000000000000005e-10)
     self.assertAlmostEqual(choices.get_summed_rate(), 1.51000000000000005e-8, places=23)
Example #18
0
 def test_choice_small_numbers(self):
     """ test that choice() works correctly.
     """
     
     iterations = 1000000
     
     # very small numbers at the end still have expected proportions
     choices = WeightedChoice()
     choices.add_choice(1, 1)
     choices.add_choice(2, 5)
     choices.add_choice(3, 0.0001)
     s = [ choices.choice() for x in range(iterations) ]
     self.assertAlmostEqual(s.count(3)/len(s), 0.0001, places=3)
     
     # very small numbers at the start still have expected proportions
     choices = WeightedChoice()
     choices.add_choice(1, 0.0001)
     choices.add_choice(2, 1)
     choices.add_choice(3, 5)
     s = [ choices.choice() for x in range(iterations) ]
     self.assertAlmostEqual(s.count(1)/len(s), 0.0001, places=3)
     
     # check that the sampling works correctly at low weight values
     choices = WeightedChoice()
     
     numbers = range(1000, 3000)
     small = [ x * 0.000000000001 for x in numbers ]
     for (name, prob) in zip(numbers, small):
         choices.add_choice(name, prob)
     
     s = [ choices.choice() for x in range(iterations) ]
     self.assertAlmostEqual(s.count(numbers[0])/len(s), 0.0001, places=3)
    def test_analyse_empty(self):
        ''' check we raise an error if the rates and severity are empty
        '''

        with self.assertRaises(ValueError):
            analyse(WeightedChoice(), [], 8, 1, iterations=10000)
Example #20
0
    def test_choice_small_numbers(self):
        """ test that choice() works correctly.
        """

        iterations = 1000000

        # very small numbers at the end still have expected proportions
        choices = WeightedChoice()
        choices.add_choice(1, 1)
        choices.add_choice(2, 5)
        choices.add_choice(3, 0.0001)
        s = [choices.choice() for x in range(iterations)]
        self.assertAlmostEqual(s.count(3) / len(s), 0.0001, places=3)

        # very small numbers at the start still have expected proportions
        choices = WeightedChoice()
        choices.add_choice(1, 0.0001)
        choices.add_choice(2, 1)
        choices.add_choice(3, 5)
        s = [choices.choice() for x in range(iterations)]
        self.assertAlmostEqual(s.count(1) / len(s), 0.0001, places=3)

        # check that the sampling works correctly at low weight values
        choices = WeightedChoice()

        numbers = range(1000, 3000)
        small = [x * 0.000000000001 for x in numbers]
        for (name, prob) in zip(numbers, small):
            choices.add_choice(name, prob)

        s = [choices.choice() for x in range(iterations)]
        self.assertAlmostEqual(s.count(numbers[0]) / len(s), 0.0001, places=3)
Example #21
0
    def test_choice_with_alleles(self):
        """ test that choice_with_alleles() works correctly.
        """

        # if you add a choice with alleles, then check that we get back alleles,
        # and that they are the same
        choices = WeightedChoice()
        choices.add_choice(1, 1, "A", "T")
        self.assertEqual(choices.choice_with_alleles(), {
            'alt': 'T',
            'ref': 'A',
            'pos': 1,
            'offset': 0
        })
        self.assertEqual(choices.choice(), 1)

        # if you add choices without alleles, then default the alleles to "N"
        choices = WeightedChoice()
        choices.add_choice(1, 1)
        self.assertEqual(choices.choice_with_alleles(), {
            'alt': 'N',
            'ref': 'N',
            'pos': 1,
            'offset': 0
        })

        # make sure you can't add multi-base alleles to the choices
        with self.assertRaises(TypeError):
            choices.add_choice(1, 1, "AA", "A")
            choices.add_choice(1, 1, "A", "AG")

        # make sure non-zero offsets are returned corectly
        choices = WeightedChoice()
        choices.add_choice(1, 1, "A", "T", 3)
        self.assertEqual(choices.choice_with_alleles(), {
            'alt': 'T',
            'ref': 'A',
            'pos': 1,
            'offset': 3
        })
        self.assertEqual(choices.choice(), 1)
Example #22
0
    def test_append(self):
        """ test that append() works correctly
        """

        # construct two objects
        a = WeightedChoice()
        a.add_choice(1, 0.5)

        b = WeightedChoice()
        b.add_choice(2, 1)

        # add one object to the other
        a.append(b)

        # check that the first object has changed correctly, but the other
        # remains unchanged
        self.assertEqual(a.get_summed_rate(), 1.5)
        self.assertEqual(b.get_summed_rate(), 1.0)