def test_analyse_sample_zero(self): ''' test we raise an error if the de novo count is zero ''' rates = WeightedChoice() rates.add_choice(200, 1e-5, 'A', 'G') rates.add_choice(201, 2e-5, 'C', 'T') severity = [5, 10] with self.assertRaises(ValueError): analyse(rates, severity, 0, 0, iterations=10000)
def test_analyse_mismatch(self): ''' test for error when the rates and severity lengths are different ''' rates = WeightedChoice() rates.add_choice(200, 1e-5, 'A', 'G') rates.add_choice(201, 2e-5, 'C', 'T') severity = [5, 10, 5] with self.assertRaises(ValueError): analyse(rates, severity, 8, 1, iterations=100000)
def test_analyse(self): ''' test that we run the simulations correctly ''' rates = WeightedChoice() rates.add_choice(200, 1e-5, 'A', 'G') rates.add_choice(201, 2e-5, 'C', 'T') rates.add_choice(202, 1e-5, 'C', 'G') severity = [5, 10, 5] # define a test where the observed score will fall at the midpoint of # the simulated null distribution p = analyse(rates, severity, 8, 1, iterations=100000) self.assertAlmostEqual(p, 0.5, places=2) # now check when we sample two de novo mutations p = analyse(rates, severity, 15, 2, iterations=100000) self.assertAlmostEqual(p, 0.25, places=2)
def test_analyse_bigger(self): ''' test a more realistically sized data set ''' seed(0) rates = WeightedChoice() pos = sorted(set([randint(1000, 3000) for x in range(2000)])) for x in pos: rates.add_choice(x, uniform(1e-10, 1e-7), 'A', 'G') severity = [randint(0, 40) for x in pos] p = analyse(rates, severity, 150, 4, iterations=10000) self.assertAlmostEqual(p, 3e-4, places=2)
def analyse_gene(ensembl, mut_dict, cadd, symbol, de_novos, constraint, weights): ''' analyse the severity of de novos found in a gene Args: ensembl: EnsemblRequest object, for transcript coordinates and sequence mut_dict: list of sequence-context mutation probabilities. cadd: pysam.TabixFile object for CADD scores (SNVs only) symbol: HGNC symbol for current gene de_novos: list of de novo mutations observed in current gene. Each entry is a dict with 'position', 'ref', 'alt', and 'consequence' keys. weights: dictionary of objects to weight CADD severity scores. We have different weights for protein-truncating and protein-altering variants, and within the protein-altering variants, different weights for variants in constrained and unconstrained regions. Returns: p-value for the observed total severity with respect to a null distribution of severities for the gene. ''' sites = [x['pos'] for x in de_novos] try: # create gene/transcript for de novo mutations transcripts = load_gene(ensembl, symbol, sites) except IndexError: return 'NA' # get per site/allele mutation rates rates_by_cq = get_site_sampler(transcripts, mut_dict) chrom = transcripts[0].get_chrom() # get per site/allele severity scores, weighted by enrichment of missense # in known dominant at different severity thresholds constrained = get_constrained_positions(ensembl, constraint, symbol) severity = get_severity(cadd, chrom, rates_by_cq, weights, constrained) # convert the rates per site per consequence to rates per site rates = WeightedChoice() for cq in sorted(rates_by_cq): rates.append(rates_by_cq[cq]) # get summed score for observed de novos observed = sum((get_severity(cadd, chrom, de_novos, weights, constrained))) # simulate distribution of summed scores within transcript return analyse(rates, severity, observed, len(de_novos), 1000000)
def test_analyse_extreme_p_value(self): ''' test when the observed severity score exceeds all possible values ''' rates = WeightedChoice() rates.add_choice(200, 1e-5, 'A', 'G') rates.add_choice(201, 2e-5, 'C', 'T') rates.add_choice(202, 1e-5, 'C', 'G') severity = [5, 10, 5] # now check when the observed severity score exceeds all possible # values from the severity distribution. This test gives an absurd # p-value at 1e-6, but that is because the observed value is # unachievable given the existsing severity scores. In practice the # observed score will always be theoretically achieveable in the null # distribution, since the observed score is calculated from the # existsing scores. p = analyse(rates, severity, 20, 1, iterations=100000) self.assertAlmostEqual(p, 1e-6, places=4)
def test_analyse_empty(self): ''' check we raise an error if the rates and severity are empty ''' with self.assertRaises(ValueError): analyse(WeightedChoice(), [], 8, 1, iterations=10000)