def test_analyse_sample_zero(self):
        ''' test we raise an error if the de novo count is zero
        '''
        rates = WeightedChoice()
        rates.add_choice(200, 1e-5, 'A', 'G')
        rates.add_choice(201, 2e-5, 'C', 'T')

        severity = [5, 10]
        with self.assertRaises(ValueError):
            analyse(rates, severity, 0, 0, iterations=10000)
    def test_analyse_mismatch(self):
        ''' test for error when the rates and severity lengths are different
        '''

        rates = WeightedChoice()
        rates.add_choice(200, 1e-5, 'A', 'G')
        rates.add_choice(201, 2e-5, 'C', 'T')

        severity = [5, 10, 5]

        with self.assertRaises(ValueError):
            analyse(rates, severity, 8, 1, iterations=100000)
    def test_analyse(self):
        ''' test that we run the simulations correctly
        '''

        rates = WeightedChoice()
        rates.add_choice(200, 1e-5, 'A', 'G')
        rates.add_choice(201, 2e-5, 'C', 'T')
        rates.add_choice(202, 1e-5, 'C', 'G')

        severity = [5, 10, 5]

        # define a test where the observed score will fall at the midpoint of
        # the simulated null distribution
        p = analyse(rates, severity, 8, 1, iterations=100000)
        self.assertAlmostEqual(p, 0.5, places=2)

        # now check when we sample two de novo mutations
        p = analyse(rates, severity, 15, 2, iterations=100000)
        self.assertAlmostEqual(p, 0.25, places=2)
    def test_analyse_bigger(self):
        ''' test a more realistically sized data set
        '''

        seed(0)
        rates = WeightedChoice()
        pos = sorted(set([randint(1000, 3000) for x in range(2000)]))

        for x in pos:
            rates.add_choice(x, uniform(1e-10, 1e-7), 'A', 'G')

        severity = [randint(0, 40) for x in pos]

        p = analyse(rates, severity, 150, 4, iterations=10000)
        self.assertAlmostEqual(p, 3e-4, places=2)
def analyse_gene(ensembl, mut_dict, cadd, symbol, de_novos, constraint,
                 weights):
    ''' analyse the severity of de novos found in a gene
    
    Args:
        ensembl: EnsemblRequest object, for transcript coordinates and sequence
        mut_dict: list of sequence-context mutation probabilities.
        cadd: pysam.TabixFile object for CADD scores (SNVs only)
        symbol: HGNC symbol for current gene
        de_novos: list of de novo mutations observed in current gene. Each entry
            is a dict with 'position', 'ref', 'alt', and 'consequence' keys.
        weights: dictionary of objects to weight CADD severity scores. We have
            different weights for protein-truncating and protein-altering
            variants, and within the protein-altering variants, different
            weights for variants in constrained and unconstrained regions.
    
    Returns:
        p-value for the observed total severity with respect to a null
        distribution of severities for the gene.
    '''

    sites = [x['pos'] for x in de_novos]
    try:
        # create gene/transcript for de novo mutations
        transcripts = load_gene(ensembl, symbol, sites)
    except IndexError:
        return 'NA'

    # get per site/allele mutation rates
    rates_by_cq = get_site_sampler(transcripts, mut_dict)

    chrom = transcripts[0].get_chrom()

    # get per site/allele severity scores, weighted by enrichment of missense
    # in known dominant at different severity thresholds
    constrained = get_constrained_positions(ensembl, constraint, symbol)
    severity = get_severity(cadd, chrom, rates_by_cq, weights, constrained)

    # convert the rates per site per consequence to rates per site
    rates = WeightedChoice()
    for cq in sorted(rates_by_cq):
        rates.append(rates_by_cq[cq])

    # get summed score for observed de novos
    observed = sum((get_severity(cadd, chrom, de_novos, weights, constrained)))

    # simulate distribution of summed scores within transcript
    return analyse(rates, severity, observed, len(de_novos), 1000000)
    def test_analyse_extreme_p_value(self):
        ''' test when the observed severity score exceeds all possible values
        '''

        rates = WeightedChoice()
        rates.add_choice(200, 1e-5, 'A', 'G')
        rates.add_choice(201, 2e-5, 'C', 'T')
        rates.add_choice(202, 1e-5, 'C', 'G')

        severity = [5, 10, 5]

        # now check when the observed severity score exceeds all possible
        # values from the severity distribution. This test gives an absurd
        # p-value at 1e-6, but that is because the observed value is
        # unachievable given the existsing severity scores. In practice the
        # observed score will always be theoretically achieveable in the null
        # distribution, since the observed score is calculated from the
        # existsing scores.
        p = analyse(rates, severity, 20, 1, iterations=100000)
        self.assertAlmostEqual(p, 1e-6, places=4)
    def test_analyse_empty(self):
        ''' check we raise an error if the rates and severity are empty
        '''

        with self.assertRaises(ValueError):
            analyse(WeightedChoice(), [], 8, 1, iterations=10000)