Example #1
0
 def test_get_p_value_lofs(self):
     """ check for loss of function rates
     """
 
     # construct two different tx objects, for the same gene, since otherwise
     # the p-values from different runs are the same
     tx1 = self.construct_gene()
     rates1 = self.get_rates(tx1)
     
     iterations = 10000
     cq = 'loss_of_function'
     de_novos = [5, 6]
     (obs_1, p_1) = get_p_value(tx1, rates1, iterations, cq, de_novos)
     
     tx2 = self.construct_gene()
     rates2 = self.get_rates(tx2)
     
     # make sure we can use the string 'lof' to get loss-of-function rates.
     # NOTE: due to randomly sampling, this will fail ~0.1% of the time,
     # purely by chance. If this fails, first try rerunning the tests.
     iterations = 10000
     cq = 'lof'
     (obs_2, p_2) = get_p_value(tx2, rates2, iterations, cq, de_novos)
     self.assertEqual(obs_1, obs_2)
     self.assertTrue(abs(p_1 - p_2) < 0.017)
Example #2
0
 def test_get_p_value(self):
     """
     """
     
     iterations = 10000
     cq = 'missense'
     de_novos = [5, 5]
     (obs, p_value) = get_p_value(self.transcript, self.rates, iterations, cq, de_novos)
     
     self.assertTrue(p_value < 0.04)
     self.assertEqual(obs, '0.0')
Example #3
0
 def test_get_p_value_single_de_novo(self):
     """ check that get_p_value() works correctly
     """
     
     # check that we don't assess transcripts with a single de novo
     iterations = 10000
     cq = 'missense'
     de_novos = [5]
     
     (obs, p_value) = get_p_value(self.transcript, self.rates, iterations, cq, de_novos)
     self.assertEqual((obs, p_value), ('NA', 'NA'))
Example #4
0
 def test_get_p_value_nonsignificant(self):
     """ check for de novos spread across the gene
     """
     
     iterations = 10000
     cq = 'missense'
     de_novos = [5, 58]
     
     (obs, p_value) = get_p_value(self.transcript, self.rates, iterations, cq, de_novos)
     
     self.assertTrue(p_value == 1.0)
     self.assertEqual(obs, '53.0')
Example #5
0
 def test_get_p_value_single_de_novo(self):
     """ check that get_p_value() works correctly
     """
     
     # check that we don't assess transcripts with a single de novo
     iterations = 10000
     cq = 'missense'
     de_novos = [5]
     
     (obs, p_value) = get_p_value(self.transcript, self.rates, iterations, cq, de_novos)
     self.assertTrue(math.isnan(obs))
     self.assertTrue(math.isnan(p_value))
Example #6
0
async def cluster_de_novos(symbol, de_novos, ensembl, iterations=1000000, mut_dict=None):
    """ analysis proximity cluster of de novos in a single gene
    
    Args:
        symbol: HGNC symbol for a gene
        de_novos: dictionary of de novo positions for the HGNC gene,
        indexed by functional type
        iterations: number of simulations to run
        ensembl: EnsemblRequest object, for obtaing info from ensembl
        mut_dict: dictionary of mutation rates, indexed by trinuclotide sequence
    
    Returns:
        a dictionary containing P values, and distances for missense, nonsense,
        and synonymous de novos events. Missing data is represented by "NA".
    """
    
    if mut_dict is None:
        mut_dict = load_mutation_rates()
    
    missense = de_novos["missense"]
    nonsense = de_novos["nonsense"]
    
    # load the set of transcripts that are the  minimum set of transcripts
    # required to contain all the de novos, unless we can't find any coding
    # transcripts that contain the de novos.
    try:
        transcripts = await load_gene(ensembl, symbol, missense + nonsense)
    except IndexError as e:
        print(e)
        return None
    
    probs = {"miss_prob": [], "nons_prob": []}
    dists = {"miss_dist": [], "nons_dist": []}
    
    for transcript in transcripts:
        
        missense_events = get_de_novos_in_transcript(transcript, missense)
        nonsense_events = get_de_novos_in_transcript(transcript, nonsense)
        
        rates = SiteRates(transcript, mut_dict)
        
        (miss_dist, miss_prob) = get_p_value(transcript, rates, iterations, "missense", missense_events)
        (nons_dist, nons_prob) = get_p_value(transcript, rates, iterations, "lof", nonsense_events)
        
        dists["miss_dist"].append(miss_dist)
        dists["nons_dist"].append(nons_dist)
        probs["miss_prob"].append(miss_prob)
        probs["nons_prob"].append(nons_prob)
        
        # remove the de novos analysed in the current transcript, so that
        # analysis of subsequent transcripts uses independent events. NOTE THAT
        # THIS MIGHT MISS SOME CLUSTERING ACROSS MUTUALLY EXCLUSIVE TRANSCRIPTS
        # IF THE DE NOVO EVENTS ARE NEAR THE TRANSCRIPT DIVERGENCE.
        missense = [x for x in missense if x not in missense_events]
        nonsense = [x for x in nonsense if x not in  nonsense_events]
        
    for key in dists:
        dists[key] = ",".join([ str(x) for x in dists[key] ])
    
    probs = {k: fishers_method(probs[k]) for k in probs}
    probs.update(dists)
    
    return probs
Example #7
0
def cluster_de_novos(symbol, de_novos, iterations=1000000, ensembl=None, mut_dict=None):
    """ analysis proximity cluster of de novos in a single gene
    
    Args:
        symbol: HGNC symbol for a gene
        de_novos: dictionary of de novo positions for the HGNC gene,
        indexed by functional type
        iterations: number of simulations to run
        ensembl: EnsemblRequest object, for obtaing info from ensembl
        mut_dict: dictionary of mutation rates, indexed by trinuclotide sequence
    
    Returns:
        a dictionary containing P values, and distances for missense, nonsense,
        and synonymous de novos events. Missing data is represented by "NA".
    """
    
    if ensembl is None:
        ensembl = EnsemblRequest('cache', 'grch37')
    
    if mut_dict is None:
        mut_dict = load_mutation_rates()
    
    missense = de_novos["missense"]
    nonsense = de_novos["nonsense"]
    
    # load the set of transcripts that are the  minimum set of transcripts
    # required to contain all the de novos, unless we can't find any coding
    # transcripts that contain the de novos.
    try:
        transcripts = load_gene(ensembl, symbol, missense + nonsense)
    except IndexError as e:
        print(e)
        return None
    
    probs = {"miss_prob": [], "nons_prob": []}
    dists = {"miss_dist": [], "nons_dist": []}
    
    for transcript in transcripts:
        
        missense_events = get_de_novos_in_transcript(transcript, missense)
        nonsense_events = get_de_novos_in_transcript(transcript, nonsense)
        
        rates = SiteRates(transcript, mut_dict)
        
        (miss_dist, miss_prob) = get_p_value(transcript, rates, iterations, "missense", missense_events)
        (nons_dist, nons_prob) = get_p_value(transcript, rates, iterations, "lof", nonsense_events)
        
        dists["miss_dist"].append(miss_dist)
        dists["nons_dist"].append(nons_dist)
        probs["miss_prob"].append(miss_prob)
        probs["nons_prob"].append(nons_prob)
        
        # remove the de novos analysed in the current transcript, so that
        # analysis of subsequent transcripts uses independent events. NOTE THAT
        # THIS MIGHT MISS SOME CLUSTERING ACROSS MUTUALLY EXCLUSIVE TRANSCRIPTS
        # IF THE DE NOVO EVENTS ARE NEAR THE TRANSCRIPT DIVERGENCE.
        missense = [x for x in missense if x not in missense_events]
        nonsense = [x for x in nonsense if x not in  nonsense_events]
        
    for key in dists:
        dists[key] = ",".join([ str(x) for x in dists[key] ])
    
    probs = combine_p_values(probs)
    probs.update(dists)
    
    return probs