def test_get_p_value_lofs(self): """ check for loss of function rates """ # construct two different tx objects, for the same gene, since otherwise # the p-values from different runs are the same tx1 = self.construct_gene() rates1 = self.get_rates(tx1) iterations = 10000 cq = 'loss_of_function' de_novos = [5, 6] (obs_1, p_1) = get_p_value(tx1, rates1, iterations, cq, de_novos) tx2 = self.construct_gene() rates2 = self.get_rates(tx2) # make sure we can use the string 'lof' to get loss-of-function rates. # NOTE: due to randomly sampling, this will fail ~0.1% of the time, # purely by chance. If this fails, first try rerunning the tests. iterations = 10000 cq = 'lof' (obs_2, p_2) = get_p_value(tx2, rates2, iterations, cq, de_novos) self.assertEqual(obs_1, obs_2) self.assertTrue(abs(p_1 - p_2) < 0.017)
def test_get_p_value(self): """ """ iterations = 10000 cq = 'missense' de_novos = [5, 5] (obs, p_value) = get_p_value(self.transcript, self.rates, iterations, cq, de_novos) self.assertTrue(p_value < 0.04) self.assertEqual(obs, '0.0')
def test_get_p_value_single_de_novo(self): """ check that get_p_value() works correctly """ # check that we don't assess transcripts with a single de novo iterations = 10000 cq = 'missense' de_novos = [5] (obs, p_value) = get_p_value(self.transcript, self.rates, iterations, cq, de_novos) self.assertEqual((obs, p_value), ('NA', 'NA'))
def test_get_p_value_nonsignificant(self): """ check for de novos spread across the gene """ iterations = 10000 cq = 'missense' de_novos = [5, 58] (obs, p_value) = get_p_value(self.transcript, self.rates, iterations, cq, de_novos) self.assertTrue(p_value == 1.0) self.assertEqual(obs, '53.0')
def test_get_p_value_single_de_novo(self): """ check that get_p_value() works correctly """ # check that we don't assess transcripts with a single de novo iterations = 10000 cq = 'missense' de_novos = [5] (obs, p_value) = get_p_value(self.transcript, self.rates, iterations, cq, de_novos) self.assertTrue(math.isnan(obs)) self.assertTrue(math.isnan(p_value))
async def cluster_de_novos(symbol, de_novos, ensembl, iterations=1000000, mut_dict=None): """ analysis proximity cluster of de novos in a single gene Args: symbol: HGNC symbol for a gene de_novos: dictionary of de novo positions for the HGNC gene, indexed by functional type iterations: number of simulations to run ensembl: EnsemblRequest object, for obtaing info from ensembl mut_dict: dictionary of mutation rates, indexed by trinuclotide sequence Returns: a dictionary containing P values, and distances for missense, nonsense, and synonymous de novos events. Missing data is represented by "NA". """ if mut_dict is None: mut_dict = load_mutation_rates() missense = de_novos["missense"] nonsense = de_novos["nonsense"] # load the set of transcripts that are the minimum set of transcripts # required to contain all the de novos, unless we can't find any coding # transcripts that contain the de novos. try: transcripts = await load_gene(ensembl, symbol, missense + nonsense) except IndexError as e: print(e) return None probs = {"miss_prob": [], "nons_prob": []} dists = {"miss_dist": [], "nons_dist": []} for transcript in transcripts: missense_events = get_de_novos_in_transcript(transcript, missense) nonsense_events = get_de_novos_in_transcript(transcript, nonsense) rates = SiteRates(transcript, mut_dict) (miss_dist, miss_prob) = get_p_value(transcript, rates, iterations, "missense", missense_events) (nons_dist, nons_prob) = get_p_value(transcript, rates, iterations, "lof", nonsense_events) dists["miss_dist"].append(miss_dist) dists["nons_dist"].append(nons_dist) probs["miss_prob"].append(miss_prob) probs["nons_prob"].append(nons_prob) # remove the de novos analysed in the current transcript, so that # analysis of subsequent transcripts uses independent events. NOTE THAT # THIS MIGHT MISS SOME CLUSTERING ACROSS MUTUALLY EXCLUSIVE TRANSCRIPTS # IF THE DE NOVO EVENTS ARE NEAR THE TRANSCRIPT DIVERGENCE. missense = [x for x in missense if x not in missense_events] nonsense = [x for x in nonsense if x not in nonsense_events] for key in dists: dists[key] = ",".join([ str(x) for x in dists[key] ]) probs = {k: fishers_method(probs[k]) for k in probs} probs.update(dists) return probs
def cluster_de_novos(symbol, de_novos, iterations=1000000, ensembl=None, mut_dict=None): """ analysis proximity cluster of de novos in a single gene Args: symbol: HGNC symbol for a gene de_novos: dictionary of de novo positions for the HGNC gene, indexed by functional type iterations: number of simulations to run ensembl: EnsemblRequest object, for obtaing info from ensembl mut_dict: dictionary of mutation rates, indexed by trinuclotide sequence Returns: a dictionary containing P values, and distances for missense, nonsense, and synonymous de novos events. Missing data is represented by "NA". """ if ensembl is None: ensembl = EnsemblRequest('cache', 'grch37') if mut_dict is None: mut_dict = load_mutation_rates() missense = de_novos["missense"] nonsense = de_novos["nonsense"] # load the set of transcripts that are the minimum set of transcripts # required to contain all the de novos, unless we can't find any coding # transcripts that contain the de novos. try: transcripts = load_gene(ensembl, symbol, missense + nonsense) except IndexError as e: print(e) return None probs = {"miss_prob": [], "nons_prob": []} dists = {"miss_dist": [], "nons_dist": []} for transcript in transcripts: missense_events = get_de_novos_in_transcript(transcript, missense) nonsense_events = get_de_novos_in_transcript(transcript, nonsense) rates = SiteRates(transcript, mut_dict) (miss_dist, miss_prob) = get_p_value(transcript, rates, iterations, "missense", missense_events) (nons_dist, nons_prob) = get_p_value(transcript, rates, iterations, "lof", nonsense_events) dists["miss_dist"].append(miss_dist) dists["nons_dist"].append(nons_dist) probs["miss_prob"].append(miss_prob) probs["nons_prob"].append(nons_prob) # remove the de novos analysed in the current transcript, so that # analysis of subsequent transcripts uses independent events. NOTE THAT # THIS MIGHT MISS SOME CLUSTERING ACROSS MUTUALLY EXCLUSIVE TRANSCRIPTS # IF THE DE NOVO EVENTS ARE NEAR THE TRANSCRIPT DIVERGENCE. missense = [x for x in missense if x not in missense_events] nonsense = [x for x in nonsense if x not in nonsense_events] for key in dists: dists[key] = ",".join([ str(x) for x in dists[key] ]) probs = combine_p_values(probs) probs.update(dists) return probs