def get_rates_by_constraint(constraint, cache_dir, threshold=1e-4, ratio=1.0): ''' get mutation rates in and out of constrained regions ''' rates = {'constrained': [], 'unconstrained': []} mut_dict = load_mutation_rates() ensembl = EnsemblRequest(cache_dir, 'grch37') for tx_id, group in constraint.groupby('transcript'): tx = construct_gene_object(ensembl, tx_id.split('.')[0]) sites = SiteRates(tx, mut_dict) constrained_sites = get_constrained_positions(tx, group, threshold, ratio) cqs = [ 'nonsense', 'missense', 'synonymous', 'splice_lof', 'splice_region' ] gene_rates = get_gene_rates(tx, sites, cqs, constrained_sites) # now add the gene rates to the larger list of all genes for category in ['constrained', 'unconstrained']: gene_rates[category]['symbol'] = list(group['gene'])[0] gene_rates[category]['chrom'] = list(group['chr'])[0] gene_rates[category]['length'] = tx.chrom_pos_to_cds( tx.get_cds_end())['pos'] rates[category].append(gene_rates[category]) return rates
def main(): args = get_options() ensembl = EnsemblRequest(args.cache, args.genome_build) cadd = pysam.TabixFile(args.cadd) constraint = load_regional_constraint(args.constraint) # open de novo mutations all_de_novos = open_mutations(args.de_novos) mut_dict = load_mutation_rates() output = open(args.output, 'w') output.write('symbol\tseverity_p_value\n') for symbol in all_de_novos: if symbol in ['', '.']: continue print(symbol) de_novos = all_de_novos[symbol] p_value = analyse_gene(ensembl, mut_dict, cadd, symbol, de_novos, constraint, WEIGHTS) line = '{}\t{}\n'.format(symbol, p_value) output.write(line)
def annotate_constraint(data, constraint_path, threshold=1e-3, ratio=0.4): ''' annotate per-site rates by whether the site is under regional constraint ''' # default to unconstrained data['constrained'] = False constraint = load_regional_constraint(constraint_path) mut_dict = load_mutation_rates() ensembl = EnsemblRequest(cache_dir, 'grch37') modified = [] for symbol, group in data.groupby('symbol'): if symbol not in set(constraint['gene']): sites = set([]) else: regional = constraint[constraint['gene'] == symbol] tx_id = list(regional['transcript'])[0] tx = construct_gene_object(ensembl, tx_id.split('.')[0]) sites = get_constrained_positions(tx, regional, threshold, ratio) gene_constraint = group['constrained'].copy() gene_constraint.loc[group['pos'].isin(sites)] = True group['constrained'] = gene_constraint modified.append(group) return pandas.concat(modified)
def main(): args = get_options() ensembl = EnsemblRequest(args.cache_folder, args.genome_build.lower()) mut_dict = load_mutation_rates(args.rates) output = open(args.out, "wt") args.func(ensembl, mut_dict, output, args)
def main(): args = get_options() ensembl = EnsemblRequest('cache', 'grch37') mut_dict = load_mutation_rates() dominant = load_dominant(args.known) data = pandas.DataFrame( columns=['symbol', 'chrom', 'pos', 'ref', 'alt', 'cq', 'prob']) data['pos'] = data['pos'].astype(int) for symbol in dominant: print(symbol) rates = get_gene_rates(symbol, ensembl, mut_dict) data = data.append(rates, ignore_index=True) with gzip.open(args.output, 'wt') as handle: data.to_csv(handle, sep='\t', index=False)
def setUpClass(self): self.temp_dir = tempfile.mkdtemp() self.ensembl = EnsemblRequest(self.temp_dir, genome_build="grch37")
class TestEnsemblRequestPy(unittest.TestCase): """ unit test the EnsemblRequest class """ @classmethod def setUpClass(self): self.temp_dir = tempfile.mkdtemp() self.ensembl = EnsemblRequest(self.temp_dir, genome_build="grch37") @classmethod def tearDownClass(self): shutil.rmtree(self.temp_dir) def test_open_url(self): """ test that open_url() works correctly """ headers = {"Content-Type": "application/json"} url = "http://rest.ensembl.org/overlap/id/ENSG00000172320?feature=gene" (response, status_code, headers) = self.ensembl.open_url(url, headers) response = json.loads(response) self.assertEqual(status_code, 200) self.assertEqual(response, [{ "source": "ensembl_havana", "logic_name": "ensembl_havana_gene", "feature_type": "gene", "external_name": "OR5A1", "seq_region_name": "11", "strand": 1, "id": "ENSG00000172320", "gene_id": "ENSG00000172320", "version": 3, "assembly_name": "GRCh38", "description": "olfactory receptor family 5 subfamily A member 1 [Source:HGNC Symbol;Acc:HGNC:8319]", "end": 59451380, "biotype": "protein_coding", "start": 59436469}] ) def test_get_genes_for_hgnc_id(self): """ test that get_genes_for_hgnc_id() works correctly """ genes = self.ensembl.get_genes_for_hgnc_id("KMT2A") self.assertEqual(genes, ['ENSG00000118058', 'ENSG00000267910']) def test_get_previous_symbol(self): """ test that get_previous_symbol() works correctly """ prev = self.ensembl.get_previous_symbol("KMT2A") self.assertEqual(prev, ["MLL"]) # make a check for a gene with multiple documents, to check that we # don't raise an error prev = self.ensembl.get_previous_symbol("KRT16P1") self.assertEqual(prev, ["KRT14P"]) def test_get_transcript_ids_for_ensembl_gene_ids(self): """ test that get_transcript_ids_for_ensembl_gene_ids() works correctly """ hgnc = ["KMT2A", "MLL"] ensg = ['ENSG00000118058', 'ENSG00000267910'] enst = self.ensembl.get_transcript_ids_for_ensembl_gene_ids(ensg, hgnc) self.assertEqual(set(enst), set(['ENST00000534358', 'ENST00000531904', 'ENST00000389506', 'ENST00000354520', 'ENST00000532204', 'ENST00000529852', 'ENST00000527869', 'ENST00000533790', 'ENST00000392873'])) def test_get_genomic_seq_for_transcript(self): """ check that get_genomic_seq_for_transcript() works correctly """ seq = self.ensembl.get_genomic_seq_for_transcript("ENST00000302030", expand=0) self.assertEqual(seq, ('11', 59210617, 59211667, '+', 'CTTGTCCTTGTGGTCC' 'ACGGGAAGCATGTCCATAACCAAAGCCTGGAACAGCTCATCAGTGACCATGTTCATCCTCCTGGGA' 'TTCACAGACCATCCAGAACTCCAGGCCCTCCTCTTTGTGACCTTCCTGGGCATCTATCTTACCACC' 'CTGGCCTGGAACCTGGCCCTCATTTTTCTGATCAGAGGTGACACCCATCTGCACACACCCATGTAC' 'TTCTTCCTAAGCAACTTATCTTTCATTGACATCTGCTACTCTTCTGCTGTGGCTCCCAATATGCTC' 'ACTGACTTCTTCTGGGAGCAGAAGACCATATCATTTGTGGGCTGTGCTGCTCAGTTTTTTTTCTTT' 'GTCGGCATGGGTCTGTCTGAGTGCCTCCTCCTGACTGCTATGGCATACGACCGATATGCAGCCATC' 'TCCAGCCCCCTTCTCTACCCCACTATCATGACCCAGGGCCTCTGTACACGCATGGTGGTTGGGGCA' 'TATGTTGGTGGCTTCCTGAGCTCCCTGATCCAGGCCAGCTCCATATTTAGGCTTCACTTTTGCGGA' 'CCCAACATCATCAACCACTTCTTCTGCGACCTCCCACCAGTCCTGGCTCTGTCTTGCTCTGACACC' 'TTCCTCAGTCAAGTGGTGAATTTCCTCGTGGTGGTCACTGTCGGAGGAACATCGTTCCTCCAACTC' 'CTTATCTCCTATGGTTACATAGTGTCTGCGGTCCTGAAGATCCCTTCAGCAGAGGGCCGATGGAAA' 'GCCTGCAACACGTGTGCCTCGCATCTGATGGTGGTGACTCTGCTGTTTGGGACAGCCCTTTTCGTG' 'TACTTGCGACCCAGCTCCAGCTACTTGCTAGGCAGGGACAAGGTGGTGTCTGTTTTCTATTCATTG' 'GTGATCCCCATGCTGAACCCTCTCATTTACAGTTTGAGGAACAAAGAGATCAAGGATGCCCTGTGG' 'AAGGTGTTGGAAAGGAAGAAAGTGTTTTCTTAGGTCATGCGTAGAAACTTATTTATCCAAACTGCT' 'GGAGAATTAAACAATCCAAGCCTTCACCTCCACCTCTGCCTCAGG')) def test_get_cds_seq_for_transcript(self): """ check that get_cds_seq_for_transcript() works correctly """ seq = self.ensembl.get_cds_seq_for_transcript("ENST00000302030") self.assertEqual(seq, 'ATGTCCATAACCAAAGCCTGGAACAGCTCATCAGTGACCATGTTCATC' 'CTCCTGGGATTCACAGACCATCCAGAACTCCAGGCCCTCCTCTTTGTGACCTTCCTGGGCATCTAT' 'CTTACCACCCTGGCCTGGAACCTGGCCCTCATTTTTCTGATCAGAGGTGACACCCATCTGCACACA' 'CCCATGTACTTCTTCCTAAGCAACTTATCTTTCATTGACATCTGCTACTCTTCTGCTGTGGCTCCC' 'AATATGCTCACTGACTTCTTCTGGGAGCAGAAGACCATATCATTTGTGGGCTGTGCTGCTCAGTTT' 'TTTTTCTTTGTCGGCATGGGTCTGTCTGAGTGCCTCCTCCTGACTGCTATGGCATACGACCGATAT' 'GCAGCCATCTCCAGCCCCCTTCTCTACCCCACTATCATGACCCAGGGCCTCTGTACACGCATGGTG' 'GTTGGGGCATATGTTGGTGGCTTCCTGAGCTCCCTGATCCAGGCCAGCTCCATATTTAGGCTTCAC' 'TTTTGCGGACCCAACATCATCAACCACTTCTTCTGCGACCTCCCACCAGTCCTGGCTCTGTCTTGC' 'TCTGACACCTTCCTCAGTCAAGTGGTGAATTTCCTCGTGGTGGTCACTGTCGGAGGAACATCGTTC' 'CTCCAACTCCTTATCTCCTATGGTTACATAGTGTCTGCGGTCCTGAAGATCCCTTCAGCAGAGGGC' 'CGATGGAAAGCCTGCAACACGTGTGCCTCGCATCTGATGGTGGTGACTCTGCTGTTTGGGACAGCC' 'CTTTTCGTGTACTTGCGACCCAGCTCCAGCTACTTGCTAGGCAGGGACAAGGTGGTGTCTGTTTTC' 'TATTCATTGGTGATCCCCATGCTGAACCCTCTCATTTACAGTTTGAGGAACAAAGAGATCAAGGAT' 'GCCCTGTGGAAGGTGTTGGAAAGGAAGAAAGTGTTTTCTTAG') def test_get_protein_seq_for_transcript(self): """ test that get_protein_seq_for_transcript() works correctly """ seq = self.ensembl.get_protein_seq_for_transcript("ENST00000302030") self.assertEqual(seq, 'MSITKAWNSSSVTMFILLGFTDHPELQALLFVTFLGIYLTTLAWNLAL' 'IFLIRGDTHLHTPMYFFLSNLSFIDICYSSAVAPNMLTDFFWEQKTISFVGCAAQFFFFVGMGLSE' 'CLLLTAMAYDRYAAISSPLLYPTIMTQGLCTRMVVGAYVGGFLSSLIQASSIFRLHFCGPNIINHF' 'FCDLPPVLALSCSDTFLSQVVNFLVVVTVGGTSFLQLLISYGYIVSAVLKIPSAEGRWKACNTCAS' 'HLMVVTLLFGTALFVYLRPSSSYLLGRDKVVSVFYSLVIPMLNPLIYSLRNKEIKDALWKVLERKK' 'VFS') def test_get_genomic_seq_for_region(self): """ test that get_genomic_seq_for_region() works correctly """ # not that this test uses GRCh37 coordinates seq = self.ensembl.get_genomic_seq_for_region('11', 59210617, 59210637) self.assertEqual(seq, 'CTTGTCCTTGTGGTCCACGGG') def test_get_chrom_for_transcript(self): """ test that get_chrom_for_transcript() works correctly """ chrom = self.ensembl.get_chrom_for_transcript("ENST00000534358", "KMT2A") self.assertEqual(chrom, "11") def test_get_exon_ranges_for_transcript(self): """ test that get_exon_ranges_for_transcript() works correctly """ exons = self.ensembl.get_exon_ranges_for_transcript("ENST00000534358") self.assertEqual(exons, [(118307205, 118307659), (118339490, 118339559), (118342377, 118345030), (118347520, 118347697), (118348682, 118348916), (118350889, 118350953), (118352430, 118352807), (118353137, 118353210), (118354898, 118355029), (118355577, 118355690), (118359329, 118359475), (118360507, 118360602), (118360844, 118360964), (118361911, 118362033), (118362459, 118362643), (118363772, 118363945), (118365003, 118365113), (118365409, 118365482), (118366415, 118366608), (118366976, 118367082), (118368651, 118368788), (118369085, 118369243), (118370018, 118370135), (118370550, 118370628), (118371702, 118371862), (118372387, 118372572), (118373113, 118377361), (118378244, 118378324), (118379851, 118379915), (118380663, 118380833), (118382666, 118382740), (118390333, 118390507), (118390672, 118390779), (118391517, 118391600), (118392003, 118392132), (118392612, 118397539)]) def test_get_cds_ranges_for_transcript(self): """ tets that get_cds_ranges_for_transcript() works correctly """ cds = self.ensembl.get_cds_ranges_for_transcript("ENST00000534358") self.assertEqual(cds, [(118307228, 118307659), (118339490, 118339559), (118342377, 118345030), (118347520, 118347697), (118348682, 118348916), (118350889, 118350953), (118352430, 118352807), (118353137, 118353210), (118354898, 118355029), (118355577, 118355690), (118359329, 118359475), (118360507, 118360602), (118360844, 118360964), (118361911, 118362033), (118362459, 118362643), (118363772, 118363945), (118365003, 118365113), (118365409, 118365482), (118366415, 118366608), (118366976, 118367082), (118368651, 118368788), (118369085, 118369243), (118370018, 118370135), (118370550, 118370628), (118371702, 118371862), (118372387, 118372572), (118373113, 118377361), (118378244, 118378324), (118379851, 118379915), (118380663, 118380833), (118382666, 118382740), (118390333, 118390507), (118390672, 118390779), (118391517, 118391600), (118392003, 118392132), (118392612, 118392887)]) def test_rate_limit_ensembl_requests(self): """ test that rate_limit_ensembl_requests() works correctly """ current_time = time.time() self.ensembl.prior_time = current_time self.ensembl.rate_limit_ensembl_requests() delta = self.ensembl.prior_time - current_time self.assertTrue(delta >= self.ensembl.rate_limit)
def cluster_de_novos(symbol, de_novos, iterations=1000000, ensembl=None, mut_dict=None): """ analysis proximity cluster of de novos in a single gene Args: symbol: HGNC symbol for a gene de_novos: dictionary of de novo positions for the HGNC gene, indexed by functional type iterations: number of simulations to run ensembl: EnsemblRequest object, for obtaing info from ensembl mut_dict: dictionary of mutation rates, indexed by trinuclotide sequence Returns: a dictionary containing P values, and distances for missense, nonsense, and synonymous de novos events. Missing data is represented by "NA". """ if ensembl is None: ensembl = EnsemblRequest('cache', 'grch37') if mut_dict is None: mut_dict = load_mutation_rates() missense = de_novos["missense"] nonsense = de_novos["nonsense"] # load the set of transcripts that are the minimum set of transcripts # required to contain all the de novos, unless we can't find any coding # transcripts that contain the de novos. try: transcripts = load_gene(ensembl, symbol, missense + nonsense) except IndexError as e: print(e) return None probs = {"miss_prob": [], "nons_prob": []} dists = {"miss_dist": [], "nons_dist": []} for transcript in transcripts: missense_events = get_de_novos_in_transcript(transcript, missense) nonsense_events = get_de_novos_in_transcript(transcript, nonsense) rates = SiteRates(transcript, mut_dict) (miss_dist, miss_prob) = get_p_value(transcript, rates, iterations, "missense", missense_events) (nons_dist, nons_prob) = get_p_value(transcript, rates, iterations, "lof", nonsense_events) dists["miss_dist"].append(miss_dist) dists["nons_dist"].append(nons_dist) probs["miss_prob"].append(miss_prob) probs["nons_prob"].append(nons_prob) # remove the de novos analysed in the current transcript, so that # analysis of subsequent transcripts uses independent events. NOTE THAT # THIS MIGHT MISS SOME CLUSTERING ACROSS MUTUALLY EXCLUSIVE TRANSCRIPTS # IF THE DE NOVO EVENTS ARE NEAR THE TRANSCRIPT DIVERGENCE. missense = [x for x in missense if x not in missense_events] nonsense = [x for x in nonsense if x not in nonsense_events] for key in dists: dists[key] = ",".join([str(x) for x in dists[key]]) probs = combine_p_values(probs) probs.update(dists) return probs
def setUpClass(cls): cls.temp_dir = tempfile.mkdtemp() cls.ensembl = EnsemblRequest(cls.temp_dir, 'grch37') cls.mut_dict = load_mutation_rates()