def get_rates_by_constraint(constraint, cache_dir, threshold=1e-4, ratio=1.0): ''' get mutation rates in and out of constrained regions ''' rates = {'constrained': [], 'unconstrained': []} mut_dict = load_mutation_rates() ensembl = EnsemblRequest(cache_dir, 'grch37') for tx_id, group in constraint.groupby('transcript'): tx = construct_gene_object(ensembl, tx_id.split('.')[0]) sites = SiteRates(tx, mut_dict) constrained_sites = get_constrained_positions(tx, group, threshold, ratio) cqs = [ 'nonsense', 'missense', 'synonymous', 'splice_lof', 'splice_region' ] gene_rates = get_gene_rates(tx, sites, cqs, constrained_sites) # now add the gene rates to the larger list of all genes for category in ['constrained', 'unconstrained']: gene_rates[category]['symbol'] = list(group['gene'])[0] gene_rates[category]['chrom'] = list(group['chr'])[0] gene_rates[category]['length'] = tx.chrom_pos_to_cds( tx.get_cds_end())['pos'] rates[category].append(gene_rates[category]) return rates
def annotate_constraint(data, constraint_path, threshold=1e-3, ratio=0.4): ''' annotate per-site rates by whether the site is under regional constraint ''' # default to unconstrained data['constrained'] = False constraint = load_regional_constraint(constraint_path) mut_dict = load_mutation_rates() ensembl = EnsemblRequest(cache_dir, 'grch37') modified = [] for symbol, group in data.groupby('symbol'): if symbol not in set(constraint['gene']): sites = set([]) else: regional = constraint[constraint['gene'] == symbol] tx_id = list(regional['transcript'])[0] tx = construct_gene_object(ensembl, tx_id.split('.')[0]) sites = get_constrained_positions(tx, regional, threshold, ratio) gene_constraint = group['constrained'].copy() gene_constraint.loc[group['pos'].isin(sites)] = True group['constrained'] = gene_constraint modified.append(group) return pandas.concat(modified)
def get_constrained_positions(ensembl, constraint, symbol, threshold=1e-4, ratio_threshold=1.0): ''' get the positions in the constrained regions ''' regions = IntervalTree() if symbol not in constraint: return regions data = constraint[symbol] tx = construct_gene_object(ensembl, data['tx']) for region in data['regions']: p_value = chi2.sf(region['chisq'], df=1) if p_value > threshold: continue if region['ratio'] > ratio_threshold: continue start, end = aa_to_chrom(tx, region['pos']) if tx.get_strand() == '-': start, end = end, start regions[start:end + 1] = True return regions
def test_construct_gene_object(self): """ """ transcript_id = "ENST00000242577" transcript = construct_gene_object(self.ensembl, transcript_id) expected = self.set_transcript() self.assertEqual(transcript, expected) self.assertEqual(transcript.get_genomic_sequence(), expected.get_genomic_sequence()) self.assertEqual(transcript.get_cds_sequence(), expected.get_cds_sequence())
def get_mutation_rates(transcripts, mut_dict, ensembl): """ determines mutation rates per functional category for transcripts Args: transcripts: list of transcript IDs for a gene mut_dict: dictionary of local sequence context mutation rates ensembl: EnsemblRequest object, to retrieve information from Ensembl. Returns: tuple of (rates, merged transcript, and transcript CDS length) """ rates = { 'missense': 0, 'nonsense': 0, 'splice_lof': 0, 'splice_region': 0, 'synonymous': 0 } combined = None for tx_id in transcripts: try: tx = construct_gene_object(ensembl, tx_id) except ValueError: continue if len(tx.get_cds_sequence()) % 3 != 0: raise ValueError("anomalous_coding_sequence") # ignore mitochondrial genes if tx.get_chrom() == "MT": continue sites = SiteRates(tx, mut_dict, masked_sites=combined) combined = tx + combined for cq in [ 'missense', 'nonsense', 'splice_lof', 'splice_region', 'synonymous' ]: rates[cq] += sites[cq].get_summed_rate() if combined is None: raise ValueError('no tx found') length = combined.get_coding_distance(combined.get_cds_start(), combined.get_cds_end()) return rates, combined, length
def get_mutation_rates(transcripts, mut_dict, ensembl): """ determines mutation rates per functional category for transcripts Args: transcripts: list of transcript IDs for a gene mut_dict: dictionary of local sequence context mutation rates ensembl: EnsemblRequest object, to retrieve information from Ensembl. Returns: tuple of (rates, merged transcript, and transcript CDS length) """ rates = {'missense': 0, 'nonsense': 0, 'splice_lof': 0, 'splice_region': 0, 'synonymous': 0} combined = None for tx_id in transcripts: try: tx = construct_gene_object(ensembl, tx_id) except ValueError: continue if len(tx.get_cds_sequence()) % 3 != 0: raise ValueError("anomalous_coding_sequence") # ignore mitochondrial genes if tx.get_chrom() == "MT": continue sites = SiteRates(tx, mut_dict, masked_sites=combined) combined = tx + combined for cq in ['missense', 'nonsense', 'splice_lof', 'splice_region', 'synonymous']: rates[cq] += sites[cq].get_summed_rate() if combined is None: raise ValueError('no tx found') length = combined.get_coding_distance(combined.get_cds_end())['pos'] return rates, combined, length
def get_transcripts(symbol, ensembl): ''' get a list of Transcript objects for a gene Args: symbol: HGNC symbol for a gene ensembl: EnsemblRequest object, to retrieve gene data with Returns: list of Transcript objects (see denovonear), sorted by size (longest transcripts first) ''' transcript_ids = get_transcript_ids(ensembl, symbol) transcripts = [] for x in sorted(transcript_ids, key=transcript_ids.get, reverse=True): try: tx = construct_gene_object(ensembl, x) transcripts.append(tx) except ValueError: continue return transcripts
def get_mutation_rates(gene_id, transcripts, mut_dict, ensembl): """ determines missense, nonsense and synonymous mutation rates for a gene This can estimate a mutation rate from the union of transcripts for a gene. This is a biased estimate of the mutation rate, where the mutation rate estimates is biased towards the rate from the first-ranked transcripts, which I prioritise by how many de novos they contain, and how long the coding sequence is. This isn't a problem when different transcripts have the same coding sequence within their shared regions, as the rates will come outthe same, but may differ two transcript share an overlapping region, but not in the same frame, so that the sites that are missense, and nonsense will differ between transcripts, and thus would produce different estimates of the mutation rate. Args: gene_id: ID for the current gene (can be a transcript ID, if we are examining single transcripts only, or can be a HGNC ID, if we are examining the union of mutation rates from multiple transcripts for a single gene). transcripts: dictionary of transcripts for a gene, indexed by gene_id mut_dict: dictionary of local sequence context mutation rates ensembl: EnsemblRequest object, to retrieve information from Ensembl. Returns: tuple of (missense, nonsense, synonymous) mutation rates """ missense = 0 nonsense = 0 splice_lof = 0 splice_region = 0 synonymous = 0 combined_transcript = None for transcript_id in transcripts[gene_id]: # get the gene coordinates, sequence etc, but if the transcript is # unusable (hence raises an error), simply move to the next transcript try: transcript = construct_gene_object(ensembl, transcript_id) except ValueError: continue if len(transcript.get_cds_sequence()) % 3 != 0: raise ValueError("anomalous_coding_sequence") # ignore mitochondrial genes, since mitochondiral mutation rates differ # from autosomal and allosomal mutation rates if transcript.get_chrom() == "MT": continue if combined_transcript is None: sites = SiteRates(transcript, mut_dict) combined_transcript = transcript else: sites = SiteRates(transcript, mut_dict, masked_sites=combined_transcript) combined_transcript += transcript missense_rates = sites["missense"] nonsense_rates = sites["nonsense"] splice_lof_rates = sites["splice_lof"] splice_region_rates = sites["splice_region"] synonymous_rates = sites["synonymous"] # if any sites have been sampled in the transcript, then add the # cumulative probability from those sites to the approporiate # mutation rate. Sometimes we won't have any sites for a transcript, as # all the sites will have been captured in previous transcripts. missense += missense_rates.get_summed_rate() nonsense += nonsense_rates.get_summed_rate() splice_lof += splice_lof_rates.get_summed_rate() splice_region += splice_region_rates.get_summed_rate() synonymous += synonymous_rates.get_summed_rate() chrom = combined_transcript.get_chrom() length = "NA" if combined_transcript is not None: length = combined_transcript.get_coding_distance(\ combined_transcript.get_cds_start(), combined_transcript.get_cds_end()) return (chrom, length, missense, nonsense, splice_lof, splice_region, synonymous)