def get_rates_by_constraint(constraint, cache_dir, threshold=1e-4, ratio=1.0):
    ''' get mutation rates in and out of constrained regions
    '''

    rates = {'constrained': [], 'unconstrained': []}
    mut_dict = load_mutation_rates()
    ensembl = EnsemblRequest(cache_dir, 'grch37')
    for tx_id, group in constraint.groupby('transcript'):
        tx = construct_gene_object(ensembl, tx_id.split('.')[0])
        sites = SiteRates(tx, mut_dict)

        constrained_sites = get_constrained_positions(tx, group, threshold,
                                                      ratio)

        cqs = [
            'nonsense', 'missense', 'synonymous', 'splice_lof', 'splice_region'
        ]
        gene_rates = get_gene_rates(tx, sites, cqs, constrained_sites)

        # now add the gene rates to the larger list of all genes
        for category in ['constrained', 'unconstrained']:
            gene_rates[category]['symbol'] = list(group['gene'])[0]
            gene_rates[category]['chrom'] = list(group['chr'])[0]
            gene_rates[category]['length'] = tx.chrom_pos_to_cds(
                tx.get_cds_end())['pos']

            rates[category].append(gene_rates[category])

    return rates
Exemple #2
0
def annotate_constraint(data, constraint_path, threshold=1e-3, ratio=0.4):
    ''' annotate per-site rates by whether the site is under regional constraint
    '''
    # default to unconstrained
    data['constrained'] = False

    constraint = load_regional_constraint(constraint_path)
    mut_dict = load_mutation_rates()
    ensembl = EnsemblRequest(cache_dir, 'grch37')

    modified = []
    for symbol, group in data.groupby('symbol'):
        if symbol not in set(constraint['gene']):
            sites = set([])
        else:
            regional = constraint[constraint['gene'] == symbol]
            tx_id = list(regional['transcript'])[0]
            tx = construct_gene_object(ensembl, tx_id.split('.')[0])
            sites = get_constrained_positions(tx, regional, threshold, ratio)

        gene_constraint = group['constrained'].copy()
        gene_constraint.loc[group['pos'].isin(sites)] = True
        group['constrained'] = gene_constraint

        modified.append(group)

    return pandas.concat(modified)
def get_constrained_positions(ensembl, constraint, symbol, threshold=1e-4, ratio_threshold=1.0):
    ''' get the positions in the constrained regions
    '''
    
    regions = IntervalTree()
    
    if symbol not in constraint:
        return regions
    
    data = constraint[symbol]
    tx = construct_gene_object(ensembl, data['tx'])
    
    for region in data['regions']:
        p_value = chi2.sf(region['chisq'], df=1)
        if p_value > threshold:
            continue
        
        if region['ratio'] > ratio_threshold:
            continue
        
        start, end = aa_to_chrom(tx, region['pos'])
        if tx.get_strand() == '-':
            start, end = end, start
        
        regions[start:end + 1] = True
    
    return regions
Exemple #4
0
 def test_construct_gene_object(self):
     """
     """
     
     transcript_id = "ENST00000242577"
     transcript = construct_gene_object(self.ensembl, transcript_id)
     
     expected = self.set_transcript()
     
     self.assertEqual(transcript, expected)
     self.assertEqual(transcript.get_genomic_sequence(), expected.get_genomic_sequence())
     self.assertEqual(transcript.get_cds_sequence(), expected.get_cds_sequence())
Exemple #5
0
 def test_construct_gene_object(self):
     """
     """
     
     transcript_id = "ENST00000242577"
     transcript = construct_gene_object(self.ensembl, transcript_id)
     
     expected = self.set_transcript()
     
     self.assertEqual(transcript, expected)
     self.assertEqual(transcript.get_genomic_sequence(), expected.get_genomic_sequence())
     self.assertEqual(transcript.get_cds_sequence(), expected.get_cds_sequence())
Exemple #6
0
def get_mutation_rates(transcripts, mut_dict, ensembl):
    """ determines mutation rates per functional category for transcripts
    
    Args:
        transcripts: list of transcript IDs for a gene
        mut_dict: dictionary of local sequence context mutation rates
        ensembl: EnsemblRequest object, to retrieve information from Ensembl.
    
    Returns:
        tuple of (rates, merged transcript, and transcript CDS length)
    """

    rates = {
        'missense': 0,
        'nonsense': 0,
        'splice_lof': 0,
        'splice_region': 0,
        'synonymous': 0
    }
    combined = None

    for tx_id in transcripts:
        try:
            tx = construct_gene_object(ensembl, tx_id)
        except ValueError:
            continue

        if len(tx.get_cds_sequence()) % 3 != 0:
            raise ValueError("anomalous_coding_sequence")

        # ignore mitochondrial genes
        if tx.get_chrom() == "MT":
            continue

        sites = SiteRates(tx, mut_dict, masked_sites=combined)
        combined = tx + combined

        for cq in [
                'missense', 'nonsense', 'splice_lof', 'splice_region',
                'synonymous'
        ]:
            rates[cq] += sites[cq].get_summed_rate()

    if combined is None:
        raise ValueError('no tx found')

    length = combined.get_coding_distance(combined.get_cds_start(),
                                          combined.get_cds_end())

    return rates, combined, length
Exemple #7
0
def get_mutation_rates(transcripts, mut_dict, ensembl):
    """ determines mutation rates per functional category for transcripts
    
    Args:
        transcripts: list of transcript IDs for a gene
        mut_dict: dictionary of local sequence context mutation rates
        ensembl: EnsemblRequest object, to retrieve information from Ensembl.
    
    Returns:
        tuple of (rates, merged transcript, and transcript CDS length)
    """
    
    rates = {'missense': 0, 'nonsense': 0, 'splice_lof': 0,
        'splice_region': 0, 'synonymous': 0}
    combined = None
    
    for tx_id in transcripts:
        try:
            tx = construct_gene_object(ensembl, tx_id)
        except ValueError:
            continue
        
        if len(tx.get_cds_sequence()) % 3 != 0:
            raise ValueError("anomalous_coding_sequence")
        
        # ignore mitochondrial genes
        if tx.get_chrom() == "MT":
            continue
        
        sites = SiteRates(tx, mut_dict, masked_sites=combined)
        combined = tx + combined
        
        for cq in ['missense', 'nonsense', 'splice_lof', 'splice_region', 'synonymous']:
            rates[cq] += sites[cq].get_summed_rate()
    
    if combined is None:
        raise ValueError('no tx found')
    
    length = combined.get_coding_distance(combined.get_cds_end())['pos']
    
    return rates, combined, length
def get_transcripts(symbol, ensembl):
    ''' get a list of Transcript objects for a gene
    
    Args:
        symbol: HGNC symbol for a gene
        ensembl: EnsemblRequest object, to retrieve gene data with
    
    Returns:
        list of Transcript objects (see denovonear), sorted by size (longest
        transcripts first)
    '''

    transcript_ids = get_transcript_ids(ensembl, symbol)

    transcripts = []
    for x in sorted(transcript_ids, key=transcript_ids.get, reverse=True):
        try:
            tx = construct_gene_object(ensembl, x)
            transcripts.append(tx)
        except ValueError:
            continue

    return transcripts
Exemple #9
0
def get_mutation_rates(gene_id, transcripts, mut_dict, ensembl):
    """ determines missense, nonsense and synonymous mutation rates for a gene
    
    This can estimate a mutation rate from the union of transcripts for a gene.
    This is a biased estimate of the mutation rate, where the mutation rate
    estimates is biased towards the rate from the first-ranked transcripts,
    which I prioritise by how many de novos they contain, and how long the
    coding sequence is.
    
    This isn't a problem when different transcripts have the same coding
    sequence within their shared regions, as the rates will come outthe same,
    but may differ two transcript share an overlapping region, but not in the
    same frame, so that the sites that are missense, and nonsense will differ
    between transcripts, and thus would produce different estimates of the
    mutation rate.
    
    Args:
        gene_id: ID for the current gene (can be a transcript ID, if we are
            examining single transcripts only, or can be a HGNC ID, if we are
            examining the union of mutation rates from multiple transcripts for
            a single gene).
        transcripts: dictionary of transcripts for a gene, indexed by gene_id
        mut_dict: dictionary of local sequence context mutation rates
        ensembl: EnsemblRequest object, to retrieve information from Ensembl.
    
    Returns:
        tuple of (missense, nonsense, synonymous) mutation rates
    """
    
    missense = 0
    nonsense = 0
    splice_lof = 0
    splice_region = 0
    synonymous = 0
    combined_transcript = None
    
    for transcript_id in transcripts[gene_id]:
        
        # get the gene coordinates, sequence etc, but if the transcript is
        # unusable (hence raises an error), simply move to the next transcript
        try:
            transcript = construct_gene_object(ensembl, transcript_id)
        except ValueError:
            continue
        
        if len(transcript.get_cds_sequence()) % 3 != 0:
            raise ValueError("anomalous_coding_sequence")
        
        # ignore mitochondrial genes, since mitochondiral mutation rates differ
        # from autosomal and allosomal mutation rates
        if transcript.get_chrom() == "MT":
            continue
        
        if combined_transcript is None:
            sites = SiteRates(transcript, mut_dict)
            combined_transcript = transcript
        else:
            sites = SiteRates(transcript, mut_dict, masked_sites=combined_transcript)
            combined_transcript += transcript
        
        missense_rates = sites["missense"]
        nonsense_rates = sites["nonsense"]
        splice_lof_rates = sites["splice_lof"]
        splice_region_rates = sites["splice_region"]
        synonymous_rates = sites["synonymous"]
        
        # if any sites have been sampled in the transcript, then add the
        # cumulative probability from those sites to the approporiate
        # mutation rate. Sometimes we won't have any sites for a transcript, as
        # all the sites will have been captured in previous transcripts.
        missense += missense_rates.get_summed_rate()
        nonsense += nonsense_rates.get_summed_rate()
        splice_lof += splice_lof_rates.get_summed_rate()
        splice_region += splice_region_rates.get_summed_rate()
        synonymous += synonymous_rates.get_summed_rate()
    
    chrom = combined_transcript.get_chrom()
    length = "NA"
    if combined_transcript is not None:
        length = combined_transcript.get_coding_distance(\
            combined_transcript.get_cds_start(), combined_transcript.get_cds_end())
    
    return (chrom, length, missense, nonsense, splice_lof, splice_region, synonymous)