def main():
    args = get_options()

    ensembl = EnsemblRequest(args.cache, args.genome_build)
    cadd = pysam.TabixFile(args.cadd)

    constraint = load_regional_constraint(args.constraint)

    # open de novo mutations
    all_de_novos = open_mutations(args.de_novos)

    mut_dict = load_mutation_rates()

    output = open(args.output, 'w')
    output.write('symbol\tseverity_p_value\n')
    for symbol in all_de_novos:
        if symbol in ['', '.']:
            continue

        print(symbol)
        de_novos = all_de_novos[symbol]
        p_value = analyse_gene(ensembl, mut_dict, cadd, symbol, de_novos,
                               constraint, WEIGHTS)
        line = '{}\t{}\n'.format(symbol, p_value)
        output.write(line)
def get_rates_by_constraint(constraint, cache_dir, threshold=1e-4, ratio=1.0):
    ''' get mutation rates in and out of constrained regions
    '''

    rates = {'constrained': [], 'unconstrained': []}
    mut_dict = load_mutation_rates()
    ensembl = EnsemblRequest(cache_dir, 'grch37')
    for tx_id, group in constraint.groupby('transcript'):
        tx = construct_gene_object(ensembl, tx_id.split('.')[0])
        sites = SiteRates(tx, mut_dict)

        constrained_sites = get_constrained_positions(tx, group, threshold,
                                                      ratio)

        cqs = [
            'nonsense', 'missense', 'synonymous', 'splice_lof', 'splice_region'
        ]
        gene_rates = get_gene_rates(tx, sites, cqs, constrained_sites)

        # now add the gene rates to the larger list of all genes
        for category in ['constrained', 'unconstrained']:
            gene_rates[category]['symbol'] = list(group['gene'])[0]
            gene_rates[category]['chrom'] = list(group['chr'])[0]
            gene_rates[category]['length'] = tx.chrom_pos_to_cds(
                tx.get_cds_end())['pos']

            rates[category].append(gene_rates[category])

    return rates
Beispiel #3
0
def annotate_constraint(data, constraint_path, threshold=1e-3, ratio=0.4):
    ''' annotate per-site rates by whether the site is under regional constraint
    '''
    # default to unconstrained
    data['constrained'] = False

    constraint = load_regional_constraint(constraint_path)
    mut_dict = load_mutation_rates()
    ensembl = EnsemblRequest(cache_dir, 'grch37')

    modified = []
    for symbol, group in data.groupby('symbol'):
        if symbol not in set(constraint['gene']):
            sites = set([])
        else:
            regional = constraint[constraint['gene'] == symbol]
            tx_id = list(regional['transcript'])[0]
            tx = construct_gene_object(ensembl, tx_id.split('.')[0])
            sites = get_constrained_positions(tx, regional, threshold, ratio)

        gene_constraint = group['constrained'].copy()
        gene_constraint.loc[group['pos'].isin(sites)] = True
        group['constrained'] = gene_constraint

        modified.append(group)

    return pandas.concat(modified)
Beispiel #4
0
def main():

    args = get_options()

    ensembl = EnsemblRequest(args.cache_folder, args.genome_build.lower())
    mut_dict = load_mutation_rates(args.rates)
    output = open(args.out, "wt")

    args.func(ensembl, mut_dict, output, args)
Beispiel #5
0
async def runner():
    args = get_options()
    FORMAT = '%(asctime)-15s %(message)s'
    logging.basicConfig(filename=args.log, format=FORMAT, level=logging.INFO)

    async with RateLimiter(per_second=15) as ensembl:
        mut_dict = load_mutation_rates(args.rates)
        with open(args.out, "wt") as output:
            await args.func(ensembl, mut_dict, output, args)
Beispiel #6
0
def main():
    
    args = get_options()
    
    ensembl = EnsemblRequest(args.cache_folder, args.genome_build.lower())
    mut_dict = load_mutation_rates(args.rates)
    output = open(args.out, "wt")
    
    args.func(ensembl, mut_dict, output, args)
def main():

    args = get_options()

    ensembl = EnsemblRequest('cache', 'grch37')
    mut_dict = load_mutation_rates()

    dominant = load_dominant(args.known)

    data = pandas.DataFrame(
        columns=['symbol', 'chrom', 'pos', 'ref', 'alt', 'cq', 'prob'])
    data['pos'] = data['pos'].astype(int)
    for symbol in dominant:
        print(symbol)
        rates = get_gene_rates(symbol, ensembl, mut_dict)
        data = data.append(rates, ignore_index=True)

    with gzip.open(args.output, 'wt') as handle:
        data.to_csv(handle, sep='\t', index=False)
Beispiel #8
0
def main():
    
    input_transcripts, input_genes, output_file, rates_file, cache_dir, \
        genome_build = get_options()
    
    # load all the data
    ensembl = EnsemblRequest(cache_dir, genome_build)
    mut_dict = load_mutation_rates(rates_file)
    
    if input_transcripts is not None:
        transcripts = load_transcripts(input_transcripts)
    else:
        transcripts = load_genes(input_genes)
    
    output = open(output_file, "w")
    output.write("transcript_id\tchrom\tlength\tmissense_rate\tnonsense_rate\t"
        "splice_lof_rate\tsplice_region_rate\tsynonymous_rate\n")
    
    for gene_id in sorted(transcripts):
        print(gene_id)
        try:
            rates = get_mutation_rates(gene_id, transcripts, mut_dict, ensembl)
            
            chrom = rates[0]
            length = rates[1]
            rates = rates[2:]
            # log transform the rates, to keep them consistent with the rates from
            # Daly et al.
            line = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format(gene_id, \
                chrom, length, *log_transform(rates))
        except ValueError as error:
            line = "{0}\t{1}\n".format(gene_id, error)
        except KeyError as error:
            # ignore genes with odd genomic sequence eg ENST00000436041 in GRCh37
            continue
        
        output.write(line)
    
    output.close()
    
    include_indel_rates(output_file)
Beispiel #9
0
async def cluster_de_novos(symbol, de_novos, ensembl, iterations=1000000, mut_dict=None):
    """ analysis proximity cluster of de novos in a single gene
    
    Args:
        symbol: HGNC symbol for a gene
        de_novos: dictionary of de novo positions for the HGNC gene,
        indexed by functional type
        iterations: number of simulations to run
        ensembl: EnsemblRequest object, for obtaing info from ensembl
        mut_dict: dictionary of mutation rates, indexed by trinuclotide sequence
    
    Returns:
        a dictionary containing P values, and distances for missense, nonsense,
        and synonymous de novos events. Missing data is represented by "NA".
    """
    
    if mut_dict is None:
        mut_dict = load_mutation_rates()
    
    missense = de_novos["missense"]
    nonsense = de_novos["nonsense"]
    
    # load the set of transcripts that are the  minimum set of transcripts
    # required to contain all the de novos, unless we can't find any coding
    # transcripts that contain the de novos.
    try:
        transcripts = await load_gene(ensembl, symbol, missense + nonsense)
    except IndexError as e:
        print(e)
        return None
    
    probs = {"miss_prob": [], "nons_prob": []}
    dists = {"miss_dist": [], "nons_dist": []}
    
    for transcript in transcripts:
        
        missense_events = get_de_novos_in_transcript(transcript, missense)
        nonsense_events = get_de_novos_in_transcript(transcript, nonsense)
        
        rates = SiteRates(transcript, mut_dict)
        
        (miss_dist, miss_prob) = get_p_value(transcript, rates, iterations, "missense", missense_events)
        (nons_dist, nons_prob) = get_p_value(transcript, rates, iterations, "lof", nonsense_events)
        
        dists["miss_dist"].append(miss_dist)
        dists["nons_dist"].append(nons_dist)
        probs["miss_prob"].append(miss_prob)
        probs["nons_prob"].append(nons_prob)
        
        # remove the de novos analysed in the current transcript, so that
        # analysis of subsequent transcripts uses independent events. NOTE THAT
        # THIS MIGHT MISS SOME CLUSTERING ACROSS MUTUALLY EXCLUSIVE TRANSCRIPTS
        # IF THE DE NOVO EVENTS ARE NEAR THE TRANSCRIPT DIVERGENCE.
        missense = [x for x in missense if x not in missense_events]
        nonsense = [x for x in nonsense if x not in  nonsense_events]
        
    for key in dists:
        dists[key] = ",".join([ str(x) for x in dists[key] ])
    
    probs = {k: fishers_method(probs[k]) for k in probs}
    probs.update(dists)
    
    return probs
Beispiel #10
0
 def get_rates(self, tx):
     # load the sequence contect mutation rates, then assess each site in the
     # CDS.
     mut_dict = load_mutation_rates()
     
     return SiteRates(tx, mut_dict)
Beispiel #11
0
def cluster_de_novos(symbol, de_novos, iterations=1000000, ensembl=None, mut_dict=None):
    """ analysis proximity cluster of de novos in a single gene
    
    Args:
        symbol: HGNC symbol for a gene
        de_novos: dictionary of de novo positions for the HGNC gene,
        indexed by functional type
        iterations: number of simulations to run
        ensembl: EnsemblRequest object, for obtaing info from ensembl
        mut_dict: dictionary of mutation rates, indexed by trinuclotide sequence
    
    Returns:
        a dictionary containing P values, and distances for missense, nonsense,
        and synonymous de novos events. Missing data is represented by "NA".
    """
    
    if ensembl is None:
        ensembl = EnsemblRequest('cache', 'grch37')
    
    if mut_dict is None:
        mut_dict = load_mutation_rates()
    
    missense = de_novos["missense"]
    nonsense = de_novos["nonsense"]
    
    # load the set of transcripts that are the  minimum set of transcripts
    # required to contain all the de novos, unless we can't find any coding
    # transcripts that contain the de novos.
    try:
        transcripts = load_gene(ensembl, symbol, missense + nonsense)
    except IndexError as e:
        print(e)
        return None
    
    probs = {"miss_prob": [], "nons_prob": []}
    dists = {"miss_dist": [], "nons_dist": []}
    
    for transcript in transcripts:
        
        missense_events = get_de_novos_in_transcript(transcript, missense)
        nonsense_events = get_de_novos_in_transcript(transcript, nonsense)
        
        rates = SiteRates(transcript, mut_dict)
        
        (miss_dist, miss_prob) = get_p_value(transcript, rates, iterations, "missense", missense_events)
        (nons_dist, nons_prob) = get_p_value(transcript, rates, iterations, "lof", nonsense_events)
        
        dists["miss_dist"].append(miss_dist)
        dists["nons_dist"].append(nons_dist)
        probs["miss_prob"].append(miss_prob)
        probs["nons_prob"].append(nons_prob)
        
        # remove the de novos analysed in the current transcript, so that
        # analysis of subsequent transcripts uses independent events. NOTE THAT
        # THIS MIGHT MISS SOME CLUSTERING ACROSS MUTUALLY EXCLUSIVE TRANSCRIPTS
        # IF THE DE NOVO EVENTS ARE NEAR THE TRANSCRIPT DIVERGENCE.
        missense = [x for x in missense if x not in missense_events]
        nonsense = [x for x in nonsense if x not in  nonsense_events]
        
    for key in dists:
        dists[key] = ",".join([ str(x) for x in dists[key] ])
    
    probs = combine_p_values(probs)
    probs.update(dists)
    
    return probs
Beispiel #12
0
 def setUpClass(cls):
     cls.temp_dir = tempfile.mkdtemp()
     cls.ensembl = EnsemblRequest(cls.temp_dir, 'grch37')
     cls.mut_dict = load_mutation_rates()