예제 #1
0
def annotate(df, cols):
    """ get longest transcript for gene column or regions """
    def get_regions(row, cols):
        prefix = '' if row[cols[0]].startswith('chr') else 'chr'
        return "{}{}:{}-{}".format(prefix, row[cols[0]], row[cols[1]],
                                   row[cols[2]])

    llama = LlamaEnsembl()
    ucsc = UCSCapi()
    dd = {'query': [], 'transcripts': []}
    if len(cols) == 1:
        genes = df[cols[0]].values
        for gene in genes:
            chrom, start, end = llama.get_gene_pos(gene)
            res = ucsc.query("chr{}:{}-{}".format(chrom, start, end))
            dd['query'].append(gene)
            dd['transcripts'].append(res.longest(gene)['transcript'])
            mergecol = cols[0]
    elif len(cols) == 3:
        dd['genes'] = []
        df['region'] = df.apply(get_regions, axis=1, args=[cols])
        regions = df['region'].values
        for region in regions:
            res = ucsc.query(region)
            for gene in res.genes():
                dd['genes'].append(gene)
                dd['query'].append(region)
                dd['transcripts'].append(res.longest(gene)['transcript'])
                mergecol = 'region'
    ndf = df.merge(pd.DataFrame(dd),
                   left_on=mergecol,
                   right_on='query',
                   how='outer')
    return ndf[~ndf.duplicated()]
예제 #2
0
def test_gene_transcripts():
    llama = LlamaEnsembl()
    ucsc = UCSCapi()
    gene = 'BRCA1'
    chrom, start, end = llama.get_gene_pos(gene)
    res = ucsc.query("chr{}:{}-{}".format(chrom, start, end))
    transcript = res.longest(gene)
    print(transcript['transcript'])