def annotate(df, cols): """ get longest transcript for gene column or regions """ def get_regions(row, cols): prefix = '' if row[cols[0]].startswith('chr') else 'chr' return "{}{}:{}-{}".format(prefix, row[cols[0]], row[cols[1]], row[cols[2]]) llama = LlamaEnsembl() ucsc = UCSCapi() dd = {'query': [], 'transcripts': []} if len(cols) == 1: genes = df[cols[0]].values for gene in genes: chrom, start, end = llama.get_gene_pos(gene) res = ucsc.query("chr{}:{}-{}".format(chrom, start, end)) dd['query'].append(gene) dd['transcripts'].append(res.longest(gene)['transcript']) mergecol = cols[0] elif len(cols) == 3: dd['genes'] = [] df['region'] = df.apply(get_regions, axis=1, args=[cols]) regions = df['region'].values for region in regions: res = ucsc.query(region) for gene in res.genes(): dd['genes'].append(gene) dd['query'].append(region) dd['transcripts'].append(res.longest(gene)['transcript']) mergecol = 'region' ndf = df.merge(pd.DataFrame(dd), left_on=mergecol, right_on='query', how='outer') return ndf[~ndf.duplicated()]
def test_gene_transcripts(): llama = LlamaEnsembl() ucsc = UCSCapi() gene = 'BRCA1' chrom, start, end = llama.get_gene_pos(gene) res = ucsc.query("chr{}:{}-{}".format(chrom, start, end)) transcript = res.longest(gene) print(transcript['transcript'])
def annotate(self): """ Annotate data frame :param df: pandas dataframe :return: merged dataframe """ llama = LlamaEnsembl() new_df = llama.annotate_dataframe(self.df) return self.df.join(new_df)
def test_annotation_hg38(): llama = LlamaEnsembl(genome='hg38') df = pd.DataFrame({ 'CHROM': ['chr1', 'chr17', '7', '7'], 'START': [153358330, 43092618, 116771890, 116773071], 'END': [153358350, 43092648, 116771920, 116773072], 'gene_exp': ['S100A9', 'BRCA1', 'MET', 'MET'], 'exon_exp': ['2', '10', '14', ''], 'strand_exp': ['+', '-', '+', '+'] }) ndf = llama.annotate_dataframe(df) mdf = pd.concat([df, ndf], axis=1) print(mdf) assert (mdf[mdf['genes'] != mdf['gene_exp']].shape[0] == 0) assert (mdf[mdf['exons'] != mdf['exon_exp']].shape[0] == 0)
def test_annotation(): llama = LlamaEnsembl() df = pd.DataFrame({ 'CHROM': ['chr1', 'chr17', '7', '7'], 'START': [153330766, 41244573, 116412043, 116412044], 'END': [153330797, 41245953, 116412043, 116412044], 'gene_exp': ['S100A9', 'BRCA1', 'MET', 'MET'], 'exon_exp': ['2', '10', '14', ''], 'strand_exp': ['+', '-', '+', '+'] }) ndf = llama.annotate_dataframe(df) mdf = pd.concat([df, ndf], axis=1) print(mdf) assert (mdf[mdf['genes'] != mdf['gene_exp']].shape[0] == 0) assert (mdf[mdf['exons'] != mdf['exon_exp']].shape[0] == 0)
def test_rsid(): llama = LlamaEnsembl() rsids = """rs1517114 rs4646 rs55886062 rs3918290 rs67376798 rs75017182 rs115232898 rs1801158 rs11615 rs1800566 rs7779029 rs151264360 rs4148323 rs8175347""" rsidstr = [r.lstrip() for r in rsids.split('\n')] df = llama.annotate_variants(rsidstr, extra_cols=['MAF', 'ambiguity']) print(df)
def test_cds_convert(): llama = LlamaEnsembl() result = llama.get_cds_region('NM_015506.2', 'c.1A>G') result2 = llama.get_cds_region('NM_015506.2', 'c.445_446del') print(result) print(result2)