예제 #1
0
def annotate(df, cols):
    """ get longest transcript for gene column or regions """
    def get_regions(row, cols):
        prefix = '' if row[cols[0]].startswith('chr') else 'chr'
        return "{}{}:{}-{}".format(prefix, row[cols[0]], row[cols[1]],
                                   row[cols[2]])

    llama = LlamaEnsembl()
    ucsc = UCSCapi()
    dd = {'query': [], 'transcripts': []}
    if len(cols) == 1:
        genes = df[cols[0]].values
        for gene in genes:
            chrom, start, end = llama.get_gene_pos(gene)
            res = ucsc.query("chr{}:{}-{}".format(chrom, start, end))
            dd['query'].append(gene)
            dd['transcripts'].append(res.longest(gene)['transcript'])
            mergecol = cols[0]
    elif len(cols) == 3:
        dd['genes'] = []
        df['region'] = df.apply(get_regions, axis=1, args=[cols])
        regions = df['region'].values
        for region in regions:
            res = ucsc.query(region)
            for gene in res.genes():
                dd['genes'].append(gene)
                dd['query'].append(region)
                dd['transcripts'].append(res.longest(gene)['transcript'])
                mergecol = 'region'
    ndf = df.merge(pd.DataFrame(dd),
                   left_on=mergecol,
                   right_on='query',
                   how='outer')
    return ndf[~ndf.duplicated()]
예제 #2
0
def test_gene_transcripts():
    llama = LlamaEnsembl()
    ucsc = UCSCapi()
    gene = 'BRCA1'
    chrom, start, end = llama.get_gene_pos(gene)
    res = ucsc.query("chr{}:{}-{}".format(chrom, start, end))
    transcript = res.longest(gene)
    print(transcript['transcript'])
예제 #3
0
 def annotate(self):
     """
     Annotate data frame
     :param df: pandas dataframe
     :return: merged dataframe
     """
     llama = LlamaEnsembl()
     new_df = llama.annotate_dataframe(self.df)
     return self.df.join(new_df)
예제 #4
0
def test_annotation_hg38():
    llama = LlamaEnsembl(genome='hg38')
    df = pd.DataFrame({
        'CHROM': ['chr1', 'chr17', '7', '7'],
        'START': [153358330, 43092618, 116771890, 116773071],
        'END': [153358350, 43092648, 116771920, 116773072],
        'gene_exp': ['S100A9', 'BRCA1', 'MET', 'MET'],
        'exon_exp': ['2', '10', '14', ''],
        'strand_exp': ['+', '-', '+', '+']
    })
    ndf = llama.annotate_dataframe(df)
    mdf = pd.concat([df, ndf], axis=1)
    print(mdf)
    assert (mdf[mdf['genes'] != mdf['gene_exp']].shape[0] == 0)
    assert (mdf[mdf['exons'] != mdf['exon_exp']].shape[0] == 0)
예제 #5
0
def test_annotation():
    llama = LlamaEnsembl()
    df = pd.DataFrame({
        'CHROM': ['chr1', 'chr17', '7', '7'],
        'START': [153330766, 41244573, 116412043, 116412044],
        'END': [153330797, 41245953, 116412043, 116412044],
        'gene_exp': ['S100A9', 'BRCA1', 'MET', 'MET'],
        'exon_exp': ['2', '10', '14', ''],
        'strand_exp': ['+', '-', '+', '+']
    })
    ndf = llama.annotate_dataframe(df)
    mdf = pd.concat([df, ndf], axis=1)
    print(mdf)
    assert (mdf[mdf['genes'] != mdf['gene_exp']].shape[0] == 0)
    assert (mdf[mdf['exons'] != mdf['exon_exp']].shape[0] == 0)
예제 #6
0
def test_rsid():
    llama = LlamaEnsembl()
    rsids = """rs1517114
    rs4646
    rs55886062
    rs3918290
    rs67376798
    rs75017182
    rs115232898
    rs1801158
    rs11615
    rs1800566
    rs7779029
    rs151264360
    rs4148323
    rs8175347"""
    rsidstr = [r.lstrip() for r in rsids.split('\n')]
    df = llama.annotate_variants(rsidstr, extra_cols=['MAF', 'ambiguity'])
    print(df)
예제 #7
0
def test_cds_convert():
    llama = LlamaEnsembl()
    result = llama.get_cds_region('NM_015506.2', 'c.1A>G')
    result2 = llama.get_cds_region('NM_015506.2', 'c.445_446del')
    print(result)
    print(result2)