from pyensembl import EnsemblRelease
from pandas import DataFrame

ensembl = EnsemblRelease(95)

df = DataFrame(
    [
        [gene.gene_id, gene.gene_name]
        for gene in ensembl.genes()
    ],
    columns=['gene_id', 'gene_name']
)

df.to_csv('ensembl_to_gene_symbol.csv', index=False)
Example #2
0
    'Y': "NC_000024"
}

#GRCh37 from http://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.25/#/def_asm_Primary_Assembly
NCBI_IDS_GRCh37 = {
    'NC_000001.10', 'NC_000002.11', 'NC_000003.11', 'NC_000004.11',
    'NC_000005.9', 'NC_000006.11', 'NC_000007.13', 'NC_000008.10',
    'NC_000009.11', 'NC_000010.10', 'NC_000011.9', 'NC_000012.11',
    'NC_000013.10', 'NC_000014.8', 'NC_000015.9', 'NC_000016.9',
    'NC_000017.10', 'NC_000018.9', 'NC_000019.9', 'NC_000020.10',
    'NC_000021.8', 'NC_000022.10', 'NC_000023.10', 'NC_000024.9'
}

data = EnsemblRelease(75, auto_download=True)

_genes = data.genes(contig=1, strand=None)

list_of_genes = _genes[:10]

for item in list_of_genes:
    #Gene(id=ENSG00000196188, name=CTSE, biotype=protein_coding, location=1:206317459-206332104)

    _data = str(item).replace(')', '').split(',')
    _id = _data[0].split('=')[-1]
    _name = _data[1].split('=')[-1]
    _biotype = _data[2].split('=')[-1]
    _location = _data[3].split('=')[-1]

    _chr, _ss = _location.split(':')
    _start, _stop = _ss.split('-')