def buildIdentifierMap(query_species): columns = ('ensembl_gene_id', 'entrezgene', 'hgnc_id', 'hgnc_symbol') data = Biomart.biomart_iterator( columns, dataset="%s_gene_ensembl" % query_species) map_identifiers = collections.defaultdict(set) for row in data: ensid = row['ensembl_gene_id'] for column in columns[1:]: xid = str(row[column]) if xid == "NA" or xid == "": continue if ensid.startswith('LRG_'): continue map_identifiers[xid].add((ensid, column)) # convert to lists map_identifiers = dict([(x, list(y)) for x, y in list(map_identifiers.items())]) return map_identifiers
def buildIdentifierMap(query_species): columns = ('ensembl_gene_id', 'entrezgene', 'hgnc_id', 'hgnc_symbol') data = Biomart.biomart_iterator( columns, dataset="%s_gene_ensembl" % query_species) map_identifiers = collections.defaultdict(set) for row in data: ensid = row['ensembl_gene_id'] for column in columns[1:]: xid = str(row[column]) if xid == "NA" or xid == "": continue if ensid.startswith('LRG_'): continue map_identifiers[xid].add((ensid, column)) # convert to lists map_identifiers = dict([(x, list(y)) for x, y in map_identifiers.items()]) return map_identifiers
def buildOrthologyMap(query_species, target_species, filter_type="ortholog_one2one"): '''build map of genes in query species to those in target species.''' columns = ('ensembl_gene_id', '%s_homolog_ensembl_gene' % target_species, '%s_homolog_orthology_type' % target_species, '%s_homolog_orthology_confidence' % target_species) data = Biomart.biomart_iterator(columns, dataset="%s_gene_ensembl" % query_species) map_query2target = dict([ (x['ensembl_gene_id'], x['%s_homolog_ensembl_gene' % target_species]) for x in data if x['%s_homolog_orthology_type' % target_species] == filter_type ]) return map_query2target
def buildOrthologyMap(query_species, target_species, filter_type="ortholog_one2one"): '''build map of genes in query species to those in target species.''' columns = ('ensembl_gene_id', '%s_homolog_ensembl_gene' % target_species, '%s_homolog_orthology_type' % target_species, '%s_homolog_orthology_confidence' % target_species) data = Biomart.biomart_iterator( columns, dataset="%s_gene_ensembl" % query_species) map_query2target = dict([( x['ensembl_gene_id'], x['%s_homolog_ensembl_gene' % target_species]) for x in data if x['%s_homolog_orthology_type' % target_species] == filter_type]) return map_query2target
def importKEGGAssignments(outfile, mart, host, biomart_dataset): '''import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' if not re.match("rnorvegicus|scerevisiae|hsapiens|mmusculus", biomart_dataset): E.warn("KEGG.db doesn't map Entrez ids for %s, %s will" " likely be empty" % (biomart_dataset, outfile)) R.library("KEGG.db") E.info("getting entrez to ensembl mapping ...") entrez2ensembl = Biomart.biomart_iterator( ("ensembl_gene_id", "entrezgene"), biomart=mart, dataset=biomart_dataset, host=host, path="/biomart/martservice") entrez2ensembl = dict((x['entrezgene'], x['ensembl_gene_id']) for x in entrez2ensembl) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") # rx2 did not work in rpy2 2.4.2 - workaround uses # absolute indices for gene_column, gene in enumerate(entrez2path.names): try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path[gene_column]: pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write( "\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def importKEGGAssignments(outfile, mart, host, biomart_dataset): '''import the KEGG annotations from the R KEGG.db annotations package. .. note:: Since KEGG is no longer publically available, this is not up-to-date and maybe removed from bioconductor in future releases The table written to outfile has the following columns: ``ontology``, ``gene_id``, ``kegg_ID``, ``kegg_name``, ``evidence``. Arguments --------- outfile : string Output filename in :term:`tsv` format. mart : string Name of the biomart host : string Host name of the biomart server biomart_dataset : string Biomart dataset ''' if not re.match("rnorvegicus|scerevisiae|hsapiens|mmusculus", biomart_dataset): E.warn("KEGG.db doesn't map Entrez ids for %s, %s will" " likely be empty" % (biomart_dataset, outfile)) R.library("KEGG.db") E.info("getting entrez to ensembl mapping ...") # Generates an iterator containing the data from biomart entrez2ensembl = Biomart.biomart_iterator( ("ensembl_gene_id", "entrezgene"), biomart=mart, dataset=biomart_dataset, host=host, path="/biomart/martservice") entrez2ensembl = dict((x['entrezgene'], x['ensembl_gene_id']) for x in entrez2ensembl) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(list(zip(pathnames.names, R.unlist(pathnames)))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") # rx2 did not work in rpy2 2.4.2 - workaround uses # absolute indices for gene_column, gene in enumerate(entrez2path.names): try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path[gene_column]: pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write( "\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")