def yield_uniprot_gene_mappings(gene_ids): import biopsy.identifiers.biomart as biomart, csv query = biomart.new_query() dataset = biomart.add_dataset(query, 'mmusculus_gene_ensembl') biomart.add_filter(dataset, 'ensembl_gene_id', ",".join(gene_ids)) biomart.add_attribute(dataset, 'ensembl_gene_id') biomart.add_attribute(dataset, 'unified_uniprot_accession') for row in csv.reader(biomart.execute_query(query), delimiter=','): if row[1]: yield biopsy.DbRef.parse_as(row[0], biopsy.db.ensembl), biopsy.DbRef.parse_as(row[1], biopsy.db.swissprot)
def proteins_for_species(species): "Goes to biomart to map all genes in species to Swissprot proteins" from . import entrez result = cookbook.DictOfSets() # build biomart query for swissprot and execute query = biomart.new_query() dataset = biomart.add_dataset(query, species) biomart.add_attribute(dataset, 'ensembl_gene_id') biomart.add_attribute(dataset, 'ensembl_transcript_id') biomart.add_attribute(dataset, 'uniprot_swissprot_accession') for row in csv.reader(biomart.execute_query(query), delimiter=','): gene_ref = biopsy.transfac.DbRef.parse_as(row[0], biopsy.transfac.db.ensembl) transcript_ref = biopsy.transfac.DbRef.parse_as( row[1], biopsy.transfac.db.ensembl) if row[2]: protein_ref = biopsy.transfac.DbRef.parse_as( row[2], biopsy.transfac.db.swissprot) result[gene_ref].add((transcript_ref, protein_ref)) # build biomart query for entrez protein and execute query = biomart.new_query() dataset = biomart.add_dataset(query, species) biomart.add_attribute(dataset, 'ensembl_gene_id') biomart.add_attribute(dataset, 'ensembl_transcript_id') biomart.add_attribute(dataset, 'protein') for row in csv.reader(biomart.execute_query(query), delimiter=','): gene_ref = biopsy.transfac.DbRef.parse_as(row[0], biopsy.transfac.db.ensembl) transcript_ref = biopsy.transfac.DbRef.parse_as( row[1], biopsy.transfac.db.ensembl) if row[2]: protein_acc = row[2] #print "'%s'" % protein_acc if protein_acc in entrez.mouse_proteins().acc2id: for protein_id in entrez.mouse_proteins().acc2id[protein_acc]: ref = T.DbRef(T.db.entrez_protein, "", protein_id) result[gene_ref].add((transcript_ref, ref)) return result
def proteins_for_species(species): "Goes to biomart to map all genes in species to Swissprot proteins" from . import entrez result = cookbook.DictOfSets() # build biomart query for swissprot and execute query = biomart.new_query() dataset = biomart.add_dataset(query, species) biomart.add_attribute(dataset, 'ensembl_gene_id') biomart.add_attribute(dataset, 'ensembl_transcript_id') biomart.add_attribute(dataset, 'uniprot_swissprot_accession') for row in csv.reader(biomart.execute_query(query), delimiter=','): gene_ref = biopsy.transfac.DbRef.parse_as(row[0], biopsy.transfac.db.ensembl) transcript_ref = biopsy.transfac.DbRef.parse_as(row[1], biopsy.transfac.db.ensembl) if row[2]: protein_ref = biopsy.transfac.DbRef.parse_as(row[2], biopsy.transfac.db.swissprot) result[gene_ref].add((transcript_ref, protein_ref)) # build biomart query for entrez protein and execute query = biomart.new_query() dataset = biomart.add_dataset(query, species) biomart.add_attribute(dataset, 'ensembl_gene_id') biomart.add_attribute(dataset, 'ensembl_transcript_id') biomart.add_attribute(dataset, 'protein') for row in csv.reader(biomart.execute_query(query), delimiter=','): gene_ref = biopsy.transfac.DbRef.parse_as(row[0], biopsy.transfac.db.ensembl) transcript_ref = biopsy.transfac.DbRef.parse_as(row[1], biopsy.transfac.db.ensembl) if row[2]: protein_acc = row[2] #print "'%s'" % protein_acc if protein_acc in entrez.mouse_proteins().acc2id: for protein_id in entrez.mouse_proteins().acc2id[protein_acc]: ref = T.DbRef(T.db.entrez_protein, "", protein_id) result[gene_ref].add((transcript_ref, ref)) return result
def _mgi_ids_from_biomart(): "Goes to biomart to map all mouse genes to MGI ids" from . import entrez result = cookbook.DictOfSets() # build biomart query for swissprot and execute query = biomart.new_query() dataset = biomart.add_dataset(query, 'mmusculus_gene_ensembl') biomart.add_attribute(dataset, 'ensembl_gene_id') biomart.add_attribute(dataset, 'external_gene_id') for row in csv.reader(biomart.execute_query(query), delimiter=','): gene_ref = biopsy.transfac.DbRef.parse_as(row[0], biopsy.transfac.db.ensembl) if row[1]: mgi_acc = row[1] if mgi_acc in mgi.acc2id(): yield gene_ref, mgi.acc2id()[mgi_acc]
def get_orthologs(species1, species2): "Returns a dict mapping ensembl genes of species 1 to genes of species 2." result = cookbook.DictOfSets() # build biomart query and execute query = biomart.new_query() dataset = biomart.add_dataset(query, species[species2].dataset()) biomart.add_attribute(dataset, 'ensembl_gene_id') biomart.add_attribute(dataset, '%s_ensembl_gene' % species[species1].short_name) biomart.print_query(query, 'query.xml') for row in csv.reader(biomart.execute_query(query), delimiter=','): if row[1]: ref_1 = biopsy.transfac.DbRef.parse_as(row[0], biopsy.transfac.db.ensembl) ref_2 = biopsy.transfac.DbRef.parse_as(row[1], biopsy.transfac.db.ensembl) result[ref_2].add(ref_1) return result