def build_protein_maps(accessions): protein_acc_map = cookbook.DictOfSets() gene_protein_map = cookbook.DictOfSets() for id, protein_acc, protein_id in accessions: protein_acc_map[protein_acc.split('.')[0]].add(protein_id) gene_protein_map[id].add(protein_id) return protein_acc_map, gene_protein_map
def get_protein_map(): """ Query Entrez to get a map from its protein accessions to ids and xrefs """ result = ProteinMap(acc_2_id=cookbook.DictOfSets(), xrefs=cookbook.DictOfSets()) for acc, id, refs in refs_for_mouse_protein_accs(): result.acc_2_id[acc].add(id.acc) for ref in refs: result.xrefs[id.acc].add(ref) return result
def get_protein_accession_map(): """ Query Entrez to get a map from its protein accession to ids """ from Bio.EUtils import HistoryClient search_term = 'mouse[orgn]' #search_term = 'MYOD[Gene name] AND mouse[orgn]' # get a handle to the results client = HistoryClient.HistoryClient() results = client.search(db='protein', term=search_term) results_size = len(results) dbids = results.dbids.ids print '# results: %d' % results_size # download them bit by bit acc_2_id = cookbook.DictOfSets() step = 10000 for start in xrange(0, results_size, step): size = min(step, results_size - start) end = start + size results.retstart = start results.retmax = end print 'Getting %d->%d' % (start, end) for id, acc in zip(dbids[start:end], results.efetch(retmode='text', rettype='acc')): acc = acc.strip().split('.')[0] print acc, id acc_2_id[acc].add( biopsy.transfac.DbRef.parse_as( id, biopsy.transfac.db.entrez_protein)) return acc_2_id
def build_mgi_go_map(): """ @return: A map between MGI identifiers and GO ontologies. """ import csv, cookbook from biopsy import DbRef, db reader = csv.reader(open(gene_association_filename, "r"), delimiter='\t') result = cookbook.DictOfSets() for row in reader: if row[0].startswith('!'): continue result[DbRef.parse_as(row[1], db.mgi)].add(row[4]) return result
def _mgi_ids_from_biomart(): "Goes to biomart to map all mouse genes to MGI ids" from . import entrez result = cookbook.DictOfSets() # build biomart query for swissprot and execute query = biomart.new_query() dataset = biomart.add_dataset(query, 'mmusculus_gene_ensembl') biomart.add_attribute(dataset, 'ensembl_gene_id') biomart.add_attribute(dataset, 'external_gene_id') for row in csv.reader(biomart.execute_query(query), delimiter=','): gene_ref = biopsy.transfac.DbRef.parse_as(row[0], biopsy.transfac.db.ensembl) if row[1]: mgi_acc = row[1] if mgi_acc in mgi.acc2id(): yield gene_ref, mgi.acc2id()[mgi_acc]
def get_orthologs(species1, species2): "Returns a dict mapping ensembl genes of species 1 to genes of species 2." result = cookbook.DictOfSets() # build biomart query and execute query = biomart.new_query() dataset = biomart.add_dataset(query, species[species2].dataset()) biomart.add_attribute(dataset, 'ensembl_gene_id') biomart.add_attribute(dataset, '%s_ensembl_gene' % species[species1].short_name) biomart.print_query(query, 'query.xml') for row in csv.reader(biomart.execute_query(query), delimiter=','): if row[1]: ref_1 = biopsy.transfac.DbRef.parse_as(row[0], biopsy.transfac.db.ensembl) ref_2 = biopsy.transfac.DbRef.parse_as(row[1], biopsy.transfac.db.ensembl) result[ref_2].add(ref_1) return result
def proteins_for_species(species): "Goes to biomart to map all genes in species to Swissprot proteins" from . import entrez result = cookbook.DictOfSets() # build biomart query for swissprot and execute query = biomart.new_query() dataset = biomart.add_dataset(query, species) biomart.add_attribute(dataset, 'ensembl_gene_id') biomart.add_attribute(dataset, 'ensembl_transcript_id') biomart.add_attribute(dataset, 'uniprot_swissprot_accession') for row in csv.reader(biomart.execute_query(query), delimiter=','): gene_ref = biopsy.transfac.DbRef.parse_as(row[0], biopsy.transfac.db.ensembl) transcript_ref = biopsy.transfac.DbRef.parse_as( row[1], biopsy.transfac.db.ensembl) if row[2]: protein_ref = biopsy.transfac.DbRef.parse_as( row[2], biopsy.transfac.db.swissprot) result[gene_ref].add((transcript_ref, protein_ref)) # build biomart query for entrez protein and execute query = biomart.new_query() dataset = biomart.add_dataset(query, species) biomart.add_attribute(dataset, 'ensembl_gene_id') biomart.add_attribute(dataset, 'ensembl_transcript_id') biomart.add_attribute(dataset, 'protein') for row in csv.reader(biomart.execute_query(query), delimiter=','): gene_ref = biopsy.transfac.DbRef.parse_as(row[0], biopsy.transfac.db.ensembl) transcript_ref = biopsy.transfac.DbRef.parse_as( row[1], biopsy.transfac.db.ensembl) if row[2]: protein_acc = row[2] #print "'%s'" % protein_acc if protein_acc in entrez.mouse_proteins().acc2id: for protein_id in entrez.mouse_proteins().acc2id[protein_acc]: ref = T.DbRef(T.db.entrez_protein, "", protein_id) result[gene_ref].add((transcript_ref, ref)) return result
def get_protein_accession_map_2(): import elementtree.ElementTree as ET search_term = 'mouse[orgn]' #search_term = 'MYOD[Gene name] AND mouse[orgn]' client = HistoryClient.HistoryClient() results = client.search(db='protein', term=search_term) results_size = len(results) print '# results: %d' % results_size acc_2_ids = cookbook.DictOfSets() step = 5000 for start in xrange(0, results_size, step): results.retstart = start results.retmax = min(step, results_size - start) #results.retmax = 1000 start = time.time() summary = results.summary() print 'Retrieving summary: %f secs' % (time.time() - start) for entry in summary: acc = entry.dataitems['Caption'].encode().strip().split('.')[0] acc_2_ids[acc].add( biopsy.transfac.DbRef.parse_as( entry.id.encode(), biopsy.transfac.db.entrez_protein))