コード例 #1
0
def build_protein_maps(accessions):
    protein_acc_map = cookbook.DictOfSets()
    gene_protein_map = cookbook.DictOfSets()
    for id, protein_acc, protein_id in accessions:
        protein_acc_map[protein_acc.split('.')[0]].add(protein_id)
        gene_protein_map[id].add(protein_id)
    return protein_acc_map, gene_protein_map
コード例 #2
0
def get_protein_map():
    """
    Query Entrez to get a map from its protein accessions to ids and xrefs
    """
    result = ProteinMap(acc_2_id=cookbook.DictOfSets(),
                        xrefs=cookbook.DictOfSets())
    for acc, id, refs in refs_for_mouse_protein_accs():
        result.acc_2_id[acc].add(id.acc)
        for ref in refs:
            result.xrefs[id.acc].add(ref)
    return result
コード例 #3
0
    def get_protein_accession_map():
        """
        Query Entrez to get a map from its protein accession to ids
        """
        from Bio.EUtils import HistoryClient
        search_term = 'mouse[orgn]'
        #search_term = 'MYOD[Gene name] AND mouse[orgn]'

        # get a handle to the results
        client = HistoryClient.HistoryClient()
        results = client.search(db='protein', term=search_term)
        results_size = len(results)
        dbids = results.dbids.ids
        print '# results: %d' % results_size

        # download them bit by bit
        acc_2_id = cookbook.DictOfSets()
        step = 10000
        for start in xrange(0, results_size, step):
            size = min(step, results_size - start)
            end = start + size
            results.retstart = start
            results.retmax = end
            print 'Getting %d->%d' % (start, end)
            for id, acc in zip(dbids[start:end],
                               results.efetch(retmode='text', rettype='acc')):
                acc = acc.strip().split('.')[0]
                print acc, id
                acc_2_id[acc].add(
                    biopsy.transfac.DbRef.parse_as(
                        id, biopsy.transfac.db.entrez_protein))

        return acc_2_id
コード例 #4
0
def build_mgi_go_map():
    """
    @return: A map between MGI identifiers and GO ontologies.
    """
    import csv, cookbook
    from biopsy import DbRef, db
    reader = csv.reader(open(gene_association_filename, "r"), delimiter='\t')
    result = cookbook.DictOfSets()
    for row in reader:
        if row[0].startswith('!'):
            continue
        result[DbRef.parse_as(row[1], db.mgi)].add(row[4])
    return result
コード例 #5
0
ファイル: ensembl.py プロジェクト: pombredanne/biopsy
def _mgi_ids_from_biomart():
    "Goes to biomart to map all mouse genes to MGI ids"
    from . import entrez
    result = cookbook.DictOfSets()

    # build biomart query for swissprot and execute
    query = biomart.new_query()
    dataset = biomart.add_dataset(query, 'mmusculus_gene_ensembl')
    biomart.add_attribute(dataset, 'ensembl_gene_id')
    biomart.add_attribute(dataset, 'external_gene_id')
    for row in csv.reader(biomart.execute_query(query), delimiter=','):
        gene_ref = biopsy.transfac.DbRef.parse_as(row[0],
                                                  biopsy.transfac.db.ensembl)
        if row[1]:
            mgi_acc = row[1]
            if mgi_acc in mgi.acc2id():
                yield gene_ref, mgi.acc2id()[mgi_acc]
コード例 #6
0
ファイル: ensembl.py プロジェクト: pombredanne/biopsy
def get_orthologs(species1, species2):
    "Returns a dict mapping ensembl genes of species 1 to genes of species 2."
    result = cookbook.DictOfSets()

    # build biomart query and execute
    query = biomart.new_query()
    dataset = biomart.add_dataset(query, species[species2].dataset())
    biomart.add_attribute(dataset, 'ensembl_gene_id')
    biomart.add_attribute(dataset,
                          '%s_ensembl_gene' % species[species1].short_name)
    biomart.print_query(query, 'query.xml')
    for row in csv.reader(biomart.execute_query(query), delimiter=','):
        if row[1]:
            ref_1 = biopsy.transfac.DbRef.parse_as(row[0],
                                                   biopsy.transfac.db.ensembl)
            ref_2 = biopsy.transfac.DbRef.parse_as(row[1],
                                                   biopsy.transfac.db.ensembl)
            result[ref_2].add(ref_1)

    return result
コード例 #7
0
ファイル: ensembl.py プロジェクト: pombredanne/biopsy
def proteins_for_species(species):
    "Goes to biomart to map all genes in species to Swissprot proteins"
    from . import entrez
    result = cookbook.DictOfSets()

    # build biomart query for swissprot and execute
    query = biomart.new_query()
    dataset = biomart.add_dataset(query, species)
    biomart.add_attribute(dataset, 'ensembl_gene_id')
    biomart.add_attribute(dataset, 'ensembl_transcript_id')
    biomart.add_attribute(dataset, 'uniprot_swissprot_accession')
    for row in csv.reader(biomart.execute_query(query), delimiter=','):
        gene_ref = biopsy.transfac.DbRef.parse_as(row[0],
                                                  biopsy.transfac.db.ensembl)
        transcript_ref = biopsy.transfac.DbRef.parse_as(
            row[1], biopsy.transfac.db.ensembl)
        if row[2]:
            protein_ref = biopsy.transfac.DbRef.parse_as(
                row[2], biopsy.transfac.db.swissprot)
            result[gene_ref].add((transcript_ref, protein_ref))

    # build biomart query for entrez protein and execute
    query = biomart.new_query()
    dataset = biomart.add_dataset(query, species)
    biomart.add_attribute(dataset, 'ensembl_gene_id')
    biomart.add_attribute(dataset, 'ensembl_transcript_id')
    biomart.add_attribute(dataset, 'protein')
    for row in csv.reader(biomart.execute_query(query), delimiter=','):
        gene_ref = biopsy.transfac.DbRef.parse_as(row[0],
                                                  biopsy.transfac.db.ensembl)
        transcript_ref = biopsy.transfac.DbRef.parse_as(
            row[1], biopsy.transfac.db.ensembl)
        if row[2]:
            protein_acc = row[2]
            #print "'%s'" % protein_acc
            if protein_acc in entrez.mouse_proteins().acc2id:
                for protein_id in entrez.mouse_proteins().acc2id[protein_acc]:
                    ref = T.DbRef(T.db.entrez_protein, "", protein_id)
                    result[gene_ref].add((transcript_ref, ref))

    return result
コード例 #8
0
    def get_protein_accession_map_2():
        import elementtree.ElementTree as ET
        search_term = 'mouse[orgn]'
        #search_term = 'MYOD[Gene name] AND mouse[orgn]'

        client = HistoryClient.HistoryClient()
        results = client.search(db='protein', term=search_term)
        results_size = len(results)
        print '# results: %d' % results_size

        acc_2_ids = cookbook.DictOfSets()
        step = 5000
        for start in xrange(0, results_size, step):
            results.retstart = start
            results.retmax = min(step, results_size - start)
            #results.retmax = 1000
            start = time.time()
            summary = results.summary()
            print 'Retrieving summary: %f secs' % (time.time() - start)
            for entry in summary:
                acc = entry.dataitems['Caption'].encode().strip().split('.')[0]
                acc_2_ids[acc].add(
                    biopsy.transfac.DbRef.parse_as(
                        entry.id.encode(), biopsy.transfac.db.entrez_protein))