def testFetchTaxaAnnotations(self): """ test the GoTerm and GoAnnotation tables """ print("fetching annotations for taxa") geneAnnots,uniprotAnnots = fetch_taxa_annotations([self.testID],self.engine,useIea=False,verbose=True) self.assertTrue('GO:0018343' in uniprotAnnots['Q9Y765'])
def summarize(self,refTaxon,termsPath): """ GO object summary and sanity check """ refTaxon = str(refTaxon) if refTaxon not in self.taxaList: raise Exception("refTaxon not present in taxaList") conn = self.engine.connect() gene2go,go2gene = self.load_dicts(termsPath=termsPath) s = select([Taxon.id,Taxon.ncbi_id,Taxon.name]).where(Taxon.ncbi_id.in_(self.taxaList)) _taxaQueries = conn.execute(s) taxaQueries = _taxaQueries.fetchall() taxaMap = dict([(str(r['ncbi_id']),str(r['id'])) for r in taxaQueries]) gene2id = {} for tquery in taxaQueries: s = select([Gene.id,Gene.ncbi_id],Gene.taxa_id==tquery['id']) _geneQueries = conn.execute(s) taxaDict = dict([(str(r['ncbi_id']),str(r['id'])) for r in _geneQueries.fetchall()]) if str(tquery['ncbi_id']) == refTaxon: refGenes = taxaDict.copy() print("there are %s genes from %s (%s)"%(len(taxaDict.keys()),tquery['name'],tquery['ncbi_id'])) gene2id.update(taxaDict) ## check for unmatched genes unmatched = 0 for gene in gene2go.items(): if gene not in gene2id: unmatched += 1 print("Summary") print("IEA annotations: %s"%self.useIea) print("total genes in combined taxa: %s"%(len(gene2id.keys()))) if unmatched > 0: print("WARNING: there were unmatched genes unmatched: %s"%unmatched) print("total genes with at least one annotation: %s"%(len(gene2go.keys()))) print("total unique annotations: %s"%(len(go2gene.keys()))) print("---------------------") _gene2go,_prot2go = fetch_taxa_annotations([refTaxon],self.engine,aspect=self.aspect,\ useIea=self.useIea) total= 0 for k,v in _gene2go.items(): total += len(v) print('RefTaxa genes: %s'%(len(refGenes.keys()))) print('Only RefTaxa: %s annotated genes, %s total annotations'%(len(_gene2go.keys()), total)) total= 0 for k,v in gene2go.items(): total += len(v) print('With additional taxa: %s annotated genes, %s total annotations'%(len(gene2go.keys()), total)) print('Percent annotation: %s'%(float(len(gene2go.keys())) / float(len(refGenes.keys()))))
def testFetchTaxaAnnotations(self): """ test the GoTerm and GoAnnotation tables """ print("fetching annotations for taxa") geneAnnots, uniprotAnnots = fetch_taxa_annotations([self.testID], self.engine, useIea=False, verbose=True) self.assertTrue('GO:0018343' in uniprotAnnots['Q9Y765'])
def create_dicts(self, termsPath, accepted=None): """ get the go2gene and gene2go dictionaries 'accepted' - list of genes that restrict included terms to a particular list """ conn = self.engine.connect() ## error checking if self.aspect not in [ 'biological_process', 'molecular_function', 'cellular_component' ]: raise Exception("Invalid aspect specified%s" % self.aspect) ## gene2go print( "...creating gene2go dictionary -- this may take several minutes or longer depending on the number of genes" ) _gene2go,prot2go = fetch_taxa_annotations(self.taxaList,self.engine,aspect=self.aspect,\ useIea=self.useIea) print( "...creating go2gene dictionary -- this may take several minutes") go2gene = {} gene2go = {} for gene, terms in _gene2go.iteritems(): if accepted and gene not in accepted: continue gene2go[gene] = terms for term in terms: if go2gene.has_key(term) == False: go2gene[term] = set([]) go2gene[term].update([gene]) for term, genes in go2gene.iteritems(): go2gene[term] = list(genes) ## pickle the dictionaries tmp = open(termsPath, 'w') cPickle.dump([gene2go, go2gene], tmp) tmp.close()
def create_dicts(self,termsPath,accepted=None): """ get the go2gene and gene2go dictionaries 'accepted' - list of genes that restrict included terms to a particular list """ conn = self.engine.connect() ## error checking if self.aspect not in ['biological_process','molecular_function','cellular_component']: raise Exception("Invalid aspect specified%s"%self.aspect) ## gene2go print("...creating gene2go dictionary -- this may take several minutes or longer depending on the number of genes") _gene2go,prot2go = fetch_taxa_annotations(self.taxaList,self.engine,aspect=self.aspect,\ useIea=self.useIea) print("...creating go2gene dictionary -- this may take several minutes") go2gene = {} gene2go = {} for gene,terms in _gene2go.items(): if accepted and gene not in accepted: continue gene2go[gene] = terms for term in terms: if term not in go2gene: go2gene[term] = set([]) go2gene[term].update([gene]) for term,genes in go2gene.items(): go2gene[term] = list(genes) ## pickle the dictionaries tmp = open(termsPath,'wb') pickle.dump([gene2go,go2gene],tmp) tmp.close()
def summarize(self, refTaxon, termsPath): """ GO object summary and sanity check """ refTaxon = str(refTaxon) if refTaxon not in self.taxaList: raise Exception("refTaxon not present in taxaList") conn = self.engine.connect() gene2go, go2gene = self.load_dicts(termsPath=termsPath) s = select([Taxon.id, Taxon.ncbi_id, Taxon.name]).where(Taxon.ncbi_id.in_(self.taxaList)) _taxaQueries = conn.execute(s) taxaQueries = _taxaQueries.fetchall() taxaMap = dict([(str(r['ncbi_id']), str(r['id'])) for r in taxaQueries]) gene2id = {} for tquery in taxaQueries: s = select([Gene.id, Gene.ncbi_id], Gene.taxa_id == tquery['id']) _geneQueries = conn.execute(s) taxaDict = dict([(str(r['ncbi_id']), str(r['id'])) for r in _geneQueries.fetchall()]) if str(tquery['ncbi_id']) == refTaxon: refGenes = taxaDict.copy() print("there are %s genes from %s (%s)" % (len(taxaDict.keys()), tquery['name'], tquery['ncbi_id'])) gene2id.update(taxaDict) ## check for unmatched genes unmatched = 0 for gene in gene2go.iterkeys(): if not gene2id.has_key(gene): unmatched += 1 print("Summary") print("IEA annotations: %s" % self.useIea) print("total genes in combined taxa: %s" % (len(gene2id.keys()))) if unmatched > 0: print("WARNING: there were unmatched genes unmatched: %s" % unmatched) print("total genes with at least one annotation: %s" % (len(gene2go.keys()))) print("total unique annotations: %s" % (len(go2gene.keys()))) print("---------------------") _gene2go,_prot2go = fetch_taxa_annotations([refTaxon],self.engine,aspect=self.aspect,\ useIea=self.useIea) total = 0 for k, v in _gene2go.iteritems(): total += len(v) print('RefTaxa genes: %s' % (len(refGenes.keys()))) print('Only RefTaxa: %s annotated genes, %s total annotations' % (len(_gene2go.keys()), total)) total = 0 for k, v in gene2go.iteritems(): total += len(v) print( 'With additional taxa: %s annotated genes, %s total annotations' % (len(gene2go.keys()), total)) print('Percent annotation: %s' % (float(len(gene2go.keys())) / float(len(refGenes.keys()))))
def enrichment_hypergeo( termList, entityList, species, useIea=True, asGenes=True, aspect="biological_process", verbose=True ): """ termList -- are the terms to be tested species -- an ncbi taxa id entityList -- gene or uniprot ids What is the probability of finding a given number of terms if we randomly select N out of M objects? M -- genes with at least one annotation N -- number of draws or size of gene list k -- the number of genes annotated by a given term (total type I objects) x -- number of times we observe a term in the gene list (draws) in R the cdf can be obtained with phyper(x,k,M-k,N) hypergeom.pmf(x, M, k, N) Returns a dict where term id is the key and hypergeo pvalue is the value """ ## connect to db and get annotations for the species session, engine = db_connect() geneAnnots, uniprotAnnots = fetch_taxa_annotations([species], engine, useIea=useIea, verbose=verbose, aspect=aspect) if asGenes == True: entity2go = geneAnnots else: entity2go = uniprotAnnots go2entity = {} for entity, go in entity2go.iteritems(): for term in go: if not go2entity.has_key(term): go2entity[term] = set([]) go2entity[term].update([entity]) for go, entity in go2entity.iteritems(): go2entity[go] = list(entity) print ("total go terms - %s" % (len(go2entity.keys()))) print ("total entities - %s" % (len(entity2go.keys()))) ## set variables M = len(entity2go.keys()) N = len(entityList) results = {} for testTerm in termList: ## find k = len(go2entity[testTerm]) x = 0 for entity in entityList: if entity in entity2go and testTerm in entity2go[entity]: x += 1 ## get a p-value if 0 in [x, M, N, k]: pvalue = np.nan else: cdf = hypergeom.cdf(x, M, k, N, loc=0) if cdf > 0: pvalue = 2 * (1 - hypergeom.cdf(x, M, k, N)) else: pvalue = 2 * hypergeom.cdf(x, M, k, N) results[testTerm] = pvalue return results
import sys,time from sqlalchemy.sql import select from htsint.database import db_connect,fetch_annotations,fetch_taxa_annotations from htsint.database import Taxon,taxa_mapper,Gene,gene_mapper session,engine = db_connect() conn = engine.connect() #timeStart = time.time() #annotations = fetch_annotations(['31251'],engine,idType='ncbi',useIea=False,aspect='biological_process') #print("end: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart))) #print annotations ##7091(small), 7227(large) timeStart = time.time() annotations,goTerms = fetch_taxa_annotations(['7227'],engine,idType='ncbi',useIea=False,aspect='biological_process') print("end: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart))) #print annotations sys.exit() ########### widget = Gene#Taxon print("scanning %s"%widget.__tablename__) timeStart = time.time() myDict = {}
def enrichment_hypergeo(termList, entityList, species, useIea=True, asGenes=True, aspect='biological_process', verbose=True): ''' termList -- are the terms to be tested species -- an ncbi taxa id entityList -- gene or uniprot ids What is the probability of finding a given number of terms if we randomly select N out of M objects? M -- genes with at least one annotation N -- number of draws or size of gene list k -- the number of genes annotated by a given term (total type I objects) x -- number of times we observe a term in the gene list (draws) in R the cdf can be obtained with phyper(x,k,M-k,N) hypergeom.pmf(x, M, k, N) Returns a dict where term id is the key and hypergeo pvalue is the value ''' ## connect to db and get annotations for the species session, engine = db_connect() geneAnnots, uniprotAnnots = fetch_taxa_annotations([species], engine, useIea=useIea, verbose=verbose, aspect=aspect) if asGenes == True: entity2go = geneAnnots else: entity2go = uniprotAnnots go2entity = {} for entity, go in entity2go.items(): for term in go: if term not in go2entity: go2entity[term] = set([]) go2entity[term].update([entity]) for go, entity in go2entity.items(): go2entity[go] = list(entity) print('total go terms - %s' % (len(go2entity.keys()))) print('total entities - %s' % (len(entity2go.keys()))) ## set variables M = len(entity2go.keys()) N = len(entityList) results = {} for testTerm in termList: ## find k = len(go2entity[testTerm]) x = 0 for entity in entityList: if entity in entity2go and testTerm in entity2go[entity]: x += 1 ## get a p-value if 0 in [x, M, N, k]: pvalue = np.nan else: cdf = hypergeom.cdf(x, M, k, N, loc=0) if cdf > 0: pvalue = 2 * (1 - hypergeom.cdf(x, M, k, N)) else: pvalue = 2 * hypergeom.cdf(x, M, k, N) results[testTerm] = pvalue return results
from sqlalchemy.sql import select from htsint.database import db_connect, fetch_annotations, fetch_taxa_annotations from htsint.database import Taxon, taxa_mapper, Gene, gene_mapper session, engine = db_connect() conn = engine.connect() #timeStart = time.time() #annotations = fetch_annotations(['31251'],engine,idType='ncbi',useIea=False,aspect='biological_process') #print("end: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart))) #print annotations ##7091(small), 7227(large) timeStart = time.time() annotations, goTerms = fetch_taxa_annotations(['7227'], engine, idType='ncbi', useIea=False, aspect='biological_process') print("end: %s" % time.strftime('%H:%M:%S', time.gmtime(time.time() - timeStart))) #print annotations sys.exit() ########### widget = Gene #Taxon print("scanning %s" % widget.__tablename__) timeStart = time.time() myDict = {} s = select([widget.id, widget.ncbi_id])