def shuffleAnnotation(self, allFA, t=10): """ This method randomly re-shuffle annotation accross annotation sets. The coherence of the annotation sets should change. The size might affected if two identical indentation are assinged one set The other stats should remain unchanged or be marginally affected. """ for FA in allFA: logger.info("\t%s" % (FA.name)) for aspect in allAspect: if aspect == 'All_aspects_of_GO': continue iterKeys = flatten([ list(repeat(k, len(FA.GPtoGO[aspect][k]))) for k in FA.GPtoGO[aspect].keys() ]) iterValues = list(flatten(FA.GPtoGO[aspect].values())) shuffle(iterValues) GPtoGO, GOtoGP = dict(), dict() for k, v in groupby(iterValues, lambda x: iterKeys.next()): GPtoGO[k] = set(v) for go in GPtoGO[k]: if not GOtoGP.has_key(go): GOtoGP[go] = set() GOtoGP[go].add(k) FA.GPtoGO[aspect] = GPtoGO FA.GOtoGP[aspect] = GOtoGP
def sampleAnnotation(self, allFA): """ This method randomly sample GO annotations keep the size of annotation sets unchange. """ for FA in allFA: logger.info("\t%s" % (FA.name)) for aspect in allAspect: if aspect == 'All_aspects_of_GO': continue iterKeys = flatten([ list(repeat(k, len(FA.GPtoGO[aspect][k]))) for k in FA.GPtoGO[aspect].keys() ]) nbValues = len(list(flatten(FA.GPtoGO[aspect].values()))) allNodes = FA.G.get_NodesfromAspect(aspect) iterValues = take(allNodes, randint(0, len(allNodes), nbValues)) GPtoGO, GOtoGP = dict(), dict() for k, v in groupby(iterValues, lambda x: iterKeys.next()): GPtoGO[k] = set(v) for go in GPtoGO[k]: if not GOtoGP.has_key(go): GOtoGP[go] = set() GOtoGP[go].add(k) FA.GPtoGO[aspect] = GPtoGO FA.GOtoGP[aspect] = GOtoGP
def funcSim(self, allFA): """ Compute Semantic Similarity between commonly annotated GP for all possible pairs of FA """ funcSim = dict() for aspect in allAspect: if aspect == "All_aspects_of_GO": continue funcSim[aspect] = dict() for twoFAs in combinations(allFA, 2): #Order the two FAs FA1, FA2 = take(twoFAs, argsort([FA.name for FA in twoFAs])) logger.info("\tbetween %s and %s for %s" % (FA1.name, FA2.name, aspect)) commonGene = self.getCommonGene(FA1, FA2, aspect) D = dict() for g in commonGene: sim, l = GOSet_PWSimilarity(FA1.G, FA1.GPtoGO[aspect][g], FA2.GPtoGO[aspect][g]) D[g] = (l[0], l[1]) funcSim[aspect][(FA1.name, FA2.name)] = D aspect = "All_aspects_of_GO" funcSim[aspect] = dict() for twoFAs in combinations(allFA, 2): #Order the two FAs FA1, FA2 = take(twoFAs, argsort([FA.name for FA in twoFAs])) logger.info("\tbetween %s and %s for %s" % (FA1.name, FA2.name, aspect)) commonGene = self.getCommonGene(FA1, FA2, aspect) D = dict() for g in commonGene: D1 = mean([ funcSim[a][(FA1.name, FA2.name)][g][0] for a in allAspect if not a == "All_aspects_of_GO" if funcSim[a][(FA1.name, FA2.name)].has_key(g) ]) D2 = mean([ funcSim[a][(FA1.name, FA2.name)][g][1] for a in allAspect if not a == "All_aspects_of_GO" if funcSim[a][(FA1.name, FA2.name)].has_key(g) ]) D[g] = (D1, D2) funcSim[aspect][(FA1.name, FA2.name)] = D self['funcSim'] = funcSim
def add(self, statistics, plotType): logger.info("Registering plot function %s" % statistics) if not rS.isRegistered(statistics): logger.handleWarning("Caution, the statistics is unknown : %s" % statistics) return False self.all.append(statistics) if plotType=="Multiple": newPlotFunc = self.getMultiPlotFunction(statistics) newPlotFunc = new.instancemethod(newPlotFunc, None, self.cls) setattr(self.cls, newPlotFunc.__name__, newPlotFunc) elif plotType=="Histo3D": newPlotFunc = self.getHisto3DPlotFunction(statistics) newPlotFunc = new.instancemethod(newPlotFunc, None, self.cls) setattr(self.cls, "%sHisto3D" % newPlotFunc.__name__, newPlotFunc) else: logger.handleWarning("Caution, the statistics plot type is unknown : %s" % plotType) pass self.types.setdefault(statistics, set()).add(plotType) self.allTypes.add(plotType) return True
def Multiple(self, allFA, statistics=None, figName=None, lloc="upper right", doGrid=False): lLabel=[aspect.replace("_", " ") for aspect in allAspect] logger.info("\t%s" % statistics) lBar=list() for aspect in allAspect: data=[] for FA in allFA: if type(FA[statistics][aspect]) == list: data.append(mean(FA[statistics][aspect])) else: data.append(FA[statistics][aspect]) lBar.append( data ) multiBar(lBar, lLabel, [FA.name for FA in allFA], self.xlabel, "%s %s" % (rS.getName(statistics), rS.getUnit(statistics)), lloc=lloc, grid=doGrid) if figName is None: figName="%s/Multi_%s_%s.%s" % (self.outDir, statistics, self.name, self.ext) savefig(figName) return figName
def add(self, statistics, name, unit="", types=None): logger.info("Registering statistics function %s" % statistics) self.all.append(statistics) self.name[statistics]=name self.unit[statistics]=unit self.types[statistics]=types
def checkValidity(self): logger.info("Name :\t%s" % self.name) allValid=True for aspect in self.GPtoGO: valid=True for gp in self.GPtoGO[aspect]: for go in self.GPtoGO[aspect][gp]: if not gp in self.GOtoGP[aspect][go]: logger.handleWarning ("%s not found in GOtoGP[%s][%s]" % (gp,aspect,go)) valid=False for go in self.GOtoGP[aspect]: for gp in self.GOtoGP[aspect][go]: if not go in self.GPtoGO[aspect][gp]: logger.handleWarning ("%s not found in GPtoGO[%s][%s]" % (go,aspect,gp)) valid=False if valid: logger.info ("%s : is valid" % (aspect)) allValid = allValid and valid return allValid
def save(self, fileName): import shelve try: logger.info("File :\t%s" % fileName) shelf = shelve.open(fileName, protocol=-1) shelf['fileName'] = fileName for k, v in self.items(): shelf[k] = v shelf.close() self.status = "Saved" except Exception, e: logger.handleWarning("Unable to save project %s: %s" % (fileName, str(e)))
def load(self, fileName): import shelve try: logger.info("File :\t%s" % fileName) shelf = shelve.open(fileName, protocol=-1) for k, v in shelf.items(): self[k] = v shelf.close() self.status = "Loaded" except Exception, e: logger.handleWarning("Unable to load project %s: %s" % (fileName, str(e)))
def read(self, fileName="", fileType=""): """ This method reads functional annotations. Available file format are GO annotation file (GAF), Blast2GO (B2G), Affymetrix,(AFFY) , ArrayIDer (AID) , GP2GO or GO2GP for a simple GPid to GOids mapping """ logger.info("Name :\t%s" % self.name) if not fileName=="": self.__dict__['fileName'] = fileName if not fileType=="": self.__dict__['fileType'] = fileType fileType=IO.IOType.get(self.fileType, self.fileType) if fileType not in IO.IOType.values(): print "Sorry, unknown file type : %s" % fileType raise ValueError logger.info("%s file : \t%s " % (fileType, fileName ) ) try: if fileType=="GAF": self.GPtoGO, self.GOtoGP = IO.extract_GAF(self.fileName, self.G, refSet=self.refSet) elif fileType=="B2G": self.GPtoGO, self.GOtoGP = IO.extract_GP2GO(self.fileName, self.G, refSet=self.refSet) elif fileType=="AFFY": self.GPtoGO, self.GOtoGP = IO.extract_Affy(self.fileName, self.G, refSet=self.refSet) elif fileType=="AID": self.GPtoGO, self.GOtoGP = IO.extract_AID(self.fileName, self.G, refSet=self.refSet) elif fileType=="SCOP": self.GPtoGO, self.GOtoGP = IO.extract_SCOP(self.fileName, self.G, refSet=self.refSet) elif fileType=="GNISD": #gene networks in seed development format http://seedgenenetwork.net/annotate#arabidopsis self.GPtoGO, self.GOtoGP = IO.extract_Affy(self.fileName, self.G, refSet=self.refSet, GO_columns=[6, 7, 8], filetype="GNIS-Affy", delimiter=' ', quoting=csv.QUOTE_MINIMAL) elif fileType=="GP2GO": self.GPtoGO, self.GOtoGP = IO.extract_GP2GO(self.fileName, self.G, refSet=self.refSet) elif fileType=="GO2GP": self.GPtoGO, self.GOtoGP = IO.extract_GO2GP(self.fileName, self.G, refSet=self.refSet) except Exception, e: logger.handleFatal("Unable to read file %s: %s" % (fileName, str(e)))
def add(self, FA): logger.info("Name :\t%s" % self.name) for aspect in FA.GPtoGO: if not self.GPtoGO.has_key(aspect): self.GPtoGO[aspect]=dict() for gp in FA.GPtoGO[aspect]: self.GPtoGO[aspect].setdefault(gp, set()).update(FA.GPtoGO[aspect][gp]) for aspect in FA.GOtoGP: if not self.GOtoGP.has_key(aspect): self.GOtoGP[aspect]=dict() for go in FA.GOtoGP[aspect]: self.GOtoGP[aspect].setdefault(go, set()).update(FA.GOtoGP[aspect][go]) self['GA']=set() for a in self.G.aspect: self['GA']=self['GA'] | set(self.GPtoGO[a].keys()) logger.info ("%d gene products are annotated" % (len(self['GA']))) self.status="Loaded" for a in self.G.aspect: logger.info ("%s : %.2f annotations per set" % (a, mean([len(self.GPtoGO[a][gp]) for gp in self.GPtoGO[a]])))
def inter(self, FA): logger.info("Name :\t%s" % self.name) for aspect in self.GPtoGO: if not FA.GPtoGO.has_key(aspect): self.GPtoGO[aspect]=dict() else: for gp in set(self.GPtoGO[aspect].keys()).difference(FA.GPtoGO[aspect]): del self.GPtoGO[aspect][gp] for gp in set(self.GPtoGO[aspect].keys()).intersection(FA.GPtoGO[aspect]): self.GPtoGO[aspect][gp].intersection_update(FA.GPtoGO[aspect][gp]) for aspect in self.GOtoGP: if not FA.GOtoGP.has_key(aspect): self.GOtoGP[aspect]=dict() else: for go in set(self.GOtoGP[aspect].keys()).difference(FA.GOtoGP[aspect].keys()): del self.GOtoGP[aspect][go] for go in set(self.GOtoGP[aspect].keys()).intersection(FA.GOtoGP[aspect].keys()): self.GOtoGP[aspect][go].intersection_update(FA.GOtoGP[aspect][go]) self['GA']=set() for a in self.G.aspect: self['GA']=self['GA'] | set(self.GPtoGO[a].keys()) logger.info ("%d gene products are annotated" % (len(self['GA']))) self.status="Loaded" for a in self.G.aspect: logger.info ("%s : %.2f annotations per set" % (a, mean([len(self.GPtoGO[a][gp]) for gp in self.GPtoGO[a]])))
def removeGP(self, GP, myAspects=None): logger.info("Name :\t%s" % self.name) if myAspects==None: myAspects=self.GPtoGO for aspect in myAspects: for g in GP: if self.GPtoGO[aspect].has_key(g): del self.GPtoGO[aspect][g] self.GOtoGP=dict() for aspect in self.G.aspect: self.GOtoGP[aspect]=dict() for gp in self.GPtoGO[aspect]: for go in self.GPtoGO[aspect][gp]: self.GOtoGP[aspect].setdefault(go, set()).add(gp) self['GA']=set() for a in self.G.aspect: self['GA']=self['GA'] | set(self.GPtoGO[a].keys()) logger.info ("%d gene products are annotated" % (len(self['GA']))) self.status="Loaed" for a in self.G.aspect: logger.info ("%s : %.2f annotations per set" % (a, mean([len(self.GPtoGO[a][gp]) for gp in self.GPtoGO[a]])))
def loadFA(G, norganism, dbcur, drepli, drepli_lab, taxid, aspects=aspects, metrics=metrics, analysisList=analysisList): inrefset = set([]) for repli in drepli_lab: inrefset |= set(drepli_lab[repli]) refSet = RefSet(organism=norganism, inSet=inrefset, refType="DB") FA = FuncAnnot(norganism, refSet, G, organism=norganism) FA.read_from_db(dbcur, replicons=drepli.keys()) analyseFA = AnalyseFA() #print FA.GPtoGO['biological_process'].keys() analyseFA.largestSet([FA]) logger.info("Largest sets of annotations:") logger.info("\t%d for %s" % (FA['largestSet']['All_aspects_of_GO'], FA.name)) batchExecute(analysisList, analyseFA, [FA]) #~ drepli_lab = {} #~ genelabeldir = "%s/genelabels/%s"%(outdir, norganism) #~ nflabels = "%s/%s_all_gene_labels"%(genelabeldir, norganism) #~ flab = open(nflabels, 'r') #~ for line in flab: #~ lsp = line.rstrip('\n').split('\t') #~ drepli_lab[lsp[0]] = drepli_lab.setdefault(lsp[0], []) + ["%s.%s"%(str(taxid), lsp[1])] #[lsp[1]] #~ flab.close() return FA #, drepli_lab
def venn_NS(v, tit=None): logger.info(tit) idx = argsort([len(key) for key in v.keys()]) for key in array(v.keys())[idx]: logger.info("%s \t: %.2f" % (key.replace('@', '\t ^ '), v[key])) logger.info("TOTAL\t: %.2f" % (100.0 - sum(v.values())))
def recall(self, allFA): """ Verspoor et al. (2006) hierarchical recall """ #The first FA is used as a Gold Standard GS = allFA[0] recall = dict() for aspect in allAspect: if aspect == "All_aspects_of_GO": continue recall[aspect] = dict() for FA in allFA[1:]: recall[aspect][(FA.name, GS.name)] = dict() logger.info("\t%s vs %s for %s" % (FA.name, GS.name, aspect)) commonGene = self.getCommonGene(FA, GS, aspect) for g in commonGene: r = list() for gs in GS.GPtoGO[aspect][g]: aGS = set(FA.G.get_Ancestors(gs)) naGS = len(aGS) maxSim = max([ (1.0 * len(aGS.intersection(FA.G.get_Ancestors(go)))) / naGS for go in FA.GPtoGO[aspect][g] ]) r.append(maxSim) recall[aspect][(FA.name, GS.name)][g] = mean(r) self['recall'] = recall
def readGOoboXML(fileName, force=False, prefix="GO"): import cPickle as pickle picName = "%s.pic" % fileName if (not os.path.exists(picName)): force = True if not force: try: logger.info("Reading serialized OBO file : %s" % picName) with open(picName, "rb") as f: G = pickle.load(f) f.close() except IOError as (inst): print str(type(inst)) + " for " + picName force = True except EOFError as (inst): ##its an empty file? print str(type(inst)) + " for " + picName force = True try: if force: fileName = checkForZip(fileName) if (not os.path.exists(fileName)): raise IOError(fileName + " does not exist and is required ") logger.info("Reading OBO file : %s" % fileName) G = get_GOGraph(readFile(fileName, mode="r"), prefix=prefix) G.fileName = fileName with open(picName, "wb") as f: logger.info("Saving serialized OBO file") pickle.dump(G, f, -1) f.close() except Exception, e: logger.handleFatal("Unable to read file %s: %s" % (fileName, str(e)))
from AIGO.ReferenceSet import RefSet from AIGO.FunctionalAnnotation import FuncAnnot from AIGO.go.OBO import readGOoboXML from AIGO.Analyse import AnalyseFA from AIGO.Report import ReportFA from AIGO.utils.Execute import batchExecute refSet = RefSet(organism="platypus", fileName="platypus.refSet", refType="Text") G = readGOoboXML("go_daily-termdb.obo-xml") FA = FuncAnnot("platypusProject", refSet, G, organism="platypus") FA.read("platypus.gaf", "GAF") analyseFA = AnalyseFA() analyseFA.largestSet([FA]) logger.info("Largest sets of annotations:") logger.info("\t%d for %s" % (FA['largestSet']['All_aspects_of_GO'], FA.name)) batchList = [ "coverage", "richness", "numberAnnot", "redundancy", "specificity", "informationContent", "hPrecision" ] batchExecute(batchList, analyseFA, [FA]) reportFA = ReportFA(outDir=None, name="platypusProject", organism="platypus") reportFA.printStatistics([FA], batchList)
def add(self, fileName, refType="Fasta"): if self.fileName == '': self.fileName = fileName self.refType = refType else: if type(self.fileName) == list: self.fileName.append(fileName) self.refType.append(refType) else: self.fileName = [self.fileName, fileName] self.refType = [self.refType, refType] fileName = checkForZip(fileName) if (not os.path.exists(fileName)): logger.handleFatal(fileName + " does not exist and is required ") logger.info("Organism :\t%s" % self.organism) logger.info("%s file :\t%s " % (refType, fileName)) try: #Use fasta file to define the reference set if refType == "Fasta": from Bio import SeqIO allID = set([ rec.name.split(";")[0].split(":")[-1] for rec in SeqIO.parse(readFile(fileName), "fasta") ]) self.update(allID) #Use a simple text file to define the reference set, first column is chosen by default elif refType == "Text": allID = set([ r[0] for r in csv.reader(readFile(fileName), delimiter=";") ]) self.update(allID) #Use a GO annotation file to define the reference set elif refType == "GAF": from AIGO.IO import readGAF_2 data, GAF_col = readGAF_2(fileName) allID = set([ ".".join([ row[GAF_col.index("Taxon(|taxon)")][6:], row[GAF_col.index("DB Object Symbol")] ]) for row in data ]) self.update(allID) #Use a Affymetrix annotation file to define the reference set elif refType == "AFFY": f = readFile(fileName) row = f.readline() while row[0] == '#': row = f.readline() header = row rd = csv.reader(f) allID = set() for row in rd: #Read gene product id if not control sequence if ("Control sequence".upper() != row[4].upper()): allID.add(row[0]) self.update(allID) else: print "Sorry, unknown file type !!" self.extend([]) raise Exception if len(self) == 0: logger.handleWarning("No gene products loaded") except Exception, e: logger.handleFatal("Unable to read file %s: %s" % (fileName, str(e)))
def compareEvidence(projectDir): """ This function compare electronically infered and manually curated annotations to experimental annotations """ projectName = "EvidenceCode" organism = "allSpecies" logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info( "This function compare electronically infered and manually curated annotations to experimental annotations." ) logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info("name of the project : %s " % projectName) refSet = RefSet(organism=organism) allOrg = [ "Arabidopsis_thaliana", "Drosophila_melanogaster", "Mycobacterium_tuberculosis_ATCC_25618", "Schizosaccharomyces_pombe", "Bos_taurus", "Escherichia_coli_ATCC_27325", "Mycobacterium_tuberculosis_Oshkosh", "Caenorhabditis_elegans", "Escherichia_coli_MG1655", "Oryza_sativa", "Synechocystis_sp", "Candida_albicans_SC5314", "Gallus_gallus", "Pseudomonas_fluorescens_Pf-5", "Danio_rerio", "Homo_sapiens", "Rattus_norvegicus" ] for refOrg in allOrg: #Define the set of gene products fileName = "%s/EvidenceCode/%s/two_experimental_evidence.goa" % ( projectDir, refOrg) refSet.add(fileName=fileName, refType="GAF") #Read GO ontoloy fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir) G = readGOoboXML(fileName, force=False) #Read all annotations fileType = "GAF" evidenceCodes = ["EXP2", "IC", "TAS", "ISS", "NAS", "IEA"] allFA = dict() #----------------------------------------------- #Read Functional annotations obtained by experiments pipeName = "EXP2" EXP2 = FuncAnnot(pipeName, refSet, G, organism=organism) for refOrg in allOrg: fileName = "%s/EvidenceCode/%s/two_experimental_evidence.goa" % ( projectDir, refOrg) FA = FuncAnnot(pipeName, refSet, G, organism=refOrg) FA.read(fileName, fileType=fileType) EXP2.add(FA) allFA[pipeName] = EXP2 #----------------------------------------------- #Read Functional annotations obtained by human curation for pipeName in ["IC", "TAS", "ISS", "NAS"]: EV = FuncAnnot(pipeName, refSet, G, organism=organism) for refOrg in allOrg: fileName = "%s/EvidenceCode/%s/%s.goa" % (projectDir, refOrg, pipeName) if not os.path.exists(fileName): continue FA = FuncAnnot(pipeName, refSet, G, organism=refOrg) FA.read(fileName, fileType=fileType) EV.add(FA) allFA[pipeName] = EV #Merge FAs Assigned by Human Curator FA = FuncAnnot("AHC", refSet, G, organism=organism) #for evidence in ["IC", "TAS", "ISS", "NAS"]: for evidence in ["IC", "ISS", "NAS"]: FA.add(allFA[evidence]) allFA["AHC"] = FA #----------------------------------------------- #Read Functional annotations obtained without human curation for pipeName in ["IEA"]: EV = FuncAnnot(pipeName, refSet, G, organism=organism) for refOrg in allOrg: fileName = "%s/EvidenceCode/%s/%s.goa" % (projectDir, refOrg, pipeName) if not os.path.exists(fileName): continue FA = FuncAnnot(pipeName, refSet, G, organism=refOrg) FA.read(fileName, fileType=fileType) EV.add(FA) allFA[pipeName] = EV #----------------------------------------------- listFA = ["EXP2", "AHC", "IEA"] #Analyse Functional annotations analyseFA = AnalyseFA() batchList = [ "obsolete", "unconnected", "removeUnconnected", "coverage", "richness", "numberAnnot", "coherence", "redundancy", "removeRedundancy", "compactness", "specificity", "informationContent" ] batchExecute(batchList, analyseFA, [allFA[evidence] for evidence in listFA]) #Plot statistics of Functional annotations outDir = "%s/Graph/%s" % (projectDir, organism) createDir(outDir) plotFA = PlotFA(xlabel="Evidence Codes", outDir=outDir, name=projectName, organism=organism) batchExecute(batchList, plotFA, [allFA[evidence] for evidence in listFA], doGrid=True) batchList = ["coherenceHisto2D", "numberAnnotHisto2D"] batchExecute(batchList, plotFA, [allFA[evidence] for evidence in listFA], doGrid=True) #Compare Functional annotations compareFA = CompareFA() batchList = ["venn", "funcSim"] batchExecute(batchList, compareFA, [allFA[evidence] for evidence in listFA]) batchList = ["recall", "precision"] batchExecute(batchList, compareFA, [allFA[evidence] for evidence in listFA]) #Plot statistics of the comparison between Functional annotations batchList = ["venn", "funcSymSim"] batchExecute(batchList, plotFA, compareFA, [allFA[evidence] for evidence in listFA]) batchList = ["recall", "precision"] batchExecute(batchList, plotFA, compareFA, [allFA[evidence] for evidence in listFA]) #----------------------------------------------- #Export statistics to Excel outDir = "%s/Export/%s" % (projectDir, organism) createDir(outDir) exportList = [ "unconnected", "coverage", "richness", "numberAnnot", "coherence", "compactness", "specificity", "informationContent", "redundancy" ] reportFA = ReportFA(outDir=outDir, name=projectName, organism=organism) reportFA.printStatistics([allFA[evidence] for evidence in listFA], exportList) reportFA.saveStatistics([allFA[evidence] for evidence in listFA], exportList) #----------------------------------------------- # Invididual contributions of evidence codes logger.info("=================================================") logger.info("Invididual contributions of evidence codes") contribution = dict() for ec in ["IC", "TAS", "ISS", "NAS", "IEA"]: contribution[ec] = set([ (gp, go) for aspect in ["cellular_component", "molecular_function", "biological_process"] for gp in allFA[ec].GPtoGO[aspect] for go in allFA[ec].GPtoGO[aspect][gp] ]) total_Annotation = sum( [len(contribution[ec]) for ec in ["IC", "TAS", "ISS", "NAS"]]) for ec in ["IC", "TAS", "ISS", "NAS"]: logger.info("\t%.02f %% of the annotations are supported by %s" % (100. * len(contribution[ec]) / total_Annotation, ec)) batchList = ["recall", "precision"] batchExecute(batchList, compareFA, [ allFA[evidence] for evidence in ["EXP2", "ISS", "TAS", "NAS", "IC", "AHC", "IEA"] ]) logger.info("Done") #----------------------------------------------- #Plotting precision and recall for each evidence code logger.info("=================================================") logger.info("Plotting precision and recall for each evidence code") reference = "EXP2" #plotEvidence=["AHC", "IEA"] plotEvidence = ["ISS", "TAS", "NAS", "IC", "AHC", "IEA"] evidenceMarker = dict(zip(plotEvidence, ['s', 'd', 'D', '*', 'p', 'h'])) evidenceSize = dict(zip(plotEvidence, [8, 8, 8, 8, 15, 15])) aspectColor = dict(zip(allAspect, ["blue", "green", "red", "cyan"])) fig = figure(figsize=(8, 8)) for evidence in plotEvidence: for aspect in allAspect: if aspect == "All_aspects_of_GO": continue allX = compareFA['recall'][aspect][(evidence, reference)].values() allY = compareFA['precision'][aspect][(evidence, reference)].values() meanX = mean(allX) errX = std(allX) / sqrt(len(allX)) meanY = mean(allY) errY = std(allY) / sqrt(len(allY)) errorbar(meanX, meanY, xerr=errX, yerr=errY, alpha=0.9, hold=True, mfc=aspectColor[aspect], ecolor=aspectColor[aspect], marker=evidenceMarker[evidence], ms=evidenceSize[evidence]) xlabel("Verspoor Hierarchical Recall") ylabel("Verspoor Hierarchical Precision") allMarker = ['o', 'o', 'o', 's', 'd', 'D', '*', 'p', 'h'] allColor = [ "green", "red", "cyan", "white", "white", "white", "white", "white", "white" ] allLabel = [ aspect.replace("_", " ") for aspect in allAspect if not aspect == "All_aspects_of_GO" ] allLabel.extend(plotEvidence) foo = [ Line2D(arange(5), arange(5), ls='-', marker=m, color=c, label=l) for m, c, l in zip(allMarker, allColor, allLabel) ] leg = legend(foo, allLabel, loc="upper left", numpoints=1) leg.legendPatch.set_alpha(0.5) grid() outDir = "%s/Graph/%s" % (projectDir, organism) createDir(outDir) figName = "%s/PrecisionVSRecall.png" % outDir savefig(figName) logger.info("Done")
def GOFrequencyBovinePipelines(projectDir): """ This function plot the frequency of GO terms in three bovine functional annotation """ projectName = "bovinePipeline" organism = "bovine" logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info( "This function plot the frequency of GO terms from 3 functional annotations for a Bovine array" ) logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info("name of the project : %s " % projectName) #Read bovine microarray probe set to define the set of gene products fileName = "%s/ReferenceSet/%s.fasta" % (projectDir, organism) refSet = RefSet(organism=organism, fileName=fileName, refType="Fasta") #Read GO ontoloy fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir) G = readGOoboXML(fileName, force=False) #Read Functional annotations allFileName = list() allFileName.append("%s/Annotation/Affy_%s.na31.annot.csv" % (projectDir, organism)) allFileName.append("%s/Annotation/B2G_%s.annot" % (projectDir, organism)) allFileName.append("%s/Annotation/AID_%s.txt" % (projectDir, organism)) allPipeName = ["AFFY", "B2G", "AID"] allFileType = allPipeName pipeline = dict() for pipeName, fileName, fileType in zip(allPipeName, allFileName, allFileType): FA = FuncAnnot(pipeName, refSet, G, organism=organism) FA.read(fileName, fileType=fileType) pipeline[pipeName] = FA #---------------------------------------------- #Plot frequency of GO terms in a radial grah outDir = "%s/Graph/%s" % (projectDir, organism) logger.info("=================================================") logger.info("Plotting frequency of GO terms") logger.info("directory : %s" % outDir) for aspect in G.aspect: logger.info("%s : " % aspect) A = None for pipeName in allPipeName: l = array([ log(1 + len(pipeline[pipeName].GOtoGP[aspect].get(go, []))) for go in G.get_NodesfromAspect(aspect) ]) l = l / max(l) * 256. l = [int(round(n)) for n in l] freq = dict([(n, c) for n, c in zip(G.get_NodesfromAspect(aspect), l)]) figName = "%s/Frequency_%s_%s.png" % ( outDir, pipeline[pipeName].name, aspect) A = G.plot_FrequencyGraph(aspect, freq, figName=figName, ttl="", graphviz=A) logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info("")
def worseFunctionalSimilarity(projectDir): """ This function identifies the ten most different annotation sets between Affymetrix and Blast2GO for a Bovine array """ from AIGO.Similarity import GOSet_PWSimilarity from itertools import izip projectName = "bovinePipeline" organism = "bovine" logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info( "This function identifies the ten most different annotation sets between Affymetrix and Blast2GO for a Bovine array" ) logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info("name of the project : %s " % projectName) #Read bovine microarray probe set to define the set of gene products fileName = "%s/ReferenceSet/%s.fasta" % (projectDir, organism) refSet = RefSet(organism=organism, fileName=fileName, refType="Fasta") #Read GO ontoloy fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir) G = readGOoboXML(fileName, force=False) fileName = "%s/Annotation/Affy_%s.na31.annot.csv" % (projectDir, organism) FA1 = FuncAnnot("AFFY", refSet, G, organism=organism) FA1.read(fileName, fileType="AFFY") fileName = "%s/Annotation/B2G_%s.annot" % (projectDir, organism) FA2 = FuncAnnot("B2G", refSet, G, organism=organism) FA2.read(fileName, fileType="B2G") #Analyse Functional annotations analyseFA = AnalyseFA() batchExecute(["removeUnconnected"], analyseFA, [FA1, FA2]) outDir = "%s/Graph/%s/WorseFuncSim" % (projectDir, organism) createDir(outDir) N = 10 logger.info("=================================================") logger.info("Plotting the %d most dissimilar annotation sets" % N) logger.info("directory : %s" % outDir) for aspect in G.aspect: commonGene = set(FA1.GPtoGO[aspect].keys()).intersection( FA2.GPtoGO[aspect].keys()) logger.info("%s : processing %d annotation sets " % (aspect, len(commonGene))) allD1, allD2 = list(), list() for i, g in enumerate(commonGene): sim, l = GOSet_PWSimilarity(G, FA1.GPtoGO[aspect][g], FA2.GPtoGO[aspect][g]) allD1.append(l[0]) allD2.append(l[1]) allD = map(lambda D: ((array(D[0]) + array(D[1])) / 2.), izip(allD1, allD2)) idx = argsort(allD) for i in arange(0, N): gp = list(commonGene)[idx[i]] figName = "%s/%s_annotation_%s_from_%s_%s.png" % ( outDir, aspect, gp, FA1.name, FA2.name) ttl = "%s annotations of %s from %s (green) and %s (red) : Functional similarity = %.2f" % ( aspect.replace("_", " "), gp, FA1.name, FA2.name, allD[idx[i]]) FA1.G.compare_InducedGraph(FA1.GPtoGO[aspect][gp], FA2.GPtoGO[aspect][gp], figName=figName, ttl=ttl) logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info("")
elif fileType=="AFFY": self.GPtoGO, self.GOtoGP = IO.extract_Affy(self.fileName, self.G, refSet=self.refSet) elif fileType=="AID": self.GPtoGO, self.GOtoGP = IO.extract_AID(self.fileName, self.G, refSet=self.refSet) elif fileType=="SCOP": self.GPtoGO, self.GOtoGP = IO.extract_SCOP(self.fileName, self.G, refSet=self.refSet) elif fileType=="GNISD": #gene networks in seed development format http://seedgenenetwork.net/annotate#arabidopsis self.GPtoGO, self.GOtoGP = IO.extract_Affy(self.fileName, self.G, refSet=self.refSet, GO_columns=[6, 7, 8], filetype="GNIS-Affy", delimiter=' ', quoting=csv.QUOTE_MINIMAL) elif fileType=="GP2GO": self.GPtoGO, self.GOtoGP = IO.extract_GP2GO(self.fileName, self.G, refSet=self.refSet) elif fileType=="GO2GP": self.GPtoGO, self.GOtoGP = IO.extract_GO2GP(self.fileName, self.G, refSet=self.refSet) except Exception, e: logger.handleFatal("Unable to read file %s: %s" % (fileName, str(e))) else: #Find the set of annotated gene products self['GA']=set() for a in self.G.aspect: self['GA']=self['GA'] | set(self.GPtoGO[a].keys()) logger.info ("%d gene products are annotated" % (len(self['GA']))) self.status="Loaded" for a in self.G.aspect: logger.info ("%s : %.2f annotations per set" % (a, mean([len(self.GPtoGO[a][gp]) for gp in self.GPtoGO[a]])))
def compareRandomizePipelines(projectDir): """ This function compare the properties of 3 randomized functional annotations for a Bovine array. """ projectName = "randomizePipeline" organism = "bovine" logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info( "This function compare the properties of 3 randomized functional annotations for a Bovine array." ) logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info("name of the project : %s " % projectName) #Read rice microarray target sequence to define the set of gene products fileName = "%s/ReferenceSet/%s.fasta" % (projectDir, organism) refSet = RefSet(organism=organism, fileName=fileName, refType="Fasta") #Read GO ontoloy fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir) G = readGOoboXML(fileName, force=False) #Read Functional annotations allFileName = list() allFileName.append("%s/Annotation/Affy_%s.na31.annot.csv" % (projectDir, organism)) allFileName.append("%s/Annotation/B2G_%s.annot" % (projectDir, organism)) allFileName.append("%s/Annotation/AID_%s.txt" % (projectDir, organism)) allPipeName = ["AFFY", "B2G", "AID"] allFileType = allPipeName pipeline = dict() for pipeName, fileName, fileType in zip(allPipeName, allFileName, allFileType): FA = FuncAnnot(pipeName, refSet, G, organism=organism) FA.read(fileName, fileType=fileType) pipeline[pipeName] = FA # Randomize FA randomizeFA = RandomizeFA() #----------------------------------------------- # Shuffle functional annotation batchList = ["shuffleAnnotation"] batchExecute(batchList, randomizeFA, [pipeline[pipeName] for pipeName in allPipeName]) #Analyse Functional annotations analyseFA = AnalyseFA() batchList = ["coherence", "redundancy"] batchExecute(batchList, analyseFA, [pipeline[pipeName] for pipeName in allPipeName]) #Export statistics to Excel outDir = "%s/Export/%s" % (projectDir, organism) createDir(outDir) exportList = ["coherence", "redundancy"] report = ReportFA(name="Randomize shuffle", outDir=outDir, organism=organism) report.printStatistics([pipeline[pipeName] for pipeName in allPipeName], exportList) report.saveStatistics([pipeline[pipeName] for pipeName in allPipeName], exportList) #----------------------------------------------- # Resample functional annotation batchList = ["sampleAnnotation"] batchExecute(batchList, randomizeFA, [pipeline[pipeName] for pipeName in allPipeName]) #Analyse Functional annotations #batchList=["obsolete", "unconnected", "removeUnconnected", "coverage", "richness", "numberAnnot", "coherence", "redundancy", "compactness", "specificity", "informationContent"] batchList = [ "obsolete", "unconnected", "removeUnconnected", "coverage", "richness", "numberAnnot", "redundancy", "specificity", "informationContent" ] batchExecute(batchList, analyseFA, [pipeline[pipeName] for pipeName in allPipeName]) #Export statistics to Excel outDir = "%s/Export/%s" % (projectDir, organism) createDir(outDir) #exportList=["coverage", "numberAnnot", "richness", "coherence", "compactness", "specificity", "informationContent", "redundancy"] exportList = [ "coverage", "numberAnnot", "richness", "specificity", "informationContent", "redundancy" ] report = ReportFA(name="Randomize sample", outDir=outDir, organism=organism) report.printStatistics([pipeline[pipeName] for pipeName in allPipeName], exportList) report.saveStatistics([pipeline[pipeName] for pipeName in allPipeName], exportList) logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info("")
def compareBovinePipelines(projectDir): """ This function compare the properties of 3 functional annotations for a Bovine array. """ projectName = "bovinePipeline" organism = "bovine" logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info( "This function compare the properties of 3 functional annotations for a Bovine array." ) logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info("name of the project : %s " % projectName) #Read bovine microarray probe set to define the set of gene products fileName = "%s/ReferenceSet/%s.fasta" % (projectDir, organism) refSet = RefSet(organism=organism, fileName=fileName, refType="Fasta") #Read GO ontoloy fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir) G = readGOoboXML(fileName, force=False) #Read Functional annotations allFileName = list() allFileName.append("%s/Annotation/Affy_%s.na31.annot.csv" % (projectDir, organism)) allFileName.append("%s/Annotation/B2G_%s.annot" % (projectDir, organism)) allFileName.append("%s/Annotation/AID_%s.txt" % (projectDir, organism)) allPipeName = ["AFFY", "B2G", "AID"] allFileType = allPipeName pipeline = dict() for pipeName, fileName, fileType in zip(allPipeName, allFileName, allFileType): FA = FuncAnnot(pipeName, refSet, G, organism=organism) FA.read(fileName, fileType=fileType) pipeline[pipeName] = FA #----------------------------------------------- #Analyse Functional annotations analyseFA = AnalyseFA() #batchList=["obsolete", "unconnected", "removeUnconnected", "coverage", "richness", "numberAnnot", "coherence", "redundancy", "compactness", "specificity", "informationContent"] batchList = [ "obsolete", "unconnected", "removeUnconnected", "coverage", "richness", "numberAnnot", "redundancy", "specificity", "informationContent" ] batchExecute(batchList, analyseFA, [pipeline[pipeName] for pipeName in allPipeName]) #How big are the largest annotation sets ? analyseFA.largestSet([pipeline[pipeName] for pipeName in allPipeName]) logger.info("The largest sets of annotations are :") for pipeName in allPipeName: FA = pipeline[pipeName] logger.info("\t%d for %s" % (FA['largestSet']['All_aspects_of_GO'], FA.name)) #Plot statistics of Functional annotations outDir = "%s/Graph/%s" % (projectDir, organism) createDir(outDir) plotFA = PlotFA(xlabel="Annotation pipelines", outDir=outDir, name=projectName, organism=organism, ext="png") batchExecute(batchList, plotFA, [pipeline[pipeName] for pipeName in allPipeName], doGrid=True) #batchList=["coherenceHisto2D", "numberAnnotHisto2D"] batchList = ["numberAnnotHisto2D"] batchExecute(batchList, plotFA, [pipeline[pipeName] for pipeName in allPipeName], doGrid=True, tit="") #----------------------------------------------- #Compare Functional annotations compareFA = CompareFA() batchList = ["venn", "funcSim"] batchExecute(batchList, compareFA, [pipeline[pipeName] for pipeName in allPipeName]) #Plot statistics of the comparison between Functional annotations batchList = ["venn", "funcSymSim"] batchExecute(batchList, plotFA, compareFA, [pipeline[pipeName] for pipeName in allPipeName], tit="") #----------------------------------------------- #Export statistics to Excel outDir = "%s/Export/%s" % (projectDir, organism) createDir(outDir) #exportList=["unconnected", "coverage", "richness", "numberAnnot", "coherence", "compactness", "specificity", "informationContent", "redundancy"] exportList = [ "unconnected", "coverage", "richness", "numberAnnot", "specificity", "informationContent", "redundancy" ] reportFA = ReportFA(outDir=outDir, name=projectName, organism=organism) reportFA.printStatistics([pipeline[pipeName] for pipeName in allPipeName], exportList) reportFA.saveStatistics([pipeline[pipeName] for pipeName in allPipeName], exportList) logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info("")
def compareSimilarity(projectDir): """ This function compare Similarity measures. """ projectName = "simPipeline" organism = "bovine" #Read bovine microarray probe set to define the set of gene products fileName = "%s/ReferenceSet/%s.fasta" % (projectDir, organism) refSet = RefSet(organism=organism, fileName=fileName, refType="Fasta") #Read GO ontoloy fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir) G = readGOoboXML(fileName, force=False) #Read Functional annotations allFileName = list() allFileName.append("%s/Annotation/Affy_%s.na31.annot.csv" % (projectDir, organism)) allFileName.append("%s/Annotation/B2G_%s.annot" % (projectDir, organism)) allPipeName = ["AFFY", "B2G"] allFileType = allPipeName pipeline = dict() for pipeName, fileName, fileType in zip(allPipeName, allFileName, allFileType): FA = FuncAnnot(pipeName, refSet, G, organism=organism) FA.read(fileName, fileType=fileType) pipeline[pipeName] = FA #----------------------------------------------- #Analyse Functional annotations analyseFA = AnalyseFA() batchList = ["removeUnconnected"] batchExecute(batchList, analyseFA, [pipeline[pipeName] for pipeName in allPipeName]) #Compute information content logger.info("=================================================") logger.info("Computing Information Content") allIC = dict() for pipeName in allPipeName: FA = pipeline[pipeName] logger.info("\t%s" % FA.name) allIC[pipeName] = dict() for a in FA.G.aspect: allIC[pipeName][a] = dict() for go in FA.GOtoGP[a]: n = len(FA.GOtoGP[a][go]) for ans in FA.G.ancestors(FA.G.get_intid(go)): allIC[pipeName][a][ans] = allIC[pipeName][a].get(ans, 0) + n for a in FA.G.aspect: if len(allIC[pipeName][a].values()) == 0: continue m = max(allIC[pipeName][a].values()) for go in allIC[pipeName][a]: allIC[pipeName][a][go] = -1. * log( 1. * allIC[pipeName][a][go] / m) #Compare coherence of biological process annotation sets in AFFY given by three different similarity metrics logger.info("=================================================") logger.info( "Computing functional coherence of biological process annotation sets in AFFY given by three different similarity metrics" ) aspect = "biological_process" pipeName = "B2G" FA = pipeline[pipeName] logger.info("\tGS2") allGS2 = [ mean(GOSet_Similarity(G, FA.GPtoGO[aspect][gp], metric="GS2")) for gp in FA.GPtoGO[aspect] if len(FA.GPtoGO[aspect][gp]) > 1 ] logger.info("\tCzekanowskiDice") allCD = [ mean( GOSet_Similarity(G, FA.GPtoGO[aspect][gp], metric="CzekanowskiDice")) for gp in FA.GPtoGO[aspect] if len(FA.GPtoGO[aspect][gp]) > 1 ] logger.info("\tResnik") allResnik = [ mean( GOSet_Similarity(G, FA.GPtoGO[aspect][gp], metric="Resnik", IC=allIC[FA.name])) for gp in FA.GPtoGO[aspect] if len(FA.GPtoGO[aspect][gp]) > 1 ] logger.info("\tCorrelation between GS2 and CzekanowskiDice : %.2f" % corrcoef(allGS2, allCD)[0][1]) logger.info("\tCorrelation between CzekanowskiDice and Resnik: %.2f" % corrcoef(allCD, allResnik)[0][1]) #Compare molecular function annotation sets in AFFY and B2G using three similarity metrics logger.info("=================================================") logger.info( "Comparing molecular function annotation sets in AFFY and B2G using three different similarity metrics" ) aspect = "molecular_function" commonGene = set(pipeline["AFFY"].GPtoGO[aspect].keys()).intersection( pipeline["B2G"].GPtoGO[aspect].keys()) logger.info("\tProcessing %d genes" % len(commonGene)) allGS2, allCD, allResnik = list(), list(), list() for gp in commonGene: GO1 = pipeline["AFFY"].GPtoGO[aspect][gp] GO2 = pipeline["B2G"].GPtoGO[aspect][gp] allGS2.append(GOSet_PWSimilarity(G, GO1, GO2, metric="GS2")[0]) allCD.append( GOSet_PWSimilarity(G, GO1, GO2, metric="CzekanowskiDice")[0]) allResnik.append( GOSet_PWSimilarity(G, GO1, GO2, metric="Resnik", IC=allIC[FA.name])[0]) logger.info("\tCorrelation between GS2 and CzekanowskiDice : %.2f" % corrcoef(allGS2, allCD)[0][1]) logger.info("\tCorrelation between CzekanowskiDice and Resnik: %.2f" % corrcoef(allCD, allResnik)[0][1])
def compareBovineAndRandom(projectDir): """ This function compare the properties of 3 functional annotations for a Bovine array + a randomize version of Affymetrix functional annotations """ projectName = "BovineAndRandom" organism = "bovine" logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info( "This function compare the properties of 3 functional annotations for a Bovine array + a randomize version of Affymetrix functional annotations." ) logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info("name of the project : %s " % projectName) #Read bovine microarray probe set to define the set of gene products fileName = "%s/ReferenceSet/%s.fasta" % (projectDir, organism) refSet = RefSet(organism=organism, fileName=fileName, refType="Fasta") #Read GO ontoloy fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir) G = readGOoboXML(fileName, force=False) #Read Functional annotations allFileName = list() allFileName.append("%s/Annotation/Affy_%s.na31.annot.csv" % (projectDir, organism)) allFileName.append("%s/Annotation/B2G_%s.annot" % (projectDir, organism)) allFileName.append("%s//Annotation/AID_%s.txt" % (projectDir, organism)) allFileName.append("%s/Annotation/Affy_%s.na31.annot.csv" % (projectDir, organism)) allPipeName = ["AFFY", "B2G", "AID", "resample"] allFileType = ["AFFY", "B2G", "AID", "AFFY"] pipeline = dict() for pipeName, fileName, fileType in zip(allPipeName, allFileName, allFileType): FA = FuncAnnot(pipeName, refSet, G, organism=organism) FA.read(fileName, fileType=fileType) pipeline[pipeName] = FA # Randimize FA randomizeFA = RandomizeFA() analyseFA = AnalyseFA() #----------------------------------------------- # Shuffle functional annotation batchList = ["sampleAnnotation"] batchExecute(batchList, randomizeFA, [pipeline[pipeName] for pipeName in ["resample"]]) batchList = ["coherence", "redundancy", "numberAnnot"] batchExecute(batchList, analyseFA, [pipeline[pipeName] for pipeName in allPipeName]) #Plot statistics of Functional annotations outDir = "%s/Graph/%s" % (projectDir, organism) createDir(outDir) plotFA = PlotFA(xlabel="Annotation pipelines", outDir=outDir, name="Resample", organism=organism, ext="png") batchExecute(batchList, plotFA, [pipeline[pipeName] for pipeName in allPipeName], doGrid=True) batchList = ["coherenceHisto2D", "numberAnnotHisto2D"] batchExecute(batchList, plotFA, [pipeline[pipeName] for pipeName in allPipeName], doGrid=True, tit="") logger.info( "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦" ) logger.info("")
def meanRandPWSim(G, FA, drepli_lab, randsimdir, GOaspects='all_aspects', compute=False, silent=True): """evealuate the average functional similarity share by a random sample of pair of genes""" if GOaspects=='all_aspects': aspects = ['biological_process', 'molecular_function', 'cellular_component'] else: aspects = list(GOaspects) dmeanrandsim = {} for repli in drepli_lab: llab = drepli_lab[repli] dmeanrandsim[repli] = {} #~ for GOaspect in FA.G.aspect: for GOaspect in aspects: dmeanrandsim[repli][GOaspect] = {} if not compute: for metric in metrics: nfrand = "%s/%s.%s.%s"%(randsimdir, repli, GOaspect, metric) if not os.access(nfrand, os.F_OK): compute = True break if compute: logger.info("Compute functional similarity on %s aspect between all random gene pair in %s"%(GOaspect, repli)) dfout = {} for metric in metrics: nfrand = "%s/%s.%s.%s"%(randsimdir, repli, GOaspect, metric) dfout[metric] = open(nfrand, 'w') lsimmax = [] lsimmean = [] # list of genes covered by an annotation in this replicon lGP = list( set(FA.GPtoGO[GOaspect].keys()) & set(llab) ) lGP.sort() for GP1 in lGP: lstrmax = [] lsmax = [] lstrmean = [] lsmean = [] for GP2 in lGP: if GP2 >= GP1: # only exlpore the lower triangular matrix continue else: GO1 = FA.GPtoGO[GOaspect][GP1] GO2 = FA.GPtoGO[GOaspect][GP2] maxsim, l = GOSet_PWSimilarity(G, GO1, GO2, FA=FA, metric="funSimMax") # profit from the fact that computing with "funSimMax" or "funSimAverage" metrics is the same (yield the same list l) lsmax.append(maxsim) lstrmax.append("%.3f"%maxsim) meansim = mean(l) lsmean.append(meansim) lstrmean.append("%.3f"%meansim) msmax = mean(lsmax) lsimmax.append(msmax) if lstrmax: dfout["funSimMax"].write(' '.join(lstrmax)+'\n') msmean = mean(lsmean) lsimmean.append(msmean) dfout["funSimAverage"].write(' '.join(lstrmean)+'\n') if not silent: print GP1, msmax, msmean for metric in dfout: dfout[metric].close() # read random similarity records for metric in metrics: lsim = [] nfrand = "%s/%s.%s.%s"%(randsimdir, repli, GOaspect, metric) logger.info("Read in file %s for random gene pair similarities"%nfrand) foutrand = open(nfrand, 'r') for line in foutrand: ls = line.rstrip('\n').split() lfs = [] for sim in ls: lfs.append(float(sim)) lsim += lfs msim = mean(lsim) dmeanrandsim[repli][GOaspect][metric] = msim if not silent: print "on %s for aspect %s with %s metric: %f"%(repli, GOaspect, metric, msim) return dmeanrandsim