Exemple #1
0
    def shuffleAnnotation(self, allFA, t=10):
        """
        This method randomly re-shuffle annotation accross annotation sets.
        The coherence of the annotation sets should change.
        The size might affected if two identical indentation are assinged one set
        The other stats should remain unchanged or be marginally affected.
        """

        for FA in allFA:

            logger.info("\t%s" % (FA.name))

            for aspect in allAspect:
                if aspect == 'All_aspects_of_GO':
                    continue

                iterKeys = flatten([
                    list(repeat(k, len(FA.GPtoGO[aspect][k])))
                    for k in FA.GPtoGO[aspect].keys()
                ])
                iterValues = list(flatten(FA.GPtoGO[aspect].values()))
                shuffle(iterValues)

                GPtoGO, GOtoGP = dict(), dict()
                for k, v in groupby(iterValues, lambda x: iterKeys.next()):
                    GPtoGO[k] = set(v)

                    for go in GPtoGO[k]:
                        if not GOtoGP.has_key(go):
                            GOtoGP[go] = set()
                        GOtoGP[go].add(k)

                FA.GPtoGO[aspect] = GPtoGO
                FA.GOtoGP[aspect] = GOtoGP
Exemple #2
0
    def sampleAnnotation(self, allFA):
        """
        This method randomly sample GO annotations  keep the size of annotation sets unchange.
        """

        for FA in allFA:

            logger.info("\t%s" % (FA.name))

            for aspect in allAspect:
                if aspect == 'All_aspects_of_GO':
                    continue

                iterKeys = flatten([
                    list(repeat(k, len(FA.GPtoGO[aspect][k])))
                    for k in FA.GPtoGO[aspect].keys()
                ])
                nbValues = len(list(flatten(FA.GPtoGO[aspect].values())))
                allNodes = FA.G.get_NodesfromAspect(aspect)
                iterValues = take(allNodes, randint(0, len(allNodes),
                                                    nbValues))

                GPtoGO, GOtoGP = dict(), dict()
                for k, v in groupby(iterValues, lambda x: iterKeys.next()):
                    GPtoGO[k] = set(v)

                    for go in GPtoGO[k]:
                        if not GOtoGP.has_key(go):
                            GOtoGP[go] = set()
                        GOtoGP[go].add(k)

                FA.GPtoGO[aspect] = GPtoGO
                FA.GOtoGP[aspect] = GOtoGP
Exemple #3
0
    def funcSim(self, allFA):
        """
        Compute Semantic Similarity between commonly annotated GP for all possible pairs of FA
        """

        funcSim = dict()
        for aspect in allAspect:
            if aspect == "All_aspects_of_GO":
                continue

            funcSim[aspect] = dict()

            for twoFAs in combinations(allFA, 2):

                #Order the two FAs
                FA1, FA2 = take(twoFAs, argsort([FA.name for FA in twoFAs]))

                logger.info("\tbetween %s and %s for %s" %
                            (FA1.name, FA2.name, aspect))

                commonGene = self.getCommonGene(FA1, FA2, aspect)

                D = dict()
                for g in commonGene:
                    sim, l = GOSet_PWSimilarity(FA1.G, FA1.GPtoGO[aspect][g],
                                                FA2.GPtoGO[aspect][g])
                    D[g] = (l[0], l[1])

                funcSim[aspect][(FA1.name, FA2.name)] = D

        aspect = "All_aspects_of_GO"

        funcSim[aspect] = dict()
        for twoFAs in combinations(allFA, 2):

            #Order the two FAs
            FA1, FA2 = take(twoFAs, argsort([FA.name for FA in twoFAs]))

            logger.info("\tbetween %s and %s for %s" %
                        (FA1.name, FA2.name, aspect))

            commonGene = self.getCommonGene(FA1, FA2, aspect)

            D = dict()
            for g in commonGene:
                D1 = mean([
                    funcSim[a][(FA1.name, FA2.name)][g][0] for a in allAspect
                    if not a == "All_aspects_of_GO"
                    if funcSim[a][(FA1.name, FA2.name)].has_key(g)
                ])
                D2 = mean([
                    funcSim[a][(FA1.name, FA2.name)][g][1] for a in allAspect
                    if not a == "All_aspects_of_GO"
                    if funcSim[a][(FA1.name, FA2.name)].has_key(g)
                ])
                D[g] = (D1, D2)

            funcSim[aspect][(FA1.name, FA2.name)] = D

        self['funcSim'] = funcSim
Exemple #4
0
    def add(self, statistics, plotType):
        logger.info("Registering plot function %s" % statistics)
        
        if not rS.isRegistered(statistics):
            logger.handleWarning("Caution, the statistics is unknown : %s" % statistics)
            return False

        self.all.append(statistics)
        
        if plotType=="Multiple":
            newPlotFunc = self.getMultiPlotFunction(statistics)
            newPlotFunc = new.instancemethod(newPlotFunc, None, self.cls)
            setattr(self.cls, newPlotFunc.__name__, newPlotFunc)
        elif plotType=="Histo3D":
            newPlotFunc = self.getHisto3DPlotFunction(statistics)
            newPlotFunc = new.instancemethod(newPlotFunc, None, self.cls)
            setattr(self.cls, "%sHisto3D" % newPlotFunc.__name__, newPlotFunc)
        else:
            logger.handleWarning("Caution, the statistics plot type is unknown : %s" % plotType)
            pass

        self.types.setdefault(statistics, set()).add(plotType)
        self.allTypes.add(plotType)

        return True
Exemple #5
0
    def Multiple(self, allFA,  statistics=None, figName=None, lloc="upper right", doGrid=False):
        lLabel=[aspect.replace("_", " ") for aspect in allAspect]

        logger.info("\t%s" % statistics)

        lBar=list()
        for aspect in allAspect:
            data=[]
            for FA in allFA:
                if type(FA[statistics][aspect]) == list:
                    data.append(mean(FA[statistics][aspect]))
                else:
                    data.append(FA[statistics][aspect])
            lBar.append( data )
            
        multiBar(lBar, lLabel, [FA.name for FA in allFA],
                 self.xlabel, "%s %s" % (rS.getName(statistics), rS.getUnit(statistics)),
                 lloc=lloc, grid=doGrid)

        if figName is None:
            figName="%s/Multi_%s_%s.%s" % (self.outDir, statistics, self.name, self.ext)
            
        savefig(figName)

        return figName
Exemple #6
0
 def add(self, statistics, name, unit="", types=None):
     logger.info("Registering statistics function %s" % statistics)
     
     self.all.append(statistics)
     self.name[statistics]=name
     self.unit[statistics]=unit
     self.types[statistics]=types
Exemple #7
0
    def checkValidity(self):
        logger.info("Name :\t%s" % self.name)
        
        allValid=True
        for aspect in self.GPtoGO:
            valid=True
            for gp in self.GPtoGO[aspect]:
                for go in self.GPtoGO[aspect][gp]:
                    if not gp in self.GOtoGP[aspect][go]:
                        logger.handleWarning ("%s not found in GOtoGP[%s][%s]" % (gp,aspect,go))
                        valid=False

            for go in self.GOtoGP[aspect]:
                for gp in self.GOtoGP[aspect][go]:
                    if not go in self.GPtoGO[aspect][gp]:
                        logger.handleWarning ("%s not found in GPtoGO[%s][%s]" % (go,aspect,gp))
                        valid=False


            if valid:
                logger.info ("%s : is valid" % (aspect))
            
            allValid = allValid and valid
            
        return allValid
Exemple #8
0
    def save(self, fileName):
        import shelve
        try:
            logger.info("File :\t%s" % fileName)
            shelf = shelve.open(fileName, protocol=-1)
            shelf['fileName'] = fileName
            for k, v in self.items():
                shelf[k] = v

            shelf.close()
            self.status = "Saved"

        except Exception, e:
            logger.handleWarning("Unable to save project %s: %s" %
                                 (fileName, str(e)))
Exemple #9
0
    def load(self, fileName):

        import shelve
        try:
            logger.info("File :\t%s" % fileName)
            shelf = shelve.open(fileName, protocol=-1)
            for k, v in shelf.items():
                self[k] = v

            shelf.close()
            self.status = "Loaded"

        except Exception, e:
            logger.handleWarning("Unable to load project %s: %s" %
                                 (fileName, str(e)))
Exemple #10
0
    def read(self, fileName="", fileType=""):
        """
        This method reads functional annotations.
        Available file format are GO annotation file (GAF), Blast2GO (B2G), Affymetrix,(AFFY) , ArrayIDer (AID) , GP2GO or GO2GP  for a simple GPid to GOids mapping
        """

        logger.info("Name :\t%s" % self.name)

        if not fileName=="":
            self.__dict__['fileName'] = fileName

        if not fileType=="":
            self.__dict__['fileType'] = fileType


        fileType=IO.IOType.get(self.fileType, self.fileType)
        
        if fileType not in IO.IOType.values():
            print "Sorry, unknown file type : %s" % fileType
            raise ValueError

        logger.info("%s file : \t%s " % (fileType, fileName ) )

        try:
            if fileType=="GAF":
                self.GPtoGO, self.GOtoGP = IO.extract_GAF(self.fileName, self.G, refSet=self.refSet)
            elif fileType=="B2G":
                self.GPtoGO, self.GOtoGP = IO.extract_GP2GO(self.fileName, self.G, refSet=self.refSet)
            elif fileType=="AFFY":
                self.GPtoGO, self.GOtoGP = IO.extract_Affy(self.fileName, self.G, refSet=self.refSet)
            elif fileType=="AID":
                self.GPtoGO, self.GOtoGP = IO.extract_AID(self.fileName, self.G, refSet=self.refSet)
            elif fileType=="SCOP":
                self.GPtoGO, self.GOtoGP = IO.extract_SCOP(self.fileName, self.G, refSet=self.refSet)
            elif fileType=="GNISD": #gene networks in seed development format http://seedgenenetwork.net/annotate#arabidopsis
                self.GPtoGO, self.GOtoGP = IO.extract_Affy(self.fileName, self.G, refSet=self.refSet, GO_columns=[6, 7, 8], filetype="GNIS-Affy", delimiter='	', quoting=csv.QUOTE_MINIMAL)
            elif fileType=="GP2GO":
                self.GPtoGO, self.GOtoGP = IO.extract_GP2GO(self.fileName, self.G, refSet=self.refSet)
            elif fileType=="GO2GP":
                self.GPtoGO, self.GOtoGP = IO.extract_GO2GP(self.fileName, self.G, refSet=self.refSet)
                
        except Exception, e:
            logger.handleFatal("Unable to read file %s: %s" % (fileName, str(e)))
Exemple #11
0
    def add(self, FA):
        logger.info("Name :\t%s" % self.name)
        
        for aspect in FA.GPtoGO:
            if not self.GPtoGO.has_key(aspect):
                self.GPtoGO[aspect]=dict()
                
            for gp in FA.GPtoGO[aspect]:                
                self.GPtoGO[aspect].setdefault(gp, set()).update(FA.GPtoGO[aspect][gp])

        for aspect in FA.GOtoGP:
            if not self.GOtoGP.has_key(aspect):
                self.GOtoGP[aspect]=dict()
                
            for go in FA.GOtoGP[aspect]:
                self.GOtoGP[aspect].setdefault(go, set()).update(FA.GOtoGP[aspect][go])

        self['GA']=set()
        for a in  self.G.aspect:
            self['GA']=self['GA']  | set(self.GPtoGO[a].keys())

        logger.info ("%d gene products are annotated" % (len(self['GA'])))

        self.status="Loaded"
            
        for  a in self.G.aspect:
            logger.info ("%s : %.2f annotations per set" % (a, mean([len(self.GPtoGO[a][gp]) for gp in self.GPtoGO[a]])))
Exemple #12
0
    def inter(self, FA):
        logger.info("Name :\t%s" % self.name)
        
        for aspect in self.GPtoGO:
            if not FA.GPtoGO.has_key(aspect):
                self.GPtoGO[aspect]=dict()
            else:
                for gp in set(self.GPtoGO[aspect].keys()).difference(FA.GPtoGO[aspect]):
                    del self.GPtoGO[aspect][gp]
                    
                for gp in set(self.GPtoGO[aspect].keys()).intersection(FA.GPtoGO[aspect]):
                    self.GPtoGO[aspect][gp].intersection_update(FA.GPtoGO[aspect][gp])

        for aspect in self.GOtoGP:
            if not FA.GOtoGP.has_key(aspect):
                self.GOtoGP[aspect]=dict()
            else:
                for go in set(self.GOtoGP[aspect].keys()).difference(FA.GOtoGP[aspect].keys()):
                    del self.GOtoGP[aspect][go]
                for go in set(self.GOtoGP[aspect].keys()).intersection(FA.GOtoGP[aspect].keys()):
                    self.GOtoGP[aspect][go].intersection_update(FA.GOtoGP[aspect][go])

        self['GA']=set()
        for a in  self.G.aspect:
            self['GA']=self['GA']  | set(self.GPtoGO[a].keys())

        logger.info ("%d gene products are annotated" % (len(self['GA'])))

        self.status="Loaded"
            
        for  a in self.G.aspect:
            logger.info ("%s : %.2f annotations per set" % (a, mean([len(self.GPtoGO[a][gp]) for gp in self.GPtoGO[a]])))
Exemple #13
0
    def removeGP(self, GP, myAspects=None):
        logger.info("Name :\t%s" % self.name)

        if myAspects==None:
            myAspects=self.GPtoGO
        
        for aspect in myAspects:
            for g in GP:
                if self.GPtoGO[aspect].has_key(g):
                    del self.GPtoGO[aspect][g]

        self.GOtoGP=dict()
        for aspect in self.G.aspect:
            self.GOtoGP[aspect]=dict()

            for gp in self.GPtoGO[aspect]:
                for go in self.GPtoGO[aspect][gp]:
                    self.GOtoGP[aspect].setdefault(go, set()).add(gp)

        self['GA']=set()
        for a in  self.G.aspect:
            self['GA']=self['GA']  | set(self.GPtoGO[a].keys())

        logger.info ("%d gene products are annotated" % (len(self['GA'])))

        self.status="Loaed"
            
        for  a in self.G.aspect:
            logger.info ("%s : %.2f annotations per set" % (a, mean([len(self.GPtoGO[a][gp]) for gp in self.GPtoGO[a]])))
def loadFA(G, norganism, dbcur, drepli, drepli_lab, taxid, aspects=aspects, metrics=metrics, analysisList=analysisList):
	inrefset = set([])
	for repli in drepli_lab:
		inrefset |= set(drepli_lab[repli])
	refSet = RefSet(organism=norganism, inSet=inrefset, refType="DB")
	FA = FuncAnnot(norganism, refSet, G, organism=norganism)
	FA.read_from_db(dbcur, replicons=drepli.keys())
	analyseFA = AnalyseFA()
	#print FA.GPtoGO['biological_process'].keys()
	analyseFA.largestSet([FA])
	logger.info("Largest sets of annotations:")
	logger.info("\t%d for %s" % (FA['largestSet']['All_aspects_of_GO'], FA.name))
	batchExecute(analysisList, analyseFA, [FA])
	#~ drepli_lab = {}
	#~ genelabeldir = "%s/genelabels/%s"%(outdir, norganism)
	#~ nflabels = "%s/%s_all_gene_labels"%(genelabeldir, norganism)
	#~ flab = open(nflabels, 'r')
	#~ for line in flab:
		#~ lsp = line.rstrip('\n').split('\t')
		#~ drepli_lab[lsp[0]] = drepli_lab.setdefault(lsp[0], []) + ["%s.%s"%(str(taxid), lsp[1])]	#[lsp[1]]
	#~ flab.close()
	return FA #, drepli_lab
Exemple #15
0
def venn_NS(v, tit=None):
    logger.info(tit)

    idx = argsort([len(key) for key in v.keys()])
    for key in array(v.keys())[idx]:
        logger.info("%s \t: %.2f" % (key.replace('@', '\t ^ '), v[key]))

    logger.info("TOTAL\t: %.2f" % (100.0 - sum(v.values())))
Exemple #16
0
    def recall(self, allFA):
        """
        Verspoor et al. (2006) hierarchical recall 
        """

        #The first FA is used as a Gold Standard
        GS = allFA[0]

        recall = dict()
        for aspect in allAspect:
            if aspect == "All_aspects_of_GO":
                continue

            recall[aspect] = dict()
            for FA in allFA[1:]:
                recall[aspect][(FA.name, GS.name)] = dict()

                logger.info("\t%s vs %s for %s" % (FA.name, GS.name, aspect))

                commonGene = self.getCommonGene(FA, GS, aspect)
                for g in commonGene:
                    r = list()
                    for gs in GS.GPtoGO[aspect][g]:
                        aGS = set(FA.G.get_Ancestors(gs))
                        naGS = len(aGS)

                        maxSim = max([
                            (1.0 *
                             len(aGS.intersection(FA.G.get_Ancestors(go)))) /
                            naGS for go in FA.GPtoGO[aspect][g]
                        ])
                        r.append(maxSim)

                    recall[aspect][(FA.name, GS.name)][g] = mean(r)

        self['recall'] = recall
Exemple #17
0
def readGOoboXML(fileName, force=False, prefix="GO"):
    import cPickle as pickle

    picName = "%s.pic" % fileName
    if (not os.path.exists(picName)):
        force = True

    if not force:
        try:
            logger.info("Reading serialized OBO file : %s" % picName)
            with open(picName, "rb") as f:

                G = pickle.load(f)
                f.close()
        except IOError as (inst):
            print str(type(inst)) + " for " + picName
            force = True
        except EOFError as (inst):  ##its an empty file?
            print str(type(inst)) + " for " + picName
            force = True

    try:
        if force:
            fileName = checkForZip(fileName)
            if (not os.path.exists(fileName)):
                raise IOError(fileName + " does not exist and is required ")

            logger.info("Reading OBO file : %s" % fileName)

            G = get_GOGraph(readFile(fileName, mode="r"), prefix=prefix)
            G.fileName = fileName

            with open(picName, "wb") as f:
                logger.info("Saving serialized OBO file")
                pickle.dump(G, f, -1)
            f.close()
    except Exception, e:
        logger.handleFatal("Unable to read file %s: %s" % (fileName, str(e)))
Exemple #18
0
from AIGO.ReferenceSet import RefSet
from AIGO.FunctionalAnnotation import FuncAnnot
from AIGO.go.OBO import readGOoboXML

from AIGO.Analyse import AnalyseFA
from AIGO.Report import ReportFA

from AIGO.utils.Execute import batchExecute

refSet = RefSet(organism="platypus",
                fileName="platypus.refSet",
                refType="Text")
G = readGOoboXML("go_daily-termdb.obo-xml")
FA = FuncAnnot("platypusProject", refSet, G, organism="platypus")
FA.read("platypus.gaf", "GAF")

analyseFA = AnalyseFA()

analyseFA.largestSet([FA])
logger.info("Largest sets of annotations:")
logger.info("\t%d for %s" % (FA['largestSet']['All_aspects_of_GO'], FA.name))

batchList = [
    "coverage", "richness", "numberAnnot", "redundancy", "specificity",
    "informationContent", "hPrecision"
]
batchExecute(batchList, analyseFA, [FA])

reportFA = ReportFA(outDir=None, name="platypusProject", organism="platypus")
reportFA.printStatistics([FA], batchList)
Exemple #19
0
    def add(self, fileName, refType="Fasta"):

        if self.fileName == '':
            self.fileName = fileName
            self.refType = refType
        else:
            if type(self.fileName) == list:
                self.fileName.append(fileName)
                self.refType.append(refType)
            else:
                self.fileName = [self.fileName, fileName]
                self.refType = [self.refType, refType]

        fileName = checkForZip(fileName)
        if (not os.path.exists(fileName)):
            logger.handleFatal(fileName + " does not exist and is required ")

        logger.info("Organism :\t%s" % self.organism)

        logger.info("%s file :\t%s " % (refType, fileName))

        try:

            #Use fasta file to define the reference set
            if refType == "Fasta":
                from Bio import SeqIO
                allID = set([
                    rec.name.split(";")[0].split(":")[-1]
                    for rec in SeqIO.parse(readFile(fileName), "fasta")
                ])
                self.update(allID)

            #Use a simple text file to define the reference set, first column is chosen by default
            elif refType == "Text":
                allID = set([
                    r[0] for r in csv.reader(readFile(fileName), delimiter=";")
                ])
                self.update(allID)

            #Use a GO annotation file to define the reference set
            elif refType == "GAF":
                from AIGO.IO import readGAF_2
                data, GAF_col = readGAF_2(fileName)

                allID = set([
                    ".".join([
                        row[GAF_col.index("Taxon(|taxon)")][6:],
                        row[GAF_col.index("DB Object Symbol")]
                    ]) for row in data
                ])
                self.update(allID)

            #Use a Affymetrix annotation file to define the reference set
            elif refType == "AFFY":
                f = readFile(fileName)
                row = f.readline()
                while row[0] == '#':
                    row = f.readline()

                header = row
                rd = csv.reader(f)
                allID = set()
                for row in rd:
                    #Read gene product id if not control sequence
                    if ("Control sequence".upper() != row[4].upper()):
                        allID.add(row[0])

                self.update(allID)
            else:
                print "Sorry, unknown file type !!"
                self.extend([])
                raise Exception

            if len(self) == 0:
                logger.handleWarning("No gene products loaded")

        except Exception, e:
            logger.handleFatal("Unable to read file %s: %s" %
                               (fileName, str(e)))
Exemple #20
0
def compareEvidence(projectDir):
    """
    This function compare electronically infered and manually curated annotations to experimental annotations
    """

    projectName = "EvidenceCode"
    organism = "allSpecies"

    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )
    logger.info(
        "This function compare electronically infered and manually curated annotations to experimental annotations."
    )
    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )

    logger.info("name of the project : %s " % projectName)

    refSet = RefSet(organism=organism)

    allOrg = [
        "Arabidopsis_thaliana", "Drosophila_melanogaster",
        "Mycobacterium_tuberculosis_ATCC_25618", "Schizosaccharomyces_pombe",
        "Bos_taurus", "Escherichia_coli_ATCC_27325",
        "Mycobacterium_tuberculosis_Oshkosh", "Caenorhabditis_elegans",
        "Escherichia_coli_MG1655", "Oryza_sativa", "Synechocystis_sp",
        "Candida_albicans_SC5314", "Gallus_gallus",
        "Pseudomonas_fluorescens_Pf-5", "Danio_rerio", "Homo_sapiens",
        "Rattus_norvegicus"
    ]

    for refOrg in allOrg:
        #Define the set of gene products
        fileName = "%s/EvidenceCode/%s/two_experimental_evidence.goa" % (
            projectDir, refOrg)
        refSet.add(fileName=fileName, refType="GAF")

    #Read GO ontoloy
    fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir)
    G = readGOoboXML(fileName, force=False)

    #Read all annotations
    fileType = "GAF"

    evidenceCodes = ["EXP2", "IC", "TAS", "ISS", "NAS", "IEA"]

    allFA = dict()

    #-----------------------------------------------
    #Read Functional annotations obtained by experiments
    pipeName = "EXP2"
    EXP2 = FuncAnnot(pipeName, refSet, G, organism=organism)
    for refOrg in allOrg:
        fileName = "%s/EvidenceCode/%s/two_experimental_evidence.goa" % (
            projectDir, refOrg)
        FA = FuncAnnot(pipeName, refSet, G, organism=refOrg)
        FA.read(fileName, fileType=fileType)
        EXP2.add(FA)
    allFA[pipeName] = EXP2

    #-----------------------------------------------
    #Read Functional annotations obtained by human curation
    for pipeName in ["IC", "TAS", "ISS", "NAS"]:
        EV = FuncAnnot(pipeName, refSet, G, organism=organism)
        for refOrg in allOrg:
            fileName = "%s/EvidenceCode/%s/%s.goa" % (projectDir, refOrg,
                                                      pipeName)
            if not os.path.exists(fileName):
                continue
            FA = FuncAnnot(pipeName, refSet, G, organism=refOrg)
            FA.read(fileName, fileType=fileType)
            EV.add(FA)
        allFA[pipeName] = EV

    #Merge FAs Assigned by Human Curator
    FA = FuncAnnot("AHC", refSet, G, organism=organism)
    #for evidence in ["IC", "TAS", "ISS", "NAS"]:
    for evidence in ["IC", "ISS", "NAS"]:
        FA.add(allFA[evidence])
    allFA["AHC"] = FA

    #-----------------------------------------------
    #Read Functional annotations obtained without human curation
    for pipeName in ["IEA"]:
        EV = FuncAnnot(pipeName, refSet, G, organism=organism)
        for refOrg in allOrg:
            fileName = "%s/EvidenceCode/%s/%s.goa" % (projectDir, refOrg,
                                                      pipeName)
            if not os.path.exists(fileName):
                continue
            FA = FuncAnnot(pipeName, refSet, G, organism=refOrg)
            FA.read(fileName, fileType=fileType)
            EV.add(FA)
        allFA[pipeName] = EV

    #-----------------------------------------------
    listFA = ["EXP2", "AHC", "IEA"]

    #Analyse Functional annotations
    analyseFA = AnalyseFA()
    batchList = [
        "obsolete", "unconnected", "removeUnconnected", "coverage", "richness",
        "numberAnnot", "coherence", "redundancy", "removeRedundancy",
        "compactness", "specificity", "informationContent"
    ]
    batchExecute(batchList, analyseFA,
                 [allFA[evidence] for evidence in listFA])

    #Plot statistics of Functional annotations
    outDir = "%s/Graph/%s" % (projectDir, organism)
    createDir(outDir)
    plotFA = PlotFA(xlabel="Evidence Codes",
                    outDir=outDir,
                    name=projectName,
                    organism=organism)
    batchExecute(batchList,
                 plotFA, [allFA[evidence] for evidence in listFA],
                 doGrid=True)

    batchList = ["coherenceHisto2D", "numberAnnotHisto2D"]
    batchExecute(batchList,
                 plotFA, [allFA[evidence] for evidence in listFA],
                 doGrid=True)

    #Compare  Functional annotations
    compareFA = CompareFA()
    batchList = ["venn", "funcSim"]
    batchExecute(batchList, compareFA,
                 [allFA[evidence] for evidence in listFA])
    batchList = ["recall", "precision"]
    batchExecute(batchList, compareFA,
                 [allFA[evidence] for evidence in listFA])

    #Plot statistics of the comparison between Functional annotations
    batchList = ["venn", "funcSymSim"]
    batchExecute(batchList, plotFA, compareFA,
                 [allFA[evidence] for evidence in listFA])
    batchList = ["recall", "precision"]
    batchExecute(batchList, plotFA, compareFA,
                 [allFA[evidence] for evidence in listFA])

    #-----------------------------------------------
    #Export statistics to Excel
    outDir = "%s/Export/%s" % (projectDir, organism)
    createDir(outDir)

    exportList = [
        "unconnected", "coverage", "richness", "numberAnnot", "coherence",
        "compactness", "specificity", "informationContent", "redundancy"
    ]
    reportFA = ReportFA(outDir=outDir, name=projectName, organism=organism)
    reportFA.printStatistics([allFA[evidence] for evidence in listFA],
                             exportList)
    reportFA.saveStatistics([allFA[evidence] for evidence in listFA],
                            exportList)

    #-----------------------------------------------
    # Invididual contributions of evidence codes
    logger.info("=================================================")
    logger.info("Invididual contributions of evidence codes")
    contribution = dict()
    for ec in ["IC", "TAS", "ISS", "NAS", "IEA"]:
        contribution[ec] = set([
            (gp, go) for aspect in
            ["cellular_component", "molecular_function", "biological_process"]
            for gp in allFA[ec].GPtoGO[aspect]
            for go in allFA[ec].GPtoGO[aspect][gp]
        ])

    total_Annotation = sum(
        [len(contribution[ec]) for ec in ["IC", "TAS", "ISS", "NAS"]])

    for ec in ["IC", "TAS", "ISS", "NAS"]:
        logger.info("\t%.02f %% of the annotations are supported by %s" %
                    (100. * len(contribution[ec]) / total_Annotation, ec))

    batchList = ["recall", "precision"]
    batchExecute(batchList, compareFA, [
        allFA[evidence]
        for evidence in ["EXP2", "ISS", "TAS", "NAS", "IC", "AHC", "IEA"]
    ])
    logger.info("Done")
    #-----------------------------------------------
    #Plotting precision and recall for each evidence code
    logger.info("=================================================")
    logger.info("Plotting precision and recall for each evidence code")
    reference = "EXP2"
    #plotEvidence=["AHC", "IEA"]
    plotEvidence = ["ISS", "TAS", "NAS", "IC", "AHC", "IEA"]

    evidenceMarker = dict(zip(plotEvidence, ['s', 'd', 'D', '*', 'p', 'h']))
    evidenceSize = dict(zip(plotEvidence, [8, 8, 8, 8, 15, 15]))
    aspectColor = dict(zip(allAspect, ["blue", "green", "red", "cyan"]))

    fig = figure(figsize=(8, 8))
    for evidence in plotEvidence:

        for aspect in allAspect:
            if aspect == "All_aspects_of_GO":
                continue

            allX = compareFA['recall'][aspect][(evidence, reference)].values()
            allY = compareFA['precision'][aspect][(evidence,
                                                   reference)].values()

            meanX = mean(allX)
            errX = std(allX) / sqrt(len(allX))
            meanY = mean(allY)
            errY = std(allY) / sqrt(len(allY))

            errorbar(meanX,
                     meanY,
                     xerr=errX,
                     yerr=errY,
                     alpha=0.9,
                     hold=True,
                     mfc=aspectColor[aspect],
                     ecolor=aspectColor[aspect],
                     marker=evidenceMarker[evidence],
                     ms=evidenceSize[evidence])

    xlabel("Verspoor Hierarchical Recall")
    ylabel("Verspoor Hierarchical Precision")

    allMarker = ['o', 'o', 'o', 's', 'd', 'D', '*', 'p', 'h']
    allColor = [
        "green", "red", "cyan", "white", "white", "white", "white", "white",
        "white"
    ]
    allLabel = [
        aspect.replace("_", " ") for aspect in allAspect
        if not aspect == "All_aspects_of_GO"
    ]
    allLabel.extend(plotEvidence)

    foo = [
        Line2D(arange(5), arange(5), ls='-', marker=m, color=c, label=l)
        for m, c, l in zip(allMarker, allColor, allLabel)
    ]
    leg = legend(foo, allLabel, loc="upper left", numpoints=1)
    leg.legendPatch.set_alpha(0.5)

    grid()

    outDir = "%s/Graph/%s" % (projectDir, organism)
    createDir(outDir)
    figName = "%s/PrecisionVSRecall.png" % outDir
    savefig(figName)
    logger.info("Done")
def GOFrequencyBovinePipelines(projectDir):
    """
    This function plot the frequency of GO terms in three bovine functional annotation
    """

    projectName = "bovinePipeline"
    organism = "bovine"

    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )
    logger.info(
        "This function plot the frequency of GO terms from 3 functional annotations  for a Bovine array"
    )
    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )

    logger.info("name of the project : %s " % projectName)

    #Read bovine microarray probe set to define the set of gene products
    fileName = "%s/ReferenceSet/%s.fasta" % (projectDir, organism)
    refSet = RefSet(organism=organism, fileName=fileName, refType="Fasta")

    #Read GO ontoloy
    fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir)
    G = readGOoboXML(fileName, force=False)

    #Read Functional annotations
    allFileName = list()
    allFileName.append("%s/Annotation/Affy_%s.na31.annot.csv" %
                       (projectDir, organism))
    allFileName.append("%s/Annotation/B2G_%s.annot" % (projectDir, organism))
    allFileName.append("%s/Annotation/AID_%s.txt" % (projectDir, organism))

    allPipeName = ["AFFY", "B2G", "AID"]
    allFileType = allPipeName

    pipeline = dict()
    for pipeName, fileName, fileType in zip(allPipeName, allFileName,
                                            allFileType):
        FA = FuncAnnot(pipeName, refSet, G, organism=organism)
        FA.read(fileName, fileType=fileType)
        pipeline[pipeName] = FA

    #----------------------------------------------
    #Plot frequency of GO terms in a radial grah
    outDir = "%s/Graph/%s" % (projectDir, organism)
    logger.info("=================================================")
    logger.info("Plotting frequency of GO terms")
    logger.info("directory : %s" % outDir)
    for aspect in G.aspect:
        logger.info("%s : " % aspect)

        A = None
        for pipeName in allPipeName:
            l = array([
                log(1 + len(pipeline[pipeName].GOtoGP[aspect].get(go, [])))
                for go in G.get_NodesfromAspect(aspect)
            ])
            l = l / max(l) * 256.
            l = [int(round(n)) for n in l]
            freq = dict([(n, c)
                         for n, c in zip(G.get_NodesfromAspect(aspect), l)])

            figName = "%s/Frequency_%s_%s.png" % (
                outDir, pipeline[pipeName].name, aspect)
            A = G.plot_FrequencyGraph(aspect,
                                      freq,
                                      figName=figName,
                                      ttl="",
                                      graphviz=A)

    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )
    logger.info("")
def worseFunctionalSimilarity(projectDir):
    """
    This function identifies the ten most different annotation sets between Affymetrix and Blast2GO for a Bovine array
    """

    from AIGO.Similarity import GOSet_PWSimilarity
    from itertools import izip

    projectName = "bovinePipeline"
    organism = "bovine"

    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )
    logger.info(
        "This function identifies the ten most different annotation sets between Affymetrix and Blast2GO for a Bovine array"
    )
    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )

    logger.info("name of the project : %s " % projectName)

    #Read bovine microarray probe set to define the set of gene products
    fileName = "%s/ReferenceSet/%s.fasta" % (projectDir, organism)
    refSet = RefSet(organism=organism, fileName=fileName, refType="Fasta")

    #Read GO ontoloy
    fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir)
    G = readGOoboXML(fileName, force=False)

    fileName = "%s/Annotation/Affy_%s.na31.annot.csv" % (projectDir, organism)
    FA1 = FuncAnnot("AFFY", refSet, G, organism=organism)
    FA1.read(fileName, fileType="AFFY")

    fileName = "%s/Annotation/B2G_%s.annot" % (projectDir, organism)
    FA2 = FuncAnnot("B2G", refSet, G, organism=organism)
    FA2.read(fileName, fileType="B2G")

    #Analyse Functional annotations
    analyseFA = AnalyseFA()
    batchExecute(["removeUnconnected"], analyseFA, [FA1, FA2])

    outDir = "%s/Graph/%s/WorseFuncSim" % (projectDir, organism)
    createDir(outDir)

    N = 10
    logger.info("=================================================")
    logger.info("Plotting the %d most dissimilar annotation sets" % N)
    logger.info("directory : %s" % outDir)
    for aspect in G.aspect:

        commonGene = set(FA1.GPtoGO[aspect].keys()).intersection(
            FA2.GPtoGO[aspect].keys())

        logger.info("%s : processing %d annotation sets " %
                    (aspect, len(commonGene)))

        allD1, allD2 = list(), list()
        for i, g in enumerate(commonGene):
            sim, l = GOSet_PWSimilarity(G, FA1.GPtoGO[aspect][g],
                                        FA2.GPtoGO[aspect][g])

            allD1.append(l[0])
            allD2.append(l[1])

        allD = map(lambda D: ((array(D[0]) + array(D[1])) / 2.),
                   izip(allD1, allD2))

        idx = argsort(allD)

        for i in arange(0, N):
            gp = list(commonGene)[idx[i]]

            figName = "%s/%s_annotation_%s_from_%s_%s.png" % (
                outDir, aspect, gp, FA1.name, FA2.name)

            ttl = "%s annotations of %s from %s (green) and %s (red) : Functional similarity = %.2f" % (
                aspect.replace("_", " "), gp, FA1.name, FA2.name, allD[idx[i]])
            FA1.G.compare_InducedGraph(FA1.GPtoGO[aspect][gp],
                                       FA2.GPtoGO[aspect][gp],
                                       figName=figName,
                                       ttl=ttl)

    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )
    logger.info("")
Exemple #23
0
            elif fileType=="AFFY":
                self.GPtoGO, self.GOtoGP = IO.extract_Affy(self.fileName, self.G, refSet=self.refSet)
            elif fileType=="AID":
                self.GPtoGO, self.GOtoGP = IO.extract_AID(self.fileName, self.G, refSet=self.refSet)
            elif fileType=="SCOP":
                self.GPtoGO, self.GOtoGP = IO.extract_SCOP(self.fileName, self.G, refSet=self.refSet)
            elif fileType=="GNISD": #gene networks in seed development format http://seedgenenetwork.net/annotate#arabidopsis
                self.GPtoGO, self.GOtoGP = IO.extract_Affy(self.fileName, self.G, refSet=self.refSet, GO_columns=[6, 7, 8], filetype="GNIS-Affy", delimiter='	', quoting=csv.QUOTE_MINIMAL)
            elif fileType=="GP2GO":
                self.GPtoGO, self.GOtoGP = IO.extract_GP2GO(self.fileName, self.G, refSet=self.refSet)
            elif fileType=="GO2GP":
                self.GPtoGO, self.GOtoGP = IO.extract_GO2GP(self.fileName, self.G, refSet=self.refSet)
                
        except Exception, e:
            logger.handleFatal("Unable to read file %s: %s" % (fileName, str(e)))

        else:            
            #Find the set of annotated gene products
            self['GA']=set()
            for a in  self.G.aspect:
                self['GA']=self['GA']  | set(self.GPtoGO[a].keys())

            logger.info ("%d gene products are annotated" % (len(self['GA'])))

            self.status="Loaded"
            
            for  a in self.G.aspect:
                logger.info ("%s : %.2f annotations per set" % (a, mean([len(self.GPtoGO[a][gp]) for gp in self.GPtoGO[a]])))


def compareRandomizePipelines(projectDir):
    """
    This function compare the properties of 3 randomized functional annotations for a Bovine array. 
    """

    projectName = "randomizePipeline"
    organism = "bovine"

    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )
    logger.info(
        "This function compare the properties of 3 randomized functional annotations for a Bovine array."
    )
    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )

    logger.info("name of the project : %s " % projectName)

    #Read rice microarray target sequence to define the set of gene products
    fileName = "%s/ReferenceSet/%s.fasta" % (projectDir, organism)
    refSet = RefSet(organism=organism, fileName=fileName, refType="Fasta")

    #Read GO ontoloy
    fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir)
    G = readGOoboXML(fileName, force=False)

    #Read Functional annotations
    allFileName = list()
    allFileName.append("%s/Annotation/Affy_%s.na31.annot.csv" %
                       (projectDir, organism))
    allFileName.append("%s/Annotation/B2G_%s.annot" % (projectDir, organism))
    allFileName.append("%s/Annotation/AID_%s.txt" % (projectDir, organism))

    allPipeName = ["AFFY", "B2G", "AID"]
    allFileType = allPipeName

    pipeline = dict()
    for pipeName, fileName, fileType in zip(allPipeName, allFileName,
                                            allFileType):
        FA = FuncAnnot(pipeName, refSet, G, organism=organism)
        FA.read(fileName, fileType=fileType)
        pipeline[pipeName] = FA

    # Randomize FA
    randomizeFA = RandomizeFA()

    #-----------------------------------------------
    # Shuffle functional annotation
    batchList = ["shuffleAnnotation"]
    batchExecute(batchList, randomizeFA,
                 [pipeline[pipeName] for pipeName in allPipeName])

    #Analyse Functional annotations
    analyseFA = AnalyseFA()
    batchList = ["coherence", "redundancy"]
    batchExecute(batchList, analyseFA,
                 [pipeline[pipeName] for pipeName in allPipeName])

    #Export statistics to Excel
    outDir = "%s/Export/%s" % (projectDir, organism)
    createDir(outDir)
    exportList = ["coherence", "redundancy"]
    report = ReportFA(name="Randomize shuffle",
                      outDir=outDir,
                      organism=organism)
    report.printStatistics([pipeline[pipeName] for pipeName in allPipeName],
                           exportList)
    report.saveStatistics([pipeline[pipeName] for pipeName in allPipeName],
                          exportList)

    #-----------------------------------------------
    # Resample functional annotation
    batchList = ["sampleAnnotation"]
    batchExecute(batchList, randomizeFA,
                 [pipeline[pipeName] for pipeName in allPipeName])

    #Analyse Functional annotations
    #batchList=["obsolete", "unconnected", "removeUnconnected", "coverage", "richness", "numberAnnot", "coherence", "redundancy", "compactness", "specificity", "informationContent"]
    batchList = [
        "obsolete", "unconnected", "removeUnconnected", "coverage", "richness",
        "numberAnnot", "redundancy", "specificity", "informationContent"
    ]
    batchExecute(batchList, analyseFA,
                 [pipeline[pipeName] for pipeName in allPipeName])

    #Export statistics to Excel
    outDir = "%s/Export/%s" % (projectDir, organism)
    createDir(outDir)
    #exportList=["coverage",  "numberAnnot",  "richness", "coherence",  "compactness", "specificity", "informationContent", "redundancy"]
    exportList = [
        "coverage", "numberAnnot", "richness", "specificity",
        "informationContent", "redundancy"
    ]
    report = ReportFA(name="Randomize sample",
                      outDir=outDir,
                      organism=organism)
    report.printStatistics([pipeline[pipeName] for pipeName in allPipeName],
                           exportList)
    report.saveStatistics([pipeline[pipeName] for pipeName in allPipeName],
                          exportList)

    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )
    logger.info("")
def compareBovinePipelines(projectDir):
    """
    This function compare the properties of 3 functional annotations for a Bovine array.
    """

    projectName = "bovinePipeline"
    organism = "bovine"

    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )
    logger.info(
        "This function compare the properties of 3 functional annotations for a Bovine array."
    )
    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )

    logger.info("name of the project : %s " % projectName)

    #Read bovine microarray probe set to define the set of gene products
    fileName = "%s/ReferenceSet/%s.fasta" % (projectDir, organism)
    refSet = RefSet(organism=organism, fileName=fileName, refType="Fasta")

    #Read GO ontoloy
    fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir)
    G = readGOoboXML(fileName, force=False)

    #Read Functional annotations
    allFileName = list()
    allFileName.append("%s/Annotation/Affy_%s.na31.annot.csv" %
                       (projectDir, organism))
    allFileName.append("%s/Annotation/B2G_%s.annot" % (projectDir, organism))
    allFileName.append("%s/Annotation/AID_%s.txt" % (projectDir, organism))

    allPipeName = ["AFFY", "B2G", "AID"]
    allFileType = allPipeName

    pipeline = dict()
    for pipeName, fileName, fileType in zip(allPipeName, allFileName,
                                            allFileType):
        FA = FuncAnnot(pipeName, refSet, G, organism=organism)
        FA.read(fileName, fileType=fileType)
        pipeline[pipeName] = FA

    #-----------------------------------------------

    #Analyse Functional annotations
    analyseFA = AnalyseFA()
    #batchList=["obsolete", "unconnected", "removeUnconnected", "coverage",  "richness", "numberAnnot", "coherence", "redundancy", "compactness", "specificity", "informationContent"]
    batchList = [
        "obsolete", "unconnected", "removeUnconnected", "coverage", "richness",
        "numberAnnot", "redundancy", "specificity", "informationContent"
    ]
    batchExecute(batchList, analyseFA,
                 [pipeline[pipeName] for pipeName in allPipeName])

    #How big are the largest annotation sets ?
    analyseFA.largestSet([pipeline[pipeName] for pipeName in allPipeName])
    logger.info("The largest sets of annotations are :")
    for pipeName in allPipeName:
        FA = pipeline[pipeName]
        logger.info("\t%d for %s" %
                    (FA['largestSet']['All_aspects_of_GO'], FA.name))

    #Plot statistics of Functional annotations
    outDir = "%s/Graph/%s" % (projectDir, organism)
    createDir(outDir)
    plotFA = PlotFA(xlabel="Annotation pipelines",
                    outDir=outDir,
                    name=projectName,
                    organism=organism,
                    ext="png")
    batchExecute(batchList,
                 plotFA, [pipeline[pipeName] for pipeName in allPipeName],
                 doGrid=True)

    #batchList=["coherenceHisto2D", "numberAnnotHisto2D"]
    batchList = ["numberAnnotHisto2D"]
    batchExecute(batchList,
                 plotFA, [pipeline[pipeName] for pipeName in allPipeName],
                 doGrid=True,
                 tit="")

    #-----------------------------------------------

    #Compare  Functional annotations
    compareFA = CompareFA()
    batchList = ["venn", "funcSim"]
    batchExecute(batchList, compareFA,
                 [pipeline[pipeName] for pipeName in allPipeName])

    #Plot statistics of the comparison between Functional annotations
    batchList = ["venn", "funcSymSim"]
    batchExecute(batchList,
                 plotFA,
                 compareFA, [pipeline[pipeName] for pipeName in allPipeName],
                 tit="")

    #-----------------------------------------------
    #Export statistics to Excel
    outDir = "%s/Export/%s" % (projectDir, organism)
    createDir(outDir)

    #exportList=["unconnected", "coverage",  "richness", "numberAnnot",  "coherence",  "compactness", "specificity", "informationContent", "redundancy"]
    exportList = [
        "unconnected", "coverage", "richness", "numberAnnot", "specificity",
        "informationContent", "redundancy"
    ]
    reportFA = ReportFA(outDir=outDir, name=projectName, organism=organism)
    reportFA.printStatistics([pipeline[pipeName] for pipeName in allPipeName],
                             exportList)
    reportFA.saveStatistics([pipeline[pipeName] for pipeName in allPipeName],
                            exportList)

    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )
    logger.info("")
Exemple #26
0
def compareSimilarity(projectDir):
    """
    This function compare Similarity measures.
    """

    projectName = "simPipeline"
    organism = "bovine"

    #Read bovine microarray probe set to define the set of gene products
    fileName = "%s/ReferenceSet/%s.fasta" % (projectDir, organism)
    refSet = RefSet(organism=organism, fileName=fileName, refType="Fasta")

    #Read GO ontoloy
    fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir)
    G = readGOoboXML(fileName, force=False)

    #Read Functional annotations
    allFileName = list()
    allFileName.append("%s/Annotation/Affy_%s.na31.annot.csv" %
                       (projectDir, organism))
    allFileName.append("%s/Annotation/B2G_%s.annot" % (projectDir, organism))

    allPipeName = ["AFFY", "B2G"]
    allFileType = allPipeName

    pipeline = dict()
    for pipeName, fileName, fileType in zip(allPipeName, allFileName,
                                            allFileType):
        FA = FuncAnnot(pipeName, refSet, G, organism=organism)
        FA.read(fileName, fileType=fileType)
        pipeline[pipeName] = FA

    #-----------------------------------------------

    #Analyse Functional annotations
    analyseFA = AnalyseFA()
    batchList = ["removeUnconnected"]
    batchExecute(batchList, analyseFA,
                 [pipeline[pipeName] for pipeName in allPipeName])

    #Compute information content
    logger.info("=================================================")
    logger.info("Computing Information Content")
    allIC = dict()
    for pipeName in allPipeName:
        FA = pipeline[pipeName]
        logger.info("\t%s" % FA.name)
        allIC[pipeName] = dict()
        for a in FA.G.aspect:
            allIC[pipeName][a] = dict()
            for go in FA.GOtoGP[a]:
                n = len(FA.GOtoGP[a][go])
                for ans in FA.G.ancestors(FA.G.get_intid(go)):
                    allIC[pipeName][a][ans] = allIC[pipeName][a].get(ans,
                                                                     0) + n

        for a in FA.G.aspect:
            if len(allIC[pipeName][a].values()) == 0:
                continue
            m = max(allIC[pipeName][a].values())
            for go in allIC[pipeName][a]:
                allIC[pipeName][a][go] = -1. * log(
                    1. * allIC[pipeName][a][go] / m)

    #Compare coherence of biological process annotation sets in AFFY given by three different similarity metrics
    logger.info("=================================================")
    logger.info(
        "Computing functional coherence of biological process annotation sets in AFFY given by three different similarity metrics"
    )
    aspect = "biological_process"
    pipeName = "B2G"
    FA = pipeline[pipeName]
    logger.info("\tGS2")
    allGS2 = [
        mean(GOSet_Similarity(G, FA.GPtoGO[aspect][gp], metric="GS2"))
        for gp in FA.GPtoGO[aspect] if len(FA.GPtoGO[aspect][gp]) > 1
    ]
    logger.info("\tCzekanowskiDice")
    allCD = [
        mean(
            GOSet_Similarity(G,
                             FA.GPtoGO[aspect][gp],
                             metric="CzekanowskiDice"))
        for gp in FA.GPtoGO[aspect] if len(FA.GPtoGO[aspect][gp]) > 1
    ]
    logger.info("\tResnik")
    allResnik = [
        mean(
            GOSet_Similarity(G,
                             FA.GPtoGO[aspect][gp],
                             metric="Resnik",
                             IC=allIC[FA.name])) for gp in FA.GPtoGO[aspect]
        if len(FA.GPtoGO[aspect][gp]) > 1
    ]
    logger.info("\tCorrelation between GS2 and CzekanowskiDice : %.2f" %
                corrcoef(allGS2, allCD)[0][1])
    logger.info("\tCorrelation between CzekanowskiDice and Resnik: %.2f" %
                corrcoef(allCD, allResnik)[0][1])

    #Compare molecular function  annotation sets in AFFY and B2G using three similarity metrics
    logger.info("=================================================")
    logger.info(
        "Comparing molecular function annotation sets in AFFY and B2G using three different similarity metrics"
    )
    aspect = "molecular_function"
    commonGene = set(pipeline["AFFY"].GPtoGO[aspect].keys()).intersection(
        pipeline["B2G"].GPtoGO[aspect].keys())
    logger.info("\tProcessing %d genes" % len(commonGene))
    allGS2, allCD, allResnik = list(), list(), list()
    for gp in commonGene:
        GO1 = pipeline["AFFY"].GPtoGO[aspect][gp]
        GO2 = pipeline["B2G"].GPtoGO[aspect][gp]

        allGS2.append(GOSet_PWSimilarity(G, GO1, GO2, metric="GS2")[0])
        allCD.append(
            GOSet_PWSimilarity(G, GO1, GO2, metric="CzekanowskiDice")[0])
        allResnik.append(
            GOSet_PWSimilarity(G, GO1, GO2, metric="Resnik",
                               IC=allIC[FA.name])[0])
    logger.info("\tCorrelation between GS2 and CzekanowskiDice : %.2f" %
                corrcoef(allGS2, allCD)[0][1])
    logger.info("\tCorrelation between CzekanowskiDice and Resnik: %.2f" %
                corrcoef(allCD, allResnik)[0][1])
def compareBovineAndRandom(projectDir):
    """
    This function compare the properties of 3 functional annotations for a Bovine array + a randomize version of Affymetrix functional annotations
    """

    projectName = "BovineAndRandom"
    organism = "bovine"

    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )
    logger.info(
        "This function compare the properties of 3 functional annotations for a Bovine array + a randomize version of Affymetrix functional annotations."
    )
    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )

    logger.info("name of the project : %s " % projectName)

    #Read bovine microarray probe set to define the set of gene products
    fileName = "%s/ReferenceSet/%s.fasta" % (projectDir, organism)
    refSet = RefSet(organism=organism, fileName=fileName, refType="Fasta")

    #Read GO ontoloy
    fileName = "%s/OBO/go_daily-termdb.obo-xml" % (projectDir)
    G = readGOoboXML(fileName, force=False)

    #Read Functional annotations
    allFileName = list()
    allFileName.append("%s/Annotation/Affy_%s.na31.annot.csv" %
                       (projectDir, organism))
    allFileName.append("%s/Annotation/B2G_%s.annot" % (projectDir, organism))
    allFileName.append("%s//Annotation/AID_%s.txt" % (projectDir, organism))
    allFileName.append("%s/Annotation/Affy_%s.na31.annot.csv" %
                       (projectDir, organism))

    allPipeName = ["AFFY", "B2G", "AID", "resample"]
    allFileType = ["AFFY", "B2G", "AID", "AFFY"]

    pipeline = dict()
    for pipeName, fileName, fileType in zip(allPipeName, allFileName,
                                            allFileType):
        FA = FuncAnnot(pipeName, refSet, G, organism=organism)
        FA.read(fileName, fileType=fileType)
        pipeline[pipeName] = FA

    # Randimize FA
    randomizeFA = RandomizeFA()
    analyseFA = AnalyseFA()

    #-----------------------------------------------
    # Shuffle functional annotation
    batchList = ["sampleAnnotation"]
    batchExecute(batchList, randomizeFA,
                 [pipeline[pipeName] for pipeName in ["resample"]])

    batchList = ["coherence", "redundancy", "numberAnnot"]
    batchExecute(batchList, analyseFA,
                 [pipeline[pipeName] for pipeName in allPipeName])

    #Plot statistics of Functional annotations
    outDir = "%s/Graph/%s" % (projectDir, organism)
    createDir(outDir)
    plotFA = PlotFA(xlabel="Annotation pipelines",
                    outDir=outDir,
                    name="Resample",
                    organism=organism,
                    ext="png")
    batchExecute(batchList,
                 plotFA, [pipeline[pipeName] for pipeName in allPipeName],
                 doGrid=True)

    batchList = ["coherenceHisto2D", "numberAnnotHisto2D"]
    batchExecute(batchList,
                 plotFA, [pipeline[pipeName] for pipeName in allPipeName],
                 doGrid=True,
                 tit="")

    logger.info(
        "◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦"
    )
    logger.info("")
def meanRandPWSim(G, FA, drepli_lab, randsimdir, GOaspects='all_aspects', compute=False, silent=True):
	"""evealuate the average functional similarity share by a random sample of pair of genes"""
	if GOaspects=='all_aspects':
		aspects = ['biological_process', 'molecular_function', 'cellular_component']
	else:
		aspects = list(GOaspects)
	dmeanrandsim = {}
	for repli in drepli_lab:
		llab = drepli_lab[repli]
		dmeanrandsim[repli] = {}
		#~ for GOaspect in FA.G.aspect:
		for GOaspect in aspects:
			dmeanrandsim[repli][GOaspect] = {}
			if not compute:
				for metric in metrics:
					nfrand = "%s/%s.%s.%s"%(randsimdir, repli, GOaspect, metric)
					if not os.access(nfrand, os.F_OK):
						compute = True
						break
			if compute:
				logger.info("Compute functional similarity on %s aspect between all random gene pair in %s"%(GOaspect, repli))
				dfout = {}
				for metric in metrics:
					nfrand = "%s/%s.%s.%s"%(randsimdir, repli, GOaspect, metric)
					dfout[metric] = open(nfrand, 'w')
				lsimmax = []
				lsimmean = []
				# list of genes covered by an annotation in this replicon
				lGP = list( set(FA.GPtoGO[GOaspect].keys()) & set(llab) )
				lGP.sort()
				for GP1 in lGP:
					lstrmax = []
					lsmax = []
					lstrmean = []
					lsmean = []
					for GP2 in lGP:
						if GP2 >= GP1:
							# only exlpore the lower triangular matrix
							continue
						else:
							GO1 = FA.GPtoGO[GOaspect][GP1]
							GO2 = FA.GPtoGO[GOaspect][GP2]
							maxsim, l = GOSet_PWSimilarity(G, GO1, GO2, FA=FA, metric="funSimMax")
							# profit from the fact that computing with "funSimMax" or "funSimAverage" metrics is the same (yield the same list l)
							lsmax.append(maxsim)
							lstrmax.append("%.3f"%maxsim)
							meansim = mean(l)
							lsmean.append(meansim)
							lstrmean.append("%.3f"%meansim)
					msmax = mean(lsmax)
					lsimmax.append(msmax)
					if lstrmax:
						dfout["funSimMax"].write(' '.join(lstrmax)+'\n')
						msmean = mean(lsmean)
						lsimmean.append(msmean)
						dfout["funSimAverage"].write(' '.join(lstrmean)+'\n')
						if not silent: print GP1, msmax, msmean
				for metric in dfout:
					dfout[metric].close()
			# read random similarity records
			for metric in metrics:
				lsim = []
				nfrand = "%s/%s.%s.%s"%(randsimdir, repli, GOaspect, metric)
				logger.info("Read in file %s for random gene pair similarities"%nfrand)
				foutrand = open(nfrand, 'r')
				for line in foutrand:
					ls = line.rstrip('\n').split()
					lfs = []
					for sim in ls:
						lfs.append(float(sim))
					lsim += lfs
				msim = mean(lsim)
				dmeanrandsim[repli][GOaspect][metric] = msim
				if not silent: print "on %s for aspect %s with %s metric: %f"%(repli, GOaspect, metric, msim)
	return dmeanrandsim