def add(self, statistics, plotType): logger.info("Registering plot function %s" % statistics) if not rS.isRegistered(statistics): logger.handleWarning("Caution, the statistics is unknown : %s" % statistics) return False self.all.append(statistics) if plotType=="Multiple": newPlotFunc = self.getMultiPlotFunction(statistics) newPlotFunc = new.instancemethod(newPlotFunc, None, self.cls) setattr(self.cls, newPlotFunc.__name__, newPlotFunc) elif plotType=="Histo3D": newPlotFunc = self.getHisto3DPlotFunction(statistics) newPlotFunc = new.instancemethod(newPlotFunc, None, self.cls) setattr(self.cls, "%sHisto3D" % newPlotFunc.__name__, newPlotFunc) else: logger.handleWarning("Caution, the statistics plot type is unknown : %s" % plotType) pass self.types.setdefault(statistics, set()).add(plotType) self.allTypes.add(plotType) return True
def checkValidity(self): logger.info("Name :\t%s" % self.name) allValid=True for aspect in self.GPtoGO: valid=True for gp in self.GPtoGO[aspect]: for go in self.GPtoGO[aspect][gp]: if not gp in self.GOtoGP[aspect][go]: logger.handleWarning ("%s not found in GOtoGP[%s][%s]" % (gp,aspect,go)) valid=False for go in self.GOtoGP[aspect]: for gp in self.GOtoGP[aspect][go]: if not go in self.GPtoGO[aspect][gp]: logger.handleWarning ("%s not found in GPtoGO[%s][%s]" % (go,aspect,gp)) valid=False if valid: logger.info ("%s : is valid" % (aspect)) allValid = allValid and valid return allValid
def extract_Affy(fileName, G, refSet=None, GO_columns=[30, 31, 32], filetype="Affy", delimiter=',', quoting=csv.QUOTE_ALL): fileName = checkForZip(fileName) if (not os.path.exists(fileName)): raise IOError(fileName + " does not exist and is required ") #sniff and seek dialect csvfile = readFile(fileName) hasRef = None if refSet: hasRef = dict(izip(refSet, refSet)) GenetoGO, GOtoGene = dict(), dict() for aspect in G.aspect: GenetoGO[aspect], GOtoGene[aspect] = dict(), dict() #Skip comments row = csvfile.readline() while row[0] == '#': row = csvfile.readline() #Read Header header = row #rd=list(csv.reader(f)) csv.register_dialect('format', delimiter=delimiter, quoting=quoting) rd = csv.reader(csvfile, dialect='format') for row in rd: #Read gene product id g = row[0] if hasRef and not hasRef.has_key(g): logger.handleWarning( "gene product %s is not in the reference set, skip it " % g) continue for aspect, i in zip( ['biological_process', 'cellular_component', 'molecular_function'], GO_columns): for item in row[i].split('///'): if not item == "---" and not len(item.strip()) == 0: go = "GO:%07d" % int(item.split('//')[0].replace('/', '')) go, aspect = G.get_GOAlternative(go, nameSpace=True) if not aspect: logger.handleWarning( "term %s is not in GO graph, skip it " % go) continue GenetoGO[aspect].setdefault(g, set([])).add(go) GOtoGene[aspect].setdefault(go, set([])).add(g) return GenetoGO, GOtoGene
def extract_GO2GP(fileName, G, refSet=None, sep1='\t', sep2=',', comments='#', skiprows=0): """ Read a functional annotation mapping file of the form GO_ID sep Gene_ID,Gene_ID sep is \t by default """ fileName = str(checkForZip(fileName)) if (not os.path.exists(fileName)): raise IOError(fileName + " does not exist and is required ") hasRef = None if refSet: hasRef = dict(izip(refSet, refSet)) GenetoGO, GOtoGene = dict(), dict() for aspect in G.aspect: GenetoGO[aspect], GOtoGene[aspect] = dict(), dict() data = loadtxt(fileName, dtype="S", usecols=[0, 1], delimiter=sep1, comments=comments, skiprows=skiprows) for go, GP in data: if not go.find('GO:') == 0: continue #Get the alternative term if any and its GO aspect term, aspect = G.get_GOAlternative(go, nameSpace=True) if not aspect: logger.handleWarning("term %s is not in GO graph, skip it " % (term)) continue for gp in GP.split(sep2): gp = gp.strip() if not hasRef is None and not hasRef.has_key(gp): logger.handleWarning( "gene product %s is not in the reference set, skip it " % gp) continue GenetoGO[aspect].setdefault(gp, set([])).add(go) GOtoGene[aspect].setdefault(go, set([])).add(gp) return GenetoGO, GOtoGene
def save(self, fileName): import shelve try: logger.info("File :\t%s" % fileName) shelf = shelve.open(fileName, protocol=-1) shelf['fileName'] = fileName for k, v in self.items(): shelf[k] = v shelf.close() self.status = "Saved" except Exception, e: logger.handleWarning("Unable to save project %s: %s" % (fileName, str(e)))
def load(self, fileName): import shelve try: logger.info("File :\t%s" % fileName) shelf = shelve.open(fileName, protocol=-1) for k, v in shelf.items(): self[k] = v shelf.close() self.status = "Loaded" except Exception, e: logger.handleWarning("Unable to load project %s: %s" % (fileName, str(e)))
def GOSet_PWSimilarity(G, GO1, GO2, metric="GS2", **kargs): """ Calculates pairwise semantic similarity scores between two given annotation sets """ if metric=="GS2": sim, l=G.GS2([G.GOtoInt(GO1), G.GOtoInt(GO2)]) elif metric=="CzekanowskiDice": sim = G.CzekanowskiDice(G.GOtoInt(GO1), G.GOtoInt(GO2)) l=[sim,sim] elif metric=="Resnik": sim,l = G.Resnik(G.GOtoInt(GO1), G.GOtoInt(GO2), kargs.get('IC', dict())) else: logger.handleWarning ("Sorry, unknown semnatic similarity %s " % metric) sim,l=None,None return sim, l
def GOSet_Similarity(G, GO, metric="GS2", **kargs): """ Calculates pairwise semantic similarity scores between GO terms in a given annotation set """ if len(GO)<2: allD=list([1.]) else: if metric=="GS2": allD=[ G.GS2([G.GOtoInt([g1]), G.GOtoInt([g2])])[0] for i,g1 in enumerate(GO) for j,g2 in enumerate(GO) if i <j ] elif metric=="CzekanowskiDice": allD=[ G.CzekanowskiDice(G.GOtoInt([g1]), G.GOtoInt([g2])) for i,g1 in enumerate(GO) for j,g2 in enumerate(GO) if i <j ] elif metric=="Resnik": allD=[ G.Resnik(G.GOtoInt([g1]), G.GOtoInt([g2]), kargs.get('IC', dict()))[0] for i,g1 in enumerate(GO) for j,g2 in enumerate(GO) if i <j ] else: logger.handleWarning ("Sorry, unknown semnatic similarity %s " % metric) allD=None return allD
def extract_SCOP(fileName, G, refSet=None): fileName = checkForZip(fileName) if (not os.path.exists(fileName)): raise IOError(fileName + " does not exist and is required ") hasRef = None if refSet: hasRef = dict(izip(refSet, refSet)) GenetoGO, GOtoGene = dict(), dict() for aspect in G.aspect: GenetoGO[aspect], GOtoGene[aspect] = dict(), dict() rd = csv.reader(readFile(fileName), delimiter=";") header = rd.next() for row in rd: #Read gene product id g = row[0] g = row[header.index('domScop')] go = row[header.index('termGo')] if hasRef and not hasRef.has_key(g): logger.handleWarning( "gene product %s is not in the reference set, skip it " % g) continue if go.find('GO:') == 0: #Get the alternative term if any and its GO aspect go, aspect = G.get_GOAlternative(go, nameSpace=True) if not aspect: logger.handleWarning("term %s is not in GO graph, skip it " % term) continue GenetoGO[aspect].setdefault(g, set([])).add(go) GOtoGene[aspect].setdefault(go, set([])).add(g) return GenetoGO, GOtoGene
def GO_Similarity(G, allGO, metric="GS2", **kargs): """ Calculates pairwise semantic similarity scores in a list of annotation sets """ if len(allGO)<2: sim=1.0 l=[1.0] else: if metric=="GS2": sim,l=G.GS2( [G.GOtoInt(GO) for GO in allGO ]) elif metric=="CzekanowskiDice": allGO=[G.GOtoInt(GO) for GO in allGO ] allD=dict() l=list() for i,GO1 in enumerate(allGO): foo=[allD.setdefault(sort([i,j])[0],dict()).setdefault(sort([i,j])[1], G.CzekanowskiDice(GO1, GO2)) for j,GO2 in enumerate(allGO) if not j==i] l.append(mean(foo)) sim=mean(l) elif metric=="Resnik": allGO=[G.GOtoInt(GO) for GO in allGO ] allD=dict() l=list() for i,GO1 in enumerate(allGO): foo=[allD.setdefault(sort([i,j])[0],dict()).setdefault(sort([i,j])[1], G.Resnik(GO1, GO2, kargs.get('IC', dict()))) for j,GO2 in enumerate(allGO) if not j==i] l.append(mean(foo)) sim=mean(l) else: logger.handleWarning ("Sorry, unknown semnatic similarity %s " % metric) sim,l=None,None return sim,l
def extract_GAF(fileName, G, refSet=None): fileName = checkForZip(fileName) if (not os.path.exists(fileName)): raise IOError(fileName + " does not exist and is required ") hasRef = None if refSet: refRef = dict(izip(refSet, refSet)) GenetoGO, GOtoGene = dict(), dict() for aspect in G.aspect: GenetoGO[aspect], GOtoGene[aspect] = dict(), dict() data, GAF_col = readGAF_2(fileName) for row in data: #g=row[GAF_col.index("DB Object Symbol")] g = ".".join([ row[GAF_col.index("Taxon(|taxon)")][6:], row[GAF_col.index("DB Object Symbol")] ]) go = row[GAF_col.index('GO ID')] if not row[GAF_col.index('Qualifier')].find('NOT') == -1: logger.handleWarning( "go term %s for gene product %s is qualified as NOT: ignored" % (go, g)) continue if hasRef and not hasRef.has_key(g): logger.handleWarning( "gene product %s is not in the reference set, skip it " % g) continue if go.find('GO:') == 0: go, aspect = G.get_GOAlternative(go, nameSpace=True) if not aspect: logger.handleWarning("term %s is not in GO graph, skip it " % go) continue GenetoGO[aspect].setdefault(g, set([])).add(go) GOtoGene[aspect].setdefault(go, set([])).add(g) return GenetoGO, GOtoGene
def add(self, fileName, refType="Fasta"): if self.fileName == '': self.fileName = fileName self.refType = refType else: if type(self.fileName) == list: self.fileName.append(fileName) self.refType.append(refType) else: self.fileName = [self.fileName, fileName] self.refType = [self.refType, refType] fileName = checkForZip(fileName) if (not os.path.exists(fileName)): logger.handleFatal(fileName + " does not exist and is required ") logger.info("Organism :\t%s" % self.organism) logger.info("%s file :\t%s " % (refType, fileName)) try: #Use fasta file to define the reference set if refType == "Fasta": from Bio import SeqIO allID = set([ rec.name.split(";")[0].split(":")[-1] for rec in SeqIO.parse(readFile(fileName), "fasta") ]) self.update(allID) #Use a simple text file to define the reference set, first column is chosen by default elif refType == "Text": allID = set([ r[0] for r in csv.reader(readFile(fileName), delimiter=";") ]) self.update(allID) #Use a GO annotation file to define the reference set elif refType == "GAF": from AIGO.IO import readGAF_2 data, GAF_col = readGAF_2(fileName) allID = set([ ".".join([ row[GAF_col.index("Taxon(|taxon)")][6:], row[GAF_col.index("DB Object Symbol")] ]) for row in data ]) self.update(allID) #Use a Affymetrix annotation file to define the reference set elif refType == "AFFY": f = readFile(fileName) row = f.readline() while row[0] == '#': row = f.readline() header = row rd = csv.reader(f) allID = set() for row in rd: #Read gene product id if not control sequence if ("Control sequence".upper() != row[4].upper()): allID.add(row[0]) self.update(allID) else: print "Sorry, unknown file type !!" self.extend([]) raise Exception if len(self) == 0: logger.handleWarning("No gene products loaded") except Exception, e: logger.handleFatal("Unable to read file %s: %s" % (fileName, str(e)))