def extract_Affy(fileName, G, refSet=None, GO_columns=[30, 31, 32], filetype="Affy", delimiter=',', quoting=csv.QUOTE_ALL): fileName = checkForZip(fileName) if (not os.path.exists(fileName)): raise IOError(fileName + " does not exist and is required ") #sniff and seek dialect csvfile = readFile(fileName) hasRef = None if refSet: hasRef = dict(izip(refSet, refSet)) GenetoGO, GOtoGene = dict(), dict() for aspect in G.aspect: GenetoGO[aspect], GOtoGene[aspect] = dict(), dict() #Skip comments row = csvfile.readline() while row[0] == '#': row = csvfile.readline() #Read Header header = row #rd=list(csv.reader(f)) csv.register_dialect('format', delimiter=delimiter, quoting=quoting) rd = csv.reader(csvfile, dialect='format') for row in rd: #Read gene product id g = row[0] if hasRef and not hasRef.has_key(g): logger.handleWarning( "gene product %s is not in the reference set, skip it " % g) continue for aspect, i in zip( ['biological_process', 'cellular_component', 'molecular_function'], GO_columns): for item in row[i].split('///'): if not item == "---" and not len(item.strip()) == 0: go = "GO:%07d" % int(item.split('//')[0].replace('/', '')) go, aspect = G.get_GOAlternative(go, nameSpace=True) if not aspect: logger.handleWarning( "term %s is not in GO graph, skip it " % go) continue GenetoGO[aspect].setdefault(g, set([])).add(go) GOtoGene[aspect].setdefault(go, set([])).add(g) return GenetoGO, GOtoGene
def extract_SCOP(fileName, G, refSet=None): fileName = checkForZip(fileName) if (not os.path.exists(fileName)): raise IOError(fileName + " does not exist and is required ") hasRef = None if refSet: hasRef = dict(izip(refSet, refSet)) GenetoGO, GOtoGene = dict(), dict() for aspect in G.aspect: GenetoGO[aspect], GOtoGene[aspect] = dict(), dict() rd = csv.reader(readFile(fileName), delimiter=";") header = rd.next() for row in rd: #Read gene product id g = row[0] g = row[header.index('domScop')] go = row[header.index('termGo')] if hasRef and not hasRef.has_key(g): logger.handleWarning( "gene product %s is not in the reference set, skip it " % g) continue if go.find('GO:') == 0: #Get the alternative term if any and its GO aspect go, aspect = G.get_GOAlternative(go, nameSpace=True) if not aspect: logger.handleWarning("term %s is not in GO graph, skip it " % term) continue GenetoGO[aspect].setdefault(g, set([])).add(go) GOtoGene[aspect].setdefault(go, set([])).add(g) return GenetoGO, GOtoGene
def readGOoboXML(fileName, force=False, prefix="GO"): import cPickle as pickle picName = "%s.pic" % fileName if (not os.path.exists(picName)): force = True if not force: try: logger.info("Reading serialized OBO file : %s" % picName) with open(picName, "rb") as f: G = pickle.load(f) f.close() except IOError as (inst): print str(type(inst)) + " for " + picName force = True except EOFError as (inst): ##its an empty file? print str(type(inst)) + " for " + picName force = True try: if force: fileName = checkForZip(fileName) if (not os.path.exists(fileName)): raise IOError(fileName + " does not exist and is required ") logger.info("Reading OBO file : %s" % fileName) G = get_GOGraph(readFile(fileName, mode="r"), prefix=prefix) G.fileName = fileName with open(picName, "wb") as f: logger.info("Saving serialized OBO file") pickle.dump(G, f, -1) f.close() except Exception, e: logger.handleFatal("Unable to read file %s: %s" % (fileName, str(e)))
def readGAF_2(fileName): GAF_col = [ "DB", "DB Object ID", "DB Object Symbol", "Qualifier", "GO ID", "DB:Reference", "Evidence Code", "With (or) From", "Aspect", "DB Object Name", "DB Object Synonym", "DB Object Type", "Taxon(|taxon)", "Date", "Assigned By", "Annotation Extension", "Gene Product Form ID" ] #Read the entire file data = [row for row in csv.reader(readFile(fileName), delimiter="\t")] #Read the header seek = 0 GAF_OK = False while data[seek][0][0] == "!": if re.search("!.*gaf-version.*:.*2", data[seek][0]): GAF_OK = True seek = seek + 1 if not GAF_OK: raise Exception("Sorry, GAF format version 2.0 expected.") return iter(data[seek:]), GAF_col
def add(self, fileName, refType="Fasta"): if self.fileName == '': self.fileName = fileName self.refType = refType else: if type(self.fileName) == list: self.fileName.append(fileName) self.refType.append(refType) else: self.fileName = [self.fileName, fileName] self.refType = [self.refType, refType] fileName = checkForZip(fileName) if (not os.path.exists(fileName)): logger.handleFatal(fileName + " does not exist and is required ") logger.info("Organism :\t%s" % self.organism) logger.info("%s file :\t%s " % (refType, fileName)) try: #Use fasta file to define the reference set if refType == "Fasta": from Bio import SeqIO allID = set([ rec.name.split(";")[0].split(":")[-1] for rec in SeqIO.parse(readFile(fileName), "fasta") ]) self.update(allID) #Use a simple text file to define the reference set, first column is chosen by default elif refType == "Text": allID = set([ r[0] for r in csv.reader(readFile(fileName), delimiter=";") ]) self.update(allID) #Use a GO annotation file to define the reference set elif refType == "GAF": from AIGO.IO import readGAF_2 data, GAF_col = readGAF_2(fileName) allID = set([ ".".join([ row[GAF_col.index("Taxon(|taxon)")][6:], row[GAF_col.index("DB Object Symbol")] ]) for row in data ]) self.update(allID) #Use a Affymetrix annotation file to define the reference set elif refType == "AFFY": f = readFile(fileName) row = f.readline() while row[0] == '#': row = f.readline() header = row rd = csv.reader(f) allID = set() for row in rd: #Read gene product id if not control sequence if ("Control sequence".upper() != row[4].upper()): allID.add(row[0]) self.update(allID) else: print "Sorry, unknown file type !!" self.extend([]) raise Exception if len(self) == 0: logger.handleWarning("No gene products loaded") except Exception, e: logger.handleFatal("Unable to read file %s: %s" % (fileName, str(e)))