def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ): if pattern == "": return patternToSearch = re.compile(pattern) if outFileName == "": outFileName = inFileName + '.cleaned' outFile = open(outFileName,'w') bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 inFile = open(inFileName) while True: bioseq.read(inFile) if bioseq.sequence == None: break bioseqNb += 1 if not patternToSearch.search(bioseq.header): bioseq.write(outFile) if verbose > 1: print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!' savedBioseqNb += 1 inFile.close() outFile.close() if verbose > 0: print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
def spliceFromCoords( genomeFile, coordFile, obsFile ): genomeFileHandler = open( genomeFile, "r" ) obsFileHandler = open( obsFile, "w" ) dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile ) while True: bs = Bioseq() bs.read( genomeFileHandler ) if bs.sequence == None: break if dChr2Maps.has_key( bs.header ): lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] ) splicedSeq = "" currentSite = 0 for iMap in lCoords: minSplice = iMap.getMin() - 1 if minSplice > currentSite: splicedSeq += bs.sequence[ currentSite : minSplice ] currentSite = iMap.getMax() splicedSeq += bs.sequence[ currentSite : ] bs.sequence = splicedSeq bs.write( obsFileHandler ) genomeFileHandler.close() obsFileHandler.close()
def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ): if pattern == "": return if outFileName == "": outFileName = inFileName + '.extracted' outFile = open( outFileName, 'w' ) patternTosearch = re.compile( pattern ) bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 inFile = open( inFileName, "r" ) while True: bioseq.read( inFile ) if bioseq.sequence == None: break bioseqNb = bioseqNb + 1 m = patternTosearch.search( bioseq.header ) if m: bioseq.write( outFile ) if verbose > 1: print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!' savedBioseqNb = savedBioseqNb + 1 inFile.close() outFile.close() if verbose > 0: print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
def dbLengthFilter( len_min, inFileName, verbose=0 ): file_db = open( inFileName, "r" ) file_dbInf = open( inFileName+".Inf"+str(len_min), "w" ) file_dbSup = open( inFileName+".Sup"+str(len_min), "w" ) seq = Bioseq() numseq = 0 nbsave = 0 while True: seq.read( file_db ) if seq.sequence == None: break l = seq.getLength() numseq = numseq + 1 if l >= len_min: seq.write( file_dbSup ) if verbose > 0: print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!' nbsave=nbsave+1 else: seq.write( file_dbInf ) if verbose > 0: print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!' nbsave=nbsave+1 file_db.close() file_dbInf.close() file_dbSup.close() if verbose > 0: print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ): if patternFileName == "": print "ERROR: no file of pattern" sys.exit(1) bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 lHeaders = [] inFile = open( inFileName, "r" ) while True: bioseq.read( inFile ) if bioseq.sequence == None: break bioseqNb += 1 lHeaders.append( bioseq.header ) inFile.close() patternFile = open( patternFileName, "r") lHeadersToRemove = [] for pattern in patternFile: if verbose > 0: print "pattern: ",pattern[:-1]; sys.stdout.flush() patternToSearch = re.compile( pattern[:-1] ) for h in lHeaders: if patternToSearch.search(h): lHeadersToRemove.append(h) patternFile.close() if outFileName == "": outFileName = inFileName + '.cleaned' outFile = open( outFileName, 'w' ) bioseqNum = 0 inFile=open( inFileName ) while True: bioseq.read( inFile ) bioseqNum += 1 if bioseq.sequence == None: break if bioseq.header not in lHeadersToRemove: bioseq.write( outFile ) if verbose > 1: print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush() savedBioseqNb += 1 inFile.close() outFile.close() if verbose > 0: print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ): if patternFileName == "": print "ERROR: no file of pattern" sys.exit(1) bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 lHeaders = [] inFile = open( inFileName, "r" ) while True: bioseq.read( inFile ) if bioseq.sequence == None: break lHeaders.append( bioseq.header ) inFile.close() lHeadersToKeep = [] patternFile = open( patternFileName, "r" ) for pattern in patternFile: if verbose > 0: print "pattern: ",pattern[:-1]; sys.stdout.flush() patternToSearch = re.compile(pattern[:-1]) for h in lHeaders: if patternToSearch.search(h): lHeadersToKeep.append(h) patternFile.close() if outFileName == "": outFileName = inFileName + ".extracted" outFile=open( outFileName, "w" ) inFile = open( inFileName, "r" ) while True: bioseq.read(inFile) if bioseq.sequence == None: break bioseqNb += 1 if bioseq.header in lHeadersToKeep: bioseq.write(outFile) if verbose > 1: print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush() savedBioseqNb += 1 inFile.close() outFile.close() if verbose > 0: print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
def filterClassifiedConsensus( self ): inFile = open( self._inFaFile, "r" ) outFile = open( self._outFaFile, "w" ) bs = Bioseq() nbInSeq = 0 nbRmv = 0 if self._classifFile != "": dHeader2Classif = self.getClassifPerHeaderOfUnclassifiedConsensus() while True: bs.read( inFile ) if bs.header == None: break nbInSeq += 1 if self._verbose > 1: print bs.header if self._filterSSRs and "SSR" in bs.header and ( self._maxLengthToFilterSSRs == 0 or bs.getLength() <= self._maxLengthToFilterSSRs ): nbRmv += 1 if self._verbose > 1: print "filtered SSR !" elif self._filterHostGenes and "HostGene" in bs.header: nbRmv += 1 if self._verbose > 1: print "filtered HostGene !" elif self._filterConfused and "confused" in bs.header and "confusedness=no" not in bs.header: nbRmv += 1 if self._verbose > 1: print "filtered confused !" elif self._filterNoCat != "0" and "NoCat" in bs.header: keep = False if "2" in self._filterNoCat: algoMSA = "" for i in ["Map","MAP","Malign","Mafft","Prank","Clustalw","Muscle","Tcoffee"]: if i in bs.header: algoMSA = i nbAlignSeq = int( bs.header.split(algoMSA+"_")[1].split("|")[0] ) if nbAlignSeq > self._nbAlignSeqNoCat: keep = True if "3" in self._filterNoCat: for header in dHeader2Classif.keys(): if header in bs.header: if "no structural features" not in dHeader2Classif[header][6]: keep = True if keep: bs.write( outFile ) else: nbRmv += 1 if self._verbose > 1: print "filtered NoCat !" elif self._filterIncomplete and "completeness=incomp" in bs.header: nbRmv += 1 if self._verbose > 1: print "filtered incomplete !" else: bs.write( outFile ) inFile.close() outFile.close() if self._verbose > 0: print "nb of input seq: %i" % ( nbInSeq ) print "nb of filtered seq: %i" % ( nbRmv ) sys.stdout.flush()