def __init__( self, name="" ): BioseqDB.__init__( self, name ) seqLength = self.getLength() if self.getSize() > 1: for bs in self.db[1:]: if bs.getLength() != seqLength: print "ERROR: aligned sequences have different length"
def getClusteringResultsInFasta( self, inFile ): """ Write a fasta file whose sequence headers contain the clustering IDs. """ dClusterId2SeqHeaders = self.getClustersFromTxtFile() if self._filterUnclusteredSeq: dClusterId2SeqHeaders = self.filterUnclusteredSequences( dClusterId2SeqHeaders ) inDB = BioseqDB( inFile ) outFileName = "%s_blastclust.fa" % ( inFile ) outF = open( outFileName, "w" ) for clusterId in dClusterId2SeqHeaders.keys(): memberId = 1 for seqHeader in dClusterId2SeqHeaders[ clusterId ]: bs = inDB.fetch( seqHeader ) bs.header = "BlastclustCluster%iMb%i_%s" % ( clusterId, memberId, seqHeader ) bs.write( outF ) memberId += 1 outF.close()
def orientInputSequences( self, lSequenceHeadersToReverse, tmpFileName="" ): """ Save input sequences while re-orienting those needing it. @param lSequenceHeadersToReverse: list of headers corresponding to sequences than need to be re-oriented @type lSequenceHeadersToReverse: list of strings @param tmpFileName: name of a fasta file (inFileName by default) @type tmpFileName: string """ if self._verbose > 0: print "saving oriented sequences..." sys.stdout.flush() if tmpFileName == "": tmpFileName = self._inFileName inDB = BioseqDB( tmpFileName ) outDB = BioseqDB() for bs in inDB.db: if bs.header in lSequenceHeadersToReverse: bs.reverseComplement() bs.header += " re-oriented" outDB.add( bs ) outDB.save( self._outFileName )
def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ): bsDB = BioseqDB( inFileName ) if verbose > 0: print "nb of input sequences: %i" % ( bsDB.getSize() ) if outFileName == "": outFileName = inFileName + ".best" + str(num) outFile = open( outFileName, "w" ) if bsDB.getSize()==0: return 0 num = int(num) if verbose > 0: print "keep the %i longest sequences" % ( num ) if minThresh > 0: print "with length > %i bp" % ( minThresh ) sys.stdout.flush() # retrieve the length of each input sequence tmpLSeqLgth = [] seqNum = 0 for bs in bsDB.db: seqNum += 1 tmpLSeqLgth.append( bs.getLength() ) if verbose > 1: print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() ) sys.stdout.flush() # sort the lengths tmpLSeqLgth.sort() tmpLSeqLgth.reverse() # select the longest lSeqLgth = [] for i in xrange( 0, min(num,len(tmpLSeqLgth)) ): if tmpLSeqLgth[i] >= minThresh: lSeqLgth.append( tmpLSeqLgth[i] ) if verbose > 0: print "selected max length: %i" % ( max(lSeqLgth) ) print "selected min length: %i" % ( min(lSeqLgth) ) sys.stdout.flush() # save the longest inFile = open( inFileName ) seqNum = 0 nbSave = 0 for bs in bsDB.db: seqNum += 1 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh: bs.write( outFile ) if verbose > 1: print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] ) sys.stdout.flush() nbSave += 1 if nbSave == num: break inFile.close() outFile.close() if verbose > 0: print nbSave, "saved sequences in ", outFileName sys.stdout.flush() return 0