コード例 #1
0
 def __init__( self, name="" ):
     BioseqDB.__init__( self, name )
     seqLength = self.getLength()
     if self.getSize() > 1:
         for bs in self.db[1:]:
             if bs.getLength() != seqLength:
                 print "ERROR: aligned sequences have different length"
コード例 #2
0
 def getClusteringResultsInFasta( self, inFile ):
     """
     Write a fasta file whose sequence headers contain the clustering IDs.
     """
     dClusterId2SeqHeaders = self.getClustersFromTxtFile()
     if self._filterUnclusteredSeq:
         dClusterId2SeqHeaders = self.filterUnclusteredSequences( dClusterId2SeqHeaders )
     inDB = BioseqDB( inFile )
     outFileName = "%s_blastclust.fa" % ( inFile )
     outF = open( outFileName, "w" )
     for clusterId in dClusterId2SeqHeaders.keys():
         memberId = 1
         for seqHeader in dClusterId2SeqHeaders[ clusterId ]:
             bs = inDB.fetch( seqHeader )
             bs.header = "BlastclustCluster%iMb%i_%s" % ( clusterId, memberId, seqHeader )
             bs.write( outF )
             memberId += 1
     outF.close()
コード例 #3
0
 def orientInputSequences( self, lSequenceHeadersToReverse, tmpFileName="" ):
     """
     Save input sequences while re-orienting those needing it.
     @param lSequenceHeadersToReverse: list of headers corresponding to sequences than need to be re-oriented
     @type lSequenceHeadersToReverse: list of strings
     @param tmpFileName: name of a fasta file (inFileName by default)
     @type tmpFileName: string
     """
     if self._verbose > 0:
         print "saving oriented sequences..."
         sys.stdout.flush()
     if tmpFileName == "":
         tmpFileName = self._inFileName
     inDB = BioseqDB( tmpFileName )
     outDB = BioseqDB()
     for bs in inDB.db:
         if bs.header in lSequenceHeadersToReverse:
             bs.reverseComplement()
             bs.header += " re-oriented"
         outDB.add( bs )
     outDB.save( self._outFileName )
コード例 #4
0
ファイル: FastaUtils.py プロジェクト: chungtseng/HCPU_midterm
 def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ):
     bsDB = BioseqDB( inFileName )
     if verbose > 0:
         print "nb of input sequences: %i" % ( bsDB.getSize() )
 
     if outFileName == "":
         outFileName = inFileName + ".best" + str(num)
     outFile = open( outFileName, "w" )
     
     if bsDB.getSize()==0:
         return 0
     
     num = int(num)
     if verbose > 0:
         print "keep the %i longest sequences" % ( num )
         if minThresh > 0:
             print "with length > %i bp" % ( minThresh )
         sys.stdout.flush()
         
     # retrieve the length of each input sequence
     tmpLSeqLgth = []
     seqNum = 0
     for bs in bsDB.db:
         seqNum += 1
         tmpLSeqLgth.append( bs.getLength() )
         if verbose > 1:
             print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() )
         sys.stdout.flush()
 
     # sort the lengths
     tmpLSeqLgth.sort()
     tmpLSeqLgth.reverse()
 
     # select the longest
     lSeqLgth = []
     for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
         if tmpLSeqLgth[i] >= minThresh:
             lSeqLgth.append( tmpLSeqLgth[i] )
     if verbose > 0:
         print "selected max length: %i" % ( max(lSeqLgth) )
         print "selected min length: %i" % ( min(lSeqLgth) )
         sys.stdout.flush()
 
     # save the longest
     inFile = open( inFileName )
     seqNum = 0
     nbSave = 0
     for bs in bsDB.db:
         seqNum += 1
         if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
             bs.write( outFile )
             if verbose > 1:
                 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
                 sys.stdout.flush()
             nbSave += 1
         if nbSave == num:
             break
     inFile.close()
     outFile.close()
     if verbose > 0:
         print nbSave, "saved sequences in ", outFileName
         sys.stdout.flush()
         
     return 0