コード例 #1
0
ファイル: FastaUtils.py プロジェクト: chungtseng/HCPU_midterm
 def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ):
     if outFileName == "":
         outFileName = inFileName + ".orf.map"
     outFile = open( outFileName, "w" )
 
     bioseq = Bioseq()
     bioseqNb = 0
 
     inFile = open( inFileName )
     while True:
         bioseq.read( inFile )
         if bioseq.sequence == None:
             break
         bioseq.upCase() 
         bioseqNb += 1
         if verbose > 0:
             print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]'
             
         orf = bioseq.findORF()
         bestOrf = []
         for i in orf.keys():
             orfLen = len(orf[i])
             for j in xrange(1, orfLen):
                 start = orf[i][j-1] + 4
                 end = orf[i][j] + 3
                 if end - start >= orfMinLength:
                     bestOrf.append( ( end-start, i+1, start, end ) )
 
         bioseq.complement()
         
         orf = bioseq.findORF()
         seqLen = bioseq.getLength()
         for i in orf.keys():
             orfLen = len(orf[i])
             for j in xrange(1, orfLen):
                 start = seqLen - orf[i][j-1] - 3
                 end = seqLen - orf[i][j] - 2
                 if start - end >= orfMinLength:
                     bestOrf.append( ( start-end, (i+1)*-1, start, end ) )
 
         bestOrf.sort()
         bestOrf.reverse()
         bestOrfNb = len(bestOrf)
         if orfMaxNb > bestOrfNb or orfMaxNb == 0 :
             orfMaxNb = bestOrfNb
         for i in xrange(0, orfMaxNb):
             if verbose > 0:
                 print bestOrf[i]
             outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\
                                "|"+str(bestOrf[i][0]),bioseq.header,
                                bestOrf[i][2],bestOrf[i][3]))
 
     inFile.close()
     outFile.close()
 
     return 0
コード例 #2
0
ファイル: FastaUtils.py プロジェクト: chungtseng/HCPU_midterm
 def dbLengthFilter( len_min, inFileName, verbose=0 ):
     file_db = open( inFileName, "r" )
     file_dbInf = open( inFileName+".Inf"+str(len_min), "w" )
     file_dbSup = open( inFileName+".Sup"+str(len_min), "w" )
     seq = Bioseq()
     numseq = 0
     nbsave = 0
     
     while True:
         seq.read( file_db )
         if seq.sequence == None:
             break
         l = seq.getLength()
         numseq = numseq + 1
         if l >= len_min:
             seq.write( file_dbSup )
             if verbose > 0:
                     print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!'
                     nbsave=nbsave+1
         else:
             seq.write( file_dbInf )
             if verbose > 0:
                     print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!'
                     nbsave=nbsave+1
                     
     file_db.close()
     file_dbInf.close()
     file_dbSup.close()
     if verbose > 0:
         print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
コード例 #3
0
ファイル: DbMySql.py プロジェクト: chungtseng/HCPU_midterm
 def createSeqTable( self, tableName, fileName = "" ):
     sqlCmd = "CREATE TABLE %s (accession varchar(255), sequence longtext, description varchar(255), length int unsigned )" % (tableName)
     self.execute( sqlCmd )
     self.createSeqIndex( tableName )
     self.updateInfoTable( tableName, fileName )
     
     if fileName != "":
         inFile = open( fileName )
         tmpFileName = fileName.split("/")[-1] + ".tmp" + str(os.getpid())
         tmpFile = open(tmpFileName, "w")
         bioseq = Bioseq()
         seqNb = 0
         while True:
             bioseq.read( inFile )
             if bioseq.sequence == None:
                 break
             seqLen = bioseq.getLength()
             tmpFile.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
                                             bioseq.sequence, bioseq.header, seqLen))
             seqNb += 1
         inFile.close()
         tmpFile.close()
         sqlCmd = "LOAD DATA LOCAL INFILE '%s' IGNORE INTO TABLE %s FIELDS ESCAPED BY ''" % \
                  (tmpFileName, tableName)
         self.execute( sqlCmd )
         os.remove( tmpFileName )
コード例 #4
0
ファイル: FastaUtils.py プロジェクト: chungtseng/HCPU_midterm
 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0):
     if verbose > 0:
         print "sort sequences by increasing length"
         sys.stdout.flush()
     if not os.path.exists( inFileName ):
         print "ERROR: file '%s' doesn't exist" % ( inFileName )
         sys.exit(1)
         
     # read each seq one by one
     # save them in distinct temporary files
     # with their length in the name
     inFileHandler = open( inFileName, "r" )
     bs = Bioseq()
     countSeq = 0
     while True:
         bs.read( inFileHandler )
         if bs.header == None:
             break
         countSeq += 1
         tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq )
         bs.appendBioseqInFile( tmpFile )
         if verbose > 1:
             print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile )
         bs.header = ""
         bs.sequence = ""
     inFileHandler.close()
     
     # sort temporary file names
     # concatenate them into the output file
     if os.path.exists( outFileName ):
         os.remove( outFileName )
     lFiles = glob.glob( "*bp_*nb" )
     lFiles.sort( key=lambda s:int(s.split("bp_")[0]) )
     for fileName in lFiles:
         cmd = "cat %s >> %s" % ( fileName, outFileName )
         returnValue = os.system( cmd )
         if returnValue != 0:
             print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName )
             sys.exit(1)
         os.remove( fileName )
         
     return 0
コード例 #5
0
 def getLengthPerSeqFromFile( inFile ):
     dHeader2Length = {}
     inFileHandler = open( inFile, "r" )
     while True:
         iBs = Bioseq()
         iBs.read( inFileHandler )
         if iBs.sequence == None:
             break
         dHeader2Length[ iBs.header ] = iBs.getLength()
     inFileHandler.close()
     return dHeader2Length
コード例 #6
0
 def filterClassifiedConsensus( self ):
     inFile = open( self._inFaFile, "r" )
     outFile = open( self._outFaFile, "w" )
     bs = Bioseq()
     nbInSeq = 0
     nbRmv = 0
     
     if self._classifFile != "":
         dHeader2Classif = self.getClassifPerHeaderOfUnclassifiedConsensus()
         
     while True:
         bs.read( inFile )
         if bs.header == None:
             break
         nbInSeq += 1
         if self._verbose > 1:
             print bs.header
             
         if self._filterSSRs and "SSR" in bs.header and ( self._maxLengthToFilterSSRs == 0 or bs.getLength() <= self._maxLengthToFilterSSRs ):
             nbRmv += 1
             if self._verbose > 1: print "filtered SSR !"
             
         elif self._filterHostGenes and "HostGene" in bs.header:
             nbRmv += 1
             if self._verbose > 1: print "filtered HostGene !"
             
         elif self._filterConfused and "confused" in bs.header and "confusedness=no" not in bs.header:
             nbRmv += 1
             if self._verbose > 1: print "filtered confused !"
             
         elif self._filterNoCat != "0" and "NoCat" in bs.header:
             keep = False
             if "2" in self._filterNoCat:
                 algoMSA = ""
                 for i in ["Map","MAP","Malign","Mafft","Prank","Clustalw","Muscle","Tcoffee"]:
                     if i in bs.header:
                         algoMSA = i
                 nbAlignSeq = int( bs.header.split(algoMSA+"_")[1].split("|")[0] )
                 if nbAlignSeq > self._nbAlignSeqNoCat:
                     keep = True
             if "3" in self._filterNoCat:
                 for header in dHeader2Classif.keys():
                     if header in bs.header:
                         if "no structural features" not in dHeader2Classif[header][6]:
                             keep = True
             if keep:
                 bs.write( outFile )
             else:
                 nbRmv += 1
                 if self._verbose > 1: print "filtered NoCat !"
                 
         elif self._filterIncomplete and "completeness=incomp" in bs.header:
             nbRmv += 1
             if self._verbose > 1: print "filtered incomplete !"
             
         else:
             bs.write( outFile )
             
     inFile.close()
     outFile.close()
     
     if self._verbose > 0:
         print "nb of input seq: %i" % ( nbInSeq )
         print "nb of filtered seq: %i" % ( nbRmv )
         sys.stdout.flush()