コード例 #1
0
ファイル: FastaUtils.py プロジェクト: chungtseng/HCPU_midterm
 def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ):
     if pattern == "":
         return
     
     patternToSearch = re.compile(pattern)
     
     if outFileName == "":
         outFileName = inFileName + '.cleaned'
     outFile = open(outFileName,'w')
     
     bioseq = Bioseq()
     bioseqNb = 0
     savedBioseqNb = 0
     inFile = open(inFileName)
     while True:
         bioseq.read(inFile)
         if bioseq.sequence == None:
             break
         bioseqNb += 1
         if not patternToSearch.search(bioseq.header):
             bioseq.write(outFile)
             if verbose > 1:
                 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'
             savedBioseqNb += 1
     inFile.close()
     
     outFile.close()
     
     if verbose > 0:
         print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
コード例 #2
0
ファイル: FastaUtils.py プロジェクト: chungtseng/HCPU_midterm
 def spliceFromCoords( genomeFile, coordFile, obsFile ):
     genomeFileHandler = open( genomeFile, "r" )
     obsFileHandler = open( obsFile, "w" )
     dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile )
     
     while True:
         bs = Bioseq()
         bs.read( genomeFileHandler )
         if bs.sequence == None:
             break
         if dChr2Maps.has_key( bs.header ):
             lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] )
             splicedSeq = ""
             currentSite = 0
             for iMap in lCoords:
                 minSplice = iMap.getMin() - 1
                 if minSplice > currentSite:
                     splicedSeq += bs.sequence[ currentSite : minSplice ]
                 currentSite = iMap.getMax()
             splicedSeq += bs.sequence[ currentSite : ]
             bs.sequence = splicedSeq
         bs.write( obsFileHandler )
         
     genomeFileHandler.close()
     obsFileHandler.close()
コード例 #3
0
ファイル: FastaUtils.py プロジェクト: chungtseng/HCPU_midterm
 def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ):
 
     if pattern == "":
         return
     
     if outFileName == "":
         outFileName = inFileName + '.extracted'
     outFile = open( outFileName, 'w' )
     
     patternTosearch = re.compile( pattern )
     bioseq = Bioseq()
     bioseqNb = 0
     savedBioseqNb = 0
     inFile = open( inFileName, "r" )
     while True:
         bioseq.read( inFile )
         if bioseq.sequence == None:
             break
         bioseqNb = bioseqNb + 1
         m = patternTosearch.search( bioseq.header )
         if m:
             bioseq.write( outFile )
             if verbose > 1:
                 print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!'
             savedBioseqNb = savedBioseqNb + 1
     inFile.close()
     
     outFile.close()
     
     if verbose > 0:
         print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
コード例 #4
0
ファイル: FastaUtils.py プロジェクト: chungtseng/HCPU_midterm
 def dbLengthFilter( len_min, inFileName, verbose=0 ):
     file_db = open( inFileName, "r" )
     file_dbInf = open( inFileName+".Inf"+str(len_min), "w" )
     file_dbSup = open( inFileName+".Sup"+str(len_min), "w" )
     seq = Bioseq()
     numseq = 0
     nbsave = 0
     
     while True:
         seq.read( file_db )
         if seq.sequence == None:
             break
         l = seq.getLength()
         numseq = numseq + 1
         if l >= len_min:
             seq.write( file_dbSup )
             if verbose > 0:
                     print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!'
                     nbsave=nbsave+1
         else:
             seq.write( file_dbInf )
             if verbose > 0:
                     print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!'
                     nbsave=nbsave+1
                     
     file_db.close()
     file_dbInf.close()
     file_dbSup.close()
     if verbose > 0:
         print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
コード例 #5
0
ファイル: FastaUtils.py プロジェクト: chungtseng/HCPU_midterm
 def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
     if patternFileName == "":
         print "ERROR: no file of pattern"
         sys.exit(1)
         
     bioseq = Bioseq()
     bioseqNb = 0
     savedBioseqNb = 0
     lHeaders = []
     inFile = open( inFileName, "r" )
     while True:
         bioseq.read( inFile )
         if bioseq.sequence == None:
             break
         bioseqNb += 1
         lHeaders.append( bioseq.header )
     inFile.close()
     
     patternFile = open( patternFileName, "r")
     lHeadersToRemove = []
     for pattern in patternFile:
         if verbose > 0:
             print "pattern: ",pattern[:-1]; sys.stdout.flush()
             
         patternToSearch = re.compile( pattern[:-1] )
         for h in lHeaders:
             if patternToSearch.search(h):
                 lHeadersToRemove.append(h)
     patternFile.close()
     
     if outFileName == "":
         outFileName = inFileName + '.cleaned'
     outFile = open( outFileName, 'w' )
 
     bioseqNum = 0
     inFile=open( inFileName )
     while True:
         bioseq.read( inFile )
         bioseqNum += 1
         if bioseq.sequence == None:
             break
         if bioseq.header not in lHeadersToRemove:
             bioseq.write( outFile )
             if verbose > 1:
                 print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
             savedBioseqNb += 1
     inFile.close()
     
     outFile.close()
     
     if verbose > 0:
         print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
コード例 #6
0
ファイル: FastaUtils.py プロジェクト: chungtseng/HCPU_midterm
    def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
    
        if patternFileName == "":
            print "ERROR: no file of pattern"
            sys.exit(1)
    
        bioseq = Bioseq()
        bioseqNb = 0
        savedBioseqNb = 0
        lHeaders = []

        inFile = open( inFileName, "r" )
        while True:
            bioseq.read( inFile )
            if bioseq.sequence == None:
                break
            lHeaders.append( bioseq.header )
        inFile.close()
    
        lHeadersToKeep = []
        patternFile = open( patternFileName, "r" )
        for pattern in patternFile:
            if verbose > 0:
                print "pattern: ",pattern[:-1]; sys.stdout.flush()
                
            patternToSearch = re.compile(pattern[:-1])
            for h in lHeaders:
                if patternToSearch.search(h):
                    lHeadersToKeep.append(h)
        patternFile.close()
    
        if outFileName == "":
            outFileName = inFileName + ".extracted"
        outFile=open( outFileName, "w" )
    
        inFile = open( inFileName, "r" )
        while True:
            bioseq.read(inFile)
            if bioseq.sequence == None:
                break
            bioseqNb += 1
            if bioseq.header in lHeadersToKeep:
                bioseq.write(outFile)
                if verbose > 1:
                    print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
                savedBioseqNb += 1
        inFile.close()
    
        outFile.close()
        
        if verbose > 0:
            print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
コード例 #7
0
 def filterClassifiedConsensus( self ):
     inFile = open( self._inFaFile, "r" )
     outFile = open( self._outFaFile, "w" )
     bs = Bioseq()
     nbInSeq = 0
     nbRmv = 0
     
     if self._classifFile != "":
         dHeader2Classif = self.getClassifPerHeaderOfUnclassifiedConsensus()
         
     while True:
         bs.read( inFile )
         if bs.header == None:
             break
         nbInSeq += 1
         if self._verbose > 1:
             print bs.header
             
         if self._filterSSRs and "SSR" in bs.header and ( self._maxLengthToFilterSSRs == 0 or bs.getLength() <= self._maxLengthToFilterSSRs ):
             nbRmv += 1
             if self._verbose > 1: print "filtered SSR !"
             
         elif self._filterHostGenes and "HostGene" in bs.header:
             nbRmv += 1
             if self._verbose > 1: print "filtered HostGene !"
             
         elif self._filterConfused and "confused" in bs.header and "confusedness=no" not in bs.header:
             nbRmv += 1
             if self._verbose > 1: print "filtered confused !"
             
         elif self._filterNoCat != "0" and "NoCat" in bs.header:
             keep = False
             if "2" in self._filterNoCat:
                 algoMSA = ""
                 for i in ["Map","MAP","Malign","Mafft","Prank","Clustalw","Muscle","Tcoffee"]:
                     if i in bs.header:
                         algoMSA = i
                 nbAlignSeq = int( bs.header.split(algoMSA+"_")[1].split("|")[0] )
                 if nbAlignSeq > self._nbAlignSeqNoCat:
                     keep = True
             if "3" in self._filterNoCat:
                 for header in dHeader2Classif.keys():
                     if header in bs.header:
                         if "no structural features" not in dHeader2Classif[header][6]:
                             keep = True
             if keep:
                 bs.write( outFile )
             else:
                 nbRmv += 1
                 if self._verbose > 1: print "filtered NoCat !"
                 
         elif self._filterIncomplete and "completeness=incomp" in bs.header:
             nbRmv += 1
             if self._verbose > 1: print "filtered incomplete !"
             
         else:
             bs.write( outFile )
             
     inFile.close()
     outFile.close()
     
     if self._verbose > 0:
         print "nb of input seq: %i" % ( nbInSeq )
         print "nb of filtered seq: %i" % ( nbRmv )
         sys.stdout.flush()