def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ):
 
     if pattern == "":
         return
     
     if outFileName == "":
         outFileName = inFileName + '.extracted'
     outFile = open( outFileName, 'w' )
     
     patternTosearch = re.compile( pattern )
     bioseq = Bioseq()
     bioseqNb = 0
     savedBioseqNb = 0
     inFile = open( inFileName, "r" )
     while True:
         bioseq.read( inFile )
         if bioseq.sequence == None:
             break
         bioseqNb = bioseqNb + 1
         m = patternTosearch.search( bioseq.header )
         if m:
             bioseq.write( outFile )
             if verbose > 1:
                 print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!'
             savedBioseqNb = savedBioseqNb + 1
     inFile.close()
     
     outFile.close()
     
     if verbose > 0:
         print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
Exemple #2
0
 def read( self, faFileHandler ):
     while True:
         seq = Bioseq()
         seq.read( faFileHandler )
         if seq.sequence == None:
             break
         self.add( seq )
Exemple #3
0
 def createSeqTable( self, tableName, fileName = "" ):
     sqlCmd = "CREATE TABLE %s (accession varchar(255), sequence longtext, description varchar(255), length int unsigned )" % (tableName)
     self.execute( sqlCmd )
     self.createSeqIndex( tableName )
     self.updateInfoTable( tableName, fileName )
     
     if fileName != "":
         inFile = open( fileName )
         tmpFileName = fileName.split("/")[-1] + ".tmp" + str(os.getpid())
         tmpFile = open(tmpFileName, "w")
         bioseq = Bioseq()
         seqNb = 0
         while True:
             bioseq.read( inFile )
             if bioseq.sequence == None:
                 break
             seqLen = bioseq.getLength()
             tmpFile.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
                                             bioseq.sequence, bioseq.header, seqLen))
             seqNb += 1
         inFile.close()
         tmpFile.close()
         sqlCmd = "LOAD DATA LOCAL INFILE '%s' IGNORE INTO TABLE %s FIELDS ESCAPED BY ''" % \
                  (tmpFileName, tableName)
         self.execute( sqlCmd )
         os.remove( tmpFileName )
 def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ):
     if pattern == "":
         return
     
     patternToSearch = re.compile(pattern)
     
     if outFileName == "":
         outFileName = inFileName + '.cleaned'
     outFile = open(outFileName,'w')
     
     bioseq = Bioseq()
     bioseqNb = 0
     savedBioseqNb = 0
     inFile = open(inFileName)
     while True:
         bioseq.read(inFile)
         if bioseq.sequence == None:
             break
         bioseqNb += 1
         if not patternToSearch.search(bioseq.header):
             bioseq.write(outFile)
             if verbose > 1:
                 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'
             savedBioseqNb += 1
     inFile.close()
     
     outFile.close()
     
     if verbose > 0:
         print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
 def extractBioseqListFromFastaFile( fileName ):
     file = open( fileName )
     lBioseq = []
     currentHeader = ""
     while currentHeader != None:
         bioseq = Bioseq()
         bioseq.read(file)
         currentHeader = bioseq.header
         if currentHeader != None:
             lBioseq.append(bioseq)
     return lBioseq
 def getLengthPerSeqFromFile( inFile ):
     dHeader2Length = {}
     inFileHandler = open( inFile, "r" )
     while True:
         iBs = Bioseq()
         iBs.read( inFileHandler )
         if iBs.sequence == None:
             break
         dHeader2Length[ iBs.header ] = iBs.getLength()
     inFileHandler.close()
     return dHeader2Length
 def dbLengthFilter( len_min, inFileName, verbose=0 ):
     file_db = open( inFileName, "r" )
     file_dbInf = open( inFileName+".Inf"+str(len_min), "w" )
     file_dbSup = open( inFileName+".Sup"+str(len_min), "w" )
     seq = Bioseq()
     numseq = 0
     nbsave = 0
     
     while True:
         seq.read( file_db )
         if seq.sequence == None:
             break
         l = seq.getLength()
         numseq = numseq + 1
         if l >= len_min:
             seq.write( file_dbSup )
             if verbose > 0:
                     print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!'
                     nbsave=nbsave+1
         else:
             seq.write( file_dbInf )
             if verbose > 0:
                     print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!'
                     nbsave=nbsave+1
                     
     file_db.close()
     file_dbInf.close()
     file_dbSup.close()
     if verbose > 0:
         print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
Exemple #8
0
 def extractPatternOfFile(self, pattern, inFileName):
     if pattern=="" :
         return
     srch=re.compile(pattern)
     file_db=open(inFileName)
     numseq=0
     nbsave=0
     while 1:
         seq=Bioseq()
         seq.read(file_db)
         if seq.sequence==None:
             break
         numseq+=1
         m=srch.search(seq.header)
         if m:
             self.add(seq)
             nbsave+=1
     file_db.close()
 def spliceFromCoords( genomeFile, coordFile, obsFile ):
     genomeFileHandler = open( genomeFile, "r" )
     obsFileHandler = open( obsFile, "w" )
     dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile )
     
     while True:
         bs = Bioseq()
         bs.read( genomeFileHandler )
         if bs.sequence == None:
             break
         if dChr2Maps.has_key( bs.header ):
             lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] )
             splicedSeq = ""
             currentSite = 0
             for iMap in lCoords:
                 minSplice = iMap.getMin() - 1
                 if minSplice > currentSite:
                     splicedSeq += bs.sequence[ currentSite : minSplice ]
                 currentSite = iMap.getMax()
             splicedSeq += bs.sequence[ currentSite : ]
             bs.sequence = splicedSeq
         bs.write( obsFileHandler )
         
     genomeFileHandler.close()
     obsFileHandler.close()
 def getSubSequence( self, accession, start, end ):
     bs = Bioseq()
     if start <= 0 or end <= 0:
         print "ERROR with coordinates start=%i or end=%i" % ( start, end )
         sys.exit(1)
         
     if accession not in self.getAccessionsList():
         print "ERROR: accession '%s' absent from table '%s'" % ( accession, self._table )
         sys.exit(1)
         
     lengthAccession = self.getSeqLengthFromAccession( accession )
     if start > lengthAccession or end > lengthAccession:
         print "ERROR: coordinates start=%i end=%i out of sequence '%s' range (%i bp)" % ( start, end, accession, lengthAccession )
         sys.exit(1)
         
     sqlCmd = "SELECT SUBSTRING(sequence,%i,%i) FROM %s WHERE accession='%s'" % ( min(start,end), abs(end-start)+ 1, self._table, accession )
     self._iDb.execute( sqlCmd )
     res = self._iDb.fetchall()
     bs.setSequence( res[0][0] )
     if start > end:
         bs.reverseComplement()
     return bs.sequence
Exemple #11
0
 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0):
     if verbose > 0:
         print "sort sequences by increasing length"
         sys.stdout.flush()
     if not os.path.exists( inFileName ):
         print "ERROR: file '%s' doesn't exist" % ( inFileName )
         sys.exit(1)
         
     # read each seq one by one
     # save them in distinct temporary files
     # with their length in the name
     inFileHandler = open( inFileName, "r" )
     bs = Bioseq()
     countSeq = 0
     while True:
         bs.read( inFileHandler )
         if bs.header == None:
             break
         countSeq += 1
         tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq )
         bs.appendBioseqInFile( tmpFile )
         if verbose > 1:
             print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile )
         bs.header = ""
         bs.sequence = ""
     inFileHandler.close()
     
     # sort temporary file names
     # concatenate them into the output file
     if os.path.exists( outFileName ):
         os.remove( outFileName )
     lFiles = glob.glob( "*bp_*nb" )
     lFiles.sort( key=lambda s:int(s.split("bp_")[0]) )
     for fileName in lFiles:
         cmd = "cat %s >> %s" % ( fileName, outFileName )
         returnValue = os.system( cmd )
         if returnValue != 0:
             print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName )
             sys.exit(1)
         os.remove( fileName )
         
     return 0
Exemple #12
0
 def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
     if patternFileName == "":
         print "ERROR: no file of pattern"
         sys.exit(1)
         
     bioseq = Bioseq()
     bioseqNb = 0
     savedBioseqNb = 0
     lHeaders = []
     inFile = open( inFileName, "r" )
     while True:
         bioseq.read( inFile )
         if bioseq.sequence == None:
             break
         bioseqNb += 1
         lHeaders.append( bioseq.header )
     inFile.close()
     
     patternFile = open( patternFileName, "r")
     lHeadersToRemove = []
     for pattern in patternFile:
         if verbose > 0:
             print "pattern: ",pattern[:-1]; sys.stdout.flush()
             
         patternToSearch = re.compile( pattern[:-1] )
         for h in lHeaders:
             if patternToSearch.search(h):
                 lHeadersToRemove.append(h)
     patternFile.close()
     
     if outFileName == "":
         outFileName = inFileName + '.cleaned'
     outFile = open( outFileName, 'w' )
 
     bioseqNum = 0
     inFile=open( inFileName )
     while True:
         bioseq.read( inFile )
         bioseqNum += 1
         if bioseq.sequence == None:
             break
         if bioseq.header not in lHeadersToRemove:
             bioseq.write( outFile )
             if verbose > 1:
                 print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
             savedBioseqNb += 1
     inFile.close()
     
     outFile.close()
     
     if verbose > 0:
         print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
Exemple #13
0
    def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
    
        if patternFileName == "":
            print "ERROR: no file of pattern"
            sys.exit(1)
    
        bioseq = Bioseq()
        bioseqNb = 0
        savedBioseqNb = 0
        lHeaders = []

        inFile = open( inFileName, "r" )
        while True:
            bioseq.read( inFile )
            if bioseq.sequence == None:
                break
            lHeaders.append( bioseq.header )
        inFile.close()
    
        lHeadersToKeep = []
        patternFile = open( patternFileName, "r" )
        for pattern in patternFile:
            if verbose > 0:
                print "pattern: ",pattern[:-1]; sys.stdout.flush()
                
            patternToSearch = re.compile(pattern[:-1])
            for h in lHeaders:
                if patternToSearch.search(h):
                    lHeadersToKeep.append(h)
        patternFile.close()
    
        if outFileName == "":
            outFileName = inFileName + ".extracted"
        outFile=open( outFileName, "w" )
    
        inFile = open( inFileName, "r" )
        while True:
            bioseq.read(inFile)
            if bioseq.sequence == None:
                break
            bioseqNb += 1
            if bioseq.header in lHeadersToKeep:
                bioseq.write(outFile)
                if verbose > 1:
                    print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
                savedBioseqNb += 1
        inFile.close()
    
        outFile.close()
        
        if verbose > 0:
            print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
Exemple #14
0
 def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ):
     if outFileName == "":
         outFileName = inFileName + ".orf.map"
     outFile = open( outFileName, "w" )
 
     bioseq = Bioseq()
     bioseqNb = 0
 
     inFile = open( inFileName )
     while True:
         bioseq.read( inFile )
         if bioseq.sequence == None:
             break
         bioseq.upCase() 
         bioseqNb += 1
         if verbose > 0:
             print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]'
             
         orf = bioseq.findORF()
         bestOrf = []
         for i in orf.keys():
             orfLen = len(orf[i])
             for j in xrange(1, orfLen):
                 start = orf[i][j-1] + 4
                 end = orf[i][j] + 3
                 if end - start >= orfMinLength:
                     bestOrf.append( ( end-start, i+1, start, end ) )
 
         bioseq.complement()
         
         orf = bioseq.findORF()
         seqLen = bioseq.getLength()
         for i in orf.keys():
             orfLen = len(orf[i])
             for j in xrange(1, orfLen):
                 start = seqLen - orf[i][j-1] - 3
                 end = seqLen - orf[i][j] - 2
                 if start - end >= orfMinLength:
                     bestOrf.append( ( start-end, (i+1)*-1, start, end ) )
 
         bestOrf.sort()
         bestOrf.reverse()
         bestOrfNb = len(bestOrf)
         if orfMaxNb > bestOrfNb or orfMaxNb == 0 :
             orfMaxNb = bestOrfNb
         for i in xrange(0, orfMaxNb):
             if verbose > 0:
                 print bestOrf[i]
             outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\
                                "|"+str(bestOrf[i][0]),bioseq.header,
                                bestOrf[i][2],bestOrf[i][3]))
 
     inFile.close()
     outFile.close()
 
     return 0
    def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 ):

        maxPropN = 0.40  # discard consensus if more than 40% of N's

        nbInSeq = self.getSize()
        if verbose > 0:
            print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush()
        if nbInSeq < 2:
            print "ERROR: can't make a consensus with less than 2 sequences"
            sys.exit(1)
        if minNbNt >= nbInSeq:
            minNbNt = nbInSeq - 1
            print "minNbNt=%i" % ( minNbNt )
        if minPropNt >= 1.0:
            print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt )
            sys.exit(1)

        lOccPerSite = self.getListOccPerSite()
        nbSites = len(lOccPerSite)
        if verbose > 0:
            print "nb of sites: %i" % ( nbSites ); sys.stdout.flush()

        seqConsensus = ""

        # for each site (i.e. each column of the MSA)
        nbRmvColumns = 0
        countSites = 0
        for dNt2Occ in lOccPerSite:
            countSites += 1
            if verbose > 1:
                print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ),
                                         nbSites )
                sys.stdout.flush()
            occMaxNt = 0   # occurrences of the predominant nucleotide at this site
            lBestNt = []
            nbNt = 0   # total nb of A, T, G and C (no gap)

            # for each distinct symbol at this site (A, T, G, C, N, -,...)
            for j in dNt2Occ.keys():
                if j != "-":
                    nbNt += dNt2Occ[j]
                    if verbose > 1:
                        print "%s: %i" % ( j, dNt2Occ[j] )
                    if dNt2Occ[j] > occMaxNt:
                        occMaxNt = dNt2Occ[j]
                        lBestNt = [ j ]
                    elif dNt2Occ[j] == occMaxNt:
                        lBestNt.append( j )
            if nbNt == 0:   # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq)
                nbRmvColumns += 1

            if len( lBestNt ) >= 1:
                bestNt = lBestNt[0]
            
            # if the predominant nucleotide occurs in less than x% of the sequences, put a "N"
            if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt:
                bestNt = "N"

            if int(nbNt) >= int(minNbNt):
                seqConsensus += bestNt
                if verbose > 1:
                    print "-> %s" % ( bestNt )

        if nbRmvColumns:
            print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 )
            sys.stdout.flush()
            if seqConsensus == "":
                print "WARNING: no consensus can be built (no sequence left)"
                return

        propN = seqConsensus.count("N") / float(len(seqConsensus))
        if propN >= maxPropN:
            print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 )
            return
        elif propN >= maxPropN * 0.5:
            print "WARNING: %i%% of N's" % ( propN * 100 )

        consensus = Bioseq()
        consensus.sequence = seqConsensus
        consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() )

        if verbose > 0:
       
            statEntropy = self.getEntropy( verbose - 1 )
            print "entropy: %s" % ( statEntropy.stringQuantiles() )
            sys.stdout.flush()

        return consensus
 def getATGCNFromIUPAC( self, nt ):
     iBs = Bioseq()
     return iBs.getATGCNFromIUPAC( nt )
Exemple #17
0
 def __init__(self, name = "", sequence = ""):
     Bioseq.__init__(self, name, sequence)
     self.name = self.header        
     self.quality = None
     self.chunkedSequence = None
     self.chunkedQuality = None
 def filterClassifiedConsensus( self ):
     inFile = open( self._inFaFile, "r" )
     outFile = open( self._outFaFile, "w" )
     bs = Bioseq()
     nbInSeq = 0
     nbRmv = 0
     
     if self._classifFile != "":
         dHeader2Classif = self.getClassifPerHeaderOfUnclassifiedConsensus()
         
     while True:
         bs.read( inFile )
         if bs.header == None:
             break
         nbInSeq += 1
         if self._verbose > 1:
             print bs.header
             
         if self._filterSSRs and "SSR" in bs.header and ( self._maxLengthToFilterSSRs == 0 or bs.getLength() <= self._maxLengthToFilterSSRs ):
             nbRmv += 1
             if self._verbose > 1: print "filtered SSR !"
             
         elif self._filterHostGenes and "HostGene" in bs.header:
             nbRmv += 1
             if self._verbose > 1: print "filtered HostGene !"
             
         elif self._filterConfused and "confused" in bs.header and "confusedness=no" not in bs.header:
             nbRmv += 1
             if self._verbose > 1: print "filtered confused !"
             
         elif self._filterNoCat != "0" and "NoCat" in bs.header:
             keep = False
             if "2" in self._filterNoCat:
                 algoMSA = ""
                 for i in ["Map","MAP","Malign","Mafft","Prank","Clustalw","Muscle","Tcoffee"]:
                     if i in bs.header:
                         algoMSA = i
                 nbAlignSeq = int( bs.header.split(algoMSA+"_")[1].split("|")[0] )
                 if nbAlignSeq > self._nbAlignSeqNoCat:
                     keep = True
             if "3" in self._filterNoCat:
                 for header in dHeader2Classif.keys():
                     if header in bs.header:
                         if "no structural features" not in dHeader2Classif[header][6]:
                             keep = True
             if keep:
                 bs.write( outFile )
             else:
                 nbRmv += 1
                 if self._verbose > 1: print "filtered NoCat !"
                 
         elif self._filterIncomplete and "completeness=incomp" in bs.header:
             nbRmv += 1
             if self._verbose > 1: print "filtered incomplete !"
             
         else:
             bs.write( outFile )
             
     inFile.close()
     outFile.close()
     
     if self._verbose > 0:
         print "nb of input seq: %i" % ( nbInSeq )
         print "nb of filtered seq: %i" % ( nbRmv )
         sys.stdout.flush()