Example #1
0
 def spliceFromCoords( genomeFile, coordFile, obsFile ):
     genomeFileHandler = open( genomeFile, "r" )
     obsFileHandler = open( obsFile, "w" )
     dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile )
     
     while True:
         bs = Bioseq()
         bs.read( genomeFileHandler )
         if bs.sequence == None:
             break
         if dChr2Maps.has_key( bs.header ):
             lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] )
             splicedSeq = ""
             currentSite = 0
             for iMap in lCoords:
                 minSplice = iMap.getMin() - 1
                 if minSplice > currentSite:
                     splicedSeq += bs.sequence[ currentSite : minSplice ]
                 currentSite = iMap.getMax()
             splicedSeq += bs.sequence[ currentSite : ]
             bs.sequence = splicedSeq
         bs.write( obsFileHandler )
         
     genomeFileHandler.close()
     obsFileHandler.close()
Example #2
0
 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0):
     if verbose > 0:
         print "sort sequences by increasing length"
         sys.stdout.flush()
     if not os.path.exists( inFileName ):
         print "ERROR: file '%s' doesn't exist" % ( inFileName )
         sys.exit(1)
         
     # read each seq one by one
     # save them in distinct temporary files
     # with their length in the name
     inFileHandler = open( inFileName, "r" )
     bs = Bioseq()
     countSeq = 0
     while True:
         bs.read( inFileHandler )
         if bs.header == None:
             break
         countSeq += 1
         tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq )
         bs.appendBioseqInFile( tmpFile )
         if verbose > 1:
             print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile )
         bs.header = ""
         bs.sequence = ""
     inFileHandler.close()
     
     # sort temporary file names
     # concatenate them into the output file
     if os.path.exists( outFileName ):
         os.remove( outFileName )
     lFiles = glob.glob( "*bp_*nb" )
     lFiles.sort( key=lambda s:int(s.split("bp_")[0]) )
     for fileName in lFiles:
         cmd = "cat %s >> %s" % ( fileName, outFileName )
         returnValue = os.system( cmd )
         if returnValue != 0:
             print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName )
             sys.exit(1)
         os.remove( fileName )
         
     return 0
    def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 ):

        maxPropN = 0.40  # discard consensus if more than 40% of N's

        nbInSeq = self.getSize()
        if verbose > 0:
            print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush()
        if nbInSeq < 2:
            print "ERROR: can't make a consensus with less than 2 sequences"
            sys.exit(1)
        if minNbNt >= nbInSeq:
            minNbNt = nbInSeq - 1
            print "minNbNt=%i" % ( minNbNt )
        if minPropNt >= 1.0:
            print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt )
            sys.exit(1)

        lOccPerSite = self.getListOccPerSite()
        nbSites = len(lOccPerSite)
        if verbose > 0:
            print "nb of sites: %i" % ( nbSites ); sys.stdout.flush()

        seqConsensus = ""

        # for each site (i.e. each column of the MSA)
        nbRmvColumns = 0
        countSites = 0
        for dNt2Occ in lOccPerSite:
            countSites += 1
            if verbose > 1:
                print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ),
                                         nbSites )
                sys.stdout.flush()
            occMaxNt = 0   # occurrences of the predominant nucleotide at this site
            lBestNt = []
            nbNt = 0   # total nb of A, T, G and C (no gap)

            # for each distinct symbol at this site (A, T, G, C, N, -,...)
            for j in dNt2Occ.keys():
                if j != "-":
                    nbNt += dNt2Occ[j]
                    if verbose > 1:
                        print "%s: %i" % ( j, dNt2Occ[j] )
                    if dNt2Occ[j] > occMaxNt:
                        occMaxNt = dNt2Occ[j]
                        lBestNt = [ j ]
                    elif dNt2Occ[j] == occMaxNt:
                        lBestNt.append( j )
            if nbNt == 0:   # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq)
                nbRmvColumns += 1

            if len( lBestNt ) >= 1:
                bestNt = lBestNt[0]
            
            # if the predominant nucleotide occurs in less than x% of the sequences, put a "N"
            if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt:
                bestNt = "N"

            if int(nbNt) >= int(minNbNt):
                seqConsensus += bestNt
                if verbose > 1:
                    print "-> %s" % ( bestNt )

        if nbRmvColumns:
            print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 )
            sys.stdout.flush()
            if seqConsensus == "":
                print "WARNING: no consensus can be built (no sequence left)"
                return

        propN = seqConsensus.count("N") / float(len(seqConsensus))
        if propN >= maxPropN:
            print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 )
            return
        elif propN >= maxPropN * 0.5:
            print "WARNING: %i%% of N's" % ( propN * 100 )

        consensus = Bioseq()
        consensus.sequence = seqConsensus
        consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() )

        if verbose > 0:
       
            statEntropy = self.getEntropy( verbose - 1 )
            print "entropy: %s" % ( statEntropy.stringQuantiles() )
            sys.stdout.flush()

        return consensus