Python Bioseq.sequence Examples

Programming Language: Python

Namespace/Package Name: pyRepetUnit.commons.seq.Bioseq

Class/Type: Bioseq

Method/Function: sequence

Examples at hotexamples.com: 3

Python Bioseq.sequence - 3 examples found. These are the top rated real world Python examples of pyRepetUnit.commons.seq.Bioseq.Bioseq.sequence extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

read(14)

write(7)

getLength(6)

sequence(3)

header(2)

__init__(1)

appendBioseqInFile(1)

complement(1)

findORF(1)

getATGCNFromIUPAC(1)

reverseComplement(1)

setSequence(1)

upCase(1)

Example #1

Show file

File: FastaUtils.py Project: chungtseng/HCPU_midterm

 def spliceFromCoords( genomeFile, coordFile, obsFile ):
     genomeFileHandler = open( genomeFile, "r" )
     obsFileHandler = open( obsFile, "w" )
     dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile )
     
     while True:
         bs = Bioseq()
         bs.read( genomeFileHandler )
         if bs.sequence == None:
             break
         if dChr2Maps.has_key( bs.header ):
             lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] )
             splicedSeq = ""
             currentSite = 0
             for iMap in lCoords:
                 minSplice = iMap.getMin() - 1
                 if minSplice > currentSite:
                     splicedSeq += bs.sequence[ currentSite : minSplice ]
                 currentSite = iMap.getMax()
             splicedSeq += bs.sequence[ currentSite : ]
             bs.sequence = splicedSeq
         bs.write( obsFileHandler )
         
     genomeFileHandler.close()
     obsFileHandler.close()

Example #2

Show file

File: FastaUtils.py Project: chungtseng/HCPU_midterm

 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0):
     if verbose > 0:
         print "sort sequences by increasing length"
         sys.stdout.flush()
     if not os.path.exists( inFileName ):
         print "ERROR: file '%s' doesn't exist" % ( inFileName )
         sys.exit(1)
         
     # read each seq one by one
     # save them in distinct temporary files
     # with their length in the name
     inFileHandler = open( inFileName, "r" )
     bs = Bioseq()
     countSeq = 0
     while True:
         bs.read( inFileHandler )
         if bs.header == None:
             break
         countSeq += 1
         tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq )
         bs.appendBioseqInFile( tmpFile )
         if verbose > 1:
             print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile )
         bs.header = ""
         bs.sequence = ""
     inFileHandler.close()
     
     # sort temporary file names
     # concatenate them into the output file
     if os.path.exists( outFileName ):
         os.remove( outFileName )
     lFiles = glob.glob( "*bp_*nb" )
     lFiles.sort( key=lambda s:int(s.split("bp_")[0]) )
     for fileName in lFiles:
         cmd = "cat %s >> %s" % ( fileName, outFileName )
         returnValue = os.system( cmd )
         if returnValue != 0:
             print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName )
             sys.exit(1)
         os.remove( fileName )
         
     return 0

Example #3

Show file

File: AlignedBioseqDB.py Project: chungtseng/HCPU_midterm

    def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 ):

        maxPropN = 0.40  # discard consensus if more than 40% of N's

        nbInSeq = self.getSize()
        if verbose > 0:
            print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush()
        if nbInSeq < 2:
            print "ERROR: can't make a consensus with less than 2 sequences"
            sys.exit(1)
        if minNbNt >= nbInSeq:
            minNbNt = nbInSeq - 1
            print "minNbNt=%i" % ( minNbNt )
        if minPropNt >= 1.0:
            print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt )
            sys.exit(1)

        lOccPerSite = self.getListOccPerSite()
        nbSites = len(lOccPerSite)
        if verbose > 0:
            print "nb of sites: %i" % ( nbSites ); sys.stdout.flush()

        seqConsensus = ""

        # for each site (i.e. each column of the MSA)
        nbRmvColumns = 0
        countSites = 0
        for dNt2Occ in lOccPerSite:
            countSites += 1
            if verbose > 1:
                print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ),
                                         nbSites )
                sys.stdout.flush()
            occMaxNt = 0   # occurrences of the predominant nucleotide at this site
            lBestNt = []
            nbNt = 0   # total nb of A, T, G and C (no gap)

            # for each distinct symbol at this site (A, T, G, C, N, -,...)
            for j in dNt2Occ.keys():
                if j != "-":
                    nbNt += dNt2Occ[j]
                    if verbose > 1:
                        print "%s: %i" % ( j, dNt2Occ[j] )
                    if dNt2Occ[j] > occMaxNt:
                        occMaxNt = dNt2Occ[j]
                        lBestNt = [ j ]
                    elif dNt2Occ[j] == occMaxNt:
                        lBestNt.append( j )
            if nbNt == 0:   # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq)
                nbRmvColumns += 1

            if len( lBestNt ) >= 1:
                bestNt = lBestNt[0]
            
            # if the predominant nucleotide occurs in less than x% of the sequences, put a "N"
            if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt:
                bestNt = "N"

            if int(nbNt) >= int(minNbNt):
                seqConsensus += bestNt
                if verbose > 1:
                    print "-> %s" % ( bestNt )

        if nbRmvColumns:
            print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 )
            sys.stdout.flush()
            if seqConsensus == "":
                print "WARNING: no consensus can be built (no sequence left)"
                return

        propN = seqConsensus.count("N") / float(len(seqConsensus))
        if propN >= maxPropN:
            print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 )
            return
        elif propN >= maxPropN * 0.5:
            print "WARNING: %i%% of N's" % ( propN * 100 )

        consensus = Bioseq()
        consensus.sequence = seqConsensus
        consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() )

        if verbose > 0:
       
            statEntropy = self.getEntropy( verbose - 1 )
            print "entropy: %s" % ( statEntropy.stringQuantiles() )
            sys.stdout.flush()

        return consensus