Exemple #1
0
def getParms ():                       # use default input sys.argv[1:]

    parser = optparse.OptionParser(usage='%prog [options] <bas_file>')

    parser.add_option ('--linehelp', action='store_true', help='show line-specific help and exit')
    parser.add_option ('--filehelp', action='store_true', help='show help for file-related parameters, and exit')
    parser.add_option ('--ccs',                  help='directory containing ccs.h5 files for CCS reads, post-2.1.0')
    parser.add_option ('--aln',                  help='cmp.h5 file for subread alignments')
    parser.add_option ('--alnccs',               help='cmp.h5 file for CCS alignments')
    parser.add_option ('--score',    type='int', help='minimum HQ region score (def: %default)')
    parser.add_option ('--length',   type='int', help='minimum HQ region length (def: %default)')
    parser.add_option ('--adapter',  type='int', help='expected adapter length (def: %default)')
    parser.add_option ('--noadapt',  action='store_true', help='do not print adapter lines (for brevity)')
    parser.add_option ('--nocons',   action='store_true', help='do not print consensus passes lines')

    parser.set_defaults (score=DEF_SCORE_THRESHOLD,
                         length=DEF_HQ_LENGTH,
                         adapter=DEF_ADAPTER_LENGTH)

    opt, args = parser.parse_args()

    if opt.linehelp:
        lineHelp()

    if opt.filehelp:
        fileHelp()

    if opt.linehelp or opt.filehelp:
        sys.exit()

    if len(args) > 1:
        logger.warning ('WARNING: alignments cmp.h5 file should now be specified with --aln keyword')
        opt.aln = args.pop()      # put it where it belongs

    return opt, args
Exemple #2
0
    def findCCSFile(self):
        """Given a directory to look in, find the ccs.h5 file that contains consensus reads for this bax file."""

        self._hasConsensus = False  # until proven otherwise

        if "PulseData/ConsensusBaseCalls" in self._top:  # if this is an older bax file, in contains its own CCS data

            self._consBasecalls = self._top["PulseData/ConsensusBaseCalls"]
            self._consZMW = self._top["PulseData/ConsensusBaseCalls/ZMW"]
            self._consPasses = self._top["PulseData/ConsensusBaseCalls/Passes"]
            self._hasConsensus = True

        elif self._CCSDir is not None:

            CCSFilename = os.path.basename(self._filename).replace("bax", "ccs")
            fqCCSFilename = os.path.join(self._CCSDir, CCSFilename)

            if os.path.exists(fqCCSFilename):

                self._CCSFile = h5py.File(fqCCSFilename, "r")
                self._consBasecalls = self._CCSFile["PulseData/ConsensusBaseCalls"]
                self._consZMW = self._CCSFile["PulseData/ConsensusBaseCalls/ZMW"]
                self._consPasses = self._CCSFile["PulseData/ConsensusBaseCalls/Passes"]
                self._hasConsensus = True
                logger.debug("BaxFile %s found CCS file %s" % (self._shortName, fqCCSFilename))

            else:
                logger.warning("%s: no CCS file found corresponding to %s" % (self._shortName, self._filename))

        else:
            logger.info("BaxFile %s does not contain CCS data (rel 2.1.0 and later). Use --ccs" % self._shortName)
Exemple #3
0
def makeTempDir (dir):

    if os.path.isdir (dir):
        logger.warning('WARNING: temp directory %s already exists' % dir)
    else:
        os.makedirs (dir)
        
    return
Exemple #4
0
def makeTempDir(dir):

    if os.path.isdir(dir):
        logger.warning('WARNING: temp directory %s already exists' % dir)
    else:
        os.makedirs(dir)

    return
Exemple #5
0
def getParms():  # use default input sys.argv[1:]

    parser = optparse.OptionParser(usage='%prog [options] <bas_file>')

    parser.add_option('--linehelp',
                      action='store_true',
                      help='show line-specific help and exit')
    parser.add_option('--filehelp',
                      action='store_true',
                      help='show help for file-related parameters, and exit')
    parser.add_option(
        '--ccs',
        help='directory containing ccs.h5 files for CCS reads, post-2.1.0')
    parser.add_option('--aln', help='cmp.h5 file for subread alignments')
    parser.add_option('--alnccs', help='cmp.h5 file for CCS alignments')
    parser.add_option('--score',
                      type='int',
                      help='minimum HQ region score (def: %default)')
    parser.add_option('--length',
                      type='int',
                      help='minimum HQ region length (def: %default)')
    parser.add_option('--adapter',
                      type='int',
                      help='expected adapter length (def: %default)')
    parser.add_option('--noadapt',
                      action='store_true',
                      help='do not print adapter lines (for brevity)')
    parser.add_option('--nocons',
                      action='store_true',
                      help='do not print consensus passes lines')

    parser.set_defaults(score=DEF_SCORE_THRESHOLD,
                        length=DEF_HQ_LENGTH,
                        adapter=DEF_ADAPTER_LENGTH)

    opt, args = parser.parse_args()

    if opt.linehelp:
        lineHelp()

    if opt.filehelp:
        fileHelp()

    if opt.linehelp or opt.filehelp:
        sys.exit()

    if len(args) > 1:
        logger.warning(
            'WARNING: alignments cmp.h5 file should now be specified with --aln keyword'
        )
        opt.aln = args.pop()  # put it where it belongs

    return opt, args
Exemple #6
0
def getGeneFromAnnotation (opt, tranList, exonList):
    '''Add to lists of transcripts and exons: annotations for gene of interest.'''

    if opt.gtf == None:
        return tranList, exonList

    omits = [] if opt.omit is None else opt.omit.split(',')            # transcripts which must not be included

    if opt.annotations:
        annotList = opt.annotations
    else:
        if opt.format == 'pickle':
            annotList   = anno.AnnotationList.fromPickle (opt.gtf)
        elif opt.format == 'alt':
            annotList   = anno.AnnotationList (opt.gtf, altFormat=True)
        else:     # standard format
            annotList   = anno.AnnotationList (opt.gtf)

    allGenes = annotList.getGeneDict()
    if opt.gene not in allGenes:
        raise RuntimeError ('gene %s is not in the annotation file' % opt.gene)
    geneList = allGenes[opt.gene]       # a list of Annotation objects
    if len(geneList) > 1:
        logger.warning('gene %s appears %d times in annotations, first occurrence plotted' \
                           % (opt.gene, len(geneList)))
    myGene = geneList[0]

    for tran in myGene.getChildren():                       # tran is an Annotation object

        if tran.name not in omits:                          # if not in ignore list

            myTran = Transcript(tran.name, start=tran.start, end=tran.end, annot=True, ID=tran.ID)

            if hasattr(tran, 'startcodon'):
                myTran.startcodon = tran.startcodon
            if hasattr(tran, 'stopcodon'):
                myTran.stopcodon = tran.stopcodon

            for exon in tran.getChildren():                 # exon is an Annotation object
                myExon = Exon(myTran, exon.name, exon.start, exon.end, exon.strand)     # no Q score
                if hasattr (exon, 'polyAs'):
                    print exon.name
                    myExon.polyAs = exon.polyAs
                exonList.append (myExon)
                myTran.exons.append(myExon)

            tranList.append (myTran)

    return tranList, exonList
Exemple #7
0
def getGeneFromAnnotation (opt, tranList, exonList):
    '''Add to lists of transcripts and exons: annotations for gene of interest.'''

    if opt.gtf == None:
        return tranList, exonList

    omits = [] if opt.omit is None else opt.omit.split(',')            # transcripts which must not be included

    if opt.format == 'pickle':
        annotList   = anno.AnnotationList.fromPickle (opt.gtf)
    elif opt.format == 'alt':
        annotList   = anno.AnnotationList (opt.gtf, altFormat=True)
    else:     # standard format
        annotList   = anno.AnnotationList (opt.gtf)

    allGenes = annotList.getGeneDict()
    if opt.gene not in allGenes:
        raise RuntimeError ('gene %s is not in the annotation file' % opt.gene)
    geneList = allGenes[opt.gene]       # a list of Annotation objects
    if len(geneList) > 1:
        logger.warning('gene %s appears %d times in annotations, first occurrence plotted' \
                           % (opt.gene, len(geneList)))
    myGene = geneList[0]

    for tran in myGene.getChildren():                       # tran is an Annotation object

        if tran.name not in omits:                          # if not in ignore list

            myTran = Transcript(tran.name, annot=True)

            if hasattr(tran, 'startcodon'):
                myTran.startcodon = tran.startcodon
            if hasattr(tran, 'stopcodon'):
                myTran.stopcodon = tran.stopcodon

            for exon in tran.getChildren():                 # exon is an Annotation object
                myExon = Exon(myTran, exon.name, exon.start, exon.end, exon.strand)     # no Q score
                if hasattr (exon, 'polyAs'):
                    print exon.name
                    myExon.polyAs = exon.polyAs
                exonList.append (myExon)
                myTran.exons.append(myExon)

            tranList.append (myTran)

    return tranList, exonList
Exemple #8
0
def getGeneFromAnnotation(opt, tranList, exonList):
    # Add to lists of transcripts and exons: annotations for gene of interest.
    if opt.gtf is None:
        return tranList, exonList
    if opt.annotations:
        annotList = opt.annotations
    else:
        if opt.format == 'pickle':
            annotList = anno.AnnotationList.fromPickle(opt.gtf)
        elif opt.format == 'alt':
            annotList = anno.AnnotationList(opt.gtf, altFormat=True)
        else:  # standard format
            annotList = anno.AnnotationList(opt.gtf)
    allGenes = annotList.getGeneDict()
    allGenes.update({k.upper(): v for k, v in allGenes.iteritems()})
    if opt.gene not in allGenes:
        raise RuntimeError('gene %s is not in the annotation file' % opt.gene)
    geneList = allGenes[opt.gene]  # a list of Annotation objects
    if len(geneList) > 1:
        logger.warning(
            'gene %s appears %d times in annotations, first occurrence plotted'
            % (opt.gene, len(geneList)))
    myGene = geneList[0]
    for tran in myGene.getChildren():  # tran is an Annotation object
        myTran = Transcript(tran.name,
                            start=tran.start,
                            end=tran.end,
                            annot=True,
                            ID=tran.ID,
                            source=(0, opt.gtf))
        if hasattr(tran, 'startcodon'):
            myTran.startcodon = tran.startcodon
        if hasattr(tran, 'stopcodon'):
            myTran.stopcodon = tran.stopcodon
        for exon in tran.getChildren():  # exon is an Annotation object
            myExon = Exon(myTran, exon.name, exon.start, exon.end,
                          exon.strand)  # no Q score
            if hasattr(exon, 'polyAs'):
                myExon.polyAs = exon.polyAs
            exonList.append(myExon)
            myTran.exons.append(myExon)
        tranList.append(myTran)
    return tranList, exonList
Exemple #9
0
    def findCCSFile(self):
        '''Given a directory to look in, find the ccs.h5 file that contains consensus reads for this bax file.'''

        self._hasConsensus = False  # until proven otherwise

        if "PulseData/ConsensusBaseCalls" in self._top:  # if this is an older bax file, in contains its own CCS data

            self._consBasecalls = self._top["PulseData/ConsensusBaseCalls"]
            self._consZMW = self._top["PulseData/ConsensusBaseCalls/ZMW"]
            self._consPasses = self._top["PulseData/ConsensusBaseCalls/Passes"]
            self._hasConsensus = True

        elif self._CCSDir is not None:

            CCSFilename = os.path.basename(self._filename).replace(
                'bax', 'ccs')
            fqCCSFilename = os.path.join(self._CCSDir, CCSFilename)

            if os.path.exists(fqCCSFilename):

                self._CCSFile = h5py.File(fqCCSFilename, 'r')
                self._consBasecalls = self._CCSFile[
                    "PulseData/ConsensusBaseCalls"]
                self._consZMW = self._CCSFile[
                    "PulseData/ConsensusBaseCalls/ZMW"]
                self._consPasses = self._CCSFile[
                    "PulseData/ConsensusBaseCalls/Passes"]
                self._hasConsensus = True
                logger.debug('BaxFile %s found CCS file %s' %
                             (self._shortName, fqCCSFilename))

            else:
                logger.warning('%s: no CCS file found corresponding to %s' %
                               (self._shortName, self._filename))

        else:
            logger.info(
                'BaxFile %s does not contain CCS data (rel 2.1.0 and later). Use --ccs'
                % self._shortName)
Exemple #10
0
def getParms ():                       # use default input sys.argv[1:]

    parser = optparse.OptionParser(usage='%prog [options] <bas_file> [<cmp_file>]',
                                   description='Print (to stdout) summary information about the contents of a bas.h5 file.')

    parser.add_option ('--ccs',                help='directory containing ccs.h5 files for CCS reads, post-2.1.0')
    parser.add_option ('--aln',                help='cmp.h5 file for subread alignments')
    parser.add_option ('--score',  type='int', help='Minimum HQ region score (def: %default)')
    parser.add_option ('--length', type='int', help='Minimum HQ region length (def: %default)')
    parser.add_option ('--insert', type='int', help='Minimum average insert length (def: %default)')

    parser.set_defaults (score=DEF_SCORE_THRESHOLD,
                         length=DEF_HQ_LENGTH,
                         insert=DEF_INSERT_THRESHOLD)

    opt, args = parser.parse_args()

    if len(args) > 1:
        logger.warning ('WARNING: alignments cmp.h5 file should now be specified with --aln keyword')
        opt.aln = args.pop()      # put it where it belongs

    return opt, args
Exemple #11
0
def getGeneFromAnnotation(opt, tranList, exonList):
    # Add to lists of transcripts and exons: annotations for gene of interest.
    if opt.gtf is None:
        return tranList, exonList
    if opt.annotations:
        annotList = opt.annotations
    else:
        if opt.format == 'pickle':
            annotList = anno.AnnotationList.fromPickle(opt.gtf)
        elif opt.format == 'alt':
            annotList = anno.AnnotationList(opt.gtf, altFormat=True)
        else:     # standard format
            annotList = anno.AnnotationList(opt.gtf)
    allGenes = annotList.getGeneDict()
    allGenes.update({k.upper(): v for k, v in allGenes.iteritems()})
    if opt.gene not in allGenes:
        raise RuntimeError('gene %s is not in the annotation file' % opt.gene)
    geneList = allGenes[opt.gene]       # a list of Annotation objects
    if len(geneList) > 1:
        logger.warning('gene %s appears %d times in annotations, first occurrence plotted'
                       % (opt.gene, len(geneList)))
    myGene = geneList[0]
    for tran in myGene.getChildren():            # tran is an Annotation object
        myTran = Transcript(tran.name, start=tran.start, end=tran.end,
                            annot=True, ID=tran.ID, source=(0, opt.gtf))
        if hasattr(tran, 'startcodon'):
            myTran.startcodon = tran.startcodon
        if hasattr(tran, 'stopcodon'):
            myTran.stopcodon = tran.stopcodon
        for exon in tran.getChildren():      # exon is an Annotation object
            myExon = Exon(myTran, exon.name, exon.start,
                          exon.end, exon.strand)     # no Q score
            if hasattr(exon, 'polyAs'):
                myExon.polyAs = exon.polyAs
            exonList.append(myExon)
            myTran.exons.append(myExon)
        tranList.append(myTran)
    return tranList, exonList
Exemple #12
0
def main ():

    logger.debug("%s starting" % sys.argv[0])

    opt, args = getParms()

    basFilename = args[0]
    logger.debug("bas file: %s" % basFilename)
    bf = H5BasFile.BasFile (basFilename, CCSDir=opt.ccs)

    if not opt.nocons:
        if not bf.hasConsensus():
            logger.warning('no ccs data found: turning on --nocons')
            opt.nocons = True

    cmp = None                      # no cmp file?

    if opt.aln is not None:         # was a subread cmp.h5 file specified?

        cmpFilename = opt.aln
        logger.debug("cmp file: %s" % cmpFilename)
        cf  = H5CmpFile.CmpFile (fileName=cmpFilename)
        cmp = H5CmpFile.CmpMovie (cmpObject=cf,
                                  movieName=bf.movieName(),
                                  maxHole=bf.maxZMW())

    cmpCCS = None

    if opt.alnccs is not None:      # was a CCS cmp.h5 file specified?

        cmpCCSFilename = opt.alnccs
        logger.debug("CCS cmp file: %s" % cmpCCSFilename)
        cfCCS  = H5CmpFile.CmpFile (fileName=cmpCCSFilename)
        cmpCCS = H5CmpFile.CmpMovie (cmpObject=cfCCS,
                                     movieName=bf.movieName(),
                                     maxHole=bf.maxZMW())

    aln = SWAligner.Aligner()           # we'll use this in the loop below for finding adapters
    aln.setRead (H5BasFile.ADAPTER)     # adapter sequence is query
    minAdapterScore = opt.adapter * aln.getPenalties()[0] / 2

    print "   ZMW     b/s stat prod tp  start end+1    len  aln  chr st",
    print "     from         to   off  astart  aend+1   mm  ins del    Q"
    print

    for hole in bf.holeNumbers():          # main loop!

        numBases = bf.readLen(hole)
        zStat    = bf.holeStatusStr(hole)  # this is a string, not a number
        zProd    = bf.productivity(hole)

        numGoodInserts = 0

        HQStart, HQEnd, HQScore = bf.HQregion(hole)[2:5]

        for region in bf.holeRegions(hole):

            regionHole, regionType, start, end, score = region

            inHQ = end > HQStart and start < HQEnd             # does region overlap HQ?

            regionDuration = float(bf.elapsedFrames(hole, start, end)) / H5BasFile.frameRate
            regionBps = (end-start) / regionDuration if regionDuration > 0 else 0

            if regionType == 0:                                # an adapter region?

                if not opt.noadapt:                            # if we are printing adapter lines
                    print "%6d  %6.3f  %-5s  %d"  % (hole, regionBps, zStat, zProd),    # these appear in every line
                    flag = 'A ' if inHQ else 'a '
                    print "%-2s  %5d %5d"  % (flag, start, end)

            elif regionType == 2:                              # a HQ region?

                print "%6d  %6.3f  %-5s  %d"  % (hole, regionBps, zStat, zProd),        # these appear in every line

                if zProd != 1 or not bf.isSequencingZMW(hole) or (HQEnd-HQStart) < opt.length:
                    flag = 'h '
                else:
                    flag = 'H+' if score >= opt.score else 'H '

                print "%-2s  %5d %5d"  % (flag, start, end),

                readDuration = float(bf.elapsedFrames(hole)) / H5BasFile.frameRate
                readBps = numBases/readDuration if readDuration > 0 else 0
                print "            score: %3d  HQ: %5d  read: %5d  dur: %8.3f  b/sec: %6.3f" \
                    % (score, HQEnd-HQStart, numBases, readDuration, readBps)

            elif regionType == 1:                              # a subread?

                print "%6d  %6.3f  %-5s  %d"  % (hole, regionBps, zStat, zProd),        # these appear in every line

                insSize = end - start

                align = None
                if cmp is not None:                                       # if a cmp.h5 was supplied
                    align = cmp.getAlignmentByPosition (hole, start, end) # alignment record for this region

                if align is not None:                                     # if the region aligned

                    numGoodInserts += 1

                    flag = 'I+'
                    print "%-2s  %5d %5d  %5d"  % (flag, start, end, insSize),

                    rStart, rEnd = align['rStart'], align['rEnd']   # fetch once, used many times
                    alnLen = rEnd-rStart
                    nMM, nIns, nDel = align['nMM'], align['nIns'], align['nDel']

                    print "%5d  %2d %1s  %9d  %9d  %4d  %6d  %6d  %3d %4d %3d %4.1f" % \
                        (alnLen,                               # length of aligned portion of read
                         align['contig'],                      # chr/contig id (see H5CmpFile)
                         '-' if align['RCRefStrand'] else '+', # strand
                         align['tStart'], align['tEnd'],       # reference offset of start/end of alignment
                         rStart-start,                         # offset of alignment start into insert
                         rStart, rEnd,
                         nMM, nIns, nDel,                      # # of mismatches, insertions, deletions
                         getQ (align)),                        # read quality Q score for insert

                elif insSize > opt.adapter * 2:                # if it's a non-descript, non-aligned region
                                                               # TODO: Make the '2' a parameter
                    numGoodInserts += 1

                    flag = 'I ' if inHQ else 'i '
                    print "%-2s  %5d %5d  %5d"  % (flag, start, end, insSize),

                elif insSize < opt.adapter:                    # if it's too short to be an adapter

                    flag = 'Is' if inHQ else 'is'
                    print "%-2s  %5d %5d  %5d"  % (flag, start, end, insSize),

                else:                                          # see if it's really an adapter that wasn't called

                    sequence = bf.getSequence(hole, start, end)
                    aln.setRef (sequence)
                    alnScore = aln.fillMatrix()                # align it to adapter

                    flag = 'ia' if alnScore >= minAdapterScore else 'is'
                    print "%-2s  %5d %5d  %5d  %2d"  % (flag, start, end, insSize, alnScore),

                    if alnScore >= minAdapterScore:
                        peaks = aln.peakPosits()
                        print " %2d" % len(peaks),
                        refString, readString = aln.alignmentStrings()
                        print ' ', refString                                                 # EOL here
                        print "                         i2                              ",   # new line
                        print readString,

                print

            else:
                raise ValueError ("unrecognised region type %d in ZMW %d" % (regionType, hole))

        if not opt.nocons:      # note that opt.nocons gets turned on if no CCS data is found
            if zProd == 1 and bf.isSequencingZMW(hole):
                printCCSDataForHole (bf, hole, numGoodInserts, cmpCCS)       # process consensus read passes

    logger.debug("complete")
Exemple #13
0
def main ():

    logger.debug("%s starting" % sys.argv[0])

    opt, args = getParms()

    basFilename = args[0]
    logger.debug("bas file: %s" % basFilename)
    bf = H5BasFile.BasFile (basFilename, CCSDir=opt.ccs)

    if bf.hasConsensus():           # don't go looking for CCS data if it's not there
        nocons = False
    else:
        logger.warning('no ccs data found: point to it with --ccs if desired')
        nocons = True

    cmp = None                      # no cmp file?

    if opt.aln is not None:         # was a subread cmp.h5 file specified?
        
        cmpFilename = opt.aln
        logger.debug("cmp file: %s" % cmpFilename)
        cf  = H5CmpFile.CmpFile (fileName=cmpFilename)
        cmp = H5CmpFile.CmpMovie (cmpObject=cf,
                                  movieName=bf.movieName(),
                                  maxHole=bf.maxZMW())

    totalC   = Counter('Total')
    seqC     = Counter('--Sequencing')
    prod0C   = Counter('----Productivity-0')
    prod1C   = Counter('----Productivity-1')
    HQLenC   = Counter('------HQ Len >= %s' % opt.length)
    HQScoreC = Counter('--------HQ Score >= %s' % opt.score)
    adaptC   = Counter('----------Avg Insert >= %s' % opt.insert)
    HQBasesC = Counter('----------HQ Bases')
    alignC   = Counter('------------Aligned')
    consC    = Counter('------------Consensus Reads')
    prod2C   = Counter('----Productivity-2')

    longest    = 0
    longestZMW = None

    for hole in bf.holeNumbers():

        numBases = bf.readLen(hole)
        zProd    = bf.productivity(hole)

        if numBases > longest:
            longest    = numBases
            longestZMW = hole

        HQStart, HQEnd, HQScore = bf.HQregion(hole)[2:5]
        HQLen = HQEnd - HQStart

        numSubreads     = 0
        numHQSubreads   = 0
        maxSubreadLen   = 0
        cumSubreadLen   = 0
        alignedSubreads = 0
        alignedTotBases = 0                           # total aligned bases in all inserts
        alignedMaxBases = 0                           # longest alignment in single insert

        for region in bf.holeRegions(hole):

            regionHole, regionType, start, end, score = region
            inHQ = end > HQStart and start < HQEnd    # does region overlap HQ?

            if regionType == 1:                       # if insert
                numSubreads += 1
                maxSubreadLen  = max (end-start, maxSubreadLen)
                cumSubreadLen += max (end-start, 0)   # clip negative lengths to zero
                if inHQ:
                    numHQSubreads += 1

                if cmp is not None:
                    align = cmp.getAlignmentByPosition (hole, start, end) # alignment record for this region

                    if align is not None:                            # if the region aligned
                        alignedSubreads += 1
                        alignedBases     = align['rEnd'] - align ['rStart']
                        alignedTotBases += alignedBases
                        alignedMaxBases  = max (alignedBases, alignedMaxBases)

        # What follows is a series of increasingly restrictive
        # criteria for a useful subread. Keep track of the number of
        # ZMWs, subreads, and bases which pass the successive tests,
        # and the length and ZMW provenance of the longest accepted
        # subread.

        totalC.incr (1, numSubreads, numBases)
        totalC.longest (hole, maxSubreadLen)

        if bf.isSequencingZMW(hole):              # sequencing ZMW?
            seqC.incr (1, numSubreads, numBases)
            seqC.longest (hole, maxSubreadLen)

            if zProd == 0:
                prod0C.incr (1, numSubreads, numBases)
                prod0C.longest (hole, maxSubreadLen)

            elif zProd == 2:
                prod2C.incr (1, numSubreads, numBases)
                prod2C.longest (hole, maxSubreadLen)

            elif zProd == 1:                      # productivity 1 gets broken down further
                prod1C.incr (1, numSubreads, numBases)
                prod1C.longest (hole, maxSubreadLen)

                if HQLen >= opt.length:
                    HQLenC.incr (1, numSubreads, numBases)
                    HQLenC.longest (hole, maxSubreadLen)

                    if HQScore >= opt.score:
                        HQScoreC.incr (1, numSubreads, numBases)
                        HQScoreC.longest (hole, maxSubreadLen)

                        # A very short average insert size probably indicates an adapter dimer.

                        if cumSubreadLen >= numSubreads * opt.insert:
                            adaptC.incr (1, numSubreads, numBases)
                            adaptC.longest (hole, maxSubreadLen)

                            HQBasesC.incr (1, numHQSubreads, HQLen)               # 
                            HQBasesC.longest (hole, HQLen)

                            if alignedSubreads > 0:
                                alignC.incr (1, alignedSubreads, alignedTotBases)     # total aligned bases
                                alignC.longest (hole, alignedMaxBases)                # longest single alignment

                        if not nocons:
                            consLen = bf.consReadLen(hole)
                            if consLen > 0:
                                consC.incr (1, bf.numConsensusPasses(hole), consLen)
                                consC.longest (hole, consLen)

    print
    print "file: ", basFilename
    print
    print "longest read was ZMW %d at %d bases" % (longestZMW, longest)
    print
    print "statistics for subreads:"
    print

    Counter.title();

    if cmp is not None:                 # if we processed a .cmp.h5 file
        for cntr in (totalC, seqC, prod0C, prod2C, prod1C, HQLenC, HQScoreC, adaptC, HQBasesC, consC, alignC):
            cntr.longPrint()
    else:
        for cntr in (totalC, seqC, prod0C, prod2C, prod1C, HQLenC, HQScoreC, adaptC, HQBasesC, consC):
            cntr.longPrint()
    print

    logger.debug("complete")
Exemple #14
0
def main():

    logger.debug("%s starting" % sys.argv[0])

    opt, args = getParms()

    basFilename = args[0]
    logger.debug("bas file: %s" % basFilename)
    bf = H5BasFile.BasFile(basFilename, CCSDir=opt.ccs)

    if not opt.nocons:
        if not bf.hasConsensus():
            logger.warning('no ccs data found: turning on --nocons')
            opt.nocons = True

    cmp = None  # no cmp file?

    if opt.aln is not None:  # was a subread cmp.h5 file specified?

        cmpFilename = opt.aln
        logger.debug("cmp file: %s" % cmpFilename)
        cf = H5CmpFile.CmpFile(fileName=cmpFilename)
        cmp = H5CmpFile.CmpMovie(cmpObject=cf,
                                 movieName=bf.movieName(),
                                 maxHole=bf.maxZMW())

    cmpCCS = None

    if opt.alnccs is not None:  # was a CCS cmp.h5 file specified?

        cmpCCSFilename = opt.alnccs
        logger.debug("CCS cmp file: %s" % cmpCCSFilename)
        cfCCS = H5CmpFile.CmpFile(fileName=cmpCCSFilename)
        cmpCCS = H5CmpFile.CmpMovie(cmpObject=cfCCS,
                                    movieName=bf.movieName(),
                                    maxHole=bf.maxZMW())

    aln = SWAligner.Aligner(
    )  # we'll use this in the loop below for finding adapters
    aln.setRead(H5BasFile.ADAPTER)  # adapter sequence is query
    minAdapterScore = opt.adapter * aln.getPenalties()[0] / 2

    print "   ZMW     b/s stat prod tp  start end+1    len  aln  chr st",
    print "     from         to   off  astart  aend+1   mm  ins del    Q"
    print

    for hole in bf.holeNumbers():  # main loop!

        numBases = bf.readLen(hole)
        zStat = bf.holeStatusStr(hole)  # this is a string, not a number
        zProd = bf.productivity(hole)

        numGoodInserts = 0

        HQStart, HQEnd, HQScore = bf.HQregion(hole)[2:5]

        for region in bf.holeRegions(hole):

            regionHole, regionType, start, end, score = region

            inHQ = end > HQStart and start < HQEnd  # does region overlap HQ?

            regionDuration = float(bf.elapsedFrames(hole, start,
                                                    end)) / H5BasFile.frameRate
            regionBps = (end -
                         start) / regionDuration if regionDuration > 0 else 0

            if regionType == 0:  # an adapter region?

                if not opt.noadapt:  # if we are printing adapter lines
                    print "%6d  %6.3f  %-5s  %d" % (
                        hole, regionBps, zStat,
                        zProd),  # these appear in every line
                    flag = 'A ' if inHQ else 'a '
                    print "%-2s  %5d %5d" % (flag, start, end)

            elif regionType == 2:  # a HQ region?

                print "%6d  %6.3f  %-5s  %d" % (
                    hole, regionBps, zStat,
                    zProd),  # these appear in every line

                if zProd != 1 or not bf.isSequencingZMW(hole) or (
                        HQEnd - HQStart) < opt.length:
                    flag = 'h '
                else:
                    flag = 'H+' if score >= opt.score else 'H '

                print "%-2s  %5d %5d" % (flag, start, end),

                readDuration = float(
                    bf.elapsedFrames(hole)) / H5BasFile.frameRate
                readBps = numBases / readDuration if readDuration > 0 else 0
                print "            score: %3d  HQ: %5d  read: %5d  dur: %8.3f  b/sec: %6.3f" \
                    % (score, HQEnd-HQStart, numBases, readDuration, readBps)

            elif regionType == 1:  # a subread?

                print "%6d  %6.3f  %-5s  %d" % (
                    hole, regionBps, zStat,
                    zProd),  # these appear in every line

                insSize = end - start

                align = None
                if cmp is not None:  # if a cmp.h5 was supplied
                    align = cmp.getAlignmentByPosition(
                        hole, start, end)  # alignment record for this region

                if align is not None:  # if the region aligned

                    numGoodInserts += 1

                    flag = 'I+'
                    print "%-2s  %5d %5d  %5d" % (flag, start, end, insSize),

                    rStart, rEnd = align['rStart'], align[
                        'rEnd']  # fetch once, used many times
                    alnLen = rEnd - rStart
                    nMM, nIns, nDel = align['nMM'], align['nIns'], align[
                        'nDel']

                    print "%5d  %2d %1s  %9d  %9d  %4d  %6d  %6d  %3d %4d %3d %4.1f" % \
                        (alnLen,                               # length of aligned portion of read
                         align['contig'],                      # chr/contig id (see H5CmpFile)
                         '-' if align['RCRefStrand'] else '+', # strand
                         align['tStart'], align['tEnd'],       # reference offset of start/end of alignment
                         rStart-start,                         # offset of alignment start into insert
                         rStart, rEnd,
                         nMM, nIns, nDel,                      # # of mismatches, insertions, deletions
                         getQ (align)),                        # read quality Q score for insert

                elif insSize > opt.adapter * 2:  # if it's a non-descript, non-aligned region
                    # TODO: Make the '2' a parameter
                    numGoodInserts += 1

                    flag = 'I ' if inHQ else 'i '
                    print "%-2s  %5d %5d  %5d" % (flag, start, end, insSize),

                elif insSize < opt.adapter:  # if it's too short to be an adapter

                    flag = 'Is' if inHQ else 'is'
                    print "%-2s  %5d %5d  %5d" % (flag, start, end, insSize),

                else:  # see if it's really an adapter that wasn't called

                    sequence = bf.getSequence(hole, start, end)
                    aln.setRef(sequence)
                    alnScore = aln.fillMatrix()  # align it to adapter

                    flag = 'ia' if alnScore >= minAdapterScore else 'is'
                    print "%-2s  %5d %5d  %5d  %2d" % (flag, start, end,
                                                       insSize, alnScore),

                    if alnScore >= minAdapterScore:
                        peaks = aln.peakPosits()
                        print " %2d" % len(peaks),
                        refString, readString = aln.alignmentStrings()
                        print ' ', refString  # EOL here
                        print "                         i2                              ",  # new line
                        print readString,

                print

            else:
                raise ValueError("unrecognised region type %d in ZMW %d" %
                                 (regionType, hole))

        if not opt.nocons:  # note that opt.nocons gets turned on if no CCS data is found
            if zProd == 1 and bf.isSequencingZMW(hole):
                printCCSDataForHole(bf, hole, numGoodInserts,
                                    cmpCCS)  # process consensus read passes

    logger.debug("complete")