Example #1
0
    def basecallIndex (self):
        '''Create and cache a table by hole number of starting indexes into PulseData/BaseCalls.'''

        if self._basecallIndex == None:

            logger.debug("creating basecall index")
            
            numEvent   = self._ZMW["NumEvent"]
            holeNumber = self._ZMW["HoleNumber"]
            numZ       = self.numZMWs()
            index      = 0
            self._basecallIndex = [0] * numZ

            # The loop below includes a check that the HoleNumbers run
            # from 0 to N-1. I.e., HoleNumber[ix] == ix. Otherwise,
            # there is no way to get from the hole number of a region
            # back to the ZMW entry (other than searching for it).

            for ix in xrange(numZ):
                
                if holeNumber[ix] != ix:
                    raise RuntimeError("Hole number != index at %d" % ix)

                self._basecallIndex[ix] = index
                index += numEvent[ix]

            logger.debug("processed %d basecalls" % index)

        return self._basecallIndex
Example #2
0
    def __init__ (self, name):

        self.name = name
        self.handle = open (name, 'w')
        self.lastPos = dict()

        logger.debug('opened %s' % name)
Example #3
0
    def __init__(self, name):

        self.name = name
        self.handle = open(name, 'w')
        self.lastPos = dict()

        logger.debug('opened %s' % name)
Example #4
0
    def cellCoords (self):
        '''Find and cache the minimum and maximum X/Y coordinates on the SMRTcell'''

        if self._coords is None:

            logger.debug("finding SMRTcell coordinates")

            numZ       = self.numZMWs()
            holeXY = self._ZMW["HoleXY"]

            minX = maxX = holeXY[0,0]
            minY = maxY = holeXY[0,1]

            for ix in xrange(numZ):

                x,y = holeXY[ix,:]

                if x < minX:
                    minX = x
                elif x > maxX:
                    maxX = x

                if y < minY:
                    minY = y
                elif y > maxY:
                    maxY = y

            self._coords = (minX, maxX, minY, maxY)

            logger.debug("SMRTcell is (%d,%d,%d,%d)" % (minX, maxX, minY, maxY))

        return self._coords
Example #5
0
    def cellCoords(self):
        """Find and cache the minimum and maximum X/Y coordinates on the SMRTcell"""

        if self._coords is None:

            logger.debug("finding SMRTcell coordinates")

            minX = 0
            maxX = 0
            minY = 0
            maxY = 0

            for bf in self._baxfile:

                holeXY = bf._ZMW["HoleXY"]

                minX = min(minX, min(holeXY[:, 0]))
                maxX = max(maxX, max(holeXY[:, 0]))
                minY = min(minY, min(holeXY[:, 1]))
                maxY = max(maxY, max(holeXY[:, 1]))

            self._coords = (minX, maxX, minY, maxY)

            logger.debug("SMRTcell is (%d,%d,%d,%d)" % (minX, maxX, minY, maxY))

        return self._coords
Example #6
0
    def _fillRegionTables (self):
        '''Create and cache tables by hole number of starting and HQ indexes into PulseData/Regions.'''

        if self._regionIndex == None:

            logger.debug("creating region index")
            
            self._regionIndex = [0] * self.numZMWs()
            self._HQIndex     = [0] * self.numZMWs()

            regions  = self._regions
            index    = 0
            lastHole = -1        # init to non-matching value

            for line in regions:

                hole, regionType = line[0:2]

                if hole != lastHole:                    # start of new hole?
                    self._regionIndex[hole] = index
                    lastHole = hole

                if regionType == 2:                     # HQ region for this hole?
                    self._HQIndex[hole] = index

                index += 1
                    
            logger.debug("processed %d regions" % index)

        return self._regionIndex
Example #7
0
    def __init__(self, filename, CCSDir=None):

        BaxFile.fileNum += 1
        self._shortName = "bax-%d" % BaxFile.fileNum

        logger.debug("creating BaxFile object for %s = %s" % (self._shortName, filename))

        self._filename = filename
        self._CCSDir = CCSDir
        self._infile = h5py.File(filename, "r")
        self._top = self._infile  # h5py 2.0.1 change!

        self._pulsedata = self._top["PulseData"]
        self._basecalls = self._top["PulseData/BaseCalls"]
        self._ZMW = self._top["PulseData/BaseCalls/ZMW"]
        self._regions = self._top["PulseData/Regions"]
        self._productivity = self._top["PulseData/BaseCalls/ZMWMetrics/Productivity"]
        self._movieName = self._top["ScanData/RunInfo"].attrs["MovieName"]
        self._holeStatus = self._ZMW["HoleStatus"]
        self._numRegions = self._regions.shape[0]

        self._PreBaseFrames = self._basecalls["PreBaseFrames"]
        self._WidthInFrames = self._basecalls["WidthInFrames"]

        self._maxZMW = max(self._ZMW["HoleNumber"])  # this takes a surprisingly long time to compute

        self._sanityChecked = False

        self.findCCSFile()
Example #8
0
def findRegions(tranList):
    # Find breakpoints where coverage by exons changes.

    # Why are we doing this? See the note in the Transcript class
    # definition below.

    breaks = list()
    for tranIx, tran in enumerate(tranList):
        for exon in tran.exons:
            breaks.append([exon.start, 0, tranIx, tran.name, exon.name])
            breaks.append([exon.end, 1, tranIx, tran.name, exon.name])

    breaks.sort(key=lambda x: x[0])
    curPos = breaks[0][0]
    curTranSet = set()
    region = 0

    for ix in xrange(len(breaks)):
        posit, flag, tranIx, tranName, exonName = breaks[ix]
        if posit > curPos + MIN_REGION_SIZE:  # this is a new region
            if len(curTranSet) > 0:
                for ix in curTranSet:
                    tranList[ix].regions.add(
                        region)  # update set of regions hit by this transcript
                region += 1
            curPos = posit
        if flag == 0:  # exon start
            curTranSet.add(tranIx)
        else:  # exon end
            curTranSet.remove(tranIx)
    logger.debug('found %d regions' % region)
    return
Example #9
0
    def fillConsensusIndexes(self):
        """Compute _consPassIndex and _consensusIndex. Called only when those arrays are needed."""

        self._consPassIndex = [None] * (self._maxZMW + 1)
        self._consensusIndex = [None] * (self._maxZMW + 1)

        for bf in self._baxfile:

            bf.ZMWSanityClause()  # sanity check the consensus datasets

            ix = 0
            passIndex = 0
            consIndex = 0
            numPasses = bf._consPasses["NumPasses"]
            numEvent = bf._consZMW["NumEvent"]

            for hole in bf.holeNumbers():
                self._consPassIndex[hole] = passIndex
                passIndex += numPasses[ix]
                self._consensusIndex[hole] = consIndex
                consIndex += numEvent[ix]
                ix += 1

            logger.debug("%s processed %d consensus passes" % (bf.shortName(), passIndex))
            logger.debug("%s processed %d consensus basecalls" % (bf.shortName(), consIndex))
Example #10
0
    def __init__(self, filename, CCSDir=None):

        BaxFile.fileNum += 1
        self._shortName = 'bax-%d' % BaxFile.fileNum

        logger.debug("creating BaxFile object for %s = %s" %
                     (self._shortName, filename))

        self._filename = filename
        self._CCSDir = CCSDir
        self._infile = h5py.File(filename, 'r')
        self._top = self._infile  # h5py 2.0.1 change!

        self._pulsedata = self._top["PulseData"]
        self._basecalls = self._top["PulseData/BaseCalls"]
        self._ZMW = self._top["PulseData/BaseCalls/ZMW"]
        self._regions = self._top["PulseData/Regions"]
        self._productivity = self._top[
            "PulseData/BaseCalls/ZMWMetrics/Productivity"]
        self._movieName = self._top["ScanData/RunInfo"].attrs["MovieName"]
        self._holeStatus = self._ZMW["HoleStatus"]
        self._numRegions = self._regions.shape[0]

        self._PreBaseFrames = self._basecalls["PreBaseFrames"]
        self._WidthInFrames = self._basecalls["WidthInFrames"]

        self._maxZMW = max(self._ZMW["HoleNumber"]
                           )  # this takes a surprisingly long time to compute

        self._sanityChecked = False

        self.findCCSFile()
Example #11
0
    def __init__(self, filename, CCSDir=None):

        logger.debug("creating BasFile object")

        self._filename = filename
        self._CCSDir = CCSDir
        self._infile = h5py.File(filename, "r")
        self._top = self._infile
        self._baxfile = list()
        self._coords = None

        self._consensusIndex = None
        self._consPassIndex = None

        if "MultiPart" not in self._top:  # if this is an old-style bas file
            bf = BaxFile(filename)  # file will contain its own CCSdata
            self._baxfile.append(bf)  # only one file (this one) in the list

        else:  # else it's an index to a set of bax files

            h5Dir = os.path.dirname(os.path.abspath(self._filename))

            for baxfileName in self._top["MultiPart/Parts"]:  # for each bax file

                fqBaxfileName = os.path.join(h5Dir, baxfileName)  # fq = fully qualified
                bf = BaxFile(fqBaxfileName, CCSDir=CCSDir)
                self._baxfile.append(bf)  # add file to list

        self.fillCombinedFields()  # need to compute this first, we'll need it later
        self.fillZMWIndexes()
        self.fillMovieName()
        self.fillRegionIndexes()
Example #12
0
    def cellCoords(self):
        '''Find and cache the minimum and maximum X/Y coordinates on the SMRTcell'''

        if self._coords is None:

            logger.debug("finding SMRTcell coordinates")

            minX = 0
            maxX = 0
            minY = 0
            maxY = 0

            for bf in self._baxfile:

                holeXY = bf._ZMW["HoleXY"]

                minX = min(minX, min(holeXY[:, 0]))
                maxX = max(maxX, max(holeXY[:, 0]))
                minY = min(minY, min(holeXY[:, 1]))
                maxY = max(maxY, max(holeXY[:, 1]))

            self._coords = (minX, maxX, minY, maxY)

            logger.debug("SMRTcell is (%d,%d,%d,%d)" %
                         (minX, maxX, minY, maxY))

        return self._coords
Example #13
0
    def fillConsensusIndexes(self):
        '''Compute _consPassIndex and _consensusIndex. Called only when those arrays are needed.'''

        self._consPassIndex = [None] * (self._maxZMW + 1)
        self._consensusIndex = [None] * (self._maxZMW + 1)

        for bf in self._baxfile:

            bf.ZMWSanityClause()  # sanity check the consensus datasets

            ix = 0
            passIndex = 0
            consIndex = 0
            numPasses = bf._consPasses["NumPasses"]
            numEvent = bf._consZMW["NumEvent"]

            for hole in bf.holeNumbers():
                self._consPassIndex[hole] = passIndex
                passIndex += numPasses[ix]
                self._consensusIndex[hole] = consIndex
                consIndex += numEvent[ix]
                ix += 1

            logger.debug("%s processed %d consensus passes" %
                         (bf.shortName(), passIndex))
            logger.debug("%s processed %d consensus basecalls" %
                         (bf.shortName(), consIndex))
Example #14
0
def main():

    logger.debug("%s starting" % sys.argv[0])

    basfileName = sys.argv[1]
    hole = int(sys.argv[2])

    bf = H5BasFile.BasFile(basfileName)

    call = bf.getBasecallField("Basecall", hole)
    delete = bf.getBasecallField("DeletionQV", hole)
    wuzzit = bf.getBasecallField("DeletionTag", hole)
    insert = bf.getBasecallField("InsertionQV", hole)
    prebase = bf.getBasecallField("PreBaseFrames", hole)
    subst = bf.getBasecallField("SubstitutionQV", hole)
    couldbe = bf.getBasecallField("SubstitutionTag", hole)
    qual = bf.getBasecallField("QualityValue", hole)
    duration = bf.getBasecallField("WidthInFrames", hole)

    print "Index  Call   Qd  Del   Qi   Qs  Sub    Q               DelT    Dur"
    print

    for ix in xrange(len(call)):

        print "%5d  %4s  %3d  %3s  %3d  %3d  %3s  %3d  %4d  %3d  %6.3f  %5.3f" \
            % (ix, chr(call[ix]), delete[ix], chr(wuzzit[ix]), insert[ix], subst[ix], chr(couldbe[ix]), qual[ix],
               prebase[ix], duration[ix],
               float(prebase[ix])/H5BasFile.frameRate, float(duration[ix])/H5BasFile.frameRate)

    logger.debug("complete")
Example #15
0
    def findCCSFile(self):
        """Given a directory to look in, find the ccs.h5 file that contains consensus reads for this bax file."""

        self._hasConsensus = False  # until proven otherwise

        if "PulseData/ConsensusBaseCalls" in self._top:  # if this is an older bax file, in contains its own CCS data

            self._consBasecalls = self._top["PulseData/ConsensusBaseCalls"]
            self._consZMW = self._top["PulseData/ConsensusBaseCalls/ZMW"]
            self._consPasses = self._top["PulseData/ConsensusBaseCalls/Passes"]
            self._hasConsensus = True

        elif self._CCSDir is not None:

            CCSFilename = os.path.basename(self._filename).replace("bax", "ccs")
            fqCCSFilename = os.path.join(self._CCSDir, CCSFilename)

            if os.path.exists(fqCCSFilename):

                self._CCSFile = h5py.File(fqCCSFilename, "r")
                self._consBasecalls = self._CCSFile["PulseData/ConsensusBaseCalls"]
                self._consZMW = self._CCSFile["PulseData/ConsensusBaseCalls/ZMW"]
                self._consPasses = self._CCSFile["PulseData/ConsensusBaseCalls/Passes"]
                self._hasConsensus = True
                logger.debug("BaxFile %s found CCS file %s" % (self._shortName, fqCCSFilename))

            else:
                logger.warning("%s: no CCS file found corresponding to %s" % (self._shortName, self._filename))

        else:
            logger.info("BaxFile %s does not contain CCS data (rel 2.1.0 and later). Use --ccs" % self._shortName)
Example #16
0
def findRegions(tranList):
    # Find breakpoints where coverage by exons changes.

    # Why are we doing this? See the note in the Transcript class
    # definition below.

    breaks = list()
    for tranIx, tran in enumerate(tranList):
        for exon in tran.exons:
            breaks.append([exon.start, 0, tranIx, tran.name, exon.name])
            breaks.append([exon.end, 1, tranIx, tran.name, exon.name])

    breaks.sort(key=lambda x: x[0])
    curPos = breaks[0][0]
    curTranSet = set()
    region = 0

    for ix in xrange(len(breaks)):
        posit, flag, tranIx, tranName, exonName = breaks[ix]
        if posit > curPos + MIN_REGION_SIZE:             # this is a new region
            if len(curTranSet) > 0:
                for ix in curTranSet:
                    tranList[ix].regions.add(region)     # update set of regions hit by this transcript
                region += 1
            curPos = posit
        if flag == 0:                                    # exon start
            curTranSet.add(tranIx)
        else:                                            # exon end
            curTranSet.remove(tranIx)
    logger.debug('found %d regions' % region)
    return
Example #17
0
    def __init__(self, filename, CCSDir=None):

        logger.debug('creating BasFile object')

        self._filename = filename
        self._CCSDir = CCSDir
        self._infile = h5py.File(filename, 'r')
        self._top = self._infile
        self._baxfile = list()
        self._coords = None

        self._consensusIndex = None
        self._consPassIndex = None

        if 'MultiPart' not in self._top:  # if this is an old-style bas file
            bf = BaxFile(filename)  # file will contain its own CCSdata
            self._baxfile.append(bf)  # only one file (this one) in the list

        else:  # else it's an index to a set of bax files

            h5Dir = os.path.dirname(os.path.abspath(self._filename))

            for baxfileName in self._top[
                    'MultiPart/Parts']:  # for each bax file

                fqBaxfileName = os.path.join(
                    h5Dir, baxfileName)  # fq = fully qualified
                bf = BaxFile(fqBaxfileName, CCSDir=CCSDir)
                self._baxfile.append(bf)  # add file to list

        self.fillCombinedFields(
        )  # need to compute this first, we'll need it later
        self.fillZMWIndexes()
        self.fillMovieName()
        self.fillRegionIndexes()
Example #18
0
    def toPickle (self, filename):

        pickHandle = open (filename, 'w')
        pk = pickle.Pickler (pickHandle, pickle.HIGHEST_PROTOCOL)
        pk.dump (self)
        pickHandle.close()

        logger.debug('wrote annotation data to pickle file %s' % filename)

        return
Example #19
0
    def toPickle(self, filename):

        pickHandle = open(filename, 'w')
        pk = pickle.Pickler(pickHandle, pickle.HIGHEST_PROTOCOL)
        pk.dump(self)
        pickHandle.close()

        logger.debug('wrote reference data to pickle file %s' % filename)

        return
Example #20
0
    def __init__ (self, fileName):

        logger.debug("creating CmpFile object for %s" % (fileName))

        self._fileName = fileName
        self._infile   = h5py.File (fileName, 'r')
####        self._top      = h5py.Group (self._infile, '/')
        self._top      = self._infile         # h5py 2.0.1 change!

        return
Example #21
0
    def fromPickle (filename):
        '''Create a ClusterDict object from a pickle file (alternative to __init__).'''

        handle = open (filename, 'r')
        pk = pickle.Unpickler (handle)
        clusterDict = pk.load()
        handle.close()

        logger.debug('read %d clusters in pickle format from %s' % (len(clusterDict), filename))

        return clusterDict
Example #22
0
    def fromPickle (filename):
        '''Create a Reference object from a pickle file (alternative to __init__).'''

        logger.debug('reading reference in pickle format from %s' % filename)

        handle = open (filename, 'r')
        pk = pickle.Unpickler (handle)
        ref = pk.load()
        handle.close()

        return ref
Example #23
0
    def fromPickle (filename):
        '''Create an AnnotationList object from a pickle file (alternative to __init__).'''

        logger.debug('reading annotations in pickle format from %s' % filename)

        handle = open (filename, 'r')
        pk = pickle.Unpickler (handle)
        annotList = pk.load()
        handle.close()

        return annotList
Example #24
0
    def fromPickle(filename):
        '''Create a Reference object from a pickle file (alternative to __init__).'''

        logger.debug('reading reference in pickle format from %s' % filename)

        handle = open(filename, 'r')
        pk = pickle.Unpickler(handle)
        ref = pk.load()
        handle.close()

        return ref
Example #25
0
    def fromPickle(filename):
        '''Create an AnnotationList object from a pickle file (alternative to __init__).'''

        logger.debug('reading annotations in pickle format from %s' % filename)

        handle = open(filename, 'r')
        pk = pickle.Unpickler(handle)
        annotList = pk.load()
        handle.close()

        return annotList
Example #26
0
def plotPolyAs (tranList, blocks):
    '''Add start/stop codons to plot.'''

    for tran in tranList:
        if tran.annot:                                        # only annotations know about polyAs
            for exon in tran.exons:
                if hasattr (exon, 'polyAs'):
                    for start, end, howmany in exon.polyAs:
                        plotA (tran, start, blocks)
                        logger.debug ('%s: %9d' % (exon.name, start))

    return
Example #27
0
    def fromPickle(filename):
        '''Create a ClusterDict object from a pickle file (alternative to __init__).'''

        handle = open(filename, 'r')
        pk = pickle.Unpickler(handle)
        clusterDict = pk.load()
        handle.close()

        logger.debug('read %d clusters in pickle format from %s' %
                     (len(clusterDict), filename))

        return clusterDict
Example #28
0
def submitFinalJobs(opt, chunkList):

    chunkFiles = ['%s \\\n' % chk.trimmedChunkName for chk in chunkList]

    sh = list()
    sh.append('#!/bin/bash\n\n')
    sh.append('set -o errexit\n')
    sh.append('set -o nounset\n\n')

    sh.append('cat \\\n')
    sh.extend(chunkFiles)
    sh.append(' > %s\n' % opt.output)

    if opt.report is not None:
        reportFiles = ['%s \\\n' % chk.reportChunkName for chk in chunkList]
        sh.append('\ncat \\\n')
        sh.extend(reportFiles)
        sh.append(' > %s\n' % opt.report)

    finalScriptName = '%s/trim_final.sh' % opt.tmpdir
    handle = open(finalScriptName, 'w')
    handle.writelines(sh)
    handle.close()

    deps = ':'.join([chk.jobno for chk in chunkList])

    cmd = list()
    cmd.append('qsub')
    cmd.append('-N trim_final')  # job name
    cmd.append('-o trim_final.out')  # output file
    cmd.append('-j oe')  # combine stdout and stderr
    cmd.append('-l nodes=1:ppn=1,walltime=4:00:00')  # resources required
    cmd.append('-d . ')  # working directory (strangely, ./ is not the default)
    cmd.append('-r n')  # do NOT attempt to restart on failure
    cmd.append('-V')  # export all environment variables to job
    cmd.append('-W umask=0002')  # make logs rw-rw-r--
    cmd.append('-m n')  # don't send any mail
    cmd.append('-W depend=afterok:%s' % deps)
    cmd.append(finalScriptName)  # script to run

    command = ' '.join(cmd)
    logger.debug('running %s' % command)

    popen_file = os.popen(command)
    response = popen_file.read().strip()
    rc = popen_file.close()
    if rc is not None:
        logger.error('command failed, rc=%d' % rc)
        raise RuntimeError

    logger.debug('jobno is %s' % response)

    return response
Example #29
0
def submitFinalJobs (opt, chunkList):

    chunkFiles = ['%s \\\n' % chk.trimmedChunkName for chk in chunkList]

    sh = list()
    sh.append('#!/bin/bash\n\n')
    sh.append('set -o errexit\n')
    sh.append('set -o nounset\n\n')

    sh.append('cat \\\n')
    sh.extend(chunkFiles)
    sh.append(' > %s\n' % opt.output)

    if opt.report is not None:
        reportFiles = ['%s \\\n' % chk.reportChunkName for chk in chunkList]
        sh.append('\ncat \\\n')
        sh.extend(reportFiles)
        sh.append(' > %s\n' % opt.report)

    finalScriptName = '%s/trim_final.sh' % opt.tmpdir
    handle =  open (finalScriptName, 'w')
    handle.writelines (sh)
    handle.close()

    deps = ':'.join ([chk.jobno for chk in chunkList])

    cmd = list()
    cmd.append('qsub')
    cmd.append('-N trim_final')       # job name
    cmd.append('-o trim_final.out')   # output file
    cmd.append('-j oe')               # combine stdout and stderr
    cmd.append('-l nodes=1:ppn=1,walltime=4:00:00')    # resources required
    cmd.append('-d . ')               # working directory (strangely, ./ is not the default)
    cmd.append('-r n')                # do NOT attempt to restart on failure
    cmd.append('-V')                  # export all environment variables to job
    cmd.append('-W umask=0002')       # make logs rw-rw-r--
    cmd.append('-m n')                # don't send any mail
    cmd.append('-W depend=afterok:%s' % deps)
    cmd.append(finalScriptName)       # script to run

    command = ' '.join(cmd)
    logger.debug ('running %s' % command)
    
    popen_file = os.popen(command)
    response = popen_file.read().strip()
    rc = popen_file.close()
    if rc is not None:
        logger.error('command failed, rc=%d' % rc)
        raise RuntimeError

    logger.debug ('jobno is %s' % response)

    return response
Example #30
0
def plotPolyAs (tranList, blocks):
    '''Add start/stop codons to plot.'''

    for tran in tranList:
        if tran.annot:                                        # only annotations know about polyAs
            for exon in tran.exons:
                if hasattr (exon, 'polyAs'):
                    for start, end, howmany in exon.polyAs:
                        plotA (tran, start, blocks)
                        logger.debug ('%s: %9d' % (exon.name, start))

    return
Example #31
0
    def toPickle (self, filename):

        self.geneDict =  None         # no need to pickle this, it can be recreated

        pickHandle = open (filename, 'w')
        pk = pickle.Pickler (pickHandle, pickle.HIGHEST_PROTOCOL)
        pk.dump (self)
        pickHandle.close()

        logger.debug('wrote %d clusters to pickle file %s' % (len(self.clusterDict), filename))

        return
Example #32
0
    def toPickle(self, filename):

        self.geneDict = None  # no need to pickle this, it can be recreated

        pickHandle = open(filename, 'w')
        pk = pickle.Pickler(pickHandle, pickle.HIGHEST_PROTOCOL)
        pk.dump(self)
        pickHandle.close()

        logger.debug('wrote %d clusters to pickle file %s' %
                     (len(self.clusterDict), filename))

        return
Example #33
0
    def makeRef (self):
        '''Invoke bowtie-build on the fasta file.'''

        if not self.handle.closed:
            self.close()

        command = '%s %s %s > %s.out 2>&1' % (BOWTIE_BUILD, self.name, self.name, self.name)
        logger.debug(command)

        buildOut = os.popen (command)      # this should return nothing, since we've redirected the output
        rc = buildOut.close()
        if rc is not None:
            raise RuntimeError ('bowtie2-build failed: %d' % rc)
Example #34
0
def main():

    logger.debug('version %s starting' % VERSION)

    opt, args = getParms()

    makeTempDir(opt.tmpdir)

    nSeqs = countSeqs(opt.input)
    logger.debug('%s contains %d sequences' % (opt.input, nSeqs))

    seqsPerJob = (nSeqs + opt.njobs - 1) / opt.njobs
    logger.debug('each of %d jobs will process %d sequences' %
                 (opt.njobs, seqsPerJob))

    chunkList = makeFastaChunks(opt, nSeqs, seqsPerJob)

    for chunk in chunkList:
        chunk.makeScript()
        chunk.submitScript()

    submitFinalJobs(opt, chunkList)

    logger.debug('finished')

    return
Example #35
0
    def fillCombinedFields(self):
        '''Called from __init__ to compute aggregated fields across all ax files.'''

        self._maxZMW = 0
        self._numZMWs = 0
        self._hasConsensus = True

        for bf in self._baxfile:
            self._maxZMW = max(self._maxZMW, bf._maxZMW)  # largest ZMW#
            self._numZMWs += bf.numZMWs()  # count of ZMWs
            if not bf.hasConsensus():
                self._hasConsensus = False

        logger.debug("largest ZMW# is %d" % self._maxZMW)
Example #36
0
    def fillCombinedFields(self):
        """Called from __init__ to compute aggregated fields across all ax files."""

        self._maxZMW = 0
        self._numZMWs = 0
        self._hasConsensus = True

        for bf in self._baxfile:
            self._maxZMW = max(self._maxZMW, bf._maxZMW)  # largest ZMW#
            self._numZMWs += bf.numZMWs()  # count of ZMWs
            if not bf.hasConsensus():
                self._hasConsensus = False

        logger.debug("largest ZMW# is %d" % self._maxZMW)
Example #37
0
    def getGeneDict (self):
        '''Create and cache dict: key=gene name, value=Annotation object for gene.'''

        if self.geneDict is None:

            logger.debug('creating gene name lookup table')

            self.geneDict = dict()

            for chr in self.chromosomes():
                for gene in self.annot[chr].getChildren():
                    self.geneDict.setdefault(gene.name, []).append(gene)

        return self.geneDict
Example #38
0
def main ():

    logger.debug('version %s starting' % VERSION)

    opt, args = getParms()

    makeTempDir (opt.tmpdir)

    nSeqs = countSeqs (opt.input)
    logger.debug('%s contains %d sequences' % (opt.input, nSeqs))

    seqsPerJob = (nSeqs + opt.njobs - 1) / opt.njobs
    logger.debug('each of %d jobs will process %d sequences' % (opt.njobs, seqsPerJob))

    chunkList = makeFastaChunks (opt, nSeqs, seqsPerJob)

    for chunk in chunkList:
        chunk.makeScript()
        chunk.submitScript()

    submitFinalJobs (opt, chunkList)

    logger.debug('finished')

    return
Example #39
0
    def getGeneDict(self):
        '''Create and cache dict: key=gene name, value=Annotation object for gene.'''

        if self.geneDict is None:

            logger.debug('creating gene name lookup table')

            self.geneDict = dict()

            for chr in self.chromosomes():
                for gene in self.annot[chr].getChildren():
                    self.geneDict.setdefault(gene.name, []).append(gene)

        return self.geneDict
Example #40
0
    def makeRef(self):
        '''Invoke bowtie-build on the fasta file.'''

        if not self.handle.closed:
            self.close()

        command = '%s %s %s > %s.out 2>&1' % (BOWTIE_BUILD, self.name,
                                              self.name, self.name)
        logger.debug(command)

        buildOut = os.popen(
            command
        )  # this should return nothing, since we've redirected the output
        rc = buildOut.close()
        if rc is not None:
            raise RuntimeError('bowtie2-build failed: %d' % rc)
Example #41
0
    def printDetails (self):
        '''Debug routine to print a bunch of stuff from the file to the log. Not production grade.'''

        movName = self._top['MovieInfo/Name']
        movID   = self._top['MovieInfo/ID']
        for ix in xrange(len(movName)):
            logger.debug("movie %d (%d): %s" % (ix, movID[ix], movName[ix]))

        path  = self._top['AlnGroup/Path']
        ID    = self._top['AlnGroup/ID']
        for ix in xrange(len(path)):
            logger.debug("path %d (%d): %s" % (ix, ID[ix], path[ix]))

####        logger.debug("" % ())

        return
Example #42
0
    def __init__ (self, filename, set=1, strobe=0, maxHole=None):

        logger.debug("creating CmpFile object for set %d strobe %d" % (set, strobe))

        self._filename     = filename
        self._setNumber    = set
        self._strobeNumber = strobe

        self._infile       = h5py.File (filename, 'r')
        self._top          = h5py.Group (self._infile, '/')
        self._index        = self._top['AlnInfo/AlnIndex']
        self._subreadMap   = None

        if maxHole == None:
            self._maxHole = max(self._index[:,7])     # largest *mapped* hole (for any set), may not be max hole!
        else:
            self._maxHole = maxHole
Example #43
0
def main ():

    logger.debug('version %s starting' % VERSION)

    opt, args = getParms()

    geneList = list()
    if opt.genes is not None:
        geneList = opt.genes.split(',')
    gene = None

    if len(args) > 0:
        logger.debug('reading matchAnnot file %s' % args[0])
        handle = open (args[0], 'r')
    else:
        logger.debug('reading matchAnnot data from stdin')
        handle = sys.stdin
    
    exonLines  = list()
    entryLines = list()

    for line in handle:

        if line.startswith ('exon:'):
            exonLines.append(line)                   # save a batch of exon lines
        else:

            if len(exonLines) > 0:                   # if there is a batch pending
                if opt.flip and strand == '-':       # reverse order if requested
                    exonLines = reverseExonList (exonLines)
                entryLines.extend(exonLines)         # add them to the output list
                exonLines = list()

            entryLines.append (line)                 # add non-exon line directly to output list

            if line.startswith ('isoform:'):
                strand = line.split()[-2]

            elif line.startswith ('gene:'):
                gene = line.split()[1]

            elif line.isspace():                     # last line of an entry?
                if len(entryLines) > 0:
                    if opt.genes is None or gene in geneList or entryLines[0].startswith('summary:'):
                        sys.stdout.writelines (entryLines)
                    entryLines =  list()

    if len(entryLines) > 0:
        if entryLines[-1].startswith('summary:'):
            sys.stdout.writelines (entryLines)

    handle.close()

    logger.debug('finished')

    return
Example #44
0
    def __init__(self, filename):
        '''
        '''

        logger.debug('reading clusters from %s' % filename)

        self.filename = filename
        self.clusters = dict()  # this is the stuff
        self.cells = dict()  # key=cell long name  value=cell number
        self.numCells = 0
        self.numClusters = 0

        regexCCS = re.compile('_CCS$')

        handle = open(filename, 'r')
        header = handle.readline().strip()  # get header line

        newStyle = header == 'cluster_id,read_id,read_type'  # is field separator space (old) or comma (new)?

        for line in handle:

            self.numClusters += 1

            if newStyle:
                clusterID, readName, FL = line.strip().split(',')
            else:
                clusterID, readName, FL = line.strip().split()

            cell, ZMW, coords = readName.split('/')
            coords = re.sub(regexCCS, '',
                            coords)  # get rid of '_CCS' at end of read range
            shortName = ZMW + '|' + coords

            if cell not in self.cells:  # have we seen this cell before?
                self.numCells += 1
                self.cells[cell] = self.numCells  # if not, give it a number
            cellNo = self.cells[cell]

            clusterEnt = self.clusters.setdefault(clusterID, {}).setdefault(
                FL, {}).setdefault(cellNo, [])
            clusterEnt.append(shortName)

        handle.close()

        logger.debug('read %d reads in %d clusters from %d cells' %
                     (self.numClusters, len(self.clusters), self.numCells))
Example #45
0
def main ():

    logger.debug('version %s starting' % VERSION)

    opt, args = getParms()

    # Reading a pickled file and repickling it doesn't make sense. But one day it will...

    if opt.format == 'pickle':
        refObj = ref.Reference.fromPickle (opt.ref)
    else:
        refObj = ref.Reference (opt.ref)

    refObj.toPickle(opt.output)

    logger.debug('finished')

    return
Example #46
0
def main():

    logger.debug('version %s starting' % VERSION)

    opt, args = getParms()

    # Reading a pickled file and repickling it doesn't make sense. But one day it will...

    if opt.format == 'pickle':
        refObj = ref.Reference.fromPickle(opt.ref)
    else:
        refObj = ref.Reference(opt.ref)

    refObj.toPickle(opt.output)

    logger.debug('finished')

    return
Example #47
0
    def __init__ (self, filename):
        '''
        '''

        logger.debug('reading clusters from %s' % filename)

        self.filename = filename
        self.clusters = dict()       # this is the stuff
        self.cells    = dict()       # key=cell long name  value=cell number
        self.numCells = 0
        self.numClusters = 0

        regexCCS = re.compile ('_CCS$')

        handle = open (filename, 'r')
        header = handle.readline().strip()            # get header line

        newStyle = header == 'cluster_id,read_id,read_type'       # is field separator space (old) or comma (new)?

        for line in handle:

            self.numClusters += 1

            if newStyle:
                clusterID, readName, FL = line.strip().split(',')
            else:
                clusterID, readName, FL = line.strip().split()

            cell, ZMW, coords = readName.split('/')
            coords = re.sub (regexCCS, '', coords)          # get rid of '_CCS' at end of read range
            shortName = ZMW + '|' + coords

            if cell not in self.cells:                      # have we seen this cell before?
                self.numCells += 1
                self.cells[cell] = self.numCells            # if not, give it a number
            cellNo = self.cells[cell]
            
            clusterEnt = self.clusters.setdefault(clusterID, {}).setdefault(FL, {}).setdefault(cellNo, [])
            clusterEnt.append (shortName)

        handle.close()

        logger.debug('read %d reads in %d clusters from %d cells' % (self.numClusters, len(self.clusters), self.numCells))
Example #48
0
    def fillZMWIndexes(self):
        '''Called from __init__ to compute _baxByHole, _ZMWindex and _basecallIndex.'''

        # _baxByHole points to the bax object for the bax file which
        # contains a given ZMW.

        # _ZMWIndex is the offset for a given ZMW into the datasets of
        # the PulseData/BaseCalls/ZMW group in the bax file pointed to
        # by _baxByHole.

        # Likewise, _basecallIndex is the offset to the first entry
        # for a given ZMW in the datasets of the PulseData/BaseCalls
        # group. Each of those datasets contains numEvent entries for
        # a given ZMW.

        # The alert reader will probably notice that we never examine
        # the MultiPart/HoleLookup dataset in the bas.h5
        # file. Instead, we derive the ZMW#-to-baxfile mapping by
        # looking at the ZMW#s contained in each bax file, in the loop
        # below.

        self._baxByHole = [None] * (self._maxZMW + 1)
        self._ZMWIndex = [None] * (self._maxZMW + 1)
        self._basecallIndex = [None] * (self._maxZMW + 1)

        for bf in self._baxfile:

            ix = 0
            eventIndex = 0
            numEvent = bf._ZMW["NumEvent"]

            for hole in bf.holeNumbers():

                self._baxByHole[hole] = bf
                self._ZMWIndex[hole] = ix
                self._basecallIndex[hole] = eventIndex
                eventIndex += numEvent[ix]
                ix += 1

            logger.debug("%s processed %d ZMWs" % (bf.shortName(), ix))
            logger.debug("%s processed %d basecalls" %
                         (bf.shortName(), eventIndex))
Example #49
0
    def submitScript(self):

        # Dependent job submission will fail if parent has already
        # completed. So delay all job startups by a short amount of time.

        startAt = datetime.datetime.now() + datetime.timedelta(0, STARTWAIT)
        startAtStr = startAt.strftime('%Y%m%d%H%M.%S')

        cmd = list()
        cmd.append('qsub')
        cmd.append('-N %s' % self.jobName)  # job name
        cmd.append('-o %s' % self.scriptOutput)  # output file
        cmd.append('-j oe')  # combine stdout and stderr
        cmd.append('-l nodes=1:ppn=1,walltime=4:00:00')  # resources required
        cmd.append('-a %s' % startAtStr)  # delay start, see above
        cmd.append(
            '-d . ')  # working directory (strangely, ./ is not the default)
        cmd.append('-r n')  # do NOT attempt to restart on failure
        cmd.append('-V')  # export all environment variables to job
        cmd.append('-W umask=0002')  # make logs rw-rw-r--
        cmd.append('-m n')  # don't send any mail
        cmd.append(self.scriptName)  # script to run

        command = ' '.join(cmd)
        logger.debug('running %s' % command)

        popen_file = os.popen(command)
        response = popen_file.read().strip()
        rc = popen_file.close()
        if rc is not None:
            logger.error('command failed, rc=%d' % rc)
            raise RuntimeError

        match = re.match(Chunk.JOBNO_PATTERN, response)
        if match is None:
            logger.error("invalid job sequence number: %s" % jobSeqStr)
            raise RuntimeError

        response = match.group(1)
        logger.debug('jobno is %s' % response)
        self.jobno = response
        return response
Example #50
0
    def submitScript (self):

        # Dependent job submission will fail if parent has already
        # completed. So delay all job startups by a short amount of time.

        startAt = datetime.datetime.now() + datetime.timedelta(0, STARTWAIT)
        startAtStr = startAt.strftime('%Y%m%d%H%M.%S')

        cmd = list()
        cmd.append('qsub')
        cmd.append('-N %s' % self.jobName)           # job name
        cmd.append('-o %s' % self.scriptOutput)      # output file
        cmd.append('-j oe')               # combine stdout and stderr
        cmd.append('-l nodes=1:ppn=1,walltime=4:00:00')    # resources required
        cmd.append('-a %s' % startAtStr)  # delay start, see above
        cmd.append('-d . ')               # working directory (strangely, ./ is not the default)
        cmd.append('-r n')                # do NOT attempt to restart on failure
        cmd.append('-V')                  # export all environment variables to job
        cmd.append('-W umask=0002')       # make logs rw-rw-r--
        cmd.append('-m n')                # don't send any mail
        cmd.append(self.scriptName)       # script to run

        command = ' '.join(cmd)
        logger.debug ('running %s' % command)
        
        popen_file = os.popen(command)
        response = popen_file.read().strip()
        rc = popen_file.close()
        if rc is not None:
            logger.error('command failed, rc=%d' % rc)
            raise RuntimeError

        match = re.match (Chunk.JOBNO_PATTERN, response)
        if match is None:
            logger.error("invalid job sequence number: %s" % jobSeqStr)
            raise RuntimeError

        response = match.group(1)
        logger.debug ('jobno is %s' % response)
        self.jobno = response
        return response
Example #51
0
    def fillRegionIndexes(self):
        '''Called from __init__ to compute _regionIndex and _HQIndex.'''

        # _regionIndex is the offset to the first entry for a given
        # ZMW in the PulseData/Regions dataset in the bas file where
        # the hole resides.

        # _HQIndex is the offset to the HQ-region entry in
        # PulseData/Regions for a given ZMW. (There is only one
        # ... for now.)

        self._regionIndex = [None] * (self._maxZMW + 1)
        self._HQIndex = [None] * (self._maxZMW + 1)

        for bf in self._baxfile:

            regions = bf._regions
            regionIndex = 0
            lastHole = -1  # init to non-matching value

            ####            for line in regions:                             #### doing it this way was *very* slow
            ####                hole, regionType = line[0:2]

            for hole, regionType in regions[:, 0:2]:

                if hole != lastHole:  # start of new hole?

                    if hole < 0 or hole > self._maxZMW:  # sanity check hole# from region table
                        raise RuntimeError("hole number %d out of range" %
                                           (hole))

                    self._regionIndex[hole] = regionIndex
                    lastHole = hole

                if regionType == 2:  # HQ region for this hole?
                    self._HQIndex[hole] = regionIndex

                regionIndex += 1

            logger.debug("%s processed %d regions" %
                         (bf.shortName(), regionIndex))
Example #52
0
    def fillZMWIndexes(self):
        """Called from __init__ to compute _baxByHole, _ZMWindex and _basecallIndex."""

        # _baxByHole points to the bax object for the bax file which
        # contains a given ZMW.

        # _ZMWIndex is the offset for a given ZMW into the datasets of
        # the PulseData/BaseCalls/ZMW group in the bax file pointed to
        # by _baxByHole.

        # Likewise, _basecallIndex is the offset to the first entry
        # for a given ZMW in the datasets of the PulseData/BaseCalls
        # group. Each of those datasets contains numEvent entries for
        # a given ZMW.

        # The alert reader will probably notice that we never examine
        # the MultiPart/HoleLookup dataset in the bas.h5
        # file. Instead, we derive the ZMW#-to-baxfile mapping by
        # looking at the ZMW#s contained in each bax file, in the loop
        # below.

        self._baxByHole = [None] * (self._maxZMW + 1)
        self._ZMWIndex = [None] * (self._maxZMW + 1)
        self._basecallIndex = [None] * (self._maxZMW + 1)

        for bf in self._baxfile:

            ix = 0
            eventIndex = 0
            numEvent = bf._ZMW["NumEvent"]

            for hole in bf.holeNumbers():

                self._baxByHole[hole] = bf
                self._ZMWIndex[hole] = ix
                self._basecallIndex[hole] = eventIndex
                eventIndex += numEvent[ix]
                ix += 1

            logger.debug("%s processed %d ZMWs" % (bf.shortName(), ix))
            logger.debug("%s processed %d basecalls" % (bf.shortName(), eventIndex))
Example #53
0
def main ():

    logger.debug("%s starting" % sys.argv[0])

    opt, args = getParms()

    basFilename = args[0]
    logger.debug("bas file: %s" % basFilename)
    basfile = H5BasFile.BasFile (basFilename)

    try:
        hole = int(args[1])
    except ValueError:
        logger.error('ERROR: second parameter must be an integer ZMW number')
        sys.exit()

    if not opt.reverse:
        sequence = basfile.getSequence(hole, opt.start, opt.end)    # end==None gets the whole read
    else:
        sequence = basfile.getRevCompSequence(hole, opt.start, opt.end)

    movie = basfile.movieName()
    length = len(sequence)
    print ">%s/%d/%d_%d" % (movie, hole, opt.start, opt.start+length)

    for ix in xrange(0,length,opt.flen):
        print sequence[ix:ix+opt.flen]

    logger.debug("complete")
Example #54
0
    def getReadGroups (self):
        '''Find read groups in /AlnGroup/Path for this movie.'''

        # Create and cache a dict whose key is ReadGroupId and whose
        # value is h5 group containing the AlnArray and other datasets
        # for that read group. See the comments in getAlignmentAsDict
        # for further confusion on the issue.

        if self._readGroups is None:

            logger.debug("creating ReadGroup list")

            self._readGroups = dict()

            movie = self._movieName
            path  = self._top['AlnGroup/Path']
            ID    = self._top['AlnGroup/ID']

            for ix in xrange(len(path)):
                logger.debug("path %d, ID %d: %s" % (ix, ID[ix], path[ix]))
####                if path[ix].endswith(movie):
####                    self._readGroups[ID[ix]] = self._top[path[ix]]
                self._readGroups[ID[ix]] = self._top[path[ix]]
                    
            logger.debug("kept %d of %d ReadGroups for this movie" % (len(self._readGroups), len(path)))

        return self._readGroups
Example #55
0
    def __init__(self, filename):
        '''Create a dict, keyed by chr, from a fasta file.'''

        self.filename = filename
        self.ref = dict()  # this is the stuff! key=chr value=sequence
        self.chrList = list()  # preserve original order of chromosomes

        logger.debug('reading reference fasta file %s' % self.filename)

        # Reading fasta lines and appending them one-by-one is very
        # slow (the paint bucket keeps getting father away). Instead,
        # accumulate individual lines in a list, and join them when
        # we've got them all. That turns out to be *much* faster.

        handle = open(self.filename, 'r')

        accum = list()

        for line in handle:

            line = line.strip()

            if line.startswith('>'):

                if len(accum) > 0:
                    self.ref[chr] = ''.join(accum)  # see comment block above
                    accum = list()

                chr = re.match(regexChr, line).group(1)
                if chr in self.ref:
                    raise RuntimeError('duplicate chromsome %s' % chr)
                self.chrList.append(chr)

            else:
                accum.append(line)

        if len(accum) > 0:  # last one?
            self.ref[chr] = ''.join(accum)

        handle.close()
Example #56
0
def main():

    logger.debug("%s starting" % sys.argv[0])

    opt, args = getParms()

    basFilename = args[0]
    logger.debug("bas file: %s" % basFilename)
    bf = H5BasFile.BasFile(basFilename)

    try:
        hole = int(args[1])
    except ValueError:
        logger.error('ERROR: second parameter must be an integer ZMW number')
        sys.exit()

    if opt.subreads:

        for region in bf.holeRegions(hole):
            regionHole, regionType, start, end, score = region
            if regionType == 1:  # a subread?
                printRange(bf, hole, opt, start, end)

    else:
        printRange(bf, hole, opt, opt.start, opt.end)

    logger.debug("complete")
Example #57
0
    def findCCSFile(self):
        '''Given a directory to look in, find the ccs.h5 file that contains consensus reads for this bax file.'''

        self._hasConsensus = False  # until proven otherwise

        if "PulseData/ConsensusBaseCalls" in self._top:  # if this is an older bax file, in contains its own CCS data

            self._consBasecalls = self._top["PulseData/ConsensusBaseCalls"]
            self._consZMW = self._top["PulseData/ConsensusBaseCalls/ZMW"]
            self._consPasses = self._top["PulseData/ConsensusBaseCalls/Passes"]
            self._hasConsensus = True

        elif self._CCSDir is not None:

            CCSFilename = os.path.basename(self._filename).replace(
                'bax', 'ccs')
            fqCCSFilename = os.path.join(self._CCSDir, CCSFilename)

            if os.path.exists(fqCCSFilename):

                self._CCSFile = h5py.File(fqCCSFilename, 'r')
                self._consBasecalls = self._CCSFile[
                    "PulseData/ConsensusBaseCalls"]
                self._consZMW = self._CCSFile[
                    "PulseData/ConsensusBaseCalls/ZMW"]
                self._consPasses = self._CCSFile[
                    "PulseData/ConsensusBaseCalls/Passes"]
                self._hasConsensus = True
                logger.debug('BaxFile %s found CCS file %s' %
                             (self._shortName, fqCCSFilename))

            else:
                logger.warning('%s: no CCS file found corresponding to %s' %
                               (self._shortName, self._filename))

        else:
            logger.info(
                'BaxFile %s does not contain CCS data (rel 2.1.0 and later). Use --ccs'
                % self._shortName)