def basecallIndex (self): '''Create and cache a table by hole number of starting indexes into PulseData/BaseCalls.''' if self._basecallIndex == None: logger.debug("creating basecall index") numEvent = self._ZMW["NumEvent"] holeNumber = self._ZMW["HoleNumber"] numZ = self.numZMWs() index = 0 self._basecallIndex = [0] * numZ # The loop below includes a check that the HoleNumbers run # from 0 to N-1. I.e., HoleNumber[ix] == ix. Otherwise, # there is no way to get from the hole number of a region # back to the ZMW entry (other than searching for it). for ix in xrange(numZ): if holeNumber[ix] != ix: raise RuntimeError("Hole number != index at %d" % ix) self._basecallIndex[ix] = index index += numEvent[ix] logger.debug("processed %d basecalls" % index) return self._basecallIndex
def __init__ (self, name): self.name = name self.handle = open (name, 'w') self.lastPos = dict() logger.debug('opened %s' % name)
def __init__(self, name): self.name = name self.handle = open(name, 'w') self.lastPos = dict() logger.debug('opened %s' % name)
def cellCoords (self): '''Find and cache the minimum and maximum X/Y coordinates on the SMRTcell''' if self._coords is None: logger.debug("finding SMRTcell coordinates") numZ = self.numZMWs() holeXY = self._ZMW["HoleXY"] minX = maxX = holeXY[0,0] minY = maxY = holeXY[0,1] for ix in xrange(numZ): x,y = holeXY[ix,:] if x < minX: minX = x elif x > maxX: maxX = x if y < minY: minY = y elif y > maxY: maxY = y self._coords = (minX, maxX, minY, maxY) logger.debug("SMRTcell is (%d,%d,%d,%d)" % (minX, maxX, minY, maxY)) return self._coords
def cellCoords(self): """Find and cache the minimum and maximum X/Y coordinates on the SMRTcell""" if self._coords is None: logger.debug("finding SMRTcell coordinates") minX = 0 maxX = 0 minY = 0 maxY = 0 for bf in self._baxfile: holeXY = bf._ZMW["HoleXY"] minX = min(minX, min(holeXY[:, 0])) maxX = max(maxX, max(holeXY[:, 0])) minY = min(minY, min(holeXY[:, 1])) maxY = max(maxY, max(holeXY[:, 1])) self._coords = (minX, maxX, minY, maxY) logger.debug("SMRTcell is (%d,%d,%d,%d)" % (minX, maxX, minY, maxY)) return self._coords
def _fillRegionTables (self): '''Create and cache tables by hole number of starting and HQ indexes into PulseData/Regions.''' if self._regionIndex == None: logger.debug("creating region index") self._regionIndex = [0] * self.numZMWs() self._HQIndex = [0] * self.numZMWs() regions = self._regions index = 0 lastHole = -1 # init to non-matching value for line in regions: hole, regionType = line[0:2] if hole != lastHole: # start of new hole? self._regionIndex[hole] = index lastHole = hole if regionType == 2: # HQ region for this hole? self._HQIndex[hole] = index index += 1 logger.debug("processed %d regions" % index) return self._regionIndex
def __init__(self, filename, CCSDir=None): BaxFile.fileNum += 1 self._shortName = "bax-%d" % BaxFile.fileNum logger.debug("creating BaxFile object for %s = %s" % (self._shortName, filename)) self._filename = filename self._CCSDir = CCSDir self._infile = h5py.File(filename, "r") self._top = self._infile # h5py 2.0.1 change! self._pulsedata = self._top["PulseData"] self._basecalls = self._top["PulseData/BaseCalls"] self._ZMW = self._top["PulseData/BaseCalls/ZMW"] self._regions = self._top["PulseData/Regions"] self._productivity = self._top["PulseData/BaseCalls/ZMWMetrics/Productivity"] self._movieName = self._top["ScanData/RunInfo"].attrs["MovieName"] self._holeStatus = self._ZMW["HoleStatus"] self._numRegions = self._regions.shape[0] self._PreBaseFrames = self._basecalls["PreBaseFrames"] self._WidthInFrames = self._basecalls["WidthInFrames"] self._maxZMW = max(self._ZMW["HoleNumber"]) # this takes a surprisingly long time to compute self._sanityChecked = False self.findCCSFile()
def findRegions(tranList): # Find breakpoints where coverage by exons changes. # Why are we doing this? See the note in the Transcript class # definition below. breaks = list() for tranIx, tran in enumerate(tranList): for exon in tran.exons: breaks.append([exon.start, 0, tranIx, tran.name, exon.name]) breaks.append([exon.end, 1, tranIx, tran.name, exon.name]) breaks.sort(key=lambda x: x[0]) curPos = breaks[0][0] curTranSet = set() region = 0 for ix in xrange(len(breaks)): posit, flag, tranIx, tranName, exonName = breaks[ix] if posit > curPos + MIN_REGION_SIZE: # this is a new region if len(curTranSet) > 0: for ix in curTranSet: tranList[ix].regions.add( region) # update set of regions hit by this transcript region += 1 curPos = posit if flag == 0: # exon start curTranSet.add(tranIx) else: # exon end curTranSet.remove(tranIx) logger.debug('found %d regions' % region) return
def fillConsensusIndexes(self): """Compute _consPassIndex and _consensusIndex. Called only when those arrays are needed.""" self._consPassIndex = [None] * (self._maxZMW + 1) self._consensusIndex = [None] * (self._maxZMW + 1) for bf in self._baxfile: bf.ZMWSanityClause() # sanity check the consensus datasets ix = 0 passIndex = 0 consIndex = 0 numPasses = bf._consPasses["NumPasses"] numEvent = bf._consZMW["NumEvent"] for hole in bf.holeNumbers(): self._consPassIndex[hole] = passIndex passIndex += numPasses[ix] self._consensusIndex[hole] = consIndex consIndex += numEvent[ix] ix += 1 logger.debug("%s processed %d consensus passes" % (bf.shortName(), passIndex)) logger.debug("%s processed %d consensus basecalls" % (bf.shortName(), consIndex))
def __init__(self, filename, CCSDir=None): BaxFile.fileNum += 1 self._shortName = 'bax-%d' % BaxFile.fileNum logger.debug("creating BaxFile object for %s = %s" % (self._shortName, filename)) self._filename = filename self._CCSDir = CCSDir self._infile = h5py.File(filename, 'r') self._top = self._infile # h5py 2.0.1 change! self._pulsedata = self._top["PulseData"] self._basecalls = self._top["PulseData/BaseCalls"] self._ZMW = self._top["PulseData/BaseCalls/ZMW"] self._regions = self._top["PulseData/Regions"] self._productivity = self._top[ "PulseData/BaseCalls/ZMWMetrics/Productivity"] self._movieName = self._top["ScanData/RunInfo"].attrs["MovieName"] self._holeStatus = self._ZMW["HoleStatus"] self._numRegions = self._regions.shape[0] self._PreBaseFrames = self._basecalls["PreBaseFrames"] self._WidthInFrames = self._basecalls["WidthInFrames"] self._maxZMW = max(self._ZMW["HoleNumber"] ) # this takes a surprisingly long time to compute self._sanityChecked = False self.findCCSFile()
def __init__(self, filename, CCSDir=None): logger.debug("creating BasFile object") self._filename = filename self._CCSDir = CCSDir self._infile = h5py.File(filename, "r") self._top = self._infile self._baxfile = list() self._coords = None self._consensusIndex = None self._consPassIndex = None if "MultiPart" not in self._top: # if this is an old-style bas file bf = BaxFile(filename) # file will contain its own CCSdata self._baxfile.append(bf) # only one file (this one) in the list else: # else it's an index to a set of bax files h5Dir = os.path.dirname(os.path.abspath(self._filename)) for baxfileName in self._top["MultiPart/Parts"]: # for each bax file fqBaxfileName = os.path.join(h5Dir, baxfileName) # fq = fully qualified bf = BaxFile(fqBaxfileName, CCSDir=CCSDir) self._baxfile.append(bf) # add file to list self.fillCombinedFields() # need to compute this first, we'll need it later self.fillZMWIndexes() self.fillMovieName() self.fillRegionIndexes()
def cellCoords(self): '''Find and cache the minimum and maximum X/Y coordinates on the SMRTcell''' if self._coords is None: logger.debug("finding SMRTcell coordinates") minX = 0 maxX = 0 minY = 0 maxY = 0 for bf in self._baxfile: holeXY = bf._ZMW["HoleXY"] minX = min(minX, min(holeXY[:, 0])) maxX = max(maxX, max(holeXY[:, 0])) minY = min(minY, min(holeXY[:, 1])) maxY = max(maxY, max(holeXY[:, 1])) self._coords = (minX, maxX, minY, maxY) logger.debug("SMRTcell is (%d,%d,%d,%d)" % (minX, maxX, minY, maxY)) return self._coords
def fillConsensusIndexes(self): '''Compute _consPassIndex and _consensusIndex. Called only when those arrays are needed.''' self._consPassIndex = [None] * (self._maxZMW + 1) self._consensusIndex = [None] * (self._maxZMW + 1) for bf in self._baxfile: bf.ZMWSanityClause() # sanity check the consensus datasets ix = 0 passIndex = 0 consIndex = 0 numPasses = bf._consPasses["NumPasses"] numEvent = bf._consZMW["NumEvent"] for hole in bf.holeNumbers(): self._consPassIndex[hole] = passIndex passIndex += numPasses[ix] self._consensusIndex[hole] = consIndex consIndex += numEvent[ix] ix += 1 logger.debug("%s processed %d consensus passes" % (bf.shortName(), passIndex)) logger.debug("%s processed %d consensus basecalls" % (bf.shortName(), consIndex))
def main(): logger.debug("%s starting" % sys.argv[0]) basfileName = sys.argv[1] hole = int(sys.argv[2]) bf = H5BasFile.BasFile(basfileName) call = bf.getBasecallField("Basecall", hole) delete = bf.getBasecallField("DeletionQV", hole) wuzzit = bf.getBasecallField("DeletionTag", hole) insert = bf.getBasecallField("InsertionQV", hole) prebase = bf.getBasecallField("PreBaseFrames", hole) subst = bf.getBasecallField("SubstitutionQV", hole) couldbe = bf.getBasecallField("SubstitutionTag", hole) qual = bf.getBasecallField("QualityValue", hole) duration = bf.getBasecallField("WidthInFrames", hole) print "Index Call Qd Del Qi Qs Sub Q DelT Dur" print for ix in xrange(len(call)): print "%5d %4s %3d %3s %3d %3d %3s %3d %4d %3d %6.3f %5.3f" \ % (ix, chr(call[ix]), delete[ix], chr(wuzzit[ix]), insert[ix], subst[ix], chr(couldbe[ix]), qual[ix], prebase[ix], duration[ix], float(prebase[ix])/H5BasFile.frameRate, float(duration[ix])/H5BasFile.frameRate) logger.debug("complete")
def findCCSFile(self): """Given a directory to look in, find the ccs.h5 file that contains consensus reads for this bax file.""" self._hasConsensus = False # until proven otherwise if "PulseData/ConsensusBaseCalls" in self._top: # if this is an older bax file, in contains its own CCS data self._consBasecalls = self._top["PulseData/ConsensusBaseCalls"] self._consZMW = self._top["PulseData/ConsensusBaseCalls/ZMW"] self._consPasses = self._top["PulseData/ConsensusBaseCalls/Passes"] self._hasConsensus = True elif self._CCSDir is not None: CCSFilename = os.path.basename(self._filename).replace("bax", "ccs") fqCCSFilename = os.path.join(self._CCSDir, CCSFilename) if os.path.exists(fqCCSFilename): self._CCSFile = h5py.File(fqCCSFilename, "r") self._consBasecalls = self._CCSFile["PulseData/ConsensusBaseCalls"] self._consZMW = self._CCSFile["PulseData/ConsensusBaseCalls/ZMW"] self._consPasses = self._CCSFile["PulseData/ConsensusBaseCalls/Passes"] self._hasConsensus = True logger.debug("BaxFile %s found CCS file %s" % (self._shortName, fqCCSFilename)) else: logger.warning("%s: no CCS file found corresponding to %s" % (self._shortName, self._filename)) else: logger.info("BaxFile %s does not contain CCS data (rel 2.1.0 and later). Use --ccs" % self._shortName)
def findRegions(tranList): # Find breakpoints where coverage by exons changes. # Why are we doing this? See the note in the Transcript class # definition below. breaks = list() for tranIx, tran in enumerate(tranList): for exon in tran.exons: breaks.append([exon.start, 0, tranIx, tran.name, exon.name]) breaks.append([exon.end, 1, tranIx, tran.name, exon.name]) breaks.sort(key=lambda x: x[0]) curPos = breaks[0][0] curTranSet = set() region = 0 for ix in xrange(len(breaks)): posit, flag, tranIx, tranName, exonName = breaks[ix] if posit > curPos + MIN_REGION_SIZE: # this is a new region if len(curTranSet) > 0: for ix in curTranSet: tranList[ix].regions.add(region) # update set of regions hit by this transcript region += 1 curPos = posit if flag == 0: # exon start curTranSet.add(tranIx) else: # exon end curTranSet.remove(tranIx) logger.debug('found %d regions' % region) return
def __init__(self, filename, CCSDir=None): logger.debug('creating BasFile object') self._filename = filename self._CCSDir = CCSDir self._infile = h5py.File(filename, 'r') self._top = self._infile self._baxfile = list() self._coords = None self._consensusIndex = None self._consPassIndex = None if 'MultiPart' not in self._top: # if this is an old-style bas file bf = BaxFile(filename) # file will contain its own CCSdata self._baxfile.append(bf) # only one file (this one) in the list else: # else it's an index to a set of bax files h5Dir = os.path.dirname(os.path.abspath(self._filename)) for baxfileName in self._top[ 'MultiPart/Parts']: # for each bax file fqBaxfileName = os.path.join( h5Dir, baxfileName) # fq = fully qualified bf = BaxFile(fqBaxfileName, CCSDir=CCSDir) self._baxfile.append(bf) # add file to list self.fillCombinedFields( ) # need to compute this first, we'll need it later self.fillZMWIndexes() self.fillMovieName() self.fillRegionIndexes()
def toPickle (self, filename): pickHandle = open (filename, 'w') pk = pickle.Pickler (pickHandle, pickle.HIGHEST_PROTOCOL) pk.dump (self) pickHandle.close() logger.debug('wrote annotation data to pickle file %s' % filename) return
def toPickle(self, filename): pickHandle = open(filename, 'w') pk = pickle.Pickler(pickHandle, pickle.HIGHEST_PROTOCOL) pk.dump(self) pickHandle.close() logger.debug('wrote reference data to pickle file %s' % filename) return
def __init__ (self, fileName): logger.debug("creating CmpFile object for %s" % (fileName)) self._fileName = fileName self._infile = h5py.File (fileName, 'r') #### self._top = h5py.Group (self._infile, '/') self._top = self._infile # h5py 2.0.1 change! return
def fromPickle (filename): '''Create a ClusterDict object from a pickle file (alternative to __init__).''' handle = open (filename, 'r') pk = pickle.Unpickler (handle) clusterDict = pk.load() handle.close() logger.debug('read %d clusters in pickle format from %s' % (len(clusterDict), filename)) return clusterDict
def fromPickle (filename): '''Create a Reference object from a pickle file (alternative to __init__).''' logger.debug('reading reference in pickle format from %s' % filename) handle = open (filename, 'r') pk = pickle.Unpickler (handle) ref = pk.load() handle.close() return ref
def fromPickle (filename): '''Create an AnnotationList object from a pickle file (alternative to __init__).''' logger.debug('reading annotations in pickle format from %s' % filename) handle = open (filename, 'r') pk = pickle.Unpickler (handle) annotList = pk.load() handle.close() return annotList
def fromPickle(filename): '''Create a Reference object from a pickle file (alternative to __init__).''' logger.debug('reading reference in pickle format from %s' % filename) handle = open(filename, 'r') pk = pickle.Unpickler(handle) ref = pk.load() handle.close() return ref
def fromPickle(filename): '''Create an AnnotationList object from a pickle file (alternative to __init__).''' logger.debug('reading annotations in pickle format from %s' % filename) handle = open(filename, 'r') pk = pickle.Unpickler(handle) annotList = pk.load() handle.close() return annotList
def plotPolyAs (tranList, blocks): '''Add start/stop codons to plot.''' for tran in tranList: if tran.annot: # only annotations know about polyAs for exon in tran.exons: if hasattr (exon, 'polyAs'): for start, end, howmany in exon.polyAs: plotA (tran, start, blocks) logger.debug ('%s: %9d' % (exon.name, start)) return
def fromPickle(filename): '''Create a ClusterDict object from a pickle file (alternative to __init__).''' handle = open(filename, 'r') pk = pickle.Unpickler(handle) clusterDict = pk.load() handle.close() logger.debug('read %d clusters in pickle format from %s' % (len(clusterDict), filename)) return clusterDict
def submitFinalJobs(opt, chunkList): chunkFiles = ['%s \\\n' % chk.trimmedChunkName for chk in chunkList] sh = list() sh.append('#!/bin/bash\n\n') sh.append('set -o errexit\n') sh.append('set -o nounset\n\n') sh.append('cat \\\n') sh.extend(chunkFiles) sh.append(' > %s\n' % opt.output) if opt.report is not None: reportFiles = ['%s \\\n' % chk.reportChunkName for chk in chunkList] sh.append('\ncat \\\n') sh.extend(reportFiles) sh.append(' > %s\n' % opt.report) finalScriptName = '%s/trim_final.sh' % opt.tmpdir handle = open(finalScriptName, 'w') handle.writelines(sh) handle.close() deps = ':'.join([chk.jobno for chk in chunkList]) cmd = list() cmd.append('qsub') cmd.append('-N trim_final') # job name cmd.append('-o trim_final.out') # output file cmd.append('-j oe') # combine stdout and stderr cmd.append('-l nodes=1:ppn=1,walltime=4:00:00') # resources required cmd.append('-d . ') # working directory (strangely, ./ is not the default) cmd.append('-r n') # do NOT attempt to restart on failure cmd.append('-V') # export all environment variables to job cmd.append('-W umask=0002') # make logs rw-rw-r-- cmd.append('-m n') # don't send any mail cmd.append('-W depend=afterok:%s' % deps) cmd.append(finalScriptName) # script to run command = ' '.join(cmd) logger.debug('running %s' % command) popen_file = os.popen(command) response = popen_file.read().strip() rc = popen_file.close() if rc is not None: logger.error('command failed, rc=%d' % rc) raise RuntimeError logger.debug('jobno is %s' % response) return response
def submitFinalJobs (opt, chunkList): chunkFiles = ['%s \\\n' % chk.trimmedChunkName for chk in chunkList] sh = list() sh.append('#!/bin/bash\n\n') sh.append('set -o errexit\n') sh.append('set -o nounset\n\n') sh.append('cat \\\n') sh.extend(chunkFiles) sh.append(' > %s\n' % opt.output) if opt.report is not None: reportFiles = ['%s \\\n' % chk.reportChunkName for chk in chunkList] sh.append('\ncat \\\n') sh.extend(reportFiles) sh.append(' > %s\n' % opt.report) finalScriptName = '%s/trim_final.sh' % opt.tmpdir handle = open (finalScriptName, 'w') handle.writelines (sh) handle.close() deps = ':'.join ([chk.jobno for chk in chunkList]) cmd = list() cmd.append('qsub') cmd.append('-N trim_final') # job name cmd.append('-o trim_final.out') # output file cmd.append('-j oe') # combine stdout and stderr cmd.append('-l nodes=1:ppn=1,walltime=4:00:00') # resources required cmd.append('-d . ') # working directory (strangely, ./ is not the default) cmd.append('-r n') # do NOT attempt to restart on failure cmd.append('-V') # export all environment variables to job cmd.append('-W umask=0002') # make logs rw-rw-r-- cmd.append('-m n') # don't send any mail cmd.append('-W depend=afterok:%s' % deps) cmd.append(finalScriptName) # script to run command = ' '.join(cmd) logger.debug ('running %s' % command) popen_file = os.popen(command) response = popen_file.read().strip() rc = popen_file.close() if rc is not None: logger.error('command failed, rc=%d' % rc) raise RuntimeError logger.debug ('jobno is %s' % response) return response
def toPickle (self, filename): self.geneDict = None # no need to pickle this, it can be recreated pickHandle = open (filename, 'w') pk = pickle.Pickler (pickHandle, pickle.HIGHEST_PROTOCOL) pk.dump (self) pickHandle.close() logger.debug('wrote %d clusters to pickle file %s' % (len(self.clusterDict), filename)) return
def toPickle(self, filename): self.geneDict = None # no need to pickle this, it can be recreated pickHandle = open(filename, 'w') pk = pickle.Pickler(pickHandle, pickle.HIGHEST_PROTOCOL) pk.dump(self) pickHandle.close() logger.debug('wrote %d clusters to pickle file %s' % (len(self.clusterDict), filename)) return
def makeRef (self): '''Invoke bowtie-build on the fasta file.''' if not self.handle.closed: self.close() command = '%s %s %s > %s.out 2>&1' % (BOWTIE_BUILD, self.name, self.name, self.name) logger.debug(command) buildOut = os.popen (command) # this should return nothing, since we've redirected the output rc = buildOut.close() if rc is not None: raise RuntimeError ('bowtie2-build failed: %d' % rc)
def main(): logger.debug('version %s starting' % VERSION) opt, args = getParms() makeTempDir(opt.tmpdir) nSeqs = countSeqs(opt.input) logger.debug('%s contains %d sequences' % (opt.input, nSeqs)) seqsPerJob = (nSeqs + opt.njobs - 1) / opt.njobs logger.debug('each of %d jobs will process %d sequences' % (opt.njobs, seqsPerJob)) chunkList = makeFastaChunks(opt, nSeqs, seqsPerJob) for chunk in chunkList: chunk.makeScript() chunk.submitScript() submitFinalJobs(opt, chunkList) logger.debug('finished') return
def fillCombinedFields(self): '''Called from __init__ to compute aggregated fields across all ax files.''' self._maxZMW = 0 self._numZMWs = 0 self._hasConsensus = True for bf in self._baxfile: self._maxZMW = max(self._maxZMW, bf._maxZMW) # largest ZMW# self._numZMWs += bf.numZMWs() # count of ZMWs if not bf.hasConsensus(): self._hasConsensus = False logger.debug("largest ZMW# is %d" % self._maxZMW)
def fillCombinedFields(self): """Called from __init__ to compute aggregated fields across all ax files.""" self._maxZMW = 0 self._numZMWs = 0 self._hasConsensus = True for bf in self._baxfile: self._maxZMW = max(self._maxZMW, bf._maxZMW) # largest ZMW# self._numZMWs += bf.numZMWs() # count of ZMWs if not bf.hasConsensus(): self._hasConsensus = False logger.debug("largest ZMW# is %d" % self._maxZMW)
def getGeneDict (self): '''Create and cache dict: key=gene name, value=Annotation object for gene.''' if self.geneDict is None: logger.debug('creating gene name lookup table') self.geneDict = dict() for chr in self.chromosomes(): for gene in self.annot[chr].getChildren(): self.geneDict.setdefault(gene.name, []).append(gene) return self.geneDict
def main (): logger.debug('version %s starting' % VERSION) opt, args = getParms() makeTempDir (opt.tmpdir) nSeqs = countSeqs (opt.input) logger.debug('%s contains %d sequences' % (opt.input, nSeqs)) seqsPerJob = (nSeqs + opt.njobs - 1) / opt.njobs logger.debug('each of %d jobs will process %d sequences' % (opt.njobs, seqsPerJob)) chunkList = makeFastaChunks (opt, nSeqs, seqsPerJob) for chunk in chunkList: chunk.makeScript() chunk.submitScript() submitFinalJobs (opt, chunkList) logger.debug('finished') return
def getGeneDict(self): '''Create and cache dict: key=gene name, value=Annotation object for gene.''' if self.geneDict is None: logger.debug('creating gene name lookup table') self.geneDict = dict() for chr in self.chromosomes(): for gene in self.annot[chr].getChildren(): self.geneDict.setdefault(gene.name, []).append(gene) return self.geneDict
def makeRef(self): '''Invoke bowtie-build on the fasta file.''' if not self.handle.closed: self.close() command = '%s %s %s > %s.out 2>&1' % (BOWTIE_BUILD, self.name, self.name, self.name) logger.debug(command) buildOut = os.popen( command ) # this should return nothing, since we've redirected the output rc = buildOut.close() if rc is not None: raise RuntimeError('bowtie2-build failed: %d' % rc)
def printDetails (self): '''Debug routine to print a bunch of stuff from the file to the log. Not production grade.''' movName = self._top['MovieInfo/Name'] movID = self._top['MovieInfo/ID'] for ix in xrange(len(movName)): logger.debug("movie %d (%d): %s" % (ix, movID[ix], movName[ix])) path = self._top['AlnGroup/Path'] ID = self._top['AlnGroup/ID'] for ix in xrange(len(path)): logger.debug("path %d (%d): %s" % (ix, ID[ix], path[ix])) #### logger.debug("" % ()) return
def __init__ (self, filename, set=1, strobe=0, maxHole=None): logger.debug("creating CmpFile object for set %d strobe %d" % (set, strobe)) self._filename = filename self._setNumber = set self._strobeNumber = strobe self._infile = h5py.File (filename, 'r') self._top = h5py.Group (self._infile, '/') self._index = self._top['AlnInfo/AlnIndex'] self._subreadMap = None if maxHole == None: self._maxHole = max(self._index[:,7]) # largest *mapped* hole (for any set), may not be max hole! else: self._maxHole = maxHole
def main (): logger.debug('version %s starting' % VERSION) opt, args = getParms() geneList = list() if opt.genes is not None: geneList = opt.genes.split(',') gene = None if len(args) > 0: logger.debug('reading matchAnnot file %s' % args[0]) handle = open (args[0], 'r') else: logger.debug('reading matchAnnot data from stdin') handle = sys.stdin exonLines = list() entryLines = list() for line in handle: if line.startswith ('exon:'): exonLines.append(line) # save a batch of exon lines else: if len(exonLines) > 0: # if there is a batch pending if opt.flip and strand == '-': # reverse order if requested exonLines = reverseExonList (exonLines) entryLines.extend(exonLines) # add them to the output list exonLines = list() entryLines.append (line) # add non-exon line directly to output list if line.startswith ('isoform:'): strand = line.split()[-2] elif line.startswith ('gene:'): gene = line.split()[1] elif line.isspace(): # last line of an entry? if len(entryLines) > 0: if opt.genes is None or gene in geneList or entryLines[0].startswith('summary:'): sys.stdout.writelines (entryLines) entryLines = list() if len(entryLines) > 0: if entryLines[-1].startswith('summary:'): sys.stdout.writelines (entryLines) handle.close() logger.debug('finished') return
def __init__(self, filename): ''' ''' logger.debug('reading clusters from %s' % filename) self.filename = filename self.clusters = dict() # this is the stuff self.cells = dict() # key=cell long name value=cell number self.numCells = 0 self.numClusters = 0 regexCCS = re.compile('_CCS$') handle = open(filename, 'r') header = handle.readline().strip() # get header line newStyle = header == 'cluster_id,read_id,read_type' # is field separator space (old) or comma (new)? for line in handle: self.numClusters += 1 if newStyle: clusterID, readName, FL = line.strip().split(',') else: clusterID, readName, FL = line.strip().split() cell, ZMW, coords = readName.split('/') coords = re.sub(regexCCS, '', coords) # get rid of '_CCS' at end of read range shortName = ZMW + '|' + coords if cell not in self.cells: # have we seen this cell before? self.numCells += 1 self.cells[cell] = self.numCells # if not, give it a number cellNo = self.cells[cell] clusterEnt = self.clusters.setdefault(clusterID, {}).setdefault( FL, {}).setdefault(cellNo, []) clusterEnt.append(shortName) handle.close() logger.debug('read %d reads in %d clusters from %d cells' % (self.numClusters, len(self.clusters), self.numCells))
def main (): logger.debug('version %s starting' % VERSION) opt, args = getParms() # Reading a pickled file and repickling it doesn't make sense. But one day it will... if opt.format == 'pickle': refObj = ref.Reference.fromPickle (opt.ref) else: refObj = ref.Reference (opt.ref) refObj.toPickle(opt.output) logger.debug('finished') return
def main(): logger.debug('version %s starting' % VERSION) opt, args = getParms() # Reading a pickled file and repickling it doesn't make sense. But one day it will... if opt.format == 'pickle': refObj = ref.Reference.fromPickle(opt.ref) else: refObj = ref.Reference(opt.ref) refObj.toPickle(opt.output) logger.debug('finished') return
def __init__ (self, filename): ''' ''' logger.debug('reading clusters from %s' % filename) self.filename = filename self.clusters = dict() # this is the stuff self.cells = dict() # key=cell long name value=cell number self.numCells = 0 self.numClusters = 0 regexCCS = re.compile ('_CCS$') handle = open (filename, 'r') header = handle.readline().strip() # get header line newStyle = header == 'cluster_id,read_id,read_type' # is field separator space (old) or comma (new)? for line in handle: self.numClusters += 1 if newStyle: clusterID, readName, FL = line.strip().split(',') else: clusterID, readName, FL = line.strip().split() cell, ZMW, coords = readName.split('/') coords = re.sub (regexCCS, '', coords) # get rid of '_CCS' at end of read range shortName = ZMW + '|' + coords if cell not in self.cells: # have we seen this cell before? self.numCells += 1 self.cells[cell] = self.numCells # if not, give it a number cellNo = self.cells[cell] clusterEnt = self.clusters.setdefault(clusterID, {}).setdefault(FL, {}).setdefault(cellNo, []) clusterEnt.append (shortName) handle.close() logger.debug('read %d reads in %d clusters from %d cells' % (self.numClusters, len(self.clusters), self.numCells))
def fillZMWIndexes(self): '''Called from __init__ to compute _baxByHole, _ZMWindex and _basecallIndex.''' # _baxByHole points to the bax object for the bax file which # contains a given ZMW. # _ZMWIndex is the offset for a given ZMW into the datasets of # the PulseData/BaseCalls/ZMW group in the bax file pointed to # by _baxByHole. # Likewise, _basecallIndex is the offset to the first entry # for a given ZMW in the datasets of the PulseData/BaseCalls # group. Each of those datasets contains numEvent entries for # a given ZMW. # The alert reader will probably notice that we never examine # the MultiPart/HoleLookup dataset in the bas.h5 # file. Instead, we derive the ZMW#-to-baxfile mapping by # looking at the ZMW#s contained in each bax file, in the loop # below. self._baxByHole = [None] * (self._maxZMW + 1) self._ZMWIndex = [None] * (self._maxZMW + 1) self._basecallIndex = [None] * (self._maxZMW + 1) for bf in self._baxfile: ix = 0 eventIndex = 0 numEvent = bf._ZMW["NumEvent"] for hole in bf.holeNumbers(): self._baxByHole[hole] = bf self._ZMWIndex[hole] = ix self._basecallIndex[hole] = eventIndex eventIndex += numEvent[ix] ix += 1 logger.debug("%s processed %d ZMWs" % (bf.shortName(), ix)) logger.debug("%s processed %d basecalls" % (bf.shortName(), eventIndex))
def submitScript(self): # Dependent job submission will fail if parent has already # completed. So delay all job startups by a short amount of time. startAt = datetime.datetime.now() + datetime.timedelta(0, STARTWAIT) startAtStr = startAt.strftime('%Y%m%d%H%M.%S') cmd = list() cmd.append('qsub') cmd.append('-N %s' % self.jobName) # job name cmd.append('-o %s' % self.scriptOutput) # output file cmd.append('-j oe') # combine stdout and stderr cmd.append('-l nodes=1:ppn=1,walltime=4:00:00') # resources required cmd.append('-a %s' % startAtStr) # delay start, see above cmd.append( '-d . ') # working directory (strangely, ./ is not the default) cmd.append('-r n') # do NOT attempt to restart on failure cmd.append('-V') # export all environment variables to job cmd.append('-W umask=0002') # make logs rw-rw-r-- cmd.append('-m n') # don't send any mail cmd.append(self.scriptName) # script to run command = ' '.join(cmd) logger.debug('running %s' % command) popen_file = os.popen(command) response = popen_file.read().strip() rc = popen_file.close() if rc is not None: logger.error('command failed, rc=%d' % rc) raise RuntimeError match = re.match(Chunk.JOBNO_PATTERN, response) if match is None: logger.error("invalid job sequence number: %s" % jobSeqStr) raise RuntimeError response = match.group(1) logger.debug('jobno is %s' % response) self.jobno = response return response
def submitScript (self): # Dependent job submission will fail if parent has already # completed. So delay all job startups by a short amount of time. startAt = datetime.datetime.now() + datetime.timedelta(0, STARTWAIT) startAtStr = startAt.strftime('%Y%m%d%H%M.%S') cmd = list() cmd.append('qsub') cmd.append('-N %s' % self.jobName) # job name cmd.append('-o %s' % self.scriptOutput) # output file cmd.append('-j oe') # combine stdout and stderr cmd.append('-l nodes=1:ppn=1,walltime=4:00:00') # resources required cmd.append('-a %s' % startAtStr) # delay start, see above cmd.append('-d . ') # working directory (strangely, ./ is not the default) cmd.append('-r n') # do NOT attempt to restart on failure cmd.append('-V') # export all environment variables to job cmd.append('-W umask=0002') # make logs rw-rw-r-- cmd.append('-m n') # don't send any mail cmd.append(self.scriptName) # script to run command = ' '.join(cmd) logger.debug ('running %s' % command) popen_file = os.popen(command) response = popen_file.read().strip() rc = popen_file.close() if rc is not None: logger.error('command failed, rc=%d' % rc) raise RuntimeError match = re.match (Chunk.JOBNO_PATTERN, response) if match is None: logger.error("invalid job sequence number: %s" % jobSeqStr) raise RuntimeError response = match.group(1) logger.debug ('jobno is %s' % response) self.jobno = response return response
def fillRegionIndexes(self): '''Called from __init__ to compute _regionIndex and _HQIndex.''' # _regionIndex is the offset to the first entry for a given # ZMW in the PulseData/Regions dataset in the bas file where # the hole resides. # _HQIndex is the offset to the HQ-region entry in # PulseData/Regions for a given ZMW. (There is only one # ... for now.) self._regionIndex = [None] * (self._maxZMW + 1) self._HQIndex = [None] * (self._maxZMW + 1) for bf in self._baxfile: regions = bf._regions regionIndex = 0 lastHole = -1 # init to non-matching value #### for line in regions: #### doing it this way was *very* slow #### hole, regionType = line[0:2] for hole, regionType in regions[:, 0:2]: if hole != lastHole: # start of new hole? if hole < 0 or hole > self._maxZMW: # sanity check hole# from region table raise RuntimeError("hole number %d out of range" % (hole)) self._regionIndex[hole] = regionIndex lastHole = hole if regionType == 2: # HQ region for this hole? self._HQIndex[hole] = regionIndex regionIndex += 1 logger.debug("%s processed %d regions" % (bf.shortName(), regionIndex))
def fillZMWIndexes(self): """Called from __init__ to compute _baxByHole, _ZMWindex and _basecallIndex.""" # _baxByHole points to the bax object for the bax file which # contains a given ZMW. # _ZMWIndex is the offset for a given ZMW into the datasets of # the PulseData/BaseCalls/ZMW group in the bax file pointed to # by _baxByHole. # Likewise, _basecallIndex is the offset to the first entry # for a given ZMW in the datasets of the PulseData/BaseCalls # group. Each of those datasets contains numEvent entries for # a given ZMW. # The alert reader will probably notice that we never examine # the MultiPart/HoleLookup dataset in the bas.h5 # file. Instead, we derive the ZMW#-to-baxfile mapping by # looking at the ZMW#s contained in each bax file, in the loop # below. self._baxByHole = [None] * (self._maxZMW + 1) self._ZMWIndex = [None] * (self._maxZMW + 1) self._basecallIndex = [None] * (self._maxZMW + 1) for bf in self._baxfile: ix = 0 eventIndex = 0 numEvent = bf._ZMW["NumEvent"] for hole in bf.holeNumbers(): self._baxByHole[hole] = bf self._ZMWIndex[hole] = ix self._basecallIndex[hole] = eventIndex eventIndex += numEvent[ix] ix += 1 logger.debug("%s processed %d ZMWs" % (bf.shortName(), ix)) logger.debug("%s processed %d basecalls" % (bf.shortName(), eventIndex))
def main (): logger.debug("%s starting" % sys.argv[0]) opt, args = getParms() basFilename = args[0] logger.debug("bas file: %s" % basFilename) basfile = H5BasFile.BasFile (basFilename) try: hole = int(args[1]) except ValueError: logger.error('ERROR: second parameter must be an integer ZMW number') sys.exit() if not opt.reverse: sequence = basfile.getSequence(hole, opt.start, opt.end) # end==None gets the whole read else: sequence = basfile.getRevCompSequence(hole, opt.start, opt.end) movie = basfile.movieName() length = len(sequence) print ">%s/%d/%d_%d" % (movie, hole, opt.start, opt.start+length) for ix in xrange(0,length,opt.flen): print sequence[ix:ix+opt.flen] logger.debug("complete")
def getReadGroups (self): '''Find read groups in /AlnGroup/Path for this movie.''' # Create and cache a dict whose key is ReadGroupId and whose # value is h5 group containing the AlnArray and other datasets # for that read group. See the comments in getAlignmentAsDict # for further confusion on the issue. if self._readGroups is None: logger.debug("creating ReadGroup list") self._readGroups = dict() movie = self._movieName path = self._top['AlnGroup/Path'] ID = self._top['AlnGroup/ID'] for ix in xrange(len(path)): logger.debug("path %d, ID %d: %s" % (ix, ID[ix], path[ix])) #### if path[ix].endswith(movie): #### self._readGroups[ID[ix]] = self._top[path[ix]] self._readGroups[ID[ix]] = self._top[path[ix]] logger.debug("kept %d of %d ReadGroups for this movie" % (len(self._readGroups), len(path))) return self._readGroups
def __init__(self, filename): '''Create a dict, keyed by chr, from a fasta file.''' self.filename = filename self.ref = dict() # this is the stuff! key=chr value=sequence self.chrList = list() # preserve original order of chromosomes logger.debug('reading reference fasta file %s' % self.filename) # Reading fasta lines and appending them one-by-one is very # slow (the paint bucket keeps getting father away). Instead, # accumulate individual lines in a list, and join them when # we've got them all. That turns out to be *much* faster. handle = open(self.filename, 'r') accum = list() for line in handle: line = line.strip() if line.startswith('>'): if len(accum) > 0: self.ref[chr] = ''.join(accum) # see comment block above accum = list() chr = re.match(regexChr, line).group(1) if chr in self.ref: raise RuntimeError('duplicate chromsome %s' % chr) self.chrList.append(chr) else: accum.append(line) if len(accum) > 0: # last one? self.ref[chr] = ''.join(accum) handle.close()
def main(): logger.debug("%s starting" % sys.argv[0]) opt, args = getParms() basFilename = args[0] logger.debug("bas file: %s" % basFilename) bf = H5BasFile.BasFile(basFilename) try: hole = int(args[1]) except ValueError: logger.error('ERROR: second parameter must be an integer ZMW number') sys.exit() if opt.subreads: for region in bf.holeRegions(hole): regionHole, regionType, start, end, score = region if regionType == 1: # a subread? printRange(bf, hole, opt, start, end) else: printRange(bf, hole, opt, opt.start, opt.end) logger.debug("complete")
def findCCSFile(self): '''Given a directory to look in, find the ccs.h5 file that contains consensus reads for this bax file.''' self._hasConsensus = False # until proven otherwise if "PulseData/ConsensusBaseCalls" in self._top: # if this is an older bax file, in contains its own CCS data self._consBasecalls = self._top["PulseData/ConsensusBaseCalls"] self._consZMW = self._top["PulseData/ConsensusBaseCalls/ZMW"] self._consPasses = self._top["PulseData/ConsensusBaseCalls/Passes"] self._hasConsensus = True elif self._CCSDir is not None: CCSFilename = os.path.basename(self._filename).replace( 'bax', 'ccs') fqCCSFilename = os.path.join(self._CCSDir, CCSFilename) if os.path.exists(fqCCSFilename): self._CCSFile = h5py.File(fqCCSFilename, 'r') self._consBasecalls = self._CCSFile[ "PulseData/ConsensusBaseCalls"] self._consZMW = self._CCSFile[ "PulseData/ConsensusBaseCalls/ZMW"] self._consPasses = self._CCSFile[ "PulseData/ConsensusBaseCalls/Passes"] self._hasConsensus = True logger.debug('BaxFile %s found CCS file %s' % (self._shortName, fqCCSFilename)) else: logger.warning('%s: no CCS file found corresponding to %s' % (self._shortName, self._filename)) else: logger.info( 'BaxFile %s does not contain CCS data (rel 2.1.0 and later). Use --ccs' % self._shortName)