def main(): logger.debug("%s starting" % sys.argv[0]) opt, args = getParms() basFilename = args[0] logger.debug("bas file: %s" % basFilename) bf = H5BasFile.BasFile(basFilename) try: hole = int(args[1]) except ValueError: logger.error('ERROR: second parameter must be an integer ZMW number') sys.exit() if opt.subreads: for region in bf.holeRegions(hole): regionHole, regionType, start, end, score = region if regionType == 1: # a subread? printRange(bf, hole, opt, start, end) else: printRange(bf, hole, opt, opt.start, opt.end) logger.debug("complete")
def main (): logger.debug("%s starting" % sys.argv[0]) opt, args = getParms() basFilename = args[0] logger.debug("bas file: %s" % basFilename) basfile = H5BasFile.BasFile (basFilename) try: hole = int(args[1]) except ValueError: logger.error('ERROR: second parameter must be an integer ZMW number') sys.exit() if not opt.reverse: sequence = basfile.getSequence(hole, opt.start, opt.end) # end==None gets the whole read else: sequence = basfile.getRevCompSequence(hole, opt.start, opt.end) movie = basfile.movieName() length = len(sequence) print ">%s/%d/%d_%d" % (movie, hole, opt.start, opt.start+length) for ix in xrange(0,length,opt.flen): print sequence[ix:ix+opt.flen] logger.debug("complete")
def main (): logger.debug("%s starting" % sys.argv[0]) opt, args = getParms() basFilename = args[0] logger.debug("bas file: %s" % basFilename) bf = H5BasFile.BasFile (basFilename) try: hole = int(args[1]) except ValueError: logger.error('ERROR: second parameter must be an integer ZMW number') sys.exit() if opt.subreads: for region in bf.holeRegions(hole): regionHole, regionType, start, end, score = region if regionType == 1: # a subread? printRange (bf, hole, opt, start, end) else: printRange (bf, hole, opt, opt.start, opt.end) logger.debug("complete")
def submitFinalJobs (opt, chunkList): chunkFiles = ['%s \\\n' % chk.trimmedChunkName for chk in chunkList] sh = list() sh.append('#!/bin/bash\n\n') sh.append('set -o errexit\n') sh.append('set -o nounset\n\n') sh.append('cat \\\n') sh.extend(chunkFiles) sh.append(' > %s\n' % opt.output) if opt.report is not None: reportFiles = ['%s \\\n' % chk.reportChunkName for chk in chunkList] sh.append('\ncat \\\n') sh.extend(reportFiles) sh.append(' > %s\n' % opt.report) finalScriptName = '%s/trim_final.sh' % opt.tmpdir handle = open (finalScriptName, 'w') handle.writelines (sh) handle.close() deps = ':'.join ([chk.jobno for chk in chunkList]) cmd = list() cmd.append('qsub') cmd.append('-N trim_final') # job name cmd.append('-o trim_final.out') # output file cmd.append('-j oe') # combine stdout and stderr cmd.append('-l nodes=1:ppn=1,walltime=4:00:00') # resources required cmd.append('-d . ') # working directory (strangely, ./ is not the default) cmd.append('-r n') # do NOT attempt to restart on failure cmd.append('-V') # export all environment variables to job cmd.append('-W umask=0002') # make logs rw-rw-r-- cmd.append('-m n') # don't send any mail cmd.append('-W depend=afterok:%s' % deps) cmd.append(finalScriptName) # script to run command = ' '.join(cmd) logger.debug ('running %s' % command) popen_file = os.popen(command) response = popen_file.read().strip() rc = popen_file.close() if rc is not None: logger.error('command failed, rc=%d' % rc) raise RuntimeError logger.debug ('jobno is %s' % response) return response
def submitFinalJobs(opt, chunkList): chunkFiles = ['%s \\\n' % chk.trimmedChunkName for chk in chunkList] sh = list() sh.append('#!/bin/bash\n\n') sh.append('set -o errexit\n') sh.append('set -o nounset\n\n') sh.append('cat \\\n') sh.extend(chunkFiles) sh.append(' > %s\n' % opt.output) if opt.report is not None: reportFiles = ['%s \\\n' % chk.reportChunkName for chk in chunkList] sh.append('\ncat \\\n') sh.extend(reportFiles) sh.append(' > %s\n' % opt.report) finalScriptName = '%s/trim_final.sh' % opt.tmpdir handle = open(finalScriptName, 'w') handle.writelines(sh) handle.close() deps = ':'.join([chk.jobno for chk in chunkList]) cmd = list() cmd.append('qsub') cmd.append('-N trim_final') # job name cmd.append('-o trim_final.out') # output file cmd.append('-j oe') # combine stdout and stderr cmd.append('-l nodes=1:ppn=1,walltime=4:00:00') # resources required cmd.append('-d . ') # working directory (strangely, ./ is not the default) cmd.append('-r n') # do NOT attempt to restart on failure cmd.append('-V') # export all environment variables to job cmd.append('-W umask=0002') # make logs rw-rw-r-- cmd.append('-m n') # don't send any mail cmd.append('-W depend=afterok:%s' % deps) cmd.append(finalScriptName) # script to run command = ' '.join(cmd) logger.debug('running %s' % command) popen_file = os.popen(command) response = popen_file.read().strip() rc = popen_file.close() if rc is not None: logger.error('command failed, rc=%d' % rc) raise RuntimeError logger.debug('jobno is %s' % response) return response
def countSeqs (filename): '''Run grep -c ">" on a fasta file to count the sequences it contains.''' command = 'grep -c ">" %s' % filename popen_file = os.popen(command) response = popen_file.read().strip() rc = popen_file.close() if rc is not None: logger.error("command failed, rc=%d" % rc) raise RuntimeError if not response.isdigit(): logger.error("grep -c returned:" % response) raise RuntimeError return int(response)
def countSeqs(filename): '''Run grep -c ">" on a fasta file to count the sequences it contains.''' command = 'grep -c ">" %s' % filename popen_file = os.popen(command) response = popen_file.read().strip() rc = popen_file.close() if rc is not None: logger.error("command failed, rc=%d" % rc) raise RuntimeError if not response.isdigit(): logger.error("grep -c returned:" % response) raise RuntimeError return int(response)
def main (): logger.debug("%s starting" % sys.argv[0]) opt, args = getParms() if len(args) != 2: logger.error ("please specify bas.h5 and cmp.h5 files as parameters. See --help") sys.exit() # TODO: Actually, all we need from the bas file is the movie name # (maxHole will default to something clever). We don't need to # open the bas file to determine the movie name: It's part of the # filename. The only real reason we specify a bas file as the # first parameter is to match the command line interface of other # scripts. basFilename = args[0] logger.debug("bas file: %s" % basFilename) bf = H5BasFile.BasFile (basFilename) movie = bf.movieName() cmpFilename = args[1] logger.debug("cmp file: %s" % cmpFilename) cf = H5CmpFile.CmpFile (fileName=cmpFilename) cmp = H5CmpFile.CmpMovie (cmpObject=cf, movieName=movie, maxHole=bf.maxZMW()) if opt.ZMW is not None: # did we ask for a specific ZMW? for align in cmp.getAlignmentsForHole(opt.ZMW): printAlignment (align, cmp, opt.flen) else: # else, print all ZMWs for align in cmp.getAlignmentsByHole(): printAlignment (align, cmp, opt.flen) logger.debug("complete")
def submitScript(self): # Dependent job submission will fail if parent has already # completed. So delay all job startups by a short amount of time. startAt = datetime.datetime.now() + datetime.timedelta(0, STARTWAIT) startAtStr = startAt.strftime('%Y%m%d%H%M.%S') cmd = list() cmd.append('qsub') cmd.append('-N %s' % self.jobName) # job name cmd.append('-o %s' % self.scriptOutput) # output file cmd.append('-j oe') # combine stdout and stderr cmd.append('-l nodes=1:ppn=1,walltime=4:00:00') # resources required cmd.append('-a %s' % startAtStr) # delay start, see above cmd.append( '-d . ') # working directory (strangely, ./ is not the default) cmd.append('-r n') # do NOT attempt to restart on failure cmd.append('-V') # export all environment variables to job cmd.append('-W umask=0002') # make logs rw-rw-r-- cmd.append('-m n') # don't send any mail cmd.append(self.scriptName) # script to run command = ' '.join(cmd) logger.debug('running %s' % command) popen_file = os.popen(command) response = popen_file.read().strip() rc = popen_file.close() if rc is not None: logger.error('command failed, rc=%d' % rc) raise RuntimeError match = re.match(Chunk.JOBNO_PATTERN, response) if match is None: logger.error("invalid job sequence number: %s" % jobSeqStr) raise RuntimeError response = match.group(1) logger.debug('jobno is %s' % response) self.jobno = response return response
def submitScript (self): # Dependent job submission will fail if parent has already # completed. So delay all job startups by a short amount of time. startAt = datetime.datetime.now() + datetime.timedelta(0, STARTWAIT) startAtStr = startAt.strftime('%Y%m%d%H%M.%S') cmd = list() cmd.append('qsub') cmd.append('-N %s' % self.jobName) # job name cmd.append('-o %s' % self.scriptOutput) # output file cmd.append('-j oe') # combine stdout and stderr cmd.append('-l nodes=1:ppn=1,walltime=4:00:00') # resources required cmd.append('-a %s' % startAtStr) # delay start, see above cmd.append('-d . ') # working directory (strangely, ./ is not the default) cmd.append('-r n') # do NOT attempt to restart on failure cmd.append('-V') # export all environment variables to job cmd.append('-W umask=0002') # make logs rw-rw-r-- cmd.append('-m n') # don't send any mail cmd.append(self.scriptName) # script to run command = ' '.join(cmd) logger.debug ('running %s' % command) popen_file = os.popen(command) response = popen_file.read().strip() rc = popen_file.close() if rc is not None: logger.error('command failed, rc=%d' % rc) raise RuntimeError match = re.match (Chunk.JOBNO_PATTERN, response) if match is None: logger.error("invalid job sequence number: %s" % jobSeqStr) raise RuntimeError response = match.group(1) logger.debug ('jobno is %s' % response) self.jobno = response return response
def main (): logger.debug("%s starting" % sys.argv[0]) opt, args = getParms() if len(args) != 2: logger.error ("please specify bas.h5 and cmp.h5 files as parameters. See --help") sys.exit() basFilename = args[0] logger.debug("bas file: %s" % basFilename) bf = H5BasFile.BasFile (basFilename) cmpFilename = args[1] logger.debug("cmp file: %s" % cmpFilename) cf = H5CmpFile.CmpFile (fileName=cmpFilename) cmp = H5CmpFile.CmpMovie (cmpObject=cf, movieName=bf.movieName(), maxHole=bf.maxZMW()) cf.printDetails() print " AlnID RG Hole Set Stb SubRd Seq Ref St Start End RefStrt RefEnd OffStrt OffEnd" print if opt.sort == 'hole': for align in cmp.getAlignmentsByHole(): printAlign(align) else : # else, must be 'none' for align in cmp.getAllAlignments(): # generator function, returns a dict printAlign(align) logger.debug("complete")
def main (): logger.debug("%s starting" % sys.argv[0]) opt, args = getParms() basFilename = args[0] basfile = H5BasFile.BasFile (basFilename) try: hole = int(args[1]) except ValueError: logger.error('ERROR: second parameter must be an integer ZMW number') sys.exit() start = opt.start end = basfile.readLen(hole) if opt.end is None else opt.end sequence = basfile.getSequence(hole, start, end) aln = SWAligner.Aligner() aln.setRef (sequence) aln.setRead (H5BasFile.ADAPTER) aln.fillMatrix() allScores = aln.allScores() range = xrange(start,end) title = "ZMW %d (%d to %d)" % (hole, start, end) plt.suptitle(title, fontsize=14, fontweight='bold') plt.plot (range, allScores, COL_NOT_HQ, zorder=1, label='non-HQ') if not opt.nocol: # finding HQ region takes a long time, so optionally turn it off # There doesn't seem to be a way to separately specify a # colour for each point in a plot. So we'll plot in one # colour, then overlay subregions of that with another # colour. Plot commands are rendered in increasing zorder. HQStart, HQEnd = basfile.HQregion(hole)[2:4] HQRange = xrange(HQStart, HQEnd) HQScores = allScores[HQStart:HQEnd] plt.plot (HQRange, HQScores, COL_HQ, zorder=2, label='HQ') label = 'adapter'; # I will only say this once... for region in basfile.holeRegions(hole): # loop through the regions looking for adapters regionType, regionStart, regionEnd = region[1:4] if regionType == 0: # an adapter? regionRange = xrange(regionStart, regionEnd) regionScores = allScores[regionStart:regionEnd] plt.plot (regionRange, regionScores, COL_ADAPT, zorder=3, label=label) label = '_nolegend_' # don't generate multiple legend entries plt.legend(loc='best', prop={'size':10}) # add a legend box to figure plt.ylim (0, len(H5BasFile.ADAPTER)) if opt.output is not None: outfile = opt.output else: outfile = "ZMW-%05d.png" % hole plt.savefig (outfile) logger.debug("complete")
def main(): logger.debug("%s starting" % sys.argv[0]) opt, args = getParms() basFilename = args[0] basfile = H5BasFile.BasFile(basFilename) try: hole = int(args[1]) except ValueError: logger.error('ERROR: second parameter must be an integer ZMW number') sys.exit() start = opt.start end = basfile.readLen(hole) if opt.end is None else opt.end sequence = basfile.getSequence(hole, start, end) aln = SWAligner.Aligner() aln.setRef(sequence) aln.setRead(H5BasFile.ADAPTER) aln.fillMatrix() allScores = aln.allScores() range = xrange(start, end) title = "ZMW %d (%d to %d)" % (hole, start, end) plt.suptitle(title, fontsize=14, fontweight='bold') plt.plot(range, allScores, COL_NOT_HQ, zorder=1, label='non-HQ') if not opt.nocol: # finding HQ region takes a long time, so optionally turn it off # There doesn't seem to be a way to separately specify a # colour for each point in a plot. So we'll plot in one # colour, then overlay subregions of that with another # colour. Plot commands are rendered in increasing zorder. HQStart, HQEnd = basfile.HQregion(hole)[2:4] HQRange = xrange(HQStart, HQEnd) HQScores = allScores[HQStart:HQEnd] plt.plot(HQRange, HQScores, COL_HQ, zorder=2, label='HQ') label = 'adapter' # I will only say this once... for region in basfile.holeRegions( hole): # loop through the regions looking for adapters regionType, regionStart, regionEnd = region[1:4] if regionType == 0: # an adapter? regionRange = xrange(regionStart, regionEnd) regionScores = allScores[regionStart:regionEnd] plt.plot(regionRange, regionScores, COL_ADAPT, zorder=3, label=label) label = '_nolegend_' # don't generate multiple legend entries plt.legend(loc='best', prop={'size': 10}) # add a legend box to figure plt.ylim(0, len(H5BasFile.ADAPTER)) if opt.output is not None: outfile = opt.output else: outfile = "ZMW-%05d.png" % hole plt.savefig(outfile) logger.debug("complete")