def test_loadmetadata_from_dataset_create_cli(self): fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name fn2 = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name log.debug(fn) aln = AlignmentSet(data.getXml(8)) aln.metadata.collections = None aln.copyTo(fn) aln.close() del aln self.assertTrue(os.path.exists(fn)) aln = AlignmentSet(fn) self.assertFalse(aln.metadata.collections) cmd = "dataset create --metadata {m} {o} {i}".format( o=fn2, i=fn, m=("/pbi/dept/secondary/siv/testdata/" "SA3-Sequel/lambda/roche_SAT/" "m54013_151205_032353.subreadset.xml")) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0, m) aln = AlignmentSet(fn2) self.assertTrue(aln.metadata.collections)
def test_membership_filter_with_equal_operator(self): aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) hns = np.unique(aln.index.holeNumber)[:1] aln.filters.addRequirement(zm=[('=', hns)]) self.assertEqual(len(list(aln)), 5) aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) hns = np.unique(aln.index.holeNumber) aln.filters.addRequirement(zm=[('==', hns)]) self.assertEqual(len(list(aln)), 177) aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) hns = np.unique(aln.index.holeNumber) hns = [n for _ in range(10000) for n in hns] hns = np.array(hns) aln.filters.addRequirement(zm=[('==', hns)]) self.assertEqual(len(list(aln)), 177) aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) hns = np.unique(aln.index.holeNumber)[:1] hns = list(hns) aln.filters.addRequirement(zm=[('==', hns)]) self.assertEqual(len(list(aln)), 5) aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) hns = np.unique(aln.index.holeNumber)[:1] hns = set(hns) aln.filters.addRequirement(zm=[('==', hns)]) self.assertEqual(len(list(aln)), 5) aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) qnames = [r.qName for r in aln[:10]] aln.filters.addRequirement(qname=[('==', qnames)]) self.assertEqual(len(list(aln)), 10) aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) qnames = [r.qName for r in aln[:1]] aln.filters.addRequirement(qname=[('==', qnames)]) self.assertEqual(len(list(aln)), 1) fn = tempfile.NamedTemporaryFile(suffix="alignmentset.xml").name aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) hns = np.unique(aln.index.holeNumber)[:1] aln.filters.addRequirement(zm=[('==', hns)]) aln.write(fn) aln.close() aln2 = AlignmentSet(fn) self.assertEqual(len(list(aln2)), 5)
def test_membership_filter(self): aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) hns = np.unique(aln.index.holeNumber)[:1] aln.filters.addRequirement(zm=[('in', hns)]) self.assertEqual(len(list(aln)), 5) aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) hns = np.unique(aln.index.holeNumber) aln.filters.addRequirement(zm=[('in', hns)]) self.assertEqual(len(list(aln)), 177) aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) hns = np.unique(aln.index.holeNumber) hns = [n for _ in range(10000) for n in hns] hns = np.array(hns) aln.filters.addRequirement(zm=[('in', hns)]) self.assertEqual(len(list(aln)), 177) aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) hns = np.unique(aln.index.holeNumber)[:1] hns = list(hns) aln.filters.addRequirement(zm=[('in', hns)]) self.assertEqual(len(list(aln)), 5) aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) hns = np.unique(aln.index.holeNumber)[:1] hns = set(hns) aln.filters.addRequirement(zm=[('in', hns)]) self.assertEqual(len(list(aln)), 5) aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) qnames = [r.qName for r in aln[:10]] aln.filters.addRequirement(qname=[('in', qnames)]) self.assertEqual(len(list(aln)), 10) fn = tempfile.NamedTemporaryFile(suffix="alignmentset.xml").name aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) hns = np.unique(aln.index.holeNumber)[:1] aln.filters.addRequirement(zm=[('in', hns)]) aln.write(fn) aln.close() aln2 = AlignmentSet(fn) self.assertEqual(len(list(aln2)), 5)
def test_loadmetadata_from_dataset_cli(self): fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name log.debug(fn) aln = AlignmentSet(data.getXml(7)) aln.metadata.collections = None aln.copyTo(fn) aln.close() del aln assert os.path.exists(fn) aln = AlignmentSet(fn) assert not aln.metadata.collections cmd = "dataset loadmetadata {i} {m}".format( i=fn, m=("/pbi/dept/secondary/siv/testdata/" "SA3-Sequel/lambda/roche_SAT/" "m54013_151205_032353.subreadset.xml")) self._check_cmd(cmd) aln = AlignmentSet(fn) assert aln.metadata.collections
class ToolRunner(object): """ The main driver class for the GenomicConsensus tool. """ def __init__(self): self._inCmpH5 = None self._resultsQueue = None self._workQueue = None self._slaves = None self._algorithm = None self._algorithmConfiguration = None self._aborting = False def _setupLogging(self): if options.quiet: logLevel = logging.ERROR elif options.verbosity >= 2: logLevel = logging.DEBUG elif options.verbosity == 1: logLevel = logging.INFO else: logLevel = logging.WARNING logFormat = '[%(levelname)s] %(message)s' logging.basicConfig(level=logLevel, format=logFormat) def _makeTemporaryDirectory(self): """ Make a temp dir where we can stash things if necessary. """ options.temporaryDirectory = tempfile.mkdtemp(prefix="GenomicConsensus-", dir="/tmp") logging.info("Created temporary directory %s" % (options.temporaryDirectory,) ) def _algorithmByName(self, name): if name=="plurality": algo = plurality elif name=="quiver": algo = quiver else: die("Failure: unrecognized algorithm %s" % name) isOK, msg = algo.availability if not isOK: die("Failure: %s" % msg) return algo def _launchSlaves(self): """ Launch a group of worker processes (self._slaves), the queue (self._workQueue) that will be used to send them chunks of work, and the queue that will be used to receive back the results (self._resultsQueue). Additionally, launch the result collector process. """ availableCpus = multiprocessing.cpu_count() logging.info("Available CPUs: %d" % (availableCpus,)) logging.info("Requested workers: %d" % (options.numWorkers,)) logging.info("Parallel Mode: %s" % ("Threaded" if options.threaded else "Process",)) if (options.numWorkers > availableCpus): logging.warn("More workers requested (%d) than CPUs available (%d);" " may result in suboptimal performance." % (options.numWorkers, availableCpus)) self._initQueues() WorkerType, ResultCollectorType = self._algorithm.slaveFactories(options.threaded) self._slaves = [] for i in xrange(options.numWorkers): p = WorkerType(self._workQueue, self._resultsQueue, self._algorithmConfiguration) self._slaves.append(p) p.start() logging.info("Launched compute slaves.") rcp = ResultCollectorType(self._resultsQueue, self._algorithmConfiguration) rcp.start() self._slaves.append(rcp) logging.info("Launched collector slave.") def _initQueues(self): if options.threaded: self._workQueue = Queue.Queue(options.queueSize) self._resultsQueue = Queue.Queue(options.queueSize) else: self._workQueue = multiprocessing.Queue(options.queueSize) self._resultsQueue = multiprocessing.Queue(options.queueSize) def _readCmpH5Input(self): """ Read the CmpH5 input file into a CmpH5 object and store it as self._inCmpH5. """ fname = options.inputFilename self._inCmpH5 = AlignmentSet(fname) def _loadReference(self, cmpH5): logging.info("Loading reference") err = reference.loadFromFile(options.referenceFilename, cmpH5) if err: die("Error loading reference") # Grok the referenceWindow spec, if any. if options.referenceWindowsAsString is None: options.referenceWindows = () elif options.skipUnrecognizedContigs: # This is a workaround for smrtpipe scatter/gather. options.referenceWindows = [] for s in options.referenceWindowsAsString.split(","): try: win = reference.stringToWindow(s) options.referenceWindows.append(win) except: pass else: options.referenceWindows = map(reference.stringToWindow, options.referenceWindowsAsString.split(",")) if options.referenceWindowsFromAlignment: options.referenceWindows = cmpH5.refWindows def _checkFileCompatibility(self, cmpH5): if not cmpH5.isSorted: die("Input CmpH5 file must be sorted.") if cmpH5.isEmpty: die("Input CmpH5 file must be nonempty.") def _shouldDisableChunkCache(self, cmpH5): #if isinstance(cmpH5, CmpH5Reader): #if cmpH5.isCmpH5: # threshold = options.autoDisableHdf5ChunkCache # return datasetCountExceedsThreshold(cmpH5, threshold) #else: # return False return True def _configureAlgorithm(self, options, cmpH5): assert self._algorithm != None try: self._algorithmConfiguration = self._algorithm.configure(options, cmpH5) except IncompatibleDataException as e: die("Failure: %s" % e.message) def _mainLoop(self): # Split up reference genome into chunks and farm out the # a chunk as a unit of work. logging.debug("Starting main loop.") ids = reference.enumerateIds(options.referenceWindows) for _id in ids: if options.fancyChunking: chunks = reference.fancyEnumerateChunks(self._inCmpH5, _id, options.referenceChunkSize, options.minCoverage, options.minMapQV, options.referenceWindows) else: chunks = reference.enumerateChunks(_id, options.referenceChunkSize, options.referenceWindows) for chunk in chunks: if self._aborting: return self._workQueue.put(chunk) # Write sentinels ("end-of-work-stream") for i in xrange(options.numWorkers): self._workQueue.put(None) def _printProfiles(self): for profile in glob.glob(os.path.join(options.temporaryDirectory, "*")): pstats.Stats(profile).sort_stats("time").print_stats(20) def _cleanup(self): if options.doProfiling: logging.info("Removing %s" % options.temporaryDirectory) shutil.rmtree(options.temporaryDirectory, ignore_errors=True) def _setupEvidenceDumpDirectory(self, directoryName): if os.path.exists(directoryName): shutil.rmtree(directoryName) os.makedirs(directoryName) @property def aborting(self): return self._aborting def abortWork(self, why): """ Performs a shutdown of all the slave processes. Called by the monitoring thread when a child process exits with a non-zero, or when a keyboard interrupt (Ctrl-C) is given. Not called during normal shutdown. """ logging.error(why) self._aborting = True self._resultsQueue.close() self._workQueue.close() @property def slaves(self): return self._slaves def main(self): # This looks scary but it's not. Python uses reference # counting and has a secondary, optional garbage collector for # collecting garbage cycles. Unfortunately when a cyclic GC # happens when a thread is calling cPickle.dumps, the # interpreter crashes sometimes. See Bug 19704. Since we # don't leak garbage cycles, disabling the cyclic GC is # essentially harmless. gc.disable() parseOptions() self._algorithm = self._algorithmByName(options.algorithm) self._setupLogging() random.seed(42) logging.info("h5py version: %s" % h5py.version.version) logging.info("hdf5 version: %s" % h5py.version.hdf5_version) logging.info("ConsensusCore version: %s" % (consensusCoreVersion() or "ConsensusCore unavailable")) logging.info("Starting.") atexit.register(self._cleanup) if options.doProfiling: self._makeTemporaryDirectory() with AlignmentSet(options.inputFilename) as peekFile: if not peekFile.isCmpH5 and not peekFile.hasPbi: logging.warn("'fancyChunking' not yet available for BAM " "files without accompanying .pbi files, " "disabling") options.fancyChunking = False logging.info("Peeking at file %s" % options.inputFilename) logging.info("Input data: numAlnHits=%d" % len(peekFile)) resolveOptions(peekFile) self._loadReference(peekFile) self._checkFileCompatibility(peekFile) self._configureAlgorithm(options, peekFile) options.disableHdf5ChunkCache = True #options.disableHdf5ChunkCache = self._shouldDisableChunkCache(peekFile) #if options.disableHdf5ChunkCache: # logging.info("Will disable HDF5 chunk cache (large number of datasets)") #logging.debug("After peek, # hdf5 objects open: %d" % h5py.h5f.get_obj_count()) if options.dumpEvidence: self._setupEvidenceDumpDirectory(options.evidenceDirectory) self._launchSlaves() self._readCmpH5Input() monitoringThread = threading.Thread(target=monitorSlaves, args=(self,)) monitoringThread.start() try: if options.doProfiling: cProfile.runctx("self._mainLoop()", globals=globals(), locals=locals(), filename=os.path.join(options.temporaryDirectory, "profile-main.out")) elif options.doDebugging: if not options.threaded: die("Debugging only works with -T (threaded) mode") logging.info("PID: %d", os.getpid()) import ipdb with ipdb.launch_ipdb_on_exception(): self._mainLoop() else: self._mainLoop() except: why = traceback.format_exc() self.abortWork(why) monitoringThread.join() if self._aborting: logging.error("Aborting") return -1 else: logging.info("Finished.") if options.doProfiling: self._printProfiles() # close h5 file. self._inCmpH5.close() return 0
class KineticsToolsRunner(object): def __init__(self, args): self.args = args self.alignments = None def start(self): self.validateArgs() return self.run() def getVersion(self): return __version__ def validateArgs(self): parser = get_parser() if not os.path.exists(self.args.alignment_set): parser.error('Input AlignmentSet file provided does not exist') if self.args.identify and self.args.control: parser.error('--control and --identify are mutally exclusive. Please choose one or the other') if self.args.useLDA: if self.args.m5Cclassifier is None: parser.error('Please specify a folder containing forward.csv and reverse.csv classifiers in --m5Cclassifier.') if self.args.m5Cgff: if not self.args.useLDA: parser.error('m5Cgff file can only be generated in --useLDA mode.') # if self.args.methylFraction and not self.args.identify: # parser.error('Currently, --methylFraction only works when the --identify option is specified.') def run(self): # Figure out what modifications to identify mods = self.args.identify modsToCall = [] if mods: items = mods.split(",") if 'm6A' in items: modsToCall.append('H') if 'm4C' in items: modsToCall.append('J') if 'm5C_TET' in items: modsToCall.append('K') self.args.identify = True self.args.modsToCall = modsToCall self.options = self.args self.options.cmdLine = " ".join(sys.argv) self._workers = [] # set random seed # XXX note that this is *not* guaranteed to yield reproducible results # indepenently of the number of processing cores used! if self.options.randomSeed is not None: np.random.seed(self.options.randomSeed) if self.args.doProfiling: cProfile.runctx("self._mainLoop()", globals=globals(), locals=locals(), filename="profile.out") else: try: ret = self._mainLoop() finally: # Be sure to shutdown child processes if we get an exception on the main thread if not self.args.threaded: for w in self._workers: if w.is_alive(): w.terminate() return ret def _initQueues(self): if self.options.threaded: # Work chunks are created by the main thread and put on this queue # They will be consumed by KineticWorker threads, stored in self._workers self._workQueue = Queue.Queue(self.options.maxQueueSize) # Completed chunks are put on this queue by KineticWorker threads # They are consumed by the KineticsWriter process self._resultsQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize) else: # Work chunks are created by the main thread and put on this queue # They will be consumed by KineticWorker threads, stored in self._workers self._workQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize) # Completed chunks are put on this queue by KineticWorker threads # They are consumed by the KineticsWriter process self._resultsQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize) def _launchSlaveProcesses(self): """ Launch a group of worker processes (self._workers), the queue (self._workQueue) that will be used to send them chunks of work, and the queue that will be used to receive back the results (self._resultsQueue). Additionally, launch the result collector process. """ availableCpus = multiprocessing.cpu_count() logging.info("Available CPUs: %d" % (availableCpus,)) logging.info("Requested worker processes: %d" % (self.options.numWorkers,)) # Use all CPUs if numWorkers < 1 if self.options.numWorkers < 1: self.options.numWorkers = availableCpus # Warn if we make a bad numWorker argument is used if self.options.numWorkers > availableCpus: logging.warn("More worker processes requested (%d) than CPUs available (%d);" " may result in suboptimal performance." % (self.options.numWorkers, availableCpus)) self._initQueues() if self.options.threaded: self.options.numWorkers = 1 WorkerType = KineticWorkerThread else: WorkerType = KineticWorkerProcess # Launch the worker processes self._workers = [] for i in xrange(self.options.numWorkers): p = WorkerType(self.options, self._workQueue, self._resultsQueue, self.ipdModel, sharedAlignmentSet=self.alignments) self._workers.append(p) p.start() logging.info("Launched worker processes.") # Launch result collector self._resultCollectorProcess = KineticsWriter(self.options, self._resultsQueue, self.refInfo, self.ipdModel) self._resultCollectorProcess.start() logging.info("Launched result collector process.") # Spawn a thread that monitors worker threads for crashes self.monitoringThread = threading.Thread(target=monitorChildProcesses, args=(self._workers + [self._resultCollectorProcess],)) self.monitoringThread.start() def _queueChunksForWindow(self, refWindow): """ Compute the chunk extents and queue up the work for a single reference """ winId = refWindow.refId winStart = refWindow.start winEnd = refWindow.end pass def loadReferenceAndModel(self, referencePath): assert self.alignments is not None and self.referenceWindows is not None # Load the reference contigs - annotated with their refID from the cmp.h5 logging.info("Loading reference contigs %s" % referencePath) contigs = ReferenceUtils.loadReferenceContigs(referencePath, alignmentSet=self.alignments, windows=self.referenceWindows) # There are three different ways the ipdModel can be loaded. # In order of precedence they are: # 1. Explicit path passed to --ipdModel # 2. Path to parameter bundle, model selected using the cmp.h5's sequencingChemistry tags # 3. Fall back to built-in model. # By default, use built-in model ipdModel = None if self.args.ipdModel: ipdModel = self.args.ipdModel logging.info("Using passed in ipd model: %s" % self.args.ipdModel) if not os.path.exists(self.args.ipdModel): logging.error("Couldn't find model file: %s" % self.args.ipdModel) sys.exit(1) elif self.args.paramsPath: if not os.path.exists(self.args.paramsPath): logging.error("Params path doesn't exist: %s" % self.args.paramsPath) sys.exit(1) majorityChem = ReferenceUtils.loadAlignmentChemistry(self.alignments) # Temporary solution for Sequel chemistries: we do not # have trained kinetics models in hand yet for Sequel # chemistries. However we have observed that the P5-C3 # training seems to yield fairly good results on Sequel # chemistries to date. So for the moment, we will use # that model for Sequel data. if majorityChem.startswith("S/"): logging.info("No trained model available yet for Sequel chemistries; modeling as P5-C3") majorityChem = "P5-C3" ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5") if majorityChem == 'unknown': logging.error("Chemistry cannot be identified---cannot perform kinetic analysis") sys.exit(1) elif not os.path.exists(ipdModel): logging.error("Aborting, no kinetics model available for this chemistry: %s" % ipdModel) sys.exit(1) else: logging.info("Using Chemistry matched IPD model: %s" % ipdModel) self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters) def loadSharedAlignmentSet(self, cmpH5Filename): """ Read the input AlignmentSet so the indices can be shared with the slaves. This is also used to pass to ReferenceUtils for setting up the ipdModel object. """ logging.info("Reading AlignmentSet: %s" % cmpH5Filename) logging.info(" reference: %s" % self.args.reference) self.alignments = AlignmentSet(cmpH5Filename, referenceFastaFname=self.args.reference) # XXX this should ensure that the file(s) get opened, including any # .pbi indices - but need to confirm this self.refInfo = self.alignments.referenceInfoTable def _mainLoop(self): """ Main loop First launch the worker and writer processes Then we loop over ReferenceGroups in the cmp.h5. For each contig we will: 1. Load the sequence into the main memory of the parent process 3. Chunk up the contig and submit the chunk descriptions to the work queue Finally, wait for the writer process to finish. """ # This looks scary but it's not. Python uses reference # counting and has a secondary, optional garbage collector for # collecting garbage cycles. Unfortunately when a cyclic GC # happens when a thread is calling cPickle.dumps, the # interpreter crashes sometimes. See Bug 19704. Since we # don't leak garbage cycles, disabling the cyclic GC is # essentially harmless. #gc.disable() self.loadSharedAlignmentSet(self.args.alignment_set) # Resolve the windows that will be visited. if self.args.referenceWindowsAsString is not None: self.referenceWindows = [] for s in self.args.referenceWindowsAsString.split(","): try: win = ReferenceUtils.parseReferenceWindow(s, self.alignments.referenceInfo) self.referenceWindows.append(win) except: if self.args.skipUnrecognizedContigs: continue else: raise Exception, "Unrecognized contig!" elif self.args.referenceWindowsFromAlignment: self.referenceWindows = ReferenceUtils.referenceWindowsFromAlignment(self.alignments, self.alignments.referenceInfo) refNames = set([rw.refName for rw in self.referenceWindows]) # limit output to contigs that overlap with reference windows self.refInfo = [r for r in self.refInfo if r.Name in refNames] else: self.referenceWindows = ReferenceUtils.createReferenceWindows( self.refInfo) # Load reference and IpdModel self.loadReferenceAndModel(self.args.reference) # Spawn workers self._launchSlaveProcesses() logging.info('Generating kinetics summary for [%s]' % self.args.alignment_set) #self.referenceMap = self.alignments['/RefGroup'].asDict('RefInfoID', 'ID') #self.alnInfo = self.alignments['/AlnInfo'].asRecArray() # Main loop -- we loop over ReferenceGroups in the cmp.h5. For each contig we will: # 1. Load the sequence into the main memory of the parent process # 2. Fork the workers # 3. chunk up the contig and self.workChunkCounter = 0 # Iterate over references for window in self.referenceWindows: logging.info('Processing window/contig: %s' % (window,)) for chunk in ReferenceUtils.enumerateChunks(self.args.referenceStride, window): self._workQueue.put((self.workChunkCounter, chunk)) self.workChunkCounter += 1 # Shutdown worker threads with None sentinels for i in xrange(self.args.numWorkers): self._workQueue.put(None) for w in self._workers: w.join() # Join on the result queue and the resultsCollector process. # This ensures all the results are written before shutdown. self.monitoringThread.join() self._resultsQueue.join() self._resultCollectorProcess.join() logging.info("ipdSummary.py finished. Exiting.") self.alignments.close() return 0
class ToolRunner(object): """ The main driver class for the GenomicConsensus tool. It is assumed that arguments have already been parsed and used to populate the global 'options' namespace before instantiating this class. """ def __init__(self): self._inAlnFile = None self._resultsQueue = None self._workQueue = None self._slaves = None self._algorithm = None self._algorithmConfiguration = None self._aborting = False def _setupLogging(self): if options.quiet: logLevel = logging.ERROR elif options.verbosity >= 2: logLevel = logging.DEBUG elif options.verbosity == 1: logLevel = logging.INFO else: logLevel = logging.WARNING log = logging.getLogger() log.setLevel(logLevel) def _makeTemporaryDirectory(self): """ Make a temp dir where we can stash things if necessary. """ options.temporaryDirectory = tempfile.mkdtemp( prefix="GenomicConsensus-", dir="/tmp") logging.info("Created temporary directory %s" % (options.temporaryDirectory, )) def _algorithmByName(self, name): if name == "plurality": from GenomicConsensus.plurality import plurality algo = plurality elif name == "quiver": from GenomicConsensus.quiver import quiver algo = quiver elif name == "arrow": from GenomicConsensus.arrow import arrow algo = arrow else: die("Failure: unrecognized algorithm %s" % name) isOK, msg = algo.availability if not isOK: die("Failure: %s" % msg) return algo def _launchSlaves(self): """ Launch a group of worker processes (self._slaves), the queue (self._workQueue) that will be used to send them chunks of work, and the queue that will be used to receive back the results (self._resultsQueue). Additionally, launch the result collector process. """ availableCpus = multiprocessing.cpu_count() logging.info("Available CPUs: %d" % (availableCpus, )) logging.info("Requested workers: %d" % (options.numWorkers, )) logging.info("Parallel Mode: %s" % ("Threaded" if options.threaded else "Process", )) if (options.numWorkers > availableCpus): logging.warn( "More workers requested (%d) than CPUs available (%d);" " may result in suboptimal performance." % (options.numWorkers, availableCpus)) self._initQueues() WorkerType, ResultCollectorType = self._algorithm.slaveFactories( options.threaded) self._slaves = [] for i in xrange(options.numWorkers): p = WorkerType(self._workQueue, self._resultsQueue, self._algorithmConfiguration) self._slaves.append(p) p.start() logging.info("Launched compute slaves.") rcp = ResultCollectorType(self._resultsQueue, self._algorithmConfiguration) rcp.start() self._slaves.append(rcp) logging.info("Launched collector slave.") def _initQueues(self): if options.threaded: self._workQueue = Queue.Queue(options.queueSize) self._resultsQueue = Queue.Queue(options.queueSize) else: self._workQueue = multiprocessing.Queue(options.queueSize) self._resultsQueue = multiprocessing.Queue(options.queueSize) def _readAlignmentInput(self): """ Read the AlignmentSet input file and store it as self._inAlnFile. """ fname = options.inputFilename self._inAlnFile = AlignmentSet(fname) def _loadReference(self, alnFile): logging.info("Loading reference") err = reference.loadFromFile(options.referenceFilename, alnFile) if err: die("Error loading reference") # Grok the referenceWindow spec, if any. if options.referenceWindowsAsString is None: options.referenceWindows = () elif options.skipUnrecognizedContigs: # This is a workaround for smrtpipe scatter/gather. options.referenceWindows = [] for s in options.referenceWindowsAsString.split(","): try: win = reference.stringToWindow(s) options.referenceWindows.append(win) except: pass else: options.referenceWindows = map( reference.stringToWindow, options.referenceWindowsAsString.split(",")) if options.referenceWindowsFromAlignment: options.referenceWindows = alnFile.refWindows def _checkFileCompatibility(self, alnFile): if not alnFile.isSorted: die("Input Alignment file must be sorted.") if alnFile.isEmpty: die("Input Alignment file must be nonempty.") def _shouldDisableChunkCache(self, alnFile): #if isinstance(alnFile, CmpH5Reader): #if alnFile.isCmpH5: # threshold = options.autoDisableHdf5ChunkCache # return datasetCountExceedsThreshold(alnFile, threshold) #else: # return False return True def _configureAlgorithm(self, options, alnFile): assert self._algorithm != None try: self._algorithmConfiguration = self._algorithm.configure( options, alnFile) except IncompatibleDataException as e: die("Failure: %s" % e.message) def _mainLoop(self): # Split up reference genome into chunks and farm out the # a chunk as a unit of work. logging.debug("Starting main loop.") ids = reference.enumerateIds(options.referenceWindows) for _id in ids: if options.fancyChunking: chunks = reference.fancyEnumerateChunks( self._inAlnFile, _id, options.referenceChunkSize, options.minCoverage, options.minMapQV, options.referenceWindows) else: chunks = reference.enumerateChunks(_id, options.referenceChunkSize, options.referenceWindows) for chunk in chunks: if self._aborting: return self._workQueue.put(chunk) # Write sentinels ("end-of-work-stream") for i in xrange(options.numWorkers): self._workQueue.put(None) def _printProfiles(self): for profile in glob.glob(os.path.join(options.temporaryDirectory, "*")): pstats.Stats(profile).sort_stats("time").print_stats(20) def _cleanup(self): if options.doProfiling: logging.info("Removing %s" % options.temporaryDirectory) shutil.rmtree(options.temporaryDirectory, ignore_errors=True) def _setupEvidenceDumpDirectory(self, directoryName): if os.path.exists(directoryName): shutil.rmtree(directoryName) os.makedirs(directoryName) @property def aborting(self): return self._aborting def abortWork(self, why): """ Performs a shutdown of all the slave processes. Called by the monitoring thread when a child process exits with a non-zero, or when a keyboard interrupt (Ctrl-C) is given. Not called during normal shutdown. """ logging.error(why) self._aborting = True self._resultsQueue.close() self._workQueue.close() @property def slaves(self): return self._slaves def main(self): # This looks scary but it's not. Python uses reference # counting and has a secondary, optional garbage collector for # collecting garbage cycles. Unfortunately when a cyclic GC # happens when a thread is calling cPickle.dumps, the # interpreter crashes sometimes. See Bug 19704. Since we # don't leak garbage cycles, disabling the cyclic GC is # essentially harmless. gc.disable() self._algorithm = self._algorithmByName(options.algorithm) self._setupLogging() random.seed(42) logging.info("h5py version: %s" % h5py.version.version) logging.info("hdf5 version: %s" % h5py.version.hdf5_version) logging.info("ConsensusCore version: %s" % (consensusCoreVersion() or "ConsensusCore unavailable")) logging.info("ConsensusCore2 version: %s" % (consensusCore2Version() or "ConsensusCore2 unavailable")) logging.info("Starting.") atexit.register(self._cleanup) if options.doProfiling: self._makeTemporaryDirectory() with AlignmentSet(options.inputFilename) as peekFile: if options.algorithm == "arrow" and peekFile.isCmpH5: die("Arrow does not support CmpH5 files") if not peekFile.isCmpH5 and not peekFile.hasPbi: die("Genomic Consensus only works with cmp.h5 files and BAM " "files with accompanying .pbi files") logging.info("Peeking at file %s" % options.inputFilename) logging.info("Input data: numAlnHits=%d" % len(peekFile)) resolveOptions(peekFile) self._loadReference(peekFile) self._checkFileCompatibility(peekFile) self._configureAlgorithm(options, peekFile) options.disableHdf5ChunkCache = True #options.disableHdf5ChunkCache = self._shouldDisableChunkCache(peekFile) #if options.disableHdf5ChunkCache: # logging.info("Will disable HDF5 chunk cache (large number of datasets)") #logging.debug("After peek, # hdf5 objects open: %d" % h5py.h5f.get_obj_count()) if options.dumpEvidence: self._setupEvidenceDumpDirectory(options.evidenceDirectory) self._launchSlaves() self._readAlignmentInput() monitoringThread = threading.Thread(target=monitorSlaves, args=(self, )) monitoringThread.start() try: if options.doProfiling: cProfile.runctx("self._mainLoop()", globals=globals(), locals=locals(), filename=os.path.join( options.temporaryDirectory, "profile-main.out")) elif options.debug: if not options.threaded: die("Debugging only works with -T (threaded) mode") logging.info("PID: %d", os.getpid()) import ipdb with ipdb.launch_ipdb_on_exception(): self._mainLoop() else: self._mainLoop() except: why = traceback.format_exc() self.abortWork(why) monitoringThread.join() if self._aborting: logging.error("Aborting") return -1 else: logging.info("Finished.") if options.doProfiling: self._printProfiles() # close h5 file. self._inAlnFile.close() return 0
def test_membership_filter(self): aln = AlignmentSet(data.getXml(11)) assert len(list(aln)) == 177 hns = np.unique(aln.index.holeNumber)[:1] aln.filters.addRequirement(zm=[('in', hns)]) assert len(list(aln)) == 5 aln = AlignmentSet(data.getXml(11)) assert len(list(aln)) == 177 hns = np.unique(aln.index.holeNumber) aln.filters.addRequirement(zm=[('in', hns)]) assert len(list(aln)) == 177 aln = AlignmentSet(data.getXml(11)) assert len(list(aln)) == 177 hns = np.unique(aln.index.holeNumber) hns = [n for _ in range(10000) for n in hns] hns = np.array(hns) aln.filters.addRequirement(zm=[('in', hns)]) assert len(list(aln)) == 177 aln = AlignmentSet(data.getXml(11)) assert len(list(aln)) == 177 hns = np.unique(aln.index.holeNumber)[:1] hns = list(hns) aln.filters.addRequirement(zm=[('in', hns)]) assert len(list(aln)) == 5 aln = AlignmentSet(data.getXml(11)) assert len(list(aln)) == 177 hns = np.unique(aln.index.holeNumber)[:1] hns = set(hns) aln.filters.addRequirement(zm=[('in', hns)]) assert len(list(aln)) == 5 aln = AlignmentSet(data.getXml(11)) assert len(list(aln)) == 177 qnames = [r.qName for r in aln[:10]] aln.filters.addRequirement(qname=[('in', qnames)]) assert len(list(aln)) == 10 aln = AlignmentSet(data.getXml(11)) assert len(list(aln)) == 177 qnames = [r.qName for r in aln[:1]] aln.filters.addRequirement(qname=[('in', qnames)]) assert len(list(aln)) == 1 # test partial qnames: aln = AlignmentSet(data.getXml(11)) assert len(list(aln)) == 177 qnames = ['/'.join(r.qName.split('/')[:2]) for r in aln[:1]] assert qnames == ['pbalchemy1GbRSIIsim0/6'] aln.filters.addRequirement(qname=[('in', qnames)]) assert len(list(aln)) == 7 fn = tempfile.NamedTemporaryFile(suffix="alignmentset.xml").name aln = AlignmentSet(data.getXml(11)) assert len(list(aln)) == 177 hns = np.unique(aln.index.holeNumber)[:1] aln.filters.addRequirement(zm=[('in', hns)]) aln.write(fn) aln.close() aln2 = AlignmentSet(fn) assert len(list(aln2)) == 5
class KineticsToolsRunner(object): def __init__(self, args): self.args = args self.alignments = None def start(self): self.validateArgs() return self.run() def getVersion(self): return __version__ def validateArgs(self): parser = get_parser() if not os.path.exists(self.args.alignment_set): parser.error('Input AlignmentSet file provided does not exist') # Over-ride --identify if --control was specified if self.args.control: self.args.identify = "" if self.args.useLDA: if self.args.m5Cclassifier is None: parser.error( 'Please specify a folder containing forward.csv and reverse.csv classifiers in --m5Cclassifier.' ) if self.args.m5Cgff: if not self.args.useLDA: parser.error( 'm5Cgff file can only be generated in --useLDA mode.') # if self.args.methylFraction and not self.args.identify: # parser.error('Currently, --methylFraction only works when the --identify option is specified.') def run(self): # Figure out what modifications to identify mods = self.args.identify modsToCall = [] if mods: items = mods.split(",") if 'm6A' in items: modsToCall.append('H') if 'm4C' in items: modsToCall.append('J') if 'm5C_TET' in items: modsToCall.append('K') self.args.identify = True self.args.modsToCall = modsToCall self.options = self.args self.options.cmdLine = " ".join(sys.argv) self._workers = [] # set random seed # XXX note that this is *not* guaranteed to yield reproducible results # indepenently of the number of processing cores used! if self.options.randomSeed is not None: np.random.seed(self.options.randomSeed) if self.args.doProfiling: cProfile.runctx("self._mainLoop()", globals=globals(), locals=locals(), filename="profile.out") else: try: ret = self._mainLoop() finally: # Be sure to shutdown child processes if we get an exception on # the main thread for w in self._workers: if w.is_alive(): w.terminate() return ret def _initQueues(self): # Work chunks are created by the main thread and put on this queue # They will be consumed by KineticWorker threads, stored in # self._workers self._workQueue = multiprocessing.JoinableQueue( self.options.maxQueueSize) # Completed chunks are put on this queue by KineticWorker threads # They are consumed by the KineticsWriter process self._resultsQueue = multiprocessing.JoinableQueue( self.options.maxQueueSize) def _launchSlaveProcesses(self): """ Launch a group of worker processes (self._workers), the queue (self._workQueue) that will be used to send them chunks of work, and the queue that will be used to receive back the results (self._resultsQueue). Additionally, launch the result collector process. """ availableCpus = multiprocessing.cpu_count() logging.info("Available CPUs: %d" % (availableCpus, )) logging.info("Requested worker processes: %d" % (self.options.numWorkers, )) # Use all CPUs if numWorkers < 1 if self.options.numWorkers < 1: self.options.numWorkers = availableCpus # Warn if we make a bad numWorker argument is used if self.options.numWorkers > availableCpus: logging.warn( "More worker processes requested (%d) than CPUs available (%d);" " may result in suboptimal performance." % (self.options.numWorkers, availableCpus)) self._initQueues() # Launch the worker processes self._workers = [] for i in range(self.options.numWorkers): p = KineticWorkerProcess(self.options, self._workQueue, self._resultsQueue, self.ipdModel, sharedAlignmentSet=self.alignments) self._workers.append(p) p.start() logging.info("Launched worker processes.") # Launch result collector self._resultCollectorProcess = KineticsWriter(self.options, self._resultsQueue, self.refInfo, self.ipdModel) self._resultCollectorProcess.start() logging.info("Launched result collector process.") # Spawn a thread that monitors worker threads for crashes self.monitoringThread = threading.Thread( target=monitorChildProcesses, args=(self._workers + [self._resultCollectorProcess], )) self.monitoringThread.start() def _queueChunksForWindow(self, refWindow): """ Compute the chunk extents and queue up the work for a single reference """ winId = refWindow.refId winStart = refWindow.start winEnd = refWindow.end pass def loadReferenceAndModel(self, referencePath, ipdModelFilename): assert self.alignments is not None and self.referenceWindows is not None # Load the reference contigs - annotated with their refID from the # alignments logging.info("Loading reference contigs {!r}".format(referencePath)) contigs = ReferenceUtils.loadReferenceContigs( referencePath, alignmentSet=self.alignments, windows=self.referenceWindows) self.ipdModel = IpdModel(contigs, ipdModelFilename, self.args.modelIters) def loadSharedAlignmentSet(self, alignmentFilename): """ Read the input AlignmentSet so the indices can be shared with the slaves. This is also used to pass to ReferenceUtils for setting up the ipdModel object. """ logging.info("Reading AlignmentSet: %s" % alignmentFilename) logging.info(" reference: %s" % self.args.reference) self.alignments = AlignmentSet(alignmentFilename, referenceFastaFname=self.args.reference) # XXX this should ensure that the file(s) get opened, including any # .pbi indices - but need to confirm this self.refInfo = self.alignments.referenceInfoTable def _mainLoop(self): """ Main loop First launch the worker and writer processes Then we loop over ReferenceGroups in the alignments. For each contig we will: 1. Load the sequence into the main memory of the parent process 3. Chunk up the contig and submit the chunk descriptions to the work queue Finally, wait for the writer process to finish. """ # This looks scary but it's not. Python uses reference # counting and has a secondary, optional garbage collector for # collecting garbage cycles. Unfortunately when a cyclic GC # happens when a thread is calling cPickle.dumps, the # interpreter crashes sometimes. See Bug 19704. Since we # don't leak garbage cycles, disabling the cyclic GC is # essentially harmless. # gc.disable() self.loadSharedAlignmentSet(self.args.alignment_set) # Resolve the windows that will be visited. if self.args.referenceWindowsAsString is not None: self.referenceWindows = [] for s in self.args.referenceWindowsAsString.split(","): try: win = ReferenceUtils.parseReferenceWindow( s, self.alignments.referenceInfo) self.referenceWindows.append(win) except BaseException: if self.args.skipUnrecognizedContigs: continue else: raise Exception("Unrecognized contig!") elif self.args.referenceWindowsFromAlignment: self.referenceWindows = ReferenceUtils.referenceWindowsFromAlignment( self.alignments, self.alignments.referenceInfo) refNames = set([rw.refName for rw in self.referenceWindows]) # limit output to contigs that overlap with reference windows self.refInfo = [r for r in self.refInfo if r.Name in refNames] else: self.referenceWindows = ReferenceUtils.createReferenceWindows( self.refInfo) # Load reference and IpdModel chemName = ReferenceUtils.loadAlignmentChemistry(self.alignments) if self.args.useChemistry is not None: chemName = self.args.useChemistry ipdModelFilename = loader.getIpdModelFilename( ipdModel=self.args.ipdModel, majorityChem=chemName, paramsPath=self.args.paramsPath) self.loadReferenceAndModel(self.args.reference, ipdModelFilename) # Spawn workers self._launchSlaveProcesses() logging.info('Generating kinetics summary for [%s]' % self.args.alignment_set) #self.referenceMap = self.alignments['/RefGroup'].asDict('RefInfoID', 'ID') #self.alnInfo = self.alignments['/AlnInfo'].asRecArray() # Main loop -- we loop over ReferenceGroups in the alignments. For each contig we will: # 1. Load the sequence into the main memory of the parent process # 2. Fork the workers # 3. chunk up the contig and self.workChunkCounter = 0 # Iterate over references for window in self.referenceWindows: logging.info('Processing window/contig: %s' % (window, )) for chunk in ReferenceUtils.enumerateChunks( self.args.referenceStride, window): self._workQueue.put((self.workChunkCounter, chunk)) self.workChunkCounter += 1 # Shutdown worker threads with None sentinels for i in range(self.args.numWorkers): self._workQueue.put(None) for w in self._workers: w.join() # Join on the result queue and the resultsCollector process. # This ensures all the results are written before shutdown. self.monitoringThread.join() self._resultsQueue.join() self._resultCollectorProcess.join() logging.info("ipdSummary.py finished. Exiting.") self.alignments.close() return 0
class ToolRunner(object): """ The main driver class for the GenomicConsensus tool. It is assumed that arguments have already been parsed and used to populate the global 'options' namespace before instantiating this class. """ def __init__(self): self._inAlnFile = None self._resultsQueue = None self._workQueue = None self._slaves = None self._algorithm = None self._algorithmConfiguration = None self._aborting = False def _makeTemporaryDirectory(self): """ Make a temp dir where we can stash things if necessary. """ options.temporaryDirectory = tempfile.mkdtemp(prefix="GenomicConsensus-", dir="/tmp") logging.info("Created temporary directory %s" % (options.temporaryDirectory,) ) def _algorithmByName(self, name, peekFile): if name == "plurality": from GenomicConsensus.plurality import plurality algo = plurality elif name == "quiver": from GenomicConsensus.quiver import quiver algo = quiver elif name == "arrow": from GenomicConsensus.arrow import arrow algo = arrow elif name == "poa": from GenomicConsensus.poa import poa algo = poa elif name == "best": logging.info("Identifying best algorithm based on input data") from GenomicConsensus import algorithmSelection algoName = algorithmSelection.bestAlgorithm(peekFile.sequencingChemistry) return self._algorithmByName(algoName, peekFile) else: die("Failure: unrecognized algorithm %s" % name) isOK, msg = algo.availability if not isOK: die("Failure: %s" % msg) logging.info("Will use {a} algorithm".format(a=name)) return algo def _launchSlaves(self): """ Launch a group of worker processes (self._slaves), the queue (self._workQueue) that will be used to send them chunks of work, and the queue that will be used to receive back the results (self._resultsQueue). Additionally, launch the result collector process. """ availableCpus = multiprocessing.cpu_count() logging.info("Available CPUs: %d" % (availableCpus,)) logging.info("Requested workers: %d" % (options.numWorkers,)) logging.info("Parallel Mode: %s" % ("Threaded" if options.threaded else "Process",)) if (options.numWorkers > availableCpus): logging.warn("More workers requested (%d) than CPUs available (%d);" " may result in suboptimal performance." % (options.numWorkers, availableCpus)) self._initQueues() WorkerType, ResultCollectorType = self._algorithm.slaveFactories(options.threaded) self._slaves = [] for i in xrange(options.numWorkers): p = WorkerType(self._workQueue, self._resultsQueue, self._algorithmConfiguration) self._slaves.append(p) p.start() logging.info("Launched compute slaves.") rcp = ResultCollectorType(self._resultsQueue, self._algorithm.name, self._algorithmConfiguration) rcp.start() self._slaves.append(rcp) logging.info("Launched collector slave.") def _initQueues(self): if options.threaded: self._workQueue = Queue.Queue(options.queueSize) self._resultsQueue = Queue.Queue(options.queueSize) else: self._workQueue = multiprocessing.Queue(options.queueSize) self._resultsQueue = multiprocessing.Queue(options.queueSize) def _readAlignmentInput(self): """ Read the AlignmentSet input file and store it as self._inAlnFile. """ fname = options.inputFilename self._inAlnFile = AlignmentSet(fname) def _loadReference(self, alnFile): logging.info("Loading reference") reference.loadFromFile(options.referenceFilename, alnFile) # Grok the referenceWindow spec, if any. if options.referenceWindowsAsString is None: options.referenceWindows = () elif options.skipUnrecognizedContigs: # This is a workaround for smrtpipe scatter/gather. options.referenceWindows = [] for s in options.referenceWindowsAsString.split(","): try: win = reference.stringToWindow(s) options.referenceWindows.append(win) except Exception: msg = traceback.format_exc() logging.debug(msg) pass else: options.referenceWindows = map(reference.stringToWindow, options.referenceWindowsAsString.split(",")) if options.referenceWindowsFromAlignment: options.referenceWindows = alnFile.refWindows def _checkFileCompatibility(self, alnFile): if not alnFile.isSorted: die("Input Alignment file must be sorted.") if alnFile.isCmpH5 and alnFile.isEmpty: die("Input Alignment file must be nonempty.") def _shouldDisableChunkCache(self, alnFile): #if isinstance(alnFile, CmpH5Reader): #if alnFile.isCmpH5: # threshold = options.autoDisableHdf5ChunkCache # return datasetCountExceedsThreshold(alnFile, threshold) #else: # return False return True def _configureAlgorithm(self, options, alnFile): assert self._algorithm != None try: self._algorithmConfiguration = self._algorithm.configure(options, alnFile) except IncompatibleDataException as e: die("Failure: %s" % e.message) def _mainLoop(self): # Split up reference genome into chunks and farm out the # a chunk as a unit of work. logging.debug("Starting main loop.") ids = reference.enumerateIds(options.referenceWindows) for _id in ids: if options.fancyChunking: chunks = reference.fancyEnumerateChunks(self._inAlnFile, _id, options.referenceChunkSize, options.minCoverage, options.minMapQV, options.referenceWindows) else: chunks = reference.enumerateChunks(_id, options.referenceChunkSize, options.referenceWindows) for chunk in chunks: if self._aborting: return self._workQueue.put(chunk) # Write sentinels ("end-of-work-stream") for i in xrange(options.numWorkers): self._workQueue.put(None) def _printProfiles(self): for profile in glob.glob(os.path.join(options.temporaryDirectory, "*")): pstats.Stats(profile).sort_stats("time").print_stats(20) def _cleanup(self): if options.doProfiling: logging.info("Removing %s" % options.temporaryDirectory) shutil.rmtree(options.temporaryDirectory, ignore_errors=True) @property def aborting(self): return self._aborting def abortWork(self, why): """ Performs a shutdown of all the slave processes. Called by the monitoring thread when a child process exits with a non-zero, or when a keyboard interrupt (Ctrl-C) is given. Not called during normal shutdown. """ logging.error(why) self._aborting = True self._resultsQueue.close() self._workQueue.close() @property def slaves(self): return self._slaves def main(self): # This looks scary but it's not. Python uses reference # counting and has a secondary, optional garbage collector for # collecting garbage cycles. Unfortunately when a cyclic GC # happens when a thread is calling cPickle.dumps, the # interpreter crashes sometimes. See Bug 19704. Since we # don't leak garbage cycles, disabling the cyclic GC is # essentially harmless. gc.disable() random.seed(42) if options.pdb or options.pdbAtStartup: print("Process ID: %d" % os.getpid(), file=sys.stderr) try: import ipdb except ImportError: die("Debugging options require 'ipdb' package installed.") if not options.threaded: die("Debugging only works with -T (threaded) mode") if options.pdbAtStartup: ipdb.set_trace() logging.info("ConsensusCore version: %s" % (consensusCoreVersion() or "ConsensusCore unavailable")) logging.info("ConsensusCore2 version: %s" % (consensusCore2Version() or "ConsensusCore2 unavailable")) logging.info("Starting.") atexit.register(self._cleanup) if options.doProfiling: self._makeTemporaryDirectory() with AlignmentSet(options.inputFilename) as peekFile: if options.algorithm == "arrow" and peekFile.isCmpH5: die("Arrow does not support CmpH5 files") if not peekFile.isCmpH5 and not peekFile.hasPbi: die("Genomic Consensus only works with cmp.h5 files and BAM " "files with accompanying .pbi files") logging.info("Peeking at file %s" % options.inputFilename) logging.info("Input data: numAlnHits=%d" % len(peekFile)) resolveOptions(peekFile) self._loadReference(peekFile) self._checkFileCompatibility(peekFile) self._algorithm = self._algorithmByName(options.algorithm, peekFile) self._configureAlgorithm(options, peekFile) options.disableHdf5ChunkCache = True #options.disableHdf5ChunkCache = self._shouldDisableChunkCache(peekFile) #if options.disableHdf5ChunkCache: # logging.info("Will disable HDF5 chunk cache (large number of datasets)") self._launchSlaves() self._readAlignmentInput() monitoringThread = threading.Thread(target=monitorSlaves, args=(self,)) monitoringThread.start() try: if options.doProfiling: cProfile.runctx("self._mainLoop()", globals=globals(), locals=locals(), filename=os.path.join(options.temporaryDirectory, "profile-main.out")) elif options.pdb: with ipdb.launch_ipdb_on_exception(): self._mainLoop() else: self._mainLoop() except BaseException as exc: msg = 'options={}'.format(pprint.pformat(vars(options))) logging.exception(msg) self.abortWork(repr(exc)) monitoringThread.join() if self._aborting: logging.error("Aborting") return -1 else: logging.info("Finished.") if options.doProfiling: self._printProfiles() # close h5 file. self._inAlnFile.close() return 0