Ejemplo n.º 1
0
    def test_loadmetadata_from_dataset_create_cli(self):
        fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        fn2 = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        log.debug(fn)

        aln = AlignmentSet(data.getXml(8))
        aln.metadata.collections = None
        aln.copyTo(fn)
        aln.close()
        del aln
        self.assertTrue(os.path.exists(fn))

        aln = AlignmentSet(fn)
        self.assertFalse(aln.metadata.collections)

        cmd = "dataset create --metadata {m} {o} {i}".format(
            o=fn2,
            i=fn,
            m=("/pbi/dept/secondary/siv/testdata/"
               "SA3-Sequel/lambda/roche_SAT/"
               "m54013_151205_032353.subreadset.xml"))
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0, m)
        aln = AlignmentSet(fn2)
        self.assertTrue(aln.metadata.collections)
Ejemplo n.º 2
0
    def test_loadmetadata_from_dataset_create_cli(self):
        fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        fn2 = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        log.debug(fn)

        aln = AlignmentSet(data.getXml(8))
        aln.metadata.collections = None
        aln.copyTo(fn)
        aln.close()
        del aln
        self.assertTrue(os.path.exists(fn))

        aln = AlignmentSet(fn)
        self.assertFalse(aln.metadata.collections)

        cmd = "dataset create --metadata {m} {o} {i}".format(
            o=fn2,
            i=fn,
            m=("/pbi/dept/secondary/siv/testdata/"
               "SA3-Sequel/lambda/roche_SAT/"
               "m54013_151205_032353.subreadset.xml"))
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0, m)
        aln = AlignmentSet(fn2)
        self.assertTrue(aln.metadata.collections)
Ejemplo n.º 3
0
    def test_membership_filter_with_equal_operator(self):
        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        hns = np.unique(aln.index.holeNumber)[:1]
        aln.filters.addRequirement(zm=[('=', hns)])
        self.assertEqual(len(list(aln)), 5)

        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        hns = np.unique(aln.index.holeNumber)
        aln.filters.addRequirement(zm=[('==', hns)])
        self.assertEqual(len(list(aln)), 177)

        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        hns = np.unique(aln.index.holeNumber)
        hns = [n for _ in range(10000) for n in hns]
        hns = np.array(hns)
        aln.filters.addRequirement(zm=[('==', hns)])
        self.assertEqual(len(list(aln)), 177)

        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        hns = np.unique(aln.index.holeNumber)[:1]
        hns = list(hns)
        aln.filters.addRequirement(zm=[('==', hns)])
        self.assertEqual(len(list(aln)), 5)

        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        hns = np.unique(aln.index.holeNumber)[:1]
        hns = set(hns)
        aln.filters.addRequirement(zm=[('==', hns)])
        self.assertEqual(len(list(aln)), 5)

        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        qnames = [r.qName for r in aln[:10]]
        aln.filters.addRequirement(qname=[('==', qnames)])
        self.assertEqual(len(list(aln)), 10)

        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        qnames = [r.qName for r in aln[:1]]
        aln.filters.addRequirement(qname=[('==', qnames)])
        self.assertEqual(len(list(aln)), 1)

        fn = tempfile.NamedTemporaryFile(suffix="alignmentset.xml").name
        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        hns = np.unique(aln.index.holeNumber)[:1]
        aln.filters.addRequirement(zm=[('==', hns)])
        aln.write(fn)
        aln.close()
        aln2 = AlignmentSet(fn)
        self.assertEqual(len(list(aln2)), 5)
Ejemplo n.º 4
0
    def test_membership_filter(self):
        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        hns = np.unique(aln.index.holeNumber)[:1]
        aln.filters.addRequirement(zm=[('in', hns)])
        self.assertEqual(len(list(aln)), 5)

        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        hns = np.unique(aln.index.holeNumber)
        aln.filters.addRequirement(zm=[('in', hns)])
        self.assertEqual(len(list(aln)), 177)

        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        hns = np.unique(aln.index.holeNumber)
        hns = [n for _ in range(10000) for n in hns]
        hns = np.array(hns)
        aln.filters.addRequirement(zm=[('in', hns)])
        self.assertEqual(len(list(aln)), 177)

        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        hns = np.unique(aln.index.holeNumber)[:1]
        hns = list(hns)
        aln.filters.addRequirement(zm=[('in', hns)])
        self.assertEqual(len(list(aln)), 5)

        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        hns = np.unique(aln.index.holeNumber)[:1]
        hns = set(hns)
        aln.filters.addRequirement(zm=[('in', hns)])
        self.assertEqual(len(list(aln)), 5)

        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        qnames = [r.qName for r in aln[:10]]
        aln.filters.addRequirement(qname=[('in', qnames)])
        self.assertEqual(len(list(aln)), 10)

        fn = tempfile.NamedTemporaryFile(suffix="alignmentset.xml").name
        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        hns = np.unique(aln.index.holeNumber)[:1]
        aln.filters.addRequirement(zm=[('in', hns)])
        aln.write(fn)
        aln.close()
        aln2 = AlignmentSet(fn)
        self.assertEqual(len(list(aln2)), 5)
Ejemplo n.º 5
0
    def test_loadmetadata_from_dataset_cli(self):
        fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        log.debug(fn)

        aln = AlignmentSet(data.getXml(7))
        aln.metadata.collections = None
        aln.copyTo(fn)
        aln.close()
        del aln
        assert os.path.exists(fn)

        aln = AlignmentSet(fn)
        assert not aln.metadata.collections

        cmd = "dataset loadmetadata {i} {m}".format(
            i=fn,
            m=("/pbi/dept/secondary/siv/testdata/"
               "SA3-Sequel/lambda/roche_SAT/"
               "m54013_151205_032353.subreadset.xml"))
        self._check_cmd(cmd)
        aln = AlignmentSet(fn)
        assert aln.metadata.collections
Ejemplo n.º 6
0
class ToolRunner(object):
    """
    The main driver class for the GenomicConsensus tool.
    """
    def __init__(self):
        self._inCmpH5 = None
        self._resultsQueue = None
        self._workQueue = None
        self._slaves = None
        self._algorithm = None
        self._algorithmConfiguration = None
        self._aborting = False

    def _setupLogging(self):
        if options.quiet:
            logLevel = logging.ERROR
        elif options.verbosity >= 2:
            logLevel = logging.DEBUG
        elif options.verbosity == 1:
            logLevel = logging.INFO
        else:
            logLevel = logging.WARNING
        logFormat = '[%(levelname)s] %(message)s'
        logging.basicConfig(level=logLevel, format=logFormat)

    def _makeTemporaryDirectory(self):
        """
        Make a temp dir where we can stash things if necessary.
        """
        options.temporaryDirectory = tempfile.mkdtemp(prefix="GenomicConsensus-", dir="/tmp")
        logging.info("Created temporary directory %s" % (options.temporaryDirectory,) )

    def _algorithmByName(self, name):
        if name=="plurality":
            algo = plurality
        elif name=="quiver":
            algo = quiver
        else:
            die("Failure: unrecognized algorithm %s" % name)
        isOK, msg = algo.availability
        if not isOK:
            die("Failure: %s" % msg)
        return algo

    def _launchSlaves(self):
        """
        Launch a group of worker processes (self._slaves), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus,))
        logging.info("Requested workers: %d" % (options.numWorkers,))
        logging.info("Parallel Mode: %s" % ("Threaded" if options.threaded else "Process",))
        if (options.numWorkers > availableCpus):
            logging.warn("More workers requested (%d) than CPUs available (%d);"
                         " may result in suboptimal performance."
                         % (options.numWorkers, availableCpus))
        self._initQueues()

        WorkerType, ResultCollectorType = self._algorithm.slaveFactories(options.threaded)
        self._slaves = []
        for i in xrange(options.numWorkers):
            p = WorkerType(self._workQueue, self._resultsQueue, self._algorithmConfiguration)
            self._slaves.append(p)
            p.start()
        logging.info("Launched compute slaves.")

        rcp = ResultCollectorType(self._resultsQueue, self._algorithmConfiguration)
        rcp.start()
        self._slaves.append(rcp)
        logging.info("Launched collector slave.")

    def _initQueues(self):
        if options.threaded:
            self._workQueue = Queue.Queue(options.queueSize)
            self._resultsQueue = Queue.Queue(options.queueSize)
        else:
            self._workQueue = multiprocessing.Queue(options.queueSize)
            self._resultsQueue = multiprocessing.Queue(options.queueSize)

    def _readCmpH5Input(self):
        """
        Read the CmpH5 input file into a CmpH5 object and
        store it as self._inCmpH5.
        """
        fname = options.inputFilename
        self._inCmpH5 = AlignmentSet(fname)

    def _loadReference(self, cmpH5):
        logging.info("Loading reference")
        err = reference.loadFromFile(options.referenceFilename, cmpH5)
        if err:
            die("Error loading reference")
        # Grok the referenceWindow spec, if any.
        if options.referenceWindowsAsString is None:
            options.referenceWindows = ()
        elif options.skipUnrecognizedContigs:
            # This is a workaround for smrtpipe scatter/gather.
            options.referenceWindows = []
            for s in options.referenceWindowsAsString.split(","):
                try:
                    win = reference.stringToWindow(s)
                    options.referenceWindows.append(win)
                except:
                    pass
        else:
            options.referenceWindows = map(reference.stringToWindow,
                                           options.referenceWindowsAsString.split(","))
        if options.referenceWindowsFromAlignment:
            options.referenceWindows = cmpH5.refWindows

    def _checkFileCompatibility(self, cmpH5):
        if not cmpH5.isSorted:
            die("Input CmpH5 file must be sorted.")
        if cmpH5.isEmpty:
            die("Input CmpH5 file must be nonempty.")

    def _shouldDisableChunkCache(self, cmpH5):
        #if isinstance(cmpH5, CmpH5Reader):
        #if cmpH5.isCmpH5:
        #    threshold = options.autoDisableHdf5ChunkCache
        #    return datasetCountExceedsThreshold(cmpH5, threshold)
        #else:
        #    return False
        return True

    def _configureAlgorithm(self, options, cmpH5):
        assert self._algorithm != None
        try:
            self._algorithmConfiguration = self._algorithm.configure(options, cmpH5)
        except IncompatibleDataException as e:
            die("Failure: %s" % e.message)

    def _mainLoop(self):
        # Split up reference genome into chunks and farm out the
        # a chunk as a unit of work.
        logging.debug("Starting main loop.")
        ids = reference.enumerateIds(options.referenceWindows)
        for _id in ids:
            if options.fancyChunking:
                chunks = reference.fancyEnumerateChunks(self._inCmpH5,
                                                        _id,
                                                        options.referenceChunkSize,
                                                        options.minCoverage,
                                                        options.minMapQV,
                                                        options.referenceWindows)
            else:
                chunks = reference.enumerateChunks(_id,
                                                   options.referenceChunkSize,
                                                   options.referenceWindows)
            for chunk in chunks:
                if self._aborting: return
                self._workQueue.put(chunk)

        # Write sentinels ("end-of-work-stream")
        for i in xrange(options.numWorkers):
            self._workQueue.put(None)

    def _printProfiles(self):
        for profile in glob.glob(os.path.join(options.temporaryDirectory, "*")):
            pstats.Stats(profile).sort_stats("time").print_stats(20)

    def _cleanup(self):
        if options.doProfiling:
            logging.info("Removing %s" % options.temporaryDirectory)
            shutil.rmtree(options.temporaryDirectory, ignore_errors=True)

    def _setupEvidenceDumpDirectory(self, directoryName):
        if os.path.exists(directoryName):
            shutil.rmtree(directoryName)
        os.makedirs(directoryName)

    @property
    def aborting(self):
        return self._aborting

    def abortWork(self, why):
        """
        Performs a shutdown of all the slave processes.  Called by the
        monitoring thread when a child process exits with a non-zero,
        or when a keyboard interrupt (Ctrl-C) is given. Not called
        during normal shutdown.
        """
        logging.error(why)
        self._aborting = True
        self._resultsQueue.close()
        self._workQueue.close()

    @property
    def slaves(self):
        return self._slaves

    def main(self):

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        gc.disable()

        parseOptions()
        self._algorithm = self._algorithmByName(options.algorithm)
        self._setupLogging()
        random.seed(42)

        logging.info("h5py version: %s" % h5py.version.version)
        logging.info("hdf5 version: %s" % h5py.version.hdf5_version)
        logging.info("ConsensusCore version: %s" %
                     (consensusCoreVersion() or "ConsensusCore unavailable"))
        logging.info("Starting.")

        atexit.register(self._cleanup)
        if options.doProfiling:
            self._makeTemporaryDirectory()

        with AlignmentSet(options.inputFilename) as peekFile:
            if not peekFile.isCmpH5 and not peekFile.hasPbi:
                logging.warn("'fancyChunking' not yet available for BAM "
                             "files without accompanying .pbi files, "
                             "disabling")
                options.fancyChunking = False
            logging.info("Peeking at file %s" % options.inputFilename)
            logging.info("Input data: numAlnHits=%d" % len(peekFile))
            resolveOptions(peekFile)
            self._loadReference(peekFile)
            self._checkFileCompatibility(peekFile)
            self._configureAlgorithm(options, peekFile)
            options.disableHdf5ChunkCache = True
            #options.disableHdf5ChunkCache = self._shouldDisableChunkCache(peekFile)
            #if options.disableHdf5ChunkCache:
            #    logging.info("Will disable HDF5 chunk cache (large number of datasets)")
            #logging.debug("After peek, # hdf5 objects open: %d" % h5py.h5f.get_obj_count())

        if options.dumpEvidence:
            self._setupEvidenceDumpDirectory(options.evidenceDirectory)

        self._launchSlaves()
        self._readCmpH5Input()

        monitoringThread = threading.Thread(target=monitorSlaves, args=(self,))
        monitoringThread.start()

        try:
            if options.doProfiling:
                cProfile.runctx("self._mainLoop()",
                                globals=globals(),
                                locals=locals(),
                                filename=os.path.join(options.temporaryDirectory,
                                                      "profile-main.out"))

            elif options.doDebugging:
                if not options.threaded:
                    die("Debugging only works with -T (threaded) mode")
                logging.info("PID: %d", os.getpid())
                import ipdb
                with ipdb.launch_ipdb_on_exception():
                    self._mainLoop()

            else:
                self._mainLoop()
        except:
            why = traceback.format_exc()
            self.abortWork(why)

        monitoringThread.join()

        if self._aborting:
            logging.error("Aborting")
            return -1
        else:
            logging.info("Finished.")

        if options.doProfiling:
            self._printProfiles()

        # close h5 file.
        self._inCmpH5.close()
        return 0
Ejemplo n.º 7
0
class KineticsToolsRunner(object):
    def __init__(self, args):
        self.args = args
        self.alignments = None

    def start(self):
        self.validateArgs()
        return self.run()

    def getVersion(self):
        return __version__

    def validateArgs(self):
        parser = get_parser()
        if not os.path.exists(self.args.alignment_set):
            parser.error('Input AlignmentSet file provided does not exist')

        if self.args.identify and self.args.control:
            parser.error('--control and --identify are mutally exclusive. Please choose one or the other')

        if self.args.useLDA:
            if self.args.m5Cclassifier is None:
                parser.error('Please specify a folder containing forward.csv and reverse.csv classifiers in --m5Cclassifier.')

        if self.args.m5Cgff:
            if not self.args.useLDA:
                parser.error('m5Cgff file can only be generated in --useLDA mode.')

        # if self.args.methylFraction and not self.args.identify:
        #    parser.error('Currently, --methylFraction only works when the --identify option is specified.')

    def run(self):

        # Figure out what modifications to identify
        mods = self.args.identify
        modsToCall = []
        if mods:
            items = mods.split(",")

            if 'm6A' in items:
                modsToCall.append('H')

            if 'm4C' in items:
                modsToCall.append('J')

            if 'm5C_TET' in items:
                modsToCall.append('K')

            self.args.identify = True
            self.args.modsToCall = modsToCall

        self.options = self.args
        self.options.cmdLine = " ".join(sys.argv)
        self._workers = []

        # set random seed
        # XXX note that this is *not* guaranteed to yield reproducible results
        # indepenently of the number of processing cores used!
        if self.options.randomSeed is not None:
            np.random.seed(self.options.randomSeed)

        if self.args.doProfiling:
            cProfile.runctx("self._mainLoop()",
                            globals=globals(),
                            locals=locals(),
                            filename="profile.out")

        else:
            try:
                ret = self._mainLoop()
            finally:
                # Be sure to shutdown child processes if we get an exception on the main thread
                if not self.args.threaded:
                    for w in self._workers:
                        if w.is_alive():
                            w.terminate()

            return ret

    def _initQueues(self):
        if self.options.threaded:
            # Work chunks are created by the main thread and put on this queue
            # They will be consumed by KineticWorker threads, stored in self._workers
            self._workQueue = Queue.Queue(self.options.maxQueueSize)

            # Completed chunks are put on this queue by KineticWorker threads
            # They are consumed by the KineticsWriter process
            self._resultsQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)
        else:
            # Work chunks are created by the main thread and put on this queue
            # They will be consumed by KineticWorker threads, stored in self._workers
            self._workQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)

            # Completed chunks are put on this queue by KineticWorker threads
            # They are consumed by the KineticsWriter process
            self._resultsQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)

    def _launchSlaveProcesses(self):
        """
        Launch a group of worker processes (self._workers), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus,))
        logging.info("Requested worker processes: %d" % (self.options.numWorkers,))

        # Use all CPUs if numWorkers < 1
        if self.options.numWorkers < 1:
            self.options.numWorkers = availableCpus

        # Warn if we make a bad numWorker argument is used
        if self.options.numWorkers > availableCpus:
            logging.warn("More worker processes requested (%d) than CPUs available (%d);"
                         " may result in suboptimal performance."
                         % (self.options.numWorkers, availableCpus))

        self._initQueues()

        if self.options.threaded:
            self.options.numWorkers = 1
            WorkerType = KineticWorkerThread
        else:
            WorkerType = KineticWorkerProcess
        
        # Launch the worker processes
        self._workers = []
        for i in xrange(self.options.numWorkers):
            p = WorkerType(self.options, self._workQueue, self._resultsQueue,
                self.ipdModel,
                sharedAlignmentSet=self.alignments)
            self._workers.append(p)
            p.start()
        logging.info("Launched worker processes.")

        # Launch result collector
        self._resultCollectorProcess = KineticsWriter(self.options, self._resultsQueue, self.refInfo, self.ipdModel)
        self._resultCollectorProcess.start()
        logging.info("Launched result collector process.")

        # Spawn a thread that monitors worker threads for crashes
        self.monitoringThread = threading.Thread(target=monitorChildProcesses, args=(self._workers + [self._resultCollectorProcess],))
        self.monitoringThread.start()

    def _queueChunksForWindow(self, refWindow):
        """
        Compute the chunk extents and queue up the work for a single reference
        """
        winId = refWindow.refId
        winStart = refWindow.start
        winEnd = refWindow.end
        pass

    def loadReferenceAndModel(self, referencePath):
        assert self.alignments is not None and self.referenceWindows is not None
        # Load the reference contigs - annotated with their refID from the cmp.h5
        logging.info("Loading reference contigs %s" % referencePath)
        contigs = ReferenceUtils.loadReferenceContigs(referencePath,
            alignmentSet=self.alignments, windows=self.referenceWindows)

        # There are three different ways the ipdModel can be loaded.
        # In order of precedence they are:
        # 1. Explicit path passed to --ipdModel
        # 2. Path to parameter bundle, model selected using the cmp.h5's sequencingChemistry tags
        # 3. Fall back to built-in model.

        # By default, use built-in model
        ipdModel = None

        if self.args.ipdModel:
            ipdModel = self.args.ipdModel
            logging.info("Using passed in ipd model: %s" % self.args.ipdModel)
            if not os.path.exists(self.args.ipdModel):
                logging.error("Couldn't find model file: %s" % self.args.ipdModel)
                sys.exit(1)
        elif self.args.paramsPath:
            if not os.path.exists(self.args.paramsPath):
                logging.error("Params path doesn't exist: %s" % self.args.paramsPath)
                sys.exit(1)

            majorityChem = ReferenceUtils.loadAlignmentChemistry(self.alignments)

            # Temporary solution for Sequel chemistries: we do not
            # have trained kinetics models in hand yet for Sequel
            # chemistries.  However we have observed that the P5-C3
            # training seems to yield fairly good results on Sequel
            # chemistries to date.  So for the moment, we will use
            # that model for Sequel data.
            if majorityChem.startswith("S/"):
                logging.info("No trained model available yet for Sequel chemistries; modeling as P5-C3")
                majorityChem = "P5-C3"

            ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5")
            if majorityChem == 'unknown':
                logging.error("Chemistry cannot be identified---cannot perform kinetic analysis")
                sys.exit(1)
            elif not os.path.exists(ipdModel):
                logging.error("Aborting, no kinetics model available for this chemistry: %s" % ipdModel)
                sys.exit(1)
            else:
                logging.info("Using Chemistry matched IPD model: %s" % ipdModel)

        self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters)

    def loadSharedAlignmentSet(self, cmpH5Filename):
        """
        Read the input AlignmentSet so the indices can be shared with the
        slaves.  This is also used to pass to ReferenceUtils for setting up
        the ipdModel object.
        """
        logging.info("Reading AlignmentSet: %s" % cmpH5Filename)
        logging.info("           reference: %s" % self.args.reference)
        self.alignments = AlignmentSet(cmpH5Filename,
                                       referenceFastaFname=self.args.reference)
        # XXX this should ensure that the file(s) get opened, including any
        # .pbi indices - but need to confirm this
        self.refInfo = self.alignments.referenceInfoTable

    def _mainLoop(self):
        """
        Main loop
        First launch the worker and writer processes
        Then we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        1. Load the sequence into the main memory of the parent process
        3. Chunk up the contig and submit the chunk descriptions to the work queue
        Finally, wait for the writer process to finish.
        """

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        #gc.disable()

        self.loadSharedAlignmentSet(self.args.alignment_set)

        # Resolve the windows that will be visited.
        if self.args.referenceWindowsAsString is not None:
            self.referenceWindows = []
            for s in self.args.referenceWindowsAsString.split(","):
                try:
                    win = ReferenceUtils.parseReferenceWindow(s, self.alignments.referenceInfo)
                    self.referenceWindows.append(win)
                except:
                    if self.args.skipUnrecognizedContigs:
                        continue
                    else:
                        raise Exception, "Unrecognized contig!"
        elif self.args.referenceWindowsFromAlignment:
            self.referenceWindows = ReferenceUtils.referenceWindowsFromAlignment(self.alignments, self.alignments.referenceInfo)
            refNames = set([rw.refName for rw in self.referenceWindows])
            # limit output to contigs that overlap with reference windows
            self.refInfo = [r for r in self.refInfo if r.Name in refNames]
        else:
            self.referenceWindows = ReferenceUtils.createReferenceWindows(
                self.refInfo)

        # Load reference and IpdModel
        self.loadReferenceAndModel(self.args.reference)

        # Spawn workers
        self._launchSlaveProcesses()

        logging.info('Generating kinetics summary for [%s]' % self.args.alignment_set)

        #self.referenceMap = self.alignments['/RefGroup'].asDict('RefInfoID', 'ID')
        #self.alnInfo = self.alignments['/AlnInfo'].asRecArray()

        # Main loop -- we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        # 1. Load the sequence into the main memory of the parent process
        # 2. Fork the workers
        # 3. chunk up the contig and

        self.workChunkCounter = 0

        # Iterate over references
        for window in self.referenceWindows:
            logging.info('Processing window/contig: %s' % (window,))
            for chunk in ReferenceUtils.enumerateChunks(self.args.referenceStride, window):
                self._workQueue.put((self.workChunkCounter, chunk))
                self.workChunkCounter += 1

        # Shutdown worker threads with None sentinels
        for i in xrange(self.args.numWorkers):
            self._workQueue.put(None)

        for w in self._workers:
            w.join()

        # Join on the result queue and the resultsCollector process.
        # This ensures all the results are written before shutdown.
        self.monitoringThread.join()
        self._resultsQueue.join()
        self._resultCollectorProcess.join()
        logging.info("ipdSummary.py finished. Exiting.")
        self.alignments.close()
        return 0
Ejemplo n.º 8
0
class ToolRunner(object):
    """
    The main driver class for the GenomicConsensus tool.  It is assumed that
    arguments have already been parsed and used to populate the global
    'options' namespace before instantiating this class.
    """
    def __init__(self):
        self._inAlnFile = None
        self._resultsQueue = None
        self._workQueue = None
        self._slaves = None
        self._algorithm = None
        self._algorithmConfiguration = None
        self._aborting = False

    def _setupLogging(self):
        if options.quiet:
            logLevel = logging.ERROR
        elif options.verbosity >= 2:
            logLevel = logging.DEBUG
        elif options.verbosity == 1:
            logLevel = logging.INFO
        else:
            logLevel = logging.WARNING
        log = logging.getLogger()
        log.setLevel(logLevel)

    def _makeTemporaryDirectory(self):
        """
        Make a temp dir where we can stash things if necessary.
        """
        options.temporaryDirectory = tempfile.mkdtemp(
            prefix="GenomicConsensus-", dir="/tmp")
        logging.info("Created temporary directory %s" %
                     (options.temporaryDirectory, ))

    def _algorithmByName(self, name):
        if name == "plurality":
            from GenomicConsensus.plurality import plurality
            algo = plurality
        elif name == "quiver":
            from GenomicConsensus.quiver import quiver
            algo = quiver
        elif name == "arrow":
            from GenomicConsensus.arrow import arrow
            algo = arrow
        else:
            die("Failure: unrecognized algorithm %s" % name)
        isOK, msg = algo.availability
        if not isOK:
            die("Failure: %s" % msg)
        return algo

    def _launchSlaves(self):
        """
        Launch a group of worker processes (self._slaves), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus, ))
        logging.info("Requested workers: %d" % (options.numWorkers, ))
        logging.info("Parallel Mode: %s" %
                     ("Threaded" if options.threaded else "Process", ))
        if (options.numWorkers > availableCpus):
            logging.warn(
                "More workers requested (%d) than CPUs available (%d);"
                " may result in suboptimal performance." %
                (options.numWorkers, availableCpus))
        self._initQueues()

        WorkerType, ResultCollectorType = self._algorithm.slaveFactories(
            options.threaded)
        self._slaves = []
        for i in xrange(options.numWorkers):
            p = WorkerType(self._workQueue, self._resultsQueue,
                           self._algorithmConfiguration)
            self._slaves.append(p)
            p.start()
        logging.info("Launched compute slaves.")

        rcp = ResultCollectorType(self._resultsQueue,
                                  self._algorithmConfiguration)
        rcp.start()
        self._slaves.append(rcp)
        logging.info("Launched collector slave.")

    def _initQueues(self):
        if options.threaded:
            self._workQueue = Queue.Queue(options.queueSize)
            self._resultsQueue = Queue.Queue(options.queueSize)
        else:
            self._workQueue = multiprocessing.Queue(options.queueSize)
            self._resultsQueue = multiprocessing.Queue(options.queueSize)

    def _readAlignmentInput(self):
        """
        Read the AlignmentSet input file and
        store it as self._inAlnFile.
        """
        fname = options.inputFilename
        self._inAlnFile = AlignmentSet(fname)

    def _loadReference(self, alnFile):
        logging.info("Loading reference")
        err = reference.loadFromFile(options.referenceFilename, alnFile)
        if err:
            die("Error loading reference")
        # Grok the referenceWindow spec, if any.
        if options.referenceWindowsAsString is None:
            options.referenceWindows = ()
        elif options.skipUnrecognizedContigs:
            # This is a workaround for smrtpipe scatter/gather.
            options.referenceWindows = []
            for s in options.referenceWindowsAsString.split(","):
                try:
                    win = reference.stringToWindow(s)
                    options.referenceWindows.append(win)
                except:
                    pass
        else:
            options.referenceWindows = map(
                reference.stringToWindow,
                options.referenceWindowsAsString.split(","))
        if options.referenceWindowsFromAlignment:
            options.referenceWindows = alnFile.refWindows

    def _checkFileCompatibility(self, alnFile):
        if not alnFile.isSorted:
            die("Input Alignment file must be sorted.")
        if alnFile.isEmpty:
            die("Input Alignment file must be nonempty.")

    def _shouldDisableChunkCache(self, alnFile):
        #if isinstance(alnFile, CmpH5Reader):
        #if alnFile.isCmpH5:
        #    threshold = options.autoDisableHdf5ChunkCache
        #    return datasetCountExceedsThreshold(alnFile, threshold)
        #else:
        #    return False
        return True

    def _configureAlgorithm(self, options, alnFile):
        assert self._algorithm != None
        try:
            self._algorithmConfiguration = self._algorithm.configure(
                options, alnFile)
        except IncompatibleDataException as e:
            die("Failure: %s" % e.message)

    def _mainLoop(self):
        # Split up reference genome into chunks and farm out the
        # a chunk as a unit of work.
        logging.debug("Starting main loop.")
        ids = reference.enumerateIds(options.referenceWindows)
        for _id in ids:
            if options.fancyChunking:
                chunks = reference.fancyEnumerateChunks(
                    self._inAlnFile, _id, options.referenceChunkSize,
                    options.minCoverage, options.minMapQV,
                    options.referenceWindows)
            else:
                chunks = reference.enumerateChunks(_id,
                                                   options.referenceChunkSize,
                                                   options.referenceWindows)
            for chunk in chunks:
                if self._aborting: return
                self._workQueue.put(chunk)

        # Write sentinels ("end-of-work-stream")
        for i in xrange(options.numWorkers):
            self._workQueue.put(None)

    def _printProfiles(self):
        for profile in glob.glob(os.path.join(options.temporaryDirectory,
                                              "*")):
            pstats.Stats(profile).sort_stats("time").print_stats(20)

    def _cleanup(self):
        if options.doProfiling:
            logging.info("Removing %s" % options.temporaryDirectory)
            shutil.rmtree(options.temporaryDirectory, ignore_errors=True)

    def _setupEvidenceDumpDirectory(self, directoryName):
        if os.path.exists(directoryName):
            shutil.rmtree(directoryName)
        os.makedirs(directoryName)

    @property
    def aborting(self):
        return self._aborting

    def abortWork(self, why):
        """
        Performs a shutdown of all the slave processes.  Called by the
        monitoring thread when a child process exits with a non-zero,
        or when a keyboard interrupt (Ctrl-C) is given. Not called
        during normal shutdown.
        """
        logging.error(why)
        self._aborting = True
        self._resultsQueue.close()
        self._workQueue.close()

    @property
    def slaves(self):
        return self._slaves

    def main(self):

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        gc.disable()

        self._algorithm = self._algorithmByName(options.algorithm)
        self._setupLogging()
        random.seed(42)

        logging.info("h5py version: %s" % h5py.version.version)
        logging.info("hdf5 version: %s" % h5py.version.hdf5_version)
        logging.info("ConsensusCore version: %s" %
                     (consensusCoreVersion() or "ConsensusCore unavailable"))
        logging.info("ConsensusCore2 version: %s" %
                     (consensusCore2Version() or "ConsensusCore2 unavailable"))
        logging.info("Starting.")

        atexit.register(self._cleanup)
        if options.doProfiling:
            self._makeTemporaryDirectory()

        with AlignmentSet(options.inputFilename) as peekFile:
            if options.algorithm == "arrow" and peekFile.isCmpH5:
                die("Arrow does not support CmpH5 files")
            if not peekFile.isCmpH5 and not peekFile.hasPbi:
                die("Genomic Consensus only works with cmp.h5 files and BAM "
                    "files with accompanying .pbi files")
            logging.info("Peeking at file %s" % options.inputFilename)
            logging.info("Input data: numAlnHits=%d" % len(peekFile))
            resolveOptions(peekFile)
            self._loadReference(peekFile)
            self._checkFileCompatibility(peekFile)
            self._configureAlgorithm(options, peekFile)
            options.disableHdf5ChunkCache = True
            #options.disableHdf5ChunkCache = self._shouldDisableChunkCache(peekFile)
            #if options.disableHdf5ChunkCache:
            #    logging.info("Will disable HDF5 chunk cache (large number of datasets)")
            #logging.debug("After peek, # hdf5 objects open: %d" % h5py.h5f.get_obj_count())

        if options.dumpEvidence:
            self._setupEvidenceDumpDirectory(options.evidenceDirectory)

        self._launchSlaves()
        self._readAlignmentInput()

        monitoringThread = threading.Thread(target=monitorSlaves,
                                            args=(self, ))
        monitoringThread.start()

        try:
            if options.doProfiling:
                cProfile.runctx("self._mainLoop()",
                                globals=globals(),
                                locals=locals(),
                                filename=os.path.join(
                                    options.temporaryDirectory,
                                    "profile-main.out"))

            elif options.debug:
                if not options.threaded:
                    die("Debugging only works with -T (threaded) mode")
                logging.info("PID: %d", os.getpid())
                import ipdb
                with ipdb.launch_ipdb_on_exception():
                    self._mainLoop()

            else:
                self._mainLoop()
        except:
            why = traceback.format_exc()
            self.abortWork(why)

        monitoringThread.join()

        if self._aborting:
            logging.error("Aborting")
            return -1
        else:
            logging.info("Finished.")

        if options.doProfiling:
            self._printProfiles()

        # close h5 file.
        self._inAlnFile.close()
        return 0
Ejemplo n.º 9
0
    def test_membership_filter(self):
        aln = AlignmentSet(data.getXml(11))
        assert len(list(aln)) == 177
        hns = np.unique(aln.index.holeNumber)[:1]
        aln.filters.addRequirement(zm=[('in', hns)])
        assert len(list(aln)) == 5

        aln = AlignmentSet(data.getXml(11))
        assert len(list(aln)) == 177
        hns = np.unique(aln.index.holeNumber)
        aln.filters.addRequirement(zm=[('in', hns)])
        assert len(list(aln)) == 177

        aln = AlignmentSet(data.getXml(11))
        assert len(list(aln)) == 177
        hns = np.unique(aln.index.holeNumber)
        hns = [n for _ in range(10000) for n in hns]
        hns = np.array(hns)
        aln.filters.addRequirement(zm=[('in', hns)])
        assert len(list(aln)) == 177

        aln = AlignmentSet(data.getXml(11))
        assert len(list(aln)) == 177
        hns = np.unique(aln.index.holeNumber)[:1]
        hns = list(hns)
        aln.filters.addRequirement(zm=[('in', hns)])
        assert len(list(aln)) == 5

        aln = AlignmentSet(data.getXml(11))
        assert len(list(aln)) == 177
        hns = np.unique(aln.index.holeNumber)[:1]
        hns = set(hns)
        aln.filters.addRequirement(zm=[('in', hns)])
        assert len(list(aln)) == 5

        aln = AlignmentSet(data.getXml(11))
        assert len(list(aln)) == 177
        qnames = [r.qName for r in aln[:10]]
        aln.filters.addRequirement(qname=[('in', qnames)])
        assert len(list(aln)) == 10

        aln = AlignmentSet(data.getXml(11))
        assert len(list(aln)) == 177
        qnames = [r.qName for r in aln[:1]]
        aln.filters.addRequirement(qname=[('in', qnames)])
        assert len(list(aln)) == 1

        # test partial qnames:
        aln = AlignmentSet(data.getXml(11))
        assert len(list(aln)) == 177
        qnames = ['/'.join(r.qName.split('/')[:2]) for r in aln[:1]]
        assert qnames == ['pbalchemy1GbRSIIsim0/6']
        aln.filters.addRequirement(qname=[('in', qnames)])
        assert len(list(aln)) == 7

        fn = tempfile.NamedTemporaryFile(suffix="alignmentset.xml").name
        aln = AlignmentSet(data.getXml(11))
        assert len(list(aln)) == 177
        hns = np.unique(aln.index.holeNumber)[:1]
        aln.filters.addRequirement(zm=[('in', hns)])
        aln.write(fn)
        aln.close()
        aln2 = AlignmentSet(fn)
        assert len(list(aln2)) == 5
Ejemplo n.º 10
0
class KineticsToolsRunner(object):
    def __init__(self, args):
        self.args = args
        self.alignments = None

    def start(self):
        self.validateArgs()
        return self.run()

    def getVersion(self):
        return __version__

    def validateArgs(self):
        parser = get_parser()
        if not os.path.exists(self.args.alignment_set):
            parser.error('Input AlignmentSet file provided does not exist')

        # Over-ride --identify if --control was specified
        if self.args.control:
            self.args.identify = ""

        if self.args.useLDA:
            if self.args.m5Cclassifier is None:
                parser.error(
                    'Please specify a folder containing forward.csv and reverse.csv classifiers in --m5Cclassifier.'
                )

        if self.args.m5Cgff:
            if not self.args.useLDA:
                parser.error(
                    'm5Cgff file can only be generated in --useLDA mode.')

        # if self.args.methylFraction and not self.args.identify:
        #    parser.error('Currently, --methylFraction only works when the --identify option is specified.')

    def run(self):

        # Figure out what modifications to identify
        mods = self.args.identify
        modsToCall = []
        if mods:
            items = mods.split(",")

            if 'm6A' in items:
                modsToCall.append('H')

            if 'm4C' in items:
                modsToCall.append('J')

            if 'm5C_TET' in items:
                modsToCall.append('K')

            self.args.identify = True
            self.args.modsToCall = modsToCall

        self.options = self.args
        self.options.cmdLine = " ".join(sys.argv)
        self._workers = []

        # set random seed
        # XXX note that this is *not* guaranteed to yield reproducible results
        # indepenently of the number of processing cores used!
        if self.options.randomSeed is not None:
            np.random.seed(self.options.randomSeed)

        if self.args.doProfiling:
            cProfile.runctx("self._mainLoop()",
                            globals=globals(),
                            locals=locals(),
                            filename="profile.out")

        else:
            try:
                ret = self._mainLoop()
            finally:
                # Be sure to shutdown child processes if we get an exception on
                # the main thread
                for w in self._workers:
                    if w.is_alive():
                        w.terminate()

            return ret

    def _initQueues(self):
        # Work chunks are created by the main thread and put on this queue
        # They will be consumed by KineticWorker threads, stored in
        # self._workers
        self._workQueue = multiprocessing.JoinableQueue(
            self.options.maxQueueSize)

        # Completed chunks are put on this queue by KineticWorker threads
        # They are consumed by the KineticsWriter process
        self._resultsQueue = multiprocessing.JoinableQueue(
            self.options.maxQueueSize)

    def _launchSlaveProcesses(self):
        """
        Launch a group of worker processes (self._workers), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus, ))
        logging.info("Requested worker processes: %d" %
                     (self.options.numWorkers, ))

        # Use all CPUs if numWorkers < 1
        if self.options.numWorkers < 1:
            self.options.numWorkers = availableCpus

        # Warn if we make a bad numWorker argument is used
        if self.options.numWorkers > availableCpus:
            logging.warn(
                "More worker processes requested (%d) than CPUs available (%d);"
                " may result in suboptimal performance." %
                (self.options.numWorkers, availableCpus))

        self._initQueues()

        # Launch the worker processes
        self._workers = []
        for i in range(self.options.numWorkers):
            p = KineticWorkerProcess(self.options,
                                     self._workQueue,
                                     self._resultsQueue,
                                     self.ipdModel,
                                     sharedAlignmentSet=self.alignments)
            self._workers.append(p)
            p.start()
        logging.info("Launched worker processes.")

        # Launch result collector
        self._resultCollectorProcess = KineticsWriter(self.options,
                                                      self._resultsQueue,
                                                      self.refInfo,
                                                      self.ipdModel)
        self._resultCollectorProcess.start()
        logging.info("Launched result collector process.")

        # Spawn a thread that monitors worker threads for crashes
        self.monitoringThread = threading.Thread(
            target=monitorChildProcesses,
            args=(self._workers + [self._resultCollectorProcess], ))
        self.monitoringThread.start()

    def _queueChunksForWindow(self, refWindow):
        """
        Compute the chunk extents and queue up the work for a single reference
        """
        winId = refWindow.refId
        winStart = refWindow.start
        winEnd = refWindow.end
        pass

    def loadReferenceAndModel(self, referencePath, ipdModelFilename):
        assert self.alignments is not None and self.referenceWindows is not None
        # Load the reference contigs - annotated with their refID from the
        # alignments
        logging.info("Loading reference contigs {!r}".format(referencePath))
        contigs = ReferenceUtils.loadReferenceContigs(
            referencePath,
            alignmentSet=self.alignments,
            windows=self.referenceWindows)
        self.ipdModel = IpdModel(contigs, ipdModelFilename,
                                 self.args.modelIters)

    def loadSharedAlignmentSet(self, alignmentFilename):
        """
        Read the input AlignmentSet so the indices can be shared with the
        slaves.  This is also used to pass to ReferenceUtils for setting up
        the ipdModel object.
        """
        logging.info("Reading AlignmentSet: %s" % alignmentFilename)
        logging.info("           reference: %s" % self.args.reference)
        self.alignments = AlignmentSet(alignmentFilename,
                                       referenceFastaFname=self.args.reference)
        # XXX this should ensure that the file(s) get opened, including any
        # .pbi indices - but need to confirm this
        self.refInfo = self.alignments.referenceInfoTable

    def _mainLoop(self):
        """
        Main loop
        First launch the worker and writer processes
        Then we loop over ReferenceGroups in the alignments.  For each contig we will:
        1. Load the sequence into the main memory of the parent process
        3. Chunk up the contig and submit the chunk descriptions to the work queue
        Finally, wait for the writer process to finish.
        """

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        # gc.disable()

        self.loadSharedAlignmentSet(self.args.alignment_set)

        # Resolve the windows that will be visited.
        if self.args.referenceWindowsAsString is not None:
            self.referenceWindows = []
            for s in self.args.referenceWindowsAsString.split(","):
                try:
                    win = ReferenceUtils.parseReferenceWindow(
                        s, self.alignments.referenceInfo)
                    self.referenceWindows.append(win)
                except BaseException:
                    if self.args.skipUnrecognizedContigs:
                        continue
                    else:
                        raise Exception("Unrecognized contig!")
        elif self.args.referenceWindowsFromAlignment:
            self.referenceWindows = ReferenceUtils.referenceWindowsFromAlignment(
                self.alignments, self.alignments.referenceInfo)
            refNames = set([rw.refName for rw in self.referenceWindows])
            # limit output to contigs that overlap with reference windows
            self.refInfo = [r for r in self.refInfo if r.Name in refNames]
        else:
            self.referenceWindows = ReferenceUtils.createReferenceWindows(
                self.refInfo)

        # Load reference and IpdModel
        chemName = ReferenceUtils.loadAlignmentChemistry(self.alignments)
        if self.args.useChemistry is not None:
            chemName = self.args.useChemistry
        ipdModelFilename = loader.getIpdModelFilename(
            ipdModel=self.args.ipdModel,
            majorityChem=chemName,
            paramsPath=self.args.paramsPath)
        self.loadReferenceAndModel(self.args.reference, ipdModelFilename)

        # Spawn workers
        self._launchSlaveProcesses()

        logging.info('Generating kinetics summary for [%s]' %
                     self.args.alignment_set)

        #self.referenceMap = self.alignments['/RefGroup'].asDict('RefInfoID', 'ID')
        #self.alnInfo = self.alignments['/AlnInfo'].asRecArray()

        # Main loop -- we loop over ReferenceGroups in the alignments.  For each contig we will:
        # 1. Load the sequence into the main memory of the parent process
        # 2. Fork the workers
        # 3. chunk up the contig and

        self.workChunkCounter = 0

        # Iterate over references
        for window in self.referenceWindows:
            logging.info('Processing window/contig: %s' % (window, ))
            for chunk in ReferenceUtils.enumerateChunks(
                    self.args.referenceStride, window):
                self._workQueue.put((self.workChunkCounter, chunk))
                self.workChunkCounter += 1

        # Shutdown worker threads with None sentinels
        for i in range(self.args.numWorkers):
            self._workQueue.put(None)

        for w in self._workers:
            w.join()

        # Join on the result queue and the resultsCollector process.
        # This ensures all the results are written before shutdown.
        self.monitoringThread.join()
        self._resultsQueue.join()
        self._resultCollectorProcess.join()
        logging.info("ipdSummary.py finished. Exiting.")
        self.alignments.close()
        return 0
Ejemplo n.º 11
0
class ToolRunner(object):
    """
    The main driver class for the GenomicConsensus tool.  It is assumed that
    arguments have already been parsed and used to populate the global
    'options' namespace before instantiating this class.
    """
    def __init__(self):
        self._inAlnFile = None
        self._resultsQueue = None
        self._workQueue = None
        self._slaves = None
        self._algorithm = None
        self._algorithmConfiguration = None
        self._aborting = False

    def _makeTemporaryDirectory(self):
        """
        Make a temp dir where we can stash things if necessary.
        """
        options.temporaryDirectory = tempfile.mkdtemp(prefix="GenomicConsensus-", dir="/tmp")
        logging.info("Created temporary directory %s" % (options.temporaryDirectory,) )

    def _algorithmByName(self, name, peekFile):
        if name == "plurality":
            from GenomicConsensus.plurality import plurality
            algo = plurality
        elif name == "quiver":
            from GenomicConsensus.quiver import quiver
            algo = quiver
        elif name == "arrow":
            from GenomicConsensus.arrow import arrow
            algo = arrow
        elif name == "poa":
            from GenomicConsensus.poa import poa
            algo = poa
        elif name == "best":
            logging.info("Identifying best algorithm based on input data")
            from GenomicConsensus import algorithmSelection
            algoName = algorithmSelection.bestAlgorithm(peekFile.sequencingChemistry)
            return self._algorithmByName(algoName, peekFile)
        else:
            die("Failure: unrecognized algorithm %s" % name)
        isOK, msg = algo.availability
        if not isOK:
            die("Failure: %s" % msg)
        logging.info("Will use {a} algorithm".format(a=name))
        return algo

    def _launchSlaves(self):
        """
        Launch a group of worker processes (self._slaves), the queue
        (self._workQueue) that will be used to send them chunks of
        work, and the queue that will be used to receive back the
        results (self._resultsQueue).

        Additionally, launch the result collector process.
        """
        availableCpus = multiprocessing.cpu_count()
        logging.info("Available CPUs: %d" % (availableCpus,))
        logging.info("Requested workers: %d" % (options.numWorkers,))
        logging.info("Parallel Mode: %s" % ("Threaded" if options.threaded else "Process",))
        if (options.numWorkers > availableCpus):
            logging.warn("More workers requested (%d) than CPUs available (%d);"
                         " may result in suboptimal performance."
                         % (options.numWorkers, availableCpus))
        self._initQueues()

        WorkerType, ResultCollectorType = self._algorithm.slaveFactories(options.threaded)
        self._slaves = []
        for i in xrange(options.numWorkers):
            p = WorkerType(self._workQueue, self._resultsQueue, self._algorithmConfiguration)
            self._slaves.append(p)
            p.start()
        logging.info("Launched compute slaves.")

        rcp = ResultCollectorType(self._resultsQueue, self._algorithm.name, self._algorithmConfiguration)
        rcp.start()
        self._slaves.append(rcp)
        logging.info("Launched collector slave.")

    def _initQueues(self):
        if options.threaded:
            self._workQueue = Queue.Queue(options.queueSize)
            self._resultsQueue = Queue.Queue(options.queueSize)
        else:
            self._workQueue = multiprocessing.Queue(options.queueSize)
            self._resultsQueue = multiprocessing.Queue(options.queueSize)

    def _readAlignmentInput(self):
        """
        Read the AlignmentSet input file and
        store it as self._inAlnFile.
        """
        fname = options.inputFilename
        self._inAlnFile = AlignmentSet(fname)

    def _loadReference(self, alnFile):
        logging.info("Loading reference")
        reference.loadFromFile(options.referenceFilename, alnFile)
        # Grok the referenceWindow spec, if any.
        if options.referenceWindowsAsString is None:
            options.referenceWindows = ()
        elif options.skipUnrecognizedContigs:
            # This is a workaround for smrtpipe scatter/gather.
            options.referenceWindows = []
            for s in options.referenceWindowsAsString.split(","):
                try:
                    win = reference.stringToWindow(s)
                    options.referenceWindows.append(win)
                except Exception:
                    msg = traceback.format_exc()
                    logging.debug(msg)
                    pass
        else:
            options.referenceWindows = map(reference.stringToWindow,
                                           options.referenceWindowsAsString.split(","))
        if options.referenceWindowsFromAlignment:
            options.referenceWindows = alnFile.refWindows

    def _checkFileCompatibility(self, alnFile):
        if not alnFile.isSorted:
            die("Input Alignment file must be sorted.")
        if alnFile.isCmpH5 and alnFile.isEmpty:
            die("Input Alignment file must be nonempty.")

    def _shouldDisableChunkCache(self, alnFile):
        #if isinstance(alnFile, CmpH5Reader):
        #if alnFile.isCmpH5:
        #    threshold = options.autoDisableHdf5ChunkCache
        #    return datasetCountExceedsThreshold(alnFile, threshold)
        #else:
        #    return False
        return True

    def _configureAlgorithm(self, options, alnFile):
        assert self._algorithm != None
        try:
            self._algorithmConfiguration = self._algorithm.configure(options, alnFile)
        except IncompatibleDataException as e:
            die("Failure: %s" % e.message)

    def _mainLoop(self):
        # Split up reference genome into chunks and farm out the
        # a chunk as a unit of work.
        logging.debug("Starting main loop.")
        ids = reference.enumerateIds(options.referenceWindows)
        for _id in ids:
            if options.fancyChunking:
                chunks = reference.fancyEnumerateChunks(self._inAlnFile,
                                                        _id,
                                                        options.referenceChunkSize,
                                                        options.minCoverage,
                                                        options.minMapQV,
                                                        options.referenceWindows)
            else:
                chunks = reference.enumerateChunks(_id,
                                                   options.referenceChunkSize,
                                                   options.referenceWindows)
            for chunk in chunks:
                if self._aborting: return
                self._workQueue.put(chunk)

        # Write sentinels ("end-of-work-stream")
        for i in xrange(options.numWorkers):
            self._workQueue.put(None)

    def _printProfiles(self):
        for profile in glob.glob(os.path.join(options.temporaryDirectory, "*")):
            pstats.Stats(profile).sort_stats("time").print_stats(20)

    def _cleanup(self):
        if options.doProfiling:
            logging.info("Removing %s" % options.temporaryDirectory)
            shutil.rmtree(options.temporaryDirectory, ignore_errors=True)

    @property
    def aborting(self):
        return self._aborting

    def abortWork(self, why):
        """
        Performs a shutdown of all the slave processes.  Called by the
        monitoring thread when a child process exits with a non-zero,
        or when a keyboard interrupt (Ctrl-C) is given. Not called
        during normal shutdown.
        """
        logging.error(why)
        self._aborting = True
        self._resultsQueue.close()
        self._workQueue.close()

    @property
    def slaves(self):
        return self._slaves

    def main(self):

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        gc.disable()

        random.seed(42)

        if options.pdb or options.pdbAtStartup:
            print("Process ID: %d" % os.getpid(), file=sys.stderr)
            try:
                import ipdb
            except ImportError:
                die("Debugging options require 'ipdb' package installed.")

            if not options.threaded:
                die("Debugging only works with -T (threaded) mode")

        if options.pdbAtStartup:
            ipdb.set_trace()

        logging.info("ConsensusCore version: %s" %
                     (consensusCoreVersion() or "ConsensusCore unavailable"))
        logging.info("ConsensusCore2 version: %s" %
                     (consensusCore2Version() or "ConsensusCore2 unavailable"))
        logging.info("Starting.")

        atexit.register(self._cleanup)
        if options.doProfiling:
            self._makeTemporaryDirectory()

        with AlignmentSet(options.inputFilename) as peekFile:
            if options.algorithm == "arrow" and peekFile.isCmpH5:
                die("Arrow does not support CmpH5 files")
            if not peekFile.isCmpH5 and not peekFile.hasPbi:
                die("Genomic Consensus only works with cmp.h5 files and BAM "
                    "files with accompanying .pbi files")
            logging.info("Peeking at file %s" % options.inputFilename)
            logging.info("Input data: numAlnHits=%d" % len(peekFile))
            resolveOptions(peekFile)
            self._loadReference(peekFile)
            self._checkFileCompatibility(peekFile)
            self._algorithm = self._algorithmByName(options.algorithm, peekFile)
            self._configureAlgorithm(options, peekFile)
            options.disableHdf5ChunkCache = True
            #options.disableHdf5ChunkCache = self._shouldDisableChunkCache(peekFile)
            #if options.disableHdf5ChunkCache:
            #    logging.info("Will disable HDF5 chunk cache (large number of datasets)")

        self._launchSlaves()
        self._readAlignmentInput()

        monitoringThread = threading.Thread(target=monitorSlaves, args=(self,))
        monitoringThread.start()

        try:
            if options.doProfiling:
                cProfile.runctx("self._mainLoop()",
                                globals=globals(),
                                locals=locals(),
                                filename=os.path.join(options.temporaryDirectory,
                                                      "profile-main.out"))

            elif options.pdb:
                with ipdb.launch_ipdb_on_exception():
                    self._mainLoop()

            else:
                self._mainLoop()
        except BaseException as exc:
            msg = 'options={}'.format(pprint.pformat(vars(options)))
            logging.exception(msg)
            self.abortWork(repr(exc))

        monitoringThread.join()

        if self._aborting:
            logging.error("Aborting")
            return -1
        else:
            logging.info("Finished.")

        if options.doProfiling:
            self._printProfiles()

        # close h5 file.
        self._inAlnFile.close()
        return 0