Beispiel #1
0
 def test_bam(self):
     bamFile = os.path.join(big_data_dir, "Hpyl_1_5000.bam")
     refFile = os.path.join(ref_dir, "Helicobacter_pylori_J99", "sequence",
                            "Helicobacter_pylori_J99.fasta")
     ds = AlignmentSet(bamFile, referenceFastaFname=refFile)
     contigs = ReferenceUtils.loadReferenceContigs(refFile, ds)
     self.assertEquals(len(contigs), 1)
     self.assertEquals(contigs[0].cmph5ID, 0)
     chemistry = ReferenceUtils.loadAlignmentChemistry(ds)
     self.assertEquals(chemistry, "P6-C4")
 def test_bam (self):
     bamFile = os.path.join(big_data_dir, "Hpyl_1_5000.bam")
     refFile = os.path.join(ref_dir, "Helicobacter_pylori_J99", "sequence",
         "Helicobacter_pylori_J99.fasta")
     ds = AlignmentSet(bamFile, referenceFastaFname=refFile)
     contigs = ReferenceUtils.loadReferenceContigs(refFile, ds)
     self.assertEquals(len(contigs), 1)
     self.assertEquals(contigs[0].cmph5ID, 0)
     chemistry = ReferenceUtils.loadAlignmentChemistry(ds)
     self.assertEquals(chemistry, "P6-C4")
 def test_cmph5 (self):
     base_dir = os.path.dirname(os.path.abspath(__file__))
     dataDir = os.path.join(base_dir,'data')
     resourcesDir = os.path.join(base_dir, '../kineticsTools/resources')
     refFile = os.path.join(dataDir, 'lambda', 'sequence', 'lambda.fasta')
     cmpFile = os.path.join(dataDir, "p4-c2-lambda-mod-decode.cmp.h5")
     ds = AlignmentSet(cmpFile, referenceFastaFname=refFile)
     contigs = ReferenceUtils.loadReferenceContigs(refFile, ds)
     self.assertEquals(len(contigs), 1)
     self.assertEquals(contigs[0].cmph5ID, 1)
     chemistry = ReferenceUtils.loadAlignmentChemistry(ds)
     self.assertEquals(chemistry, "P4-C2")
Beispiel #4
0
 def test_cmph5(self):
     base_dir = os.path.dirname(os.path.abspath(__file__))
     dataDir = os.path.join(base_dir, 'data')
     resourcesDir = os.path.join(base_dir, '../kineticsTools/resources')
     refFile = os.path.join(dataDir, 'lambda', 'sequence', 'lambda.fasta')
     cmpFile = os.path.join(dataDir, "p4-c2-lambda-mod-decode.cmp.h5")
     ds = AlignmentSet(cmpFile, referenceFastaFname=refFile)
     contigs = ReferenceUtils.loadReferenceContigs(refFile, ds)
     self.assertEquals(len(contigs), 1)
     self.assertEquals(contigs[0].cmph5ID, 1)
     chemistry = ReferenceUtils.loadAlignmentChemistry(ds)
     self.assertEquals(chemistry, "P4-C2")
Beispiel #5
0
    def loadReferenceAndModel(self, referencePath):
        assert self.alignments is not None and self.referenceWindows is not None
        # Load the reference contigs - annotated with their refID from the cmp.h5
        logging.info("Loading reference contigs %s" % referencePath)
        contigs = ReferenceUtils.loadReferenceContigs(referencePath,
            alignmentSet=self.alignments, windows=self.referenceWindows)

        # There are three different ways the ipdModel can be loaded.
        # In order of precedence they are:
        # 1. Explicit path passed to --ipdModel
        # 2. Path to parameter bundle, model selected using the cmp.h5's sequencingChemistry tags
        # 3. Fall back to built-in model.

        # By default, use built-in model
        ipdModel = None

        if self.args.ipdModel:
            ipdModel = self.args.ipdModel
            logging.info("Using passed in ipd model: %s" % self.args.ipdModel)
            if not os.path.exists(self.args.ipdModel):
                logging.error("Couldn't find model file: %s" % self.args.ipdModel)
                sys.exit(1)
        elif self.args.paramsPath:
            if not os.path.exists(self.args.paramsPath):
                logging.error("Params path doesn't exist: %s" % self.args.paramsPath)
                sys.exit(1)

            majorityChem = ReferenceUtils.loadAlignmentChemistry(self.alignments)

            # Temporary solution for Sequel chemistries: we do not
            # have trained kinetics models in hand yet for Sequel
            # chemistries.  However we have observed that the P5-C3
            # training seems to yield fairly good results on Sequel
            # chemistries to date.  So for the moment, we will use
            # that model for Sequel data.
            if majorityChem.startswith("S/"):
                logging.info("No trained model available yet for Sequel chemistries; modeling as P5-C3")
                majorityChem = "P5-C3"

            ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5")
            if majorityChem == 'unknown':
                logging.error("Chemistry cannot be identified---cannot perform kinetic analysis")
                sys.exit(1)
            elif not os.path.exists(ipdModel):
                logging.error("Aborting, no kinetics model available for this chemistry: %s" % ipdModel)
                sys.exit(1)
            else:
                logging.info("Using Chemistry matched IPD model: %s" % ipdModel)

        self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters)
    def loadReferenceAndModel(self, referencePath, cmpH5Path):

        # Load the reference contigs - annotated with their refID from the cmp.h5
        contigs = ReferenceUtils.loadReferenceContigs(referencePath, cmpH5Path)

        # Read reference info table from cmp.h5
        (refInfoTable, _) = ReferenceUtils.loadCmpH5Tables(cmpH5Path)
        self.refInfo = refInfoTable

        # There are three different ways the ipdModel can be loaded.
        # In order of precedence they are:
        # 1. Explicit path passed to --ipdModel
        # 2. Path to parameter bundle, model selected using the cmp.h5's chemistry info
        # 3. Fall back to built-in model.

        # By default, use built-in model
        ipdModel = None

        if self.args.ipdModel:
            ipdModel = self.args.ipdModel
            logging.info("Using passed in ipd model: %s" % self.args.ipdModel)
            if not os.path.exists(self.args.ipdModel):
                logging.error("Couldn't find model file: %s" %
                              self.args.ipdModel)

        elif self.args.paramsPath:
            if not os.path.exists(self.args.paramsPath):
                logging.error("Params path doesn't exist: %s" %
                              self.args.paramsPath)
                sys.exit(1)

            majorityChem = ReferenceUtils.loadCmpH5Chemistry(cmpH5Path)
            ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5")

            if majorityChem == 'unknown':
                logging.warning(
                    "Chemistry is unknown. Falling back to built-in model")
                ipdModel = None
            elif not os.path.exists(ipdModel):
                logging.warning("Model not found: %s" % ipdModel)
                logging.warning("Falling back to built-in model")
                ipdModel = None
            else:
                logging.info("Using Chemistry matched IPD model: %s" %
                             ipdModel)

        self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters)
Beispiel #7
0
 def test_parseReferenceWindow(self):
     window = "gi|12057207|gb|AE001439.1|:1-5000"
     bamFile = os.path.join(big_data_dir, "Hpyl_1_5000.bam")
     refFile = os.path.join(ref_dir, "Helicobacter_pylori_J99", "sequence",
                            "Helicobacter_pylori_J99.fasta")
     alnFile = AlignmentSet(bamFile, referenceFastaFname=refFile)
     win = ReferenceUtils.parseReferenceWindow(window,
                                               alnFile.referenceInfo)
     self.assertEquals([win.refId, win.start, win.end], [0, 1, 5000])
 def test_parseReferenceWindow (self):
     window = "gi|12057207|gb|AE001439.1|:1-5000"
     bamFile = os.path.join(big_data_dir, "Hpyl_1_5000.bam")
     refFile = os.path.join(ref_dir, "Helicobacter_pylori_J99", "sequence",
         "Helicobacter_pylori_J99.fasta")
     alnFile = AlignmentSet(bamFile, referenceFastaFname=refFile)
     win = ReferenceUtils.parseReferenceWindow(window,
         alnFile.referenceInfo)
     self.assertEquals([win.refId, win.start, win.end], [0, 1, 5000])
Beispiel #9
0
 def loadReferenceAndModel(self, referencePath, ipdModelFilename):
     assert self.alignments is not None and self.referenceWindows is not None
     # Load the reference contigs - annotated with their refID from the cmp.h5
     logging.info("Loading reference contigs {!r}".format(referencePath))
     contigs = ReferenceUtils.loadReferenceContigs(
         referencePath,
         alignmentSet=self.alignments,
         windows=self.referenceWindows)
     self.ipdModel = IpdModel(contigs, ipdModelFilename,
                              self.args.modelIters)
    def loadReferenceAndModel(self, referencePath, cmpH5Path):

        # Load the reference contigs - annotated with their refID from the cmp.h5
        contigs = ReferenceUtils.loadReferenceContigs(referencePath, cmpH5Path)

        # Read reference info table from cmp.h5
        (refInfoTable, _) = ReferenceUtils.loadCmpH5Tables(cmpH5Path)
        self.refInfo = refInfoTable

        # There are three different ways the ipdModel can be loaded.
        # In order of precedence they are:
        # 1. Explicit path passed to --ipdModel
        # 2. Path to parameter bundle, model selected using the cmp.h5's chemistry info
        # 3. Fall back to built-in model.

        # By default, use built-in model
        ipdModel = None

        if self.args.ipdModel:
            ipdModel = self.args.ipdModel
            logging.info("Using passed in ipd model: %s" % self.args.ipdModel)
            if not os.path.exists(self.args.ipdModel):
                logging.error("Couldn't find model file: %s" % self.args.ipdModel)

        elif self.args.paramsPath:
            if not os.path.exists(self.args.paramsPath):
                logging.error("Params path doesn't exist: %s" % self.args.paramsPath)
                sys.exit(1)

            majorityChem = ReferenceUtils.loadCmpH5Chemistry(cmpH5Path)
            ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5")

            if majorityChem == "unknown":
                logging.warning("Chemistry is unknown. Falling back to built-in model")
                ipdModel = None
            elif not os.path.exists(ipdModel):
                logging.warning("Model not found: %s" % ipdModel)
                logging.warning("Falling back to built-in model")
                ipdModel = None
            else:
                logging.info("Using Chemistry matched IPD model: %s" % ipdModel)

        self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters)
 def test_createReferenceWindows (self):
     bamFile = os.path.join(big_data_dir, "Hpyl_1_5000.bam")
     ds = AlignmentSet(bamFile, referenceFastaFname=None)
     refInfoTable = ds.referenceInfoTable
     windows = ReferenceUtils.createReferenceWindows(refInfoTable)
     self.assertEqual(len(windows), 1)
     w = windows[0]
     self.assertEqual(w.refId, 0)
     self.assertEqual(w.refName, 'gi|12057207|gb|AE001439.1|')
     self.assertEqual(w.start, 0)
     self.assertEqual(w.end, 1643831)
Beispiel #12
0
 def test_createReferenceWindows(self):
     bamFile = os.path.join(big_data_dir, "Hpyl_1_5000.bam")
     ds = AlignmentSet(bamFile, referenceFastaFname=None)
     refInfoTable = ds.referenceInfoTable
     windows = ReferenceUtils.createReferenceWindows(refInfoTable)
     self.assertEqual(len(windows), 1)
     w = windows[0]
     self.assertEqual(w.refId, 0)
     self.assertEqual(w.refName, 'gi|12057207|gb|AE001439.1|')
     self.assertEqual(w.start, 0)
     self.assertEqual(w.end, 1643831)
Beispiel #13
0
    def loadReferenceAndModel(self, referencePath):
        assert self.alignments is not None and self.referenceWindows is not None
        # Load the reference contigs - annotated with their refID from the cmp.h5
        logging.info("Loading reference contigs %s" % referencePath)
        contigs = ReferenceUtils.loadReferenceContigs(referencePath,
            alignmentSet=self.alignments, windows=self.referenceWindows)

        # There are three different ways the ipdModel can be loaded.
        # In order of precedence they are:
        # 1. Explicit path passed to --ipdModel
        # 2. Path to parameter bundle, model selected using the cmp.h5's sequencingChemistry tags
        # 3. Fall back to built-in model.

        # By default, use built-in model
        ipdModel = None

        if self.args.ipdModel:
            ipdModel = self.args.ipdModel
            logging.info("Using passed in ipd model: %s" % self.args.ipdModel)
            if not os.path.exists(self.args.ipdModel):
                logging.error("Couldn't find model file: %s" % self.args.ipdModel)
                sys.exit(1)
        elif self.args.paramsPath:
            if not os.path.exists(self.args.paramsPath):
                logging.error("Params path doesn't exist: %s" % self.args.paramsPath)
                sys.exit(1)

            majorityChem = ReferenceUtils.loadAlignmentChemistry(self.alignments)
            ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5")
            if majorityChem == 'unknown':
                logging.error("Chemistry cannot be identified---cannot perform kinetic analysis")
                sys.exit(1)
            elif not os.path.exists(ipdModel):
                logging.error("Aborting, no kinetics model available for this chemistry: %s" % ipdModel)
                sys.exit(1)
            else:
                logging.info("Using Chemistry matched IPD model: %s" % ipdModel)

        self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters)
    def setUp(self):
        self.cmpH5 = None
        resourcesDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../kineticsTools/resources')
        ref = self.getReference()
        alnFile = self.getAlignments()
        assert os.path.exists(alnFile) and os.path.exists(ref)

        self.ds = AlignmentSet(alnFile, referenceFastaFname=ref)
        self.contigs = ReferenceUtils.loadReferenceContigs(ref, self.ds)
        self.ipdModel = IpdModel(self.contigs, os.path.join(resourcesDir, "P6-C4.h5"))
        # Create a functional KineticWorker object that can be poked at
        self.kw = KineticWorker(self.ipdModel)
        # Put in our cmp.h5 - this is normally supplied by the Worker
        self.kw.caseCmpH5 = self.ds
        self.kw.controlCmpH5 = None

        self.kw.options = self.getOpts()
Beispiel #15
0
    def setUp(self):
        self.cmpH5 = None
        resourcesDir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                    '../kineticsTools/resources')
        ref = self.getReference()
        alnFile = self.getAlignments()
        assert os.path.exists(alnFile) and os.path.exists(ref)

        self.ds = AlignmentSet(alnFile, referenceFastaFname=ref)
        self.contigs = ReferenceUtils.loadReferenceContigs(ref, self.ds)
        self.ipdModel = IpdModel(self.contigs,
                                 os.path.join(resourcesDir, "P6-C4.h5"))
        # Create a functional KineticWorker object that can be poked at
        self.kw = KineticWorker(self.ipdModel)
        # Put in our cmp.h5 - this is normally supplied by the Worker
        self.kw.caseCmpH5 = self.ds
        self.kw.controlCmpH5 = None

        self.kw.options = self.getOpts()
Beispiel #16
0
    def setUp(self):

        # Load the lambda genome from our sample data

        dataDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
        ref = os.path.join(dataDir, 'lambda', 'sequence', 'lambda.fasta')
        cmpFile = os.path.join(dataDir, "p4-c2-lambda-mod-decode.cmp.h5")

        self.contigs = ReferenceUtils.loadReferenceContigs(ref, cmpFile)
        self.ipdModel = IpdModel(self.contigs)

        # Create a functional KineticWorker object that can be poked at manually.
        self.kw = KineticWorker(self.ipdModel)
        self.cmpH5 = CmpH5Reader(cmpFile)

        # Put in our cmp.h5 - this is normally supplied by the Worker superclass
        self.kw.caseCmpH5 = self.cmpH5
        self.kw.controlCmpH5 = None

        self.kw.options = self.getOpts()
Beispiel #17
0
    def setUp(self):

        # Load the lambda genome from our sample data

        dataDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
        resourcesDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../kineticsTools/resources')
        ref = os.path.join(dataDir, 'lambda', 'sequence', 'lambda.fasta')
        cmpFile = os.path.join(dataDir, "p4-c2-lambda-mod-decode.cmp.h5")

        self.cmpH5 = AlignmentSet(cmpFile, referenceFastaFname=ref)
        self.contigs = ReferenceUtils.loadReferenceContigs(ref, self.cmpH5)
        self.ipdModel = IpdModel(self.contigs, os.path.join(resourcesDir, "P4-C2.h5"))

        # Create a functional KineticWorker object that can be poked at manually.
        self.kw = KineticWorker(self.ipdModel)

        # Put in our cmp.h5 - this is normally supplied by the Worker superclass
        self.kw.caseCmpH5 = self.cmpH5
        self.kw.controlCmpH5 = None

        self.kw.options = self.getOpts()
Beispiel #18
0
    def _mainLoop(self):
        """
        Main loop
        First launch the worker and writer processes
        Then we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        1. Load the sequence into the main memory of the parent process
        3. Chunk up the contig and submit the chunk descriptions to the work queue
        Finally, wait for the writer process to finish.
        """

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        #gc.disable()

        self.loadSharedAlignmentSet(self.args.alignment_set)

        # Resolve the windows that will be visited.
        if self.args.referenceWindowsAsString is not None:
            self.referenceWindows = []
            for s in self.args.referenceWindowsAsString.split(","):
                try:
                    win = ReferenceUtils.parseReferenceWindow(
                        s, self.alignments.referenceInfo)
                    self.referenceWindows.append(win)
                except:
                    if self.args.skipUnrecognizedContigs:
                        continue
                    else:
                        raise Exception("Unrecognized contig!")
        elif self.args.referenceWindowsFromAlignment:
            self.referenceWindows = ReferenceUtils.referenceWindowsFromAlignment(
                self.alignments, self.alignments.referenceInfo)
            refNames = set([rw.refName for rw in self.referenceWindows])
            # limit output to contigs that overlap with reference windows
            self.refInfo = [r for r in self.refInfo if r.Name in refNames]
        else:
            self.referenceWindows = ReferenceUtils.createReferenceWindows(
                self.refInfo)

        # Load reference and IpdModel
        ipdModelFilename = basic.getIpdModelFilename(
            self.args.ipdModel,
            ReferenceUtils.loadAlignmentChemistry(self.alignments),
            self.args.paramsPath)
        self.loadReferenceAndModel(self.args.reference, ipdModelFilename)

        # Spawn workers
        self._launchSlaveProcesses()

        logging.info('Generating kinetics summary for [%s]' %
                     self.args.alignment_set)

        #self.referenceMap = self.alignments['/RefGroup'].asDict('RefInfoID', 'ID')
        #self.alnInfo = self.alignments['/AlnInfo'].asRecArray()

        # Main loop -- we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        # 1. Load the sequence into the main memory of the parent process
        # 2. Fork the workers
        # 3. chunk up the contig and

        self.workChunkCounter = 0

        # Iterate over references
        for window in self.referenceWindows:
            logging.info('Processing window/contig: %s' % (window, ))
            for chunk in ReferenceUtils.enumerateChunks(
                    self.args.referenceStride, window):
                self._workQueue.put((self.workChunkCounter, chunk))
                self.workChunkCounter += 1

        # Shutdown worker threads with None sentinels
        for i in xrange(self.args.numWorkers):
            self._workQueue.put(None)

        for w in self._workers:
            w.join()

        # Join on the result queue and the resultsCollector process.
        # This ensures all the results are written before shutdown.
        self.monitoringThread.join()
        self._resultsQueue.join()
        self._resultCollectorProcess.join()
        logging.info("ipdSummary.py finished. Exiting.")
        self.alignments.close()
        return 0
Beispiel #19
0
    def loadReferenceAndModel(self, referencePath, cmpH5Path):

        # Load the reference contigs - annotated with their refID from the cmp.h5
        contigs = ReferenceUtils.loadReferenceContigs(referencePath, cmpH5Path)

        # Read reference info table from cmp.h5
        (refInfoTable, _) = ReferenceUtils.loadCmpH5Tables(cmpH5Path)

        if (self.options.refContigs is not None
                or self.options.refContigIndex != -1):

            if (self.options.refContigs is not None
                    and self.options.refContigIndex != -1):

                requestedIds = set(self.options.refContigs.split(',')).union(
                    [self.options.refContigIndex])

            elif (self.options.refContigs is None
                  and self.options.refContigIndex != -1):

                requestedIds = set([self.options.refContigIndex])

            elif (self.options.refContigs is not None
                  and self.options.refContigIndex == -1):

                requestedIds = set(self.options.refContigs.split(','))

            relevantContigs = [
                i for (i, rec) in enumerate(refInfoTable)
                if (rec.FullName in requestedIds or rec.Name in requestedIds
                    or rec.RefInfoID in requestedIds)
            ]
            self.refInfo = refInfoTable[relevantContigs]

        else:
            self.refInfo = refInfoTable

        # There are three different ways the ipdModel can be loaded.
        # In order of precedence they are:
        # 1. Explicit path passed to --ipdModel
        # 2. Path to parameter bundle, model selected using the cmp.h5's sequencingChemistry tags
        # 3. Fall back to built-in model.

        # By default, use built-in model
        ipdModel = None

        if self.args.ipdModel:
            ipdModel = self.args.ipdModel
            logging.info("Using passed in ipd model: %s" % self.args.ipdModel)
            if not os.path.exists(self.args.ipdModel):
                logging.error("Couldn't find model file: %s" %
                              self.args.ipdModel)
                sys.exit(1)
        elif self.args.paramsPath:
            if not os.path.exists(self.args.paramsPath):
                logging.error("Params path doesn't exist: %s" %
                              self.args.paramsPath)
                sys.exit(1)

            majorityChem = ReferenceUtils.loadCmpH5Chemistry(cmpH5Path)
            ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5")
            if majorityChem == 'unknown':
                logging.error(
                    "Chemistry cannot be identified---cannot perform kinetic analysis"
                )
                sys.exit(1)
            elif not os.path.exists(ipdModel):
                logging.error(
                    "Aborting, no kinetics model available for this chemistry: %s"
                    % ipdModel)
                sys.exit(1)
            else:
                logging.info("Using Chemistry matched IPD model: %s" %
                             ipdModel)

        self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters)
Beispiel #20
0
    def _mainLoop(self):
        """
        Main loop
        First launch the worker and writer processes
        Then we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        1. Load the sequence into the main memory of the parent process
        3. Chunk up the contig and submit the chunk descriptions to the work queue
        Finally, wait for the writer process to finish.
        """

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        #gc.disable()

        self.loadSharedAlignmentSet(self.args.alignment_set)

        # Resolve the windows that will be visited.
        if self.args.referenceWindowsAsString is not None:
            self.referenceWindows = []
            for s in self.args.referenceWindowsAsString.split(","):
                try:
                    win = ReferenceUtils.parseReferenceWindow(s, self.alignments.referenceInfo)
                    self.referenceWindows.append(win)
                except:
                    if self.args.skipUnrecognizedContigs:
                        continue
                    else:
                        raise Exception, "Unrecognized contig!"
        elif self.args.referenceWindowsFromAlignment:
            self.referenceWindows = ReferenceUtils.referenceWindowsFromAlignment(self.alignments, self.alignments.referenceInfo)
            refNames = set([rw.refName for rw in self.referenceWindows])
            # limit output to contigs that overlap with reference windows
            self.refInfo = [r for r in self.refInfo if r.Name in refNames]
        else:
            self.referenceWindows = ReferenceUtils.createReferenceWindows(
                self.refInfo)

        # Load reference and IpdModel
        self.loadReferenceAndModel(self.args.reference)

        # Spawn workers
        self._launchSlaveProcesses()

        logging.info('Generating kinetics summary for [%s]' % self.args.alignment_set)

        #self.referenceMap = self.alignments['/RefGroup'].asDict('RefInfoID', 'ID')
        #self.alnInfo = self.alignments['/AlnInfo'].asRecArray()

        # Main loop -- we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        # 1. Load the sequence into the main memory of the parent process
        # 2. Fork the workers
        # 3. chunk up the contig and

        self.workChunkCounter = 0

        # Iterate over references
        for window in self.referenceWindows:
            logging.info('Processing window/contig: %s' % (window,))
            for chunk in ReferenceUtils.enumerateChunks(self.args.referenceStride, window):
                self._workQueue.put((self.workChunkCounter, chunk))
                self.workChunkCounter += 1

        # Shutdown worker threads with None sentinels
        for i in xrange(self.args.numWorkers):
            self._workQueue.put(None)

        for w in self._workers:
            w.join()

        # Join on the result queue and the resultsCollector process.
        # This ensures all the results are written before shutdown.
        self.monitoringThread.join()
        self._resultsQueue.join()
        self._resultCollectorProcess.join()
        logging.info("ipdSummary.py finished. Exiting.")
        self.alignments.close()
        return 0
Beispiel #21
0
    def loadReferenceAndModel(self, referencePath, cmpH5Path):

        # Load the reference contigs - annotated with their refID from the cmp.h5
        contigs = ReferenceUtils.loadReferenceContigs(referencePath, cmpH5Path)

        # Read reference info table from cmp.h5
        (refInfoTable, _) = ReferenceUtils.loadCmpH5Tables(cmpH5Path)

        if (self.options.refContigs is not None or
            self.options.refContigIndex != -1):

            if (self.options.refContigs is not None and 
                self.options.refContigIndex != -1):

                requestedIds = set(self.options.refContigs.split(',')).union([self.options.refContigIndex])

            elif (self.options.refContigs is None and 
                self.options.refContigIndex != -1):
       
                requestedIds = set([self.options.refContigIndex])

            elif (self.options.refContigs is not None and 
                self.options.refContigIndex == -1):
       
                requestedIds = set(self.options.refContigs.split(','))
      

            relevantContigs = [ i for (i, rec) in enumerate(refInfoTable)
                                if (rec.FullName  in requestedIds or
                                    rec.Name      in requestedIds or
                                    rec.RefInfoID in requestedIds) ]
            self.refInfo = refInfoTable[relevantContigs]


        else:
            self.refInfo = refInfoTable

        # There are three different ways the ipdModel can be loaded.
        # In order of precedence they are:
        # 1. Explicit path passed to --ipdModel
        # 2. Path to parameter bundle, model selected using the cmp.h5's sequencingChemistry tags
        # 3. Fall back to built-in model.

        # By default, use built-in model
        ipdModel = None

        if self.args.ipdModel:
            ipdModel = self.args.ipdModel
            logging.info("Using passed in ipd model: %s" % self.args.ipdModel)
            if not os.path.exists(self.args.ipdModel):
                logging.error("Couldn't find model file: %s" % self.args.ipdModel)
                sys.exit(1)
        elif self.args.paramsPath:
            if not os.path.exists(self.args.paramsPath):
                logging.error("Params path doesn't exist: %s" % self.args.paramsPath)
                sys.exit(1)

            majorityChem = ReferenceUtils.loadCmpH5Chemistry(cmpH5Path)
            ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5")
            if majorityChem == 'unknown':
                logging.warning("Chemistry is unknown. Falling back to built-in model")
                ipdModel = None
            elif not os.path.exists(ipdModel):
                logging.warning("Model not found: %s" % ipdModel)
                logging.warning("Falling back to built-in model")
                ipdModel = None
            else:
                logging.info("Using Chemistry matched IPD model: %s" % ipdModel)

        self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters)