def _computePositionTraditionalControl(self, caseObservations, controlObservations, methylFractionFlag, identifyFlag, testProcedure=_tTest):
        """Summarize the observed ipds at one template position/strand, using a case-control analysis"""
        # Compute stats on the observed ipds
        caseData = caseObservations['data']['ipd']
        controlData = controlObservations['data']['ipd']

        res = dict()
        res['refId'] = self.refId

        # FASTA header name
        res['refName'] = self.refName

        strand = res['strand'] = 1 - caseObservations['strand']
        tpl = res['tpl'] = caseObservations['tpl']
        res['base'] = self.cognateBaseFunc(tpl, strand)

        res['coverage'] = int(round((caseData.size + controlData.size) / 2.0))  # need a coverage annotation

        res['caseCoverage'] = caseData.size
        res['controlCoverage'] = controlData.size

        res['caseMean'] = caseData.mean().item()
        res['caseMedian'] = np.median(caseData).item()
        res['caseStd'] = np.std(caseData).item()

        res['controlMean'] = controlData.mean().item()
        res['controlMedian'] = np.median(controlData).item()
        res['controlStd'] = np.std(controlData).item()

        trim = (0.001, 0.03)
        ctrlMean = mstats.trimmed_mean(controlData, trim).item()
        if abs(ctrlMean) > 1e-3:
            res['ipdRatio'] = (mstats.trimmed_mean(caseData, trim).item() / ctrlMean)
        else:
            res['ipdRatio'] = 1.0

        testResults = testProcedure(caseData, controlData)
        res['testStatistic'] = testResults['testStatistic']
        res['pvalue'] = testResults['pvalue']

        pvalue = max(sys.float_info.min, res['pvalue'])
        res['score'] = round(-10.0 * math.log10(pvalue))

        # If the methylFractionFlag is set, then estimate fraction using just modelPrediction in the detection case.
        if methylFractionFlag and pvalue < self.options.pvalue and not identifyFlag:
            if res['controlCoverage'] > self.options.methylMinCov and res['caseCoverage'] > self.options.methylMinCov:

                # Instantiate mixture estimation methods:
                mixture = MixtureEstimationMethods(self.ipdModel.gbmModel.post, self.ipdModel.gbmModel.pre, res, self.options.methylMinCov)
                x = mixture.detectionMixModelBootstrap(res['controlMean'], caseData)

                res[FRAC] = x[0]
                res[FRAClow] = x[1]
                res[FRACup] = x[2]
            else:
                res[FRAC] = np.nan
                res[FRACup] = np.nan
                res[FRAClow] = np.nan

        return res
    def _computePositionSyntheticControl(self, caseObservations, capValue, methylFractionFlag, identifyFlag, modelPrediction=None):
        """Summarize the observed ipds at one template position/strand, using the synthetic ipd model"""

        # Compute stats on the observed ipds
        d = caseObservations['data']['ipd']
        res = dict()

        # ref00000x name
        res['refId'] = self.refId

        # FASTA header name
        res['refName'] = self.refName

        # NOTE -- this is where the strand flipping occurs -- make sure to reproduce this in the all calling methods
        strand = res['strand'] = 1 - caseObservations['strand']
        tpl = res['tpl'] = caseObservations['tpl']
        res['coverage'] = d.size

        # Don't compute these stats - they just take time and confuse things
        # res['mean'] = d.mean().item()
        # res['median'] = np.median(d).item()
        # res['std'] = np.std(d).item()
        # Compute the predicted IPD from the model
        # NOTE! The ipd model is in the observed read strand
        if modelPrediction is None:
            modelPrediction = self.meanIpdFunc(tpl, strand).item()
        res['modelPrediction'] = modelPrediction

        res['base'] = self.cognateBaseFunc(tpl, strand)

        # Store in case of methylated fraction estimtion:
        res['rawData'] = d

        # Try a hybrid capping approach -- cap at the higher of
        #  - 5x the model prediction
        #  - 90th percentile of the local data (at low coverage we pick a lower percentile to ensure we trim the highest datapoint
        #  - global cap value

        percentile = min(90, (1.0 - 1.0 / (d.size - 1)) * 100)
        localPercentile = np.percentile(d, percentile)
        capValue = max(capValue, 4.0 * modelPrediction, localPercentile)

        # np.minimum(d, capValue, out=d)  # this version will send capped IPDs to modified fraction estimator
        d = np.minimum(d, capValue)

        # Trimmed stats
        res['tMean'] = d.mean().item()
        res['tErr'] = np.std(d).item() / sqrt(d.size)

        res['ipdRatio'] = res['tMean'] / res['modelPrediction']

        # Don't know the modification yet
        res["modification"] = "."

        # use ttest-based pvalue
        # res['pvalue'] = self.computeObservationPValue(res)
        res['tStatistic'] = self.computeObservationTstatistic(res)
        res['pvalue'] = self.computeObservationPValueTTest(res)

        pvalue = max(sys.float_info.min, res['pvalue'])
        score = round(-10.0 * math.log10(pvalue))
        res['score'] = score

        # If the methylFractionFlag is set, then estimate fraction using just modelPrediction in the detection case.
        if methylFractionFlag and pvalue < self.options.pvalue and not identifyFlag:
            if res['coverage'] > self.options.methylMinCov:
                modelPrediction = self.meanIpdFunc(tpl, strand).item()

                # Instantiate mixture estimation methods:
                mixture = MixtureEstimationMethods(self.ipdModel.gbmModel.post, self.ipdModel.gbmModel.pre, res, self.options.methylMinCov)
                x = mixture.detectionMixModelBootstrap(modelPrediction, d)
                # x = self.detectionMixModelBootstrap(modelPrediction, d)

                res[FRAC] = x[0]
                res[FRAClow] = x[1]
                res[FRACup] = x[2]
            else:
                res[FRAC] = np.nan
                res[FRACup] = np.nan
                res[FRAClow] = np.nan

        # print res
        return res
Example #3
0
    def _computePositionTraditionalControl(self,
                                           caseObservations,
                                           controlObservations,
                                           capValue,
                                           controlCapValue,
                                           methylFractionFlag,
                                           identifyFlag,
                                           testProcedure=_tTest):

        oCapValue = capValue
        oControlCapValue = controlCapValue
        """Summarize the observed ipds at one template position/strand, using a case-control analysis"""
        # Compute stats on the observed ipds
        caseData = caseObservations['data']['ipd']
        controlData = controlObservations['data']['ipd']

        # cap both the native and control data, more or less as it is done in computePositionSyntheticControl:
        percentile = min(90, (1.0 - 1.0 / (caseData.size - 1)) * 100)
        localPercentile = np.percentile(caseData, percentile)
        capValue = max(capValue, 4.0 * np.median(caseData).item(),
                       localPercentile)
        caseData = np.minimum(caseData, capValue)

        percentile = min(90, (1.0 - 1.0 / (controlData.size - 1)) * 100)
        localPercentile = np.percentile(controlData, percentile)
        controlCapValue = max(controlCapValue,
                              4.0 * np.median(controlData).item(),
                              localPercentile)
        controlData = np.minimum(controlData, controlCapValue)

        res = dict()
        res['refId'] = self.refId

        # FASTA header name
        res['refName'] = self.refName

        strand = res['strand'] = 1 - caseObservations['strand']
        tpl = res['tpl'] = caseObservations['tpl']
        res['base'] = self.cognateBaseFunc(tpl, strand)

        res['coverage'] = int(round((caseData.size + controlData.size) /
                                    2.0))  # need a coverage annotation

        res['caseCoverage'] = caseData.size
        res['controlCoverage'] = controlData.size

        res['caseMean'] = caseData.mean().item()
        res['caseMedian'] = np.median(caseData).item()
        res['caseStd'] = np.std(caseData).item()

        res['controlMean'] = controlData.mean().item()
        res['controlMedian'] = np.median(controlData).item()
        res['controlStd'] = np.std(controlData).item()

        trim = (0.001, 0.03)
        ctrlMean = mstats.trimmed_mean(controlData, trim).item()
        if abs(ctrlMean) > 1e-3:
            res['ipdRatio'] = (mstats.trimmed_mean(caseData, trim).item() /
                               ctrlMean)
        else:
            res['ipdRatio'] = 1.0

        testResults = testProcedure(caseData, controlData)
        res['testStatistic'] = testResults['testStatistic']
        res['pvalue'] = testResults['pvalue']

        # res['testStatistic'] = ( res['caseMedian'] -  res['controlMedian'] ) / sqrt( res['caseStd']**2 + res['controlStd']**2 )
        # res['pvalue'] =  0.5 * erfc(res['testStatistic'] / sqrt(2))

        pvalue = max(sys.float_info.min, res['pvalue'])
        res['score'] = round(-10.0 * math.log10(pvalue))

        # print res

        # If the methylFractionFlag is set, then estimate fraction using just modelPrediction in the detection case.
        if methylFractionFlag and pvalue < self.options.pvalue and not identifyFlag:
            if res['controlCoverage'] > self.options.methylMinCov and res[
                    'caseCoverage'] > self.options.methylMinCov:
                # Instantiate mixture estimation methods:
                mixture = MixtureEstimationMethods(self.ipdModel.gbmModel.post,
                                                   self.ipdModel.gbmModel.pre,
                                                   res,
                                                   self.options.methylMinCov)
                x = mixture.detectionMixModelBootstrap(res['controlMean'],
                                                       caseData)
                res[FRAC] = x[0]
                res[FRAClow] = x[1]
                res[FRACup] = x[2]
            else:
                res[FRAC] = np.nan
                res[FRACup] = np.nan
                res[FRAClow] = np.nan

        return res
Example #4
0
    def _computePositionSyntheticControl(self,
                                         caseObservations,
                                         capValue,
                                         methylFractionFlag,
                                         identifyFlag,
                                         modelPrediction=None):
        """Summarize the observed ipds at one template position/strand, using the synthetic ipd model"""

        # Compute stats on the observed ipds
        d = caseObservations['data']['ipd']
        res = dict()

        # ref00000x name
        res['refId'] = self.refId

        # FASTA header name
        res['refName'] = self.refName

        # NOTE -- this is where the strand flipping occurs -- make sure to reproduce this in the all calling methods
        strand = res['strand'] = 1 - caseObservations['strand']
        tpl = res['tpl'] = caseObservations['tpl']
        res['coverage'] = d.size

        # Don't compute these stats - they just take time and confuse things
        # res['mean'] = d.mean().item()
        # res['median'] = np.median(d).item()
        # res['std'] = np.std(d).item()
        # Compute the predicted IPD from the model
        # NOTE! The ipd model is in the observed read strand
        if modelPrediction is None:
            modelPrediction = self.meanIpdFunc(tpl, strand).item()
        res['modelPrediction'] = modelPrediction

        res['base'] = self.cognateBaseFunc(tpl, strand)

        # Store in case of methylated fraction estimtion:
        res['rawData'] = d

        # Try a hybrid capping approach -- cap at the higher of
        #  - 5x the model prediction
        #  - 90th percentile of the local data (at low coverage we pick a lower percentile to ensure we trim the highest datapoint
        #  - global cap value

        percentile = min(90, (1.0 - 1.0 / (d.size - 1)) * 100)
        localPercentile = np.percentile(d, percentile)
        capValue = max(capValue, 4.0 * modelPrediction, localPercentile)

        # np.minimum(d, capValue, out=d)  # this version will send capped IPDs to modified fraction estimator
        d = np.minimum(d, capValue)

        # Trimmed stats
        res['tMean'] = d.mean().item()
        res['tErr'] = np.std(d).item() / sqrt(d.size)

        res['ipdRatio'] = res['tMean'] / res['modelPrediction']

        # Don't know the modification yet
        res["modification"] = "."

        # use ttest-based pvalue
        # res['pvalue'] = self.computeObservationPValue(res)
        res['tStatistic'] = self.computeObservationTstatistic(res)
        res['pvalue'] = self.computeObservationPValueTTest(res)

        pvalue = max(sys.float_info.min, res['pvalue'])
        score = round(-10.0 * math.log10(pvalue))
        res['score'] = score

        # If the methylFractionFlag is set, then estimate fraction using just modelPrediction in the detection case.
        if methylFractionFlag and pvalue < self.options.pvalue and not identifyFlag:
            if res['coverage'] > self.options.methylMinCov:
                modelPrediction = self.meanIpdFunc(tpl, strand).item()

                # Instantiate mixture estimation methods:
                mixture = MixtureEstimationMethods(self.ipdModel.gbmModel.post,
                                                   self.ipdModel.gbmModel.pre,
                                                   res,
                                                   self.options.methylMinCov)
                x = mixture.detectionMixModelBootstrap(modelPrediction, d)
                # x = self.detectionMixModelBootstrap(modelPrediction, d)

                res[FRAC] = x[0]
                res[FRAClow] = x[1]
                res[FRACup] = x[2]
            else:
                res[FRAC] = np.nan
                res[FRACup] = np.nan
                res[FRAClow] = np.nan

        # print res
        return res
Example #5
0
    def scoreMods(self, modCalls):
        """
        For each modification in the best scoring configuration, score a config excluding the current mod against the winning config
        use this value as the Qmod for the deleted modification
        """

        qvModCalls = dict()

        modSeq = a.array('c')
        modSeq.fromstring(self.sequence)

        # Apply the found modifications to the raw sequence
        for (pos, call) in modCalls.items():
            modSeq[pos] = call

        for (pos, call) in modCalls.items():

            # Score the modified template at all positions affected by this mod
            modScore = self.scoreRegion(pos - self.post, pos + self.pre,
                                        modSeq)
            modScores = self.getRegionScores(pos - self.post, pos + self.pre,
                                             modSeq)

            if self.methylFractionFlag and self.rawKinetics.has_key(pos):
                if self.rawKinetics[pos]["coverage"] > self.methylMinCov:
                    modifiedMeanVectors = self.getContextMeans(
                        pos - self.post, pos + self.pre, modSeq)

            # Switch back to the unmodified base and re-score
            modSeq[pos] = canonicalBaseMap[call]
            noModScore = self.scoreRegion(pos - self.post, pos + self.pre,
                                          modSeq)
            noModScores = self.getRegionScores(pos - self.post, pos + self.pre,
                                               modSeq)

            if self.methylFractionFlag and self.rawKinetics.has_key(pos):
                if self.rawKinetics[pos]["coverage"] > self.methylMinCov:
                    unModifiedMeanVectors = self.getContextMeans(
                        pos - self.post, pos + self.pre, modSeq)

            # Put back the modified base
            modSeq[pos] = call

            # Compute score difference
            llr = modScore - noModScore

            # Convert from LLR to phred-scaled probability of modification
            qModScore = 10 * llr * log10e + 10 * log1p(exp(-llr)) * log10e

            # Figure out which secondary peaks were likely generated by this modification
            # What is the posterior that the peak was generated by this mod?
            maskPos = self.findMaskPositions(pos, modScores, noModScores)

            # FIXME:  Without this, currently, the identificationQv score is too low for many Ca5C sites
            # if self.useLDA:
            #     if self.rawKinetics.has_key(pos):
            #         if self.rawKinetics[pos].has_key('Ca5C'):
            #             llr = -self.rawKinetics[pos]['Ca5C']
            #             qModScore = 100 * llr * log10e + 100*log1p(exp(-llr))*log10e
            if self.methylFractionFlag and self.rawKinetics.has_key(pos):

                if self.rawKinetics[pos]["coverage"] > self.methylMinCov:

                    # Instantiate mixture estimation methods:
                    mixture = MixtureEstimationMethods(self.gbmModel.post,
                                                       self.gbmModel.pre,
                                                       self.rawKinetics,
                                                       self.methylMinCov)

                    # Use modifiedMeanVectors and unmodifiedMeanVectors to calculate mixing proportion, and 95% CI limits.
                    methylFracEst, methylFracLow, methylFracUpp = mixture.estimateMethylatedFractions(
                        pos, unModifiedMeanVectors, modifiedMeanVectors,
                        ModificationPeakMask[modNames[call]])

                    qvModCalls[pos] = {
                        'modification': modNames[call],
                        'QMod': qModScore,
                        'LLR': llr,
                        'Mask': maskPos,
                        FRAC: methylFracEst,
                        FRAClow: methylFracLow,
                        FRACup: methylFracUpp
                    }

                else:
                    qvModCalls[pos] = {
                        'modification': modNames[call],
                        'QMod': qModScore,
                        'LLR': llr,
                        'Mask': maskPos
                    }

            else:
                # Store the full results
                qvModCalls[pos] = {
                    'modification': modNames[call],
                    'QMod': qModScore,
                    'LLR': llr,
                    'Mask': maskPos
                }

        return qvModCalls
    def scoreMods(self, modCalls):
        """
        For each modification in the best scoring configuration, score a config excluding the current mod against the winning config
        use this value as the Qmod for the deleted modification
        """

        qvModCalls = dict()

        modSeq = a.array('c')
        modSeq.fromstring(self.sequence)

        # Apply the found modifications to the raw sequence
        for (pos, call) in modCalls.items():
            modSeq[pos] = call

        for (pos, call) in modCalls.items():

            # Score the modified template at all positions affected by this mod
            modScore = self.scoreRegion(pos - self.post, pos + self.pre, modSeq)
            modScores = self.getRegionScores(pos - self.post, pos + self.pre, modSeq)

            if self.methylFractionFlag and self.rawKinetics.has_key(pos):
                if self.rawKinetics[pos]["coverage"] > self.methylMinCov:
                    modifiedMeanVectors = self.getContextMeans(pos - self.post, pos + self.pre, modSeq)

            # Switch back to the unmodified base and re-score
            modSeq[pos] = canonicalBaseMap[call]
            noModScore = self.scoreRegion(pos - self.post, pos + self.pre, modSeq)
            noModScores = self.getRegionScores(pos - self.post, pos + self.pre, modSeq)

            if self.methylFractionFlag and self.rawKinetics.has_key(pos):
                if self.rawKinetics[pos]["coverage"] > self.methylMinCov:
                    unModifiedMeanVectors = self.getContextMeans(pos - self.post, pos + self.pre, modSeq)

            # Put back the modified base
            modSeq[pos] = call

            # Compute score difference
            llr = modScore - noModScore

            # Convert from LLR to phred-scaled probability of modification
            qModScore = 10 * llr * log10e + 10 * log1p(exp(-llr)) * log10e

            # Figure out which secondary peaks were likely generated by this modification
            # What is the posterior that the peak was generated by this mod?
            maskPos = self.findMaskPositions(pos, modScores, noModScores)

            # FIXME:  Without this, currently, the identificationQv score is too low for many Ca5C sites
            # if self.useLDA:
            #     if self.rawKinetics.has_key(pos):
            #         if self.rawKinetics[pos].has_key('Ca5C'):
            #             llr = -self.rawKinetics[pos]['Ca5C']
            #             qModScore = 100 * llr * log10e + 100*log1p(exp(-llr))*log10e
            if self.methylFractionFlag and self.rawKinetics.has_key(pos):

                if self.rawKinetics[pos]["coverage"] > self.methylMinCov:

                    # Instantiate mixture estimation methods:
                    mixture = MixtureEstimationMethods(self.gbmModel.post, self.gbmModel.pre, self.rawKinetics, self.methylMinCov)

                    # Use modifiedMeanVectors and unmodifiedMeanVectors to calculate mixing proportion, and 95% CI limits.
                    methylFracEst, methylFracLow, methylFracUpp = mixture.estimateMethylatedFractions(pos, unModifiedMeanVectors, modifiedMeanVectors, ModificationPeakMask[modNames[call]])

                    qvModCalls[pos] = {'modification': modNames[call], 'QMod': qModScore, 'LLR': llr, 'Mask': maskPos,
                                       FRAC: methylFracEst, FRAClow: methylFracLow, FRACup: methylFracUpp}

                else:
                    qvModCalls[pos] = {'modification': modNames[call], 'QMod': qModScore, 'LLR': llr, 'Mask': maskPos}

            else:
                # Store the full results
                qvModCalls[pos] = {'modification': modNames[call], 'QMod': qModScore, 'LLR': llr, 'Mask': maskPos}

        return qvModCalls