예제 #1
0
    def _computePositionTraditionalControl(self, caseObservations, controlObservations, methylFractionFlag, identifyFlag, testProcedure=_tTest):
        """Summarize the observed ipds at one template position/strand, using a case-control analysis"""
        # Compute stats on the observed ipds
        caseData = caseObservations['data']['ipd']
        controlData = controlObservations['data']['ipd']

        res = dict()
        res['refId'] = self.refId

        # FASTA header name
        res['refName'] = self.refName

        strand = res['strand'] = 1 - caseObservations['strand']
        tpl = res['tpl'] = caseObservations['tpl']
        res['base'] = self.cognateBaseFunc(tpl, strand)

        res['coverage'] = int(round((caseData.size + controlData.size) / 2.0))  # need a coverage annotation

        res['caseCoverage'] = caseData.size
        res['controlCoverage'] = controlData.size

        res['caseMean'] = caseData.mean().item()
        res['caseMedian'] = np.median(caseData).item()
        res['caseStd'] = np.std(caseData).item()

        res['controlMean'] = controlData.mean().item()
        res['controlMedian'] = np.median(controlData).item()
        res['controlStd'] = np.std(controlData).item()

        trim = (0.001, 0.03)
        ctrlMean = mstats.trimmed_mean(controlData, trim).item()
        if abs(ctrlMean) > 1e-3:
            res['ipdRatio'] = (mstats.trimmed_mean(caseData, trim).item() / ctrlMean)
        else:
            res['ipdRatio'] = 1.0

        testResults = testProcedure(caseData, controlData)
        res['testStatistic'] = testResults['testStatistic']
        res['pvalue'] = testResults['pvalue']

        pvalue = max(sys.float_info.min, res['pvalue'])
        res['score'] = round(-10.0 * math.log10(pvalue))

        # If the methylFractionFlag is set, then estimate fraction using just modelPrediction in the detection case.
        if methylFractionFlag and pvalue < self.options.pvalue and not identifyFlag:
            if res['controlCoverage'] > self.options.methylMinCov and res['caseCoverage'] > self.options.methylMinCov:

                # Instantiate mixture estimation methods:
                mixture = MixtureEstimationMethods(self.ipdModel.gbmModel.post, self.ipdModel.gbmModel.pre, res, self.options.methylMinCov)
                x = mixture.detectionMixModelBootstrap(res['controlMean'], caseData)

                res[FRAC] = x[0]
                res[FRAClow] = x[1]
                res[FRACup] = x[2]
            else:
                res[FRAC] = np.nan
                res[FRACup] = np.nan
                res[FRAClow] = np.nan

        return res
예제 #2
0
    def _computePositionTraditionalControl(self,
                                           caseObservations,
                                           controlObservations,
                                           capValue,
                                           controlCapValue,
                                           methylFractionFlag,
                                           identifyFlag,
                                           testProcedure=_tTest):

        oCapValue = capValue
        oControlCapValue = controlCapValue
        """Summarize the observed ipds at one template position/strand, using a case-control analysis"""
        # Compute stats on the observed ipds
        caseData = caseObservations['data']['ipd']
        controlData = controlObservations['data']['ipd']

        # cap both the native and control data, more or less as it is done in computePositionSyntheticControl:
        percentile = min(90, (1.0 - 1.0 / (caseData.size - 1)) * 100)
        localPercentile = np.percentile(caseData, percentile)
        capValue = max(capValue, 4.0 * np.median(caseData).item(),
                       localPercentile)
        caseData = np.minimum(caseData, capValue)

        percentile = min(90, (1.0 - 1.0 / (controlData.size - 1)) * 100)
        localPercentile = np.percentile(controlData, percentile)
        controlCapValue = max(controlCapValue,
                              4.0 * np.median(controlData).item(),
                              localPercentile)
        controlData = np.minimum(controlData, controlCapValue)

        res = dict()
        res['refId'] = self.refId

        # FASTA header name
        res['refName'] = self.refName

        strand = res['strand'] = 1 - caseObservations['strand']
        tpl = res['tpl'] = caseObservations['tpl']
        res['base'] = self.cognateBaseFunc(tpl, strand)

        res['coverage'] = int(round((caseData.size + controlData.size) /
                                    2.0))  # need a coverage annotation

        res['caseCoverage'] = caseData.size
        res['controlCoverage'] = controlData.size

        res['caseMean'] = caseData.mean().item()
        res['caseMedian'] = np.median(caseData).item()
        res['caseStd'] = np.std(caseData).item()

        res['controlMean'] = controlData.mean().item()
        res['controlMedian'] = np.median(controlData).item()
        res['controlStd'] = np.std(controlData).item()

        trim = (0.001, 0.03)
        ctrlMean = mstats.trimmed_mean(controlData, trim).item()
        if abs(ctrlMean) > 1e-3:
            res['ipdRatio'] = (mstats.trimmed_mean(caseData, trim).item() /
                               ctrlMean)
        else:
            res['ipdRatio'] = 1.0

        testResults = testProcedure(caseData, controlData)
        res['testStatistic'] = testResults['testStatistic']
        res['pvalue'] = testResults['pvalue']

        # res['testStatistic'] = ( res['caseMedian'] -  res['controlMedian'] ) / sqrt( res['caseStd']**2 + res['controlStd']**2 )
        # res['pvalue'] =  0.5 * erfc(res['testStatistic'] / sqrt(2))

        pvalue = max(sys.float_info.min, res['pvalue'])
        res['score'] = round(-10.0 * math.log10(pvalue))

        # print res

        # If the methylFractionFlag is set, then estimate fraction using just modelPrediction in the detection case.
        if methylFractionFlag and pvalue < self.options.pvalue and not identifyFlag:
            if res['controlCoverage'] > self.options.methylMinCov and res[
                    'caseCoverage'] > self.options.methylMinCov:
                # Instantiate mixture estimation methods:
                mixture = MixtureEstimationMethods(self.ipdModel.gbmModel.post,
                                                   self.ipdModel.gbmModel.pre,
                                                   res,
                                                   self.options.methylMinCov)
                x = mixture.detectionMixModelBootstrap(res['controlMean'],
                                                       caseData)
                res[FRAC] = x[0]
                res[FRAClow] = x[1]
                res[FRACup] = x[2]
            else:
                res[FRAC] = np.nan
                res[FRACup] = np.nan
                res[FRAClow] = np.nan

        return res
예제 #3
0
    def _computePositionSyntheticControl(self, caseObservations, capValue, methylFractionFlag, identifyFlag, modelPrediction=None):
        """Summarize the observed ipds at one template position/strand, using the synthetic ipd model"""

        # Compute stats on the observed ipds
        d = caseObservations['data']['ipd']
        res = dict()

        # ref00000x name
        res['refId'] = self.refId

        # FASTA header name
        res['refName'] = self.refName

        # NOTE -- this is where the strand flipping occurs -- make sure to reproduce this in the all calling methods
        strand = res['strand'] = 1 - caseObservations['strand']
        tpl = res['tpl'] = caseObservations['tpl']
        res['coverage'] = d.size

        # Don't compute these stats - they just take time and confuse things
        # res['mean'] = d.mean().item()
        # res['median'] = np.median(d).item()
        # res['std'] = np.std(d).item()
        # Compute the predicted IPD from the model
        # NOTE! The ipd model is in the observed read strand
        if modelPrediction is None:
            modelPrediction = self.meanIpdFunc(tpl, strand).item()
        res['modelPrediction'] = modelPrediction

        res['base'] = self.cognateBaseFunc(tpl, strand)

        # Store in case of methylated fraction estimtion:
        res['rawData'] = d

        # Try a hybrid capping approach -- cap at the higher of
        #  - 5x the model prediction
        #  - 90th percentile of the local data (at low coverage we pick a lower percentile to ensure we trim the highest datapoint
        #  - global cap value

        percentile = min(90, (1.0 - 1.0 / (d.size - 1)) * 100)
        localPercentile = np.percentile(d, percentile)
        capValue = max(capValue, 4.0 * modelPrediction, localPercentile)

        # np.minimum(d, capValue, out=d)  # this version will send capped IPDs to modified fraction estimator
        d = np.minimum(d, capValue)

        # Trimmed stats
        res['tMean'] = d.mean().item()
        res['tErr'] = np.std(d).item() / sqrt(d.size)

        res['ipdRatio'] = res['tMean'] / res['modelPrediction']

        # Don't know the modification yet
        res["modification"] = "."

        # use ttest-based pvalue
        # res['pvalue'] = self.computeObservationPValue(res)
        res['tStatistic'] = self.computeObservationTstatistic(res)
        res['pvalue'] = self.computeObservationPValueTTest(res)

        pvalue = max(sys.float_info.min, res['pvalue'])
        score = round(-10.0 * math.log10(pvalue))
        res['score'] = score

        # If the methylFractionFlag is set, then estimate fraction using just modelPrediction in the detection case.
        if methylFractionFlag and pvalue < self.options.pvalue and not identifyFlag:
            if res['coverage'] > self.options.methylMinCov:
                modelPrediction = self.meanIpdFunc(tpl, strand).item()

                # Instantiate mixture estimation methods:
                mixture = MixtureEstimationMethods(self.ipdModel.gbmModel.post, self.ipdModel.gbmModel.pre, res, self.options.methylMinCov)
                x = mixture.detectionMixModelBootstrap(modelPrediction, d)
                # x = self.detectionMixModelBootstrap(modelPrediction, d)

                res[FRAC] = x[0]
                res[FRAClow] = x[1]
                res[FRACup] = x[2]
            else:
                res[FRAC] = np.nan
                res[FRACup] = np.nan
                res[FRAClow] = np.nan

        # print res
        return res
예제 #4
0
    def _computePositionSyntheticControl(self,
                                         caseObservations,
                                         capValue,
                                         methylFractionFlag,
                                         identifyFlag,
                                         modelPrediction=None):
        """Summarize the observed ipds at one template position/strand, using the synthetic ipd model"""

        # Compute stats on the observed ipds
        d = caseObservations['data']['ipd']
        res = dict()

        # ref00000x name
        res['refId'] = self.refId

        # FASTA header name
        res['refName'] = self.refName

        # NOTE -- this is where the strand flipping occurs -- make sure to reproduce this in the all calling methods
        strand = res['strand'] = 1 - caseObservations['strand']
        tpl = res['tpl'] = caseObservations['tpl']
        res['coverage'] = d.size

        # Don't compute these stats - they just take time and confuse things
        # res['mean'] = d.mean().item()
        # res['median'] = np.median(d).item()
        # res['std'] = np.std(d).item()
        # Compute the predicted IPD from the model
        # NOTE! The ipd model is in the observed read strand
        if modelPrediction is None:
            modelPrediction = self.meanIpdFunc(tpl, strand).item()
        res['modelPrediction'] = modelPrediction

        res['base'] = self.cognateBaseFunc(tpl, strand)

        # Store in case of methylated fraction estimtion:
        res['rawData'] = d

        # Try a hybrid capping approach -- cap at the higher of
        #  - 5x the model prediction
        #  - 90th percentile of the local data (at low coverage we pick a lower percentile to ensure we trim the highest datapoint
        #  - global cap value

        percentile = min(90, (1.0 - 1.0 / (d.size - 1)) * 100)
        localPercentile = np.percentile(d, percentile)
        capValue = max(capValue, 4.0 * modelPrediction, localPercentile)

        # np.minimum(d, capValue, out=d)  # this version will send capped IPDs to modified fraction estimator
        d = np.minimum(d, capValue)

        # Trimmed stats
        res['tMean'] = d.mean().item()
        res['tErr'] = np.std(d).item() / sqrt(d.size)

        res['ipdRatio'] = res['tMean'] / res['modelPrediction']

        # Don't know the modification yet
        res["modification"] = "."

        # use ttest-based pvalue
        # res['pvalue'] = self.computeObservationPValue(res)
        res['tStatistic'] = self.computeObservationTstatistic(res)
        res['pvalue'] = self.computeObservationPValueTTest(res)

        pvalue = max(sys.float_info.min, res['pvalue'])
        score = round(-10.0 * math.log10(pvalue))
        res['score'] = score

        # If the methylFractionFlag is set, then estimate fraction using just modelPrediction in the detection case.
        if methylFractionFlag and pvalue < self.options.pvalue and not identifyFlag:
            if res['coverage'] > self.options.methylMinCov:
                modelPrediction = self.meanIpdFunc(tpl, strand).item()

                # Instantiate mixture estimation methods:
                mixture = MixtureEstimationMethods(self.ipdModel.gbmModel.post,
                                                   self.ipdModel.gbmModel.pre,
                                                   res,
                                                   self.options.methylMinCov)
                x = mixture.detectionMixModelBootstrap(modelPrediction, d)
                # x = self.detectionMixModelBootstrap(modelPrediction, d)

                res[FRAC] = x[0]
                res[FRAClow] = x[1]
                res[FRACup] = x[2]
            else:
                res[FRAC] = np.nan
                res[FRACup] = np.nan
                res[FRAClow] = np.nan

        # print res
        return res