Example #1
0
    def onChunk(self, referenceWindow):

        # Setup the object for a new window.
        self._prepForReferenceWindow(referenceWindow)

        # start and end are the windows of the reference that we are responsible for reporting data from.
        # We may elect to pull data from a wider window for use with positive control
        (reference, start, end) = referenceWindow

        # Trim end coordinate to length of current template
        end = min(end, self.ipdModel.refLength(reference))

        if self.options.identify:
            # If we are attempting to identify modifications, get the raw data for a slightly expanded window
            # then do the decoding, then weave the modification results back into the main results

            padStart = start - self.pad
            padEnd = end + self.pad
            perSiteResults = self._summarizeReferenceRegion((padStart, padEnd), self.options.methylFraction, self.options.identify)

            if self.options.useLDA:

                # FIXME: add on a column "Ca5C" containing LDA score for each C-residue site
                # Below is an example of how to use an alternative, the BasicLdaEnricher, which does not use the positive control model
                # PositiveControlEnricher currently uses a logistic regression model trained using SMRTportal job 65203 (native E. coli)

                # lda = BasicLdaEnricher( self.ipdModel.gbmModel, self.sequence, perSiteResults, self.options.identify, self.options.modsToCall )
                lda = PositiveControlEnricher(self.ipdModel.gbmModel, self.sequence, perSiteResults)
                perSiteResults = lda.callEnricherFunction(perSiteResults)

            try:
                # Handle different modes of 'extra analysis' here -- this one is for multi-site m5C detection
                # mods = self._multiSiteDetection(perSiteResults, (start, end))
                mods = self._decodePositiveControl(perSiteResults, (start, end))
            except:
                type, value, tb = sys.exc_info()
                traceback.print_exc()
                pdb.post_mortem(tb)

            finalCalls = []

            # Weave together results
            for strand in [0, 1]:
                strandSign = 1 if strand == 0 else -1

                siteDict = dict((x['tpl'], x) for x in perSiteResults if start <= x['tpl'] < end and x['strand'] == strand)
                modDict = dict((x['tpl'], x) for x in mods if start <= x['tpl'] < end and x['strand'] == strand)

                # Go through the modifications - add tags for identified mods to per-site stats
                # add a 'offTarget' tag to the off target peaks.
                for (pos, mod) in modDict.items():

                    # Only convert to positive control call if we actually have enough
                    # coverage on the cognate base!
                    if siteDict.has_key(mod['tpl']):

                        # Copy mod identification data
                        #siteDict[mod['tpl']]['modificationScore'] = mod['QMod']
                        #siteDict[mod['tpl']]['modification'] = mod['modification']

                        if self.options.methylFraction and mod.has_key(FRAC):
                            siteDict[mod['tpl']][FRAC] = mod[FRAC]
                            siteDict[mod['tpl']][FRAClow] = mod[FRAClow]
                            siteDict[mod['tpl']][FRACup] = mod[FRACup]

                        # Copy any extra properties that were added
                        newKeys = set(mod.keys()) - set(siteDict[mod['tpl']].keys())
                        for nk in newKeys:
                            siteDict[mod['tpl']][nk] = mod[nk]

                    if mod.has_key('Mask'):
                        # The decoder should supply the off-target peak mask
                        mask = mod['Mask']
                        mask.append(0)  # make sure we always mask the cognate position
                    else:
                        # If the decoder doesn't supply a mask - use a hard-coded version
                        # FIXME - this branch is deprecated
                        mask = ModificationPeakMask[mod['modification']]

                    # Mask out neighbor peaks that may have been caused by this mod
                    for offset in mask:
                        shadowPos = mod['tpl'] + strandSign * offset
                        if siteDict.has_key(shadowPos):
                            siteDict[shadowPos]['offTargetPeak'] = True

                finalCalls.extend(siteDict.values())

            # Sort by template position
            finalCalls.sort(key=lambda x: x['tpl'])
            return finalCalls

        else:
            result = self._summarizeReferenceRegion((start, end), self.options.methylFraction, self.options.identify)

            if self.options.useLDA and self.controlCmpH5 is None:

                # FIXME: add on a column "Ca5C" containing LDA score for each C-residue site
                # lda = BasicLdaEnricher(self.ipdModel.gbmModel, self.sequence, result, self.options.identify)
                lda = PositiveControlEnricher(self.ipdModel.gbmModel, self.sequence, result)
                results = lda.callEnricherFunction(result)

            result.sort(key=lambda x: x['tpl'])
            return result
    def onChunk(self, referenceWindow):


        # start and end are the windows of the reference that we are responsible for reporting data from.
        # We may elect to pull data from a wider window for use with positive control

        if self.options.smBaseMod:
		(reference, smId, start, end) = referenceWindow
	else:
		(reference, start, end) = referenceWindow

	targetBounds = (start,end)
        # Trim end coordinate to length of current template
        end = min(end,self.ipdModel.refLength(reference))

        # Each chunk is from a single reference -- fire up meanIpd func on the current reference
        self.meanIpdFunc = self.ipdModel.predictIpdFunc(reference)

        # Get the cognate base at a given position
        self.cognateBaseFunc = self.ipdModel.cognateBaseFunc(reference)

        self.refId = reference

        self.sequence = self.ipdModel.getReferenceWindow(self.refId, 0, start, end)

        # Compute the data for this chunk

        if self.options.identify:
            # If we are attempting to identify modifications, get the raw data for a slightly expanded window
            # then do the decoding, then weave the modification results back into the main results

            padStart = start - 8
            padEnd = end + 8
            perSiteResults = self._summarizeReferenceRegion((padStart, padEnd), self.options.methylFraction, self.options.identify)

            if self.options.useLDA:

                # FIXME: add on a column "Ca5C" containing LDA score for each C-residue site
                # Below is an example of how to use an alternative, the BasicLdaEnricher, which does not use the positive control model
                # PositiveControlEnricher currently uses a logistic regression model trained using SMRTportal job 65203 (native E. coli)

                # lda = BasicLdaEnricher( self.ipdModel.gbmModel, self.sequence, perSiteResults, self.options.identify, self.options.modsToCall )
                lda = PositiveControlEnricher( self.ipdModel.gbmModel, self.sequence, perSiteResults )
                perSiteResults = lda.callEnricherFunction( perSiteResults )

            mods = self._decodePositiveControl(perSiteResults, (start, end))

            finalCalls = []

            # Weave together results
            for strand in [0, 1]:
                strandSign = 1 if strand == 0 else -1

                siteDict = dict((x['tpl'], x) for x in perSiteResults if start <= x['tpl'] < end and x['strand'] == strand)
                modDict = dict((x['tpl'], x) for x in mods if start <= x['tpl'] < end and x['strand'] == strand)

                # Go through the modifications - add tags for identified mods to per-site stats
                # add a 'offTarget' tag to the off target peaks.
                for (pos, mod) in modDict.items():

                    # Only convert to positive control call if we actually have enough
                    # coverage on the cognate base!
                    if siteDict.has_key(mod['tpl']):

                        # Copy mod identification data 
                        siteDict[mod['tpl']]['modificationScore'] = mod['QMod']
                        siteDict[mod['tpl']]['modification'] = mod['modification']
		
                        if self.options.methylFraction and mod.has_key(FRAC):
                            siteDict[mod['tpl']][FRAC] = mod[FRAC]
                            siteDict[mod['tpl']][FRAClow] = mod[FRAClow]
                            siteDict[mod['tpl']][FRACup] = mod[FRACup]


                    if mod.has_key('Mask'):
                        # The decoder should supply the off-target peak mask
                        mask = mod['Mask']
                        mask.append(0) # make sure we always mask the cognate position
                    else:
                        # If the decoder doesn't supply a mask - use a hard-coded version
                        # FIXME - this branch is deprecated
                        mask = ModificationPeakMask[mod['modification']]

                    # Mask out neighbor peaks that may have been caused by this mod
                    for offset in mask:
                        shadowPos = mod['tpl'] + strandSign * offset
                        if siteDict.has_key(shadowPos):
                            siteDict[shadowPos]['offTargetPeak'] = True

                finalCalls.extend(siteDict.values())

            # Sort by template position
            finalCalls.sort(key = lambda x: x['tpl'])
            return finalCalls

        else:
            if self.options.smBaseMod: 
		result = self._summarizeMolecule(smId, targetBounds, self.options.methylFraction, self.options.identify)
	    else:
		result = self._summarizeReferenceRegion(targetBounds, self.options.methylFraction, self.options.identify)		

            if self.options.useLDA and self.controlCmpH5 is None:

                # FIXME: add on a column "Ca5C" containing LDA score for each C-residue site
                # lda = BasicLdaEnricher(self.ipdModel.gbmModel, self.sequence, result, self.options.identify)
                lda = PositiveControlEnricher( self.ipdModel.gbmModel, self.sequence, result )
                results = lda.callEnricherFunction( result )

            if self.options.smBaseMod:
	 	pass
	    else:
		result.sort(key = lambda x: x['tpl'])
            
	    return result