def findDiffFP(self):
        cuts = self.reads
        forwardArray, backwardArray = cuts["+"], cuts["-"]

        cuts2 = self.reads2
        forwardArray2, backwardArray2 = cuts2["+"], cuts2["-"]

        # Adjust the FDR threshold to a minimum of withCutoff
        threshold = min(self.FDR_value, self.MIN_SCORE)

        # Find the footprints at this threshold
        offsets = self.footprints(threshold)

        # Work out the bootstrap scores for these footprints using the other data set
        best_probabilities, best_footprintsizes = pyDNase.footprinting.WellingtonC.diff_calculate(
            forwardArray, backwardArray, forwardArray2, backwardArray2,
            [i[1] for i in offsets], [i[0] for i in offsets], threshold)

        result_intervals = []

        for i in offsets:
            middle = self.interval.startbp + i[0]
            fp_halfsize = (best_footprintsizes[i[0]] // 2)
            left = middle - fp_halfsize
            right = middle + fp_halfsize
            ml_score = best_probabilities[i[0]]
            result = pyDNase.GenomicInterval(self.interval.chromosome,
                                             left,
                                             right,
                                             score=ml_score)
            result_intervals.append(result)
        return result_intervals
Exemple #2
0
    def footprints(self, withCutoff=-30, merge=1):
        """
        This returns reads GenomicIntervalSet with the intervals retrieved below the specific cutoff applied to the selected data
        """
        #This find the positions of all the ranges below the cutoff using reads new method
        ranges = []
        tempMLE, templogProb = np.array(self.lengths), np.array(self.scores)

        #Here we have some different logic for selecting the summits of footprints
        #TODO: Document this part

        while templogProb.min() < withCutoff:
            minimapos = templogProb.argmin()
            minimafplen = tempMLE[minimapos]
            minimaphalffplen = int(minimafplen) / 2
            lbound = max(minimapos - (minimaphalffplen), 0)
            rbound = min(minimapos + (minimaphalffplen), len(templogProb))
            ranges.append((lbound, rbound, templogProb.min(), minimafplen))
            templogProb[max(lbound - minimafplen, 0
                            ):min(rbound + minimafplen, len(templogProb))] = 1

        returnSet = pyDNase.GenomicIntervalSet()
        #Merges overlapping ranges (TODO: documentation)
        if ranges:
            # This change here changes the way we merge footprints from the probability trace
            #TODO: Documentation
            if merge:
                merged_ranges = []
                while len(ranges):
                    #Find best score
                    sorted(ranges, key=lambda x: -x[2])
                    #Take the last value
                    best = ranges.pop()
                    merged_ranges.append(best)
                    #Check for overlapping regions and remove
                    new_ranges = []
                    for c, d, e, f in ranges:
                        if not c <= best[1] <= d:
                            new_ranges.append([c, d, e, f])
                    ranges = new_ranges
            else:
                merged_ranges = ranges
            #Creates reads GenomicIntervalSet and adds the footprints to them
            for i in merged_ranges:
                rstartbp = self.interval.startbp + i[0]
                #We must add one to the end base of the footprint to account for the BED file format
                rendbp = self.interval.startbp + i[1] + 1
                region = pyDNase.GenomicInterval(self.interval.chromosome,
                                                 rstartbp,
                                                 rendbp,
                                                 strand="+",
                                                 score=i[2])
                returnSet += region
        return returnSet