def checkExactOverlap(bed1, bed2): """ make sure two bed files cover same region exactly: a requirement for all code based on the comparisons in this module.""" errorMessage = ( "Bed files %s and %s cannot be compared. xxx. " " Input files must be both sorted, cover the exact same region," " and contain no self-overlaps.") % (bed1, bed2) # empty file may break downstream comparisons size1 = os.path.getsize(bed1) size2 = os.path.getsize(bed2) if size1 == 0 or size2 == 0: raise RuntimeError( errorMessage.replace("xxx", "one or both inputs empty")) # test self-overlap and sorting intervals1 = readBedIntervals(bed1, sort=False) for i in xrange(1, len(intervals1)): if intersectSize(intervals1[i - 1], intervals1[i]) != 0: raise RuntimeError( errorMessage.replace( "xxx", "Overlapping intervals %s and %s found in input1" % (intervals1[i - 1], intervals1[i]))) if intervals1[i - 1] > intervals1[i]: raise RuntimeError( errorMessage.replace( "xxx", "Out of order intervals %s and %s found in input1" % (intervals1[i - 1], intervals1[i]))) # test self-overlap and sorting intervals2 = readBedIntervals(bed1, sort=False) for i in xrange(1, len(intervals2)): if intersectSize(intervals2[i - 1], intervals2[i]) != 0: raise RuntimeError( errorMessage.replace( "xxx", "Overlapping intervals %s and %s found in input2" % (intervals2[i - 1], intervals2[i]))) if intervals2[i - 1] > intervals2[i]: raise RuntimeError( errorMessage.replace( "xxx", "Out of order intervals %s and %s found in input2" % (intervals2[i - 1], intervals2[i]))) # test intersection size tempFile = getLocalTempPath("Temp_test", ".bed") runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile)) if os.path.getsize(tempFile) != 0: runShellCommand("rm -f %s" % tempFile) raise RuntimeError( errorMessage.replace("xxx", "Input1 covers regions outside input2")) runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile)) if os.path.getsize(tempFile) != 0: runShellCommand("rm -f %s" % tempFile) raise RuntimeError( errorMessage.replace("xxx", "Input2 covers regions outside input1")) runShellCommand("rm -f %s" % tempFile)
def checkExactOverlap(bed1, bed2): """ make sure two bed files cover same region exactly: a requirement for all code based on the comparisons in this module.""" errorMessage = ("Bed files %s and %s cannot be compared. xxx. " " Input files must be both sorted, cover the exact same region," " and contain no self-overlaps.") % (bed1, bed2) # empty file may break downstream comparisons size1 = os.path.getsize(bed1) size2 = os.path.getsize(bed2) if size1 == 0 or size2 == 0: raise RuntimeError(errorMessage.replace("xxx", "one or both inputs empty")) # test self-overlap and sorting intervals1 = readBedIntervals(bed1, sort=False) for i in xrange(1, len(intervals1)): if intersectSize(intervals1[i-1], intervals1[i]) != 0: raise RuntimeError(errorMessage.replace( "xxx", "Overlapping intervals %s and %s found in input1" % ( intervals1[i-1], intervals1[i]))) if intervals1[i-1] > intervals1[i]: raise RuntimeError(errorMessage.replace( "xxx", "Out of order intervals %s and %s found in input1" % ( intervals1[i-1], intervals1[i]))) # test self-overlap and sorting intervals2 = readBedIntervals(bed1, sort=False) for i in xrange(1, len(intervals2)): if intersectSize(intervals2[i-1], intervals2[i]) != 0: raise RuntimeError(errorMessage.replace( "xxx", "Overlapping intervals %s and %s found in input2" % ( intervals2[i-1], intervals2[i]))) if intervals2[i-1] > intervals2[i]: raise RuntimeError(errorMessage.replace( "xxx", "Out of order intervals %s and %s found in input2" % ( intervals2[i-1], intervals2[i]))) # test intersection size tempFile = getLocalTempPath("Temp_test", ".bed") runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile)) if os.path.getsize(tempFile) != 0: runShellCommand("rm -f %s" % tempFile) raise RuntimeError(errorMessage.replace( "xxx", "Input1 covers regions outside input2")) runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile)) if os.path.getsize(tempFile) != 0: runShellCommand("rm -f %s" % tempFile) raise RuntimeError(errorMessage.replace( "xxx", "Input2 covers regions outside input1")) runShellCommand("rm -f %s" % tempFile)
def testIntersect(self): a = [0] * 6 a[0] = ("b", 10, 100) a[1] = ("a", 10, 100) a[2] = ("a", 5, 15) a[3] = ("a", 11, 12) a[4] = ("a", 95, 105) a[5] = ("a", 0, 1000) for i in xrange(1, 6): assert intersectSize(a[0], a[i]) == 0 assert intersectSize(a[i], a[0]) == 0 assert intersectSize(a[1], a[2]) == 5 assert intersectSize(a[2], a[1]) == 5 assert intersectSize(a[1], a[3]) == 1 assert intersectSize(a[3], a[1]) == 1 assert intersectSize(a[1], a[4]) == 5 assert intersectSize(a[4], a[1]) == 5 assert intersectSize(a[1], a[5]) == 90 assert intersectSize(a[5], a[1]) == 90
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Fill in masked intervals of an hmm prediction " "(from teHmmEval.py) with state corresponding to surrounding" " intervals.") parser.add_argument("tracksXML", help="XML track list (used to id masking" " tracks") parser.add_argument("allBed", help="Target scope. Masked intervals outside" " of these regions will not be included") parser.add_argument("inBed", help="TE prediction BED file. State labels" " should probably be mapped (ie with fitStateNames.py)") parser.add_argument("outBed", help="Output BED. Will be equivalent to" " the input bed except all gaps corresponding to " "masked intervals will be filled") parser.add_argument("--maxLen", help="Maximum length of a masked interval" " to fill (inclusive). Use --delMask option with same value" "if running compareBedStates.py after.", type=int, default=sys.maxint) parser.add_argument("--default", help="Default label to give to masked " "region if no label can be determined", default="0") parser.add_argument("--tgts", help="Only relabel gaps that " "are flanked on both sides by the same state, and this state" " is in this comma- separated list. --default used for other" " gaps. If not targetst specified then all states checked.", default=None) parser.add_argument("--oneSidedTgts", help="Only relabel gaps that " "are flanked on at least one side by a state in this comma-" "separated list --default used for other gaps", default=None) parser.add_argument("--onlyDefault", help="Add the default state (--default) no" " no all masked gaps no matter what. ie ignoring all other " "logic", action="store_true", default=False) parser.add_argument("--cut", help="Cut out gaps for masked tracks from the input." " By default, the input is expected to come from the HMM " "with mask intervals already absent, and will crash on with" " an assertion error if an overlap is detected.", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # make sets tgtSet = set() if args.tgts is not None: tgtSet = set(args.tgts.split(",")) oneSidedTgtSet = set() if args.oneSidedTgts is not None: oneSidedTgtSet = set(args.oneSidedTgts.split(",")) assert len(tgtSet.intersection(oneSidedTgtSet)) == 0 # read the track list trackList = TrackList(args.tracksXML) maskTracks = trackList.getMaskTracks() # read the input bed inBed = args.inBed if args.cut is True: inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML) inputIntervals = readBedIntervals(inBed, ncol = 4, sort = True) if args.cut is True: runShellCommand("rm -f %s" % inBed) if len(maskTracks) == 0 or len(inputIntervals) == 0: runShellCommand("cp %s %s" % (args.inBed, args.outBed)) logger.warning("No mask tracks located in %s or" " %s empty" % (args.tracksXML, args.inBed)) return 0 # make a temporary, combined, merged masking bed file tempMaskBed = getLocalTempPath("Temp_mb", ".bed") for maskTrack in maskTracks: assert os.path.isfile(maskTrack.getPath()) runShellCommand("cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % ( maskTrack.getPath(), tempMaskBed)) maskedIntervals = getMergedBedIntervals(tempMaskBed, sort = True) resolvedMasks = 0 if len(inputIntervals) == 0: logger.warning("No mask tracks located in %s" % args.tracksXML) return inputIdx = 0 rightFlank = inputIntervals[inputIdx] tempOutMask = getLocalTempPath("Temp_om", ".bed") tempOutMaskFile = open(tempOutMask, "w") for maskIdx, maskInterval in enumerate(maskedIntervals): if maskInterval[2] - maskInterval[1] > args.maxLen: continue # find candidate right flank while rightFlank < maskInterval: if inputIdx == len(inputIntervals) - 1: rightFlank = None break else: inputIdx += 1 rightFlank = inputIntervals[inputIdx] # candidate left flank leftFlank = None if inputIdx > 0: leftFlank = inputIntervals[inputIdx - 1] # identify flanking states if the intervals perfectly abut leftState = None if leftFlank is not None: if leftFlank[0] == maskInterval[0] and leftFlank[2] == maskInterval[1]: leftState = str(leftFlank[3]) else: assert intersectSize(leftFlank, maskInterval) == 0 rightState = None if rightFlank is not None: if rightFlank[0] == maskInterval[0] and rightFlank[1] == maskInterval[2]: rightState = str(rightFlank[3]) else: assert intersectSize(rightFlank, maskInterval) == 0 # choose a state for the mask interval maskState = str(args.default) if args.onlyDefault is True: pass elif leftState is not None and leftState == rightState: if len(tgtSet) == 0 or leftState in tgtSet: maskState = leftState elif leftState in oneSidedTgtSet: maskState = leftState elif rightState in oneSidedTgtSet: maskState = rightState # write our mask interval tempOutMaskFile.write("%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1], maskInterval[2], maskState)) tempOutMaskFile.close() tempMergePath1 = getLocalTempPath("Temp_mp", ".bed") tempMergePath2 = getLocalTempPath("Temp_mp", ".bed") runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1, tempOutMask, tempMergePath1)) runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2)) tempScopePath = getLocalTempPath("temp_all", ".bed") runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath)) runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath, args.outBed)) runShellCommand("rm -f %s" % " ".join([tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath])) cleanBedTool(tempBedToolPath)
def compareIntervalsOneSided(trueIntervals, predIntervals, col, threshold, usePredLenForThreshold, allowMultipleMatches): """ Same idea as baselevel comparison above, but treats bed intervals as single unit, and does not perform symmetric test. In particular, we return the following stats here: for each true interval, is it covered by a predicted interval (with the same name) by at least threshold pct? The stats returned is therefore a pair for each state: (num intervals in truth correctly predicted , num intervals in truth incorrectly predicted) This is effectively a recall measure. Of course, calling a second time with truth and pred swapped, will yield the precision. We also include the total lengths of the true predicted and false predicted elements. So each states maps to a tuplie like (numTrue, totTrueLen, numFalse, totFalseLen) the usePredLenForThreshold option is activated by the args.strictPrec flag when computing prediction vs truth (see description of this flag for what it does) NOTE: this test will return a positive hit if a giant predicted interval overlaps a tiny true interval. this can be changed, but since this form of innacuracy will be caught when called with true/pred swapped (precision) I'm not sure if it's necessary """ # as in base level comp, both interval sets must cover exactly same regions # in same order. the asserts below only partially check this: assert trueIntervals[0][0] == predIntervals[0][0] assert trueIntervals[0][1] == predIntervals[0][1] assert trueIntervals[-1][2] == predIntervals[-1][2] LP = len(predIntervals) LT = len(trueIntervals) stats = dict() confMat = dict() pi = 0 for ti in xrange(LT): trueInterval = trueIntervals[ti] trueState = trueInterval[col] trueLen = float(trueInterval[2] - trueInterval[1]) # advance pi to first pred interval that intersects ti while True: if pi < LP and intersectSize(trueInterval, predIntervals[pi]) == 0: pi += 1 else: break # scan all intersecting predIntervals with ti bestFrac = 0.0 totalFrac = 0.0 for i in xrange(pi, LP): overlapSize = intersectSize(trueInterval, predIntervals[i]) if overlapSize > 0: denom = trueLen if usePredLenForThreshold is True: denom = float(predIntervals[i][2] - predIntervals[i][1]) frac = float(overlapSize) / denom # look for biggest true overlap when computing accuracy if predIntervals[i][col] == trueState: bestFrac = max(bestFrac, frac) # compute total overlap for allowMultipleMatches option totalFrac += frac # count all overlaps >= thresh when computing confusion matrix if frac >= threshold: updateConfMatrix(confMat, predIntervals[i][col], trueState) else: break if allowMultipleMatches is True: bestFrac = totalFrac # update stats if trueState not in stats: stats[trueState] = [0, 0, 0, 0] if bestFrac >= threshold: stats[trueState][0] += 1 stats[trueState][1] += trueLen else: # dont really need this (can be inferred from total number of # true intervals but whatever) stats[trueState][2] += 1 stats[trueState][3] += trueLen return stats, confMat
def removeOverlaps(inBed, outBed, args): """ Little hack to get this script workign with different settings of ltr_finder where annotations can overlap. To resolve overlaps, we choose the best element (by score, then length), and delete anything it touches. TODO: incorporate Dougs script for his lastz stuff? """ bedIntervals = [x for x in BedTool(inBed).sort()] outFile = open(outBed, "w") def getLtrID(interval): return interval.chrom + interval.name[interval.name.rfind("|") + 1:] # pass 1: element sizes sizes = dict() for interval in bedIntervals: id = getLtrID(interval) length = int(interval.end) - int(interval.start) if id in sizes: sizes[id] += length else: sizes[id] = length # pass 2: greedy kill (not optimal for all transitive cases) # strategy: any pairwise overlap will be detected in either # the left or right scan of at least one of the overlapping # elements. dead = set() for i, interval in enumerate(bedIntervals): id = getLtrID(interval) size = sizes[id] if id in dead: continue for j in xrange(i - 1, -1, -1): if intersectSize((interval.chrom, interval.start, interval.end), (bedIntervals[j].chrom, bedIntervals[j].start, bedIntervals[j].end)) <= 0: break otherId = getLtrID(bedIntervals[j]) if otherId in dead: continue if (bedIntervals[j].score > interval.score or (bedIntervals[j].score == interval.score and sizes[otherId] > size)): dead.add(id) break else: dead.add(otherId) if id in dead: continue for j in xrange(i + 1, len(bedIntervals), 1): if intersectSize((interval.chrom, interval.start, interval.end), (bedIntervals[j].chrom, bedIntervals[j].start, bedIntervals[j].end)) <= 0: break otherId = getLtrID(bedIntervals[j]) if otherId in dead: continue if (bedIntervals[j].score > interval.score or (bedIntervals[j].score == interval.score and sizes[otherId] > size)): dead.add(id) break else: dead.add(otherId) if id in dead: continue # pass 3: write non-killed for interval in bedIntervals: id = getLtrID(interval) if id not in dead: if interval.strand == "?": interval.strand = "." applyWeak(interval, args) outFile.write(str(interval))
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Fill in masked intervals of an hmm prediction " "(from teHmmEval.py) with state corresponding to surrounding" " intervals.") parser.add_argument("tracksXML", help="XML track list (used to id masking" " tracks") parser.add_argument("allBed", help="Target scope. Masked intervals outside" " of these regions will not be included") parser.add_argument( "inBed", help="TE prediction BED file. State labels" " should probably be mapped (ie with fitStateNames.py)") parser.add_argument("outBed", help="Output BED. Will be equivalent to" " the input bed except all gaps corresponding to " "masked intervals will be filled") parser.add_argument( "--maxLen", help="Maximum length of a masked interval" " to fill (inclusive). Use --delMask option with same value" "if running compareBedStates.py after.", type=int, default=sys.maxint) parser.add_argument("--default", help="Default label to give to masked " "region if no label can be determined", default="0") parser.add_argument( "--tgts", help="Only relabel gaps that " "are flanked on both sides by the same state, and this state" " is in this comma- separated list. --default used for other" " gaps. If not targetst specified then all states checked.", default=None) parser.add_argument( "--oneSidedTgts", help="Only relabel gaps that " "are flanked on at least one side by a state in this comma-" "separated list --default used for other gaps", default=None) parser.add_argument( "--onlyDefault", help="Add the default state (--default) no" " no all masked gaps no matter what. ie ignoring all other " "logic", action="store_true", default=False) parser.add_argument( "--cut", help="Cut out gaps for masked tracks from the input." " By default, the input is expected to come from the HMM " "with mask intervals already absent, and will crash on with" " an assertion error if an overlap is detected.", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # make sets tgtSet = set() if args.tgts is not None: tgtSet = set(args.tgts.split(",")) oneSidedTgtSet = set() if args.oneSidedTgts is not None: oneSidedTgtSet = set(args.oneSidedTgts.split(",")) assert len(tgtSet.intersection(oneSidedTgtSet)) == 0 # read the track list trackList = TrackList(args.tracksXML) maskTracks = trackList.getMaskTracks() # read the input bed inBed = args.inBed if args.cut is True: inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML) inputIntervals = readBedIntervals(inBed, ncol=4, sort=True) if args.cut is True: runShellCommand("rm -f %s" % inBed) if len(maskTracks) == 0 or len(inputIntervals) == 0: runShellCommand("cp %s %s" % (args.inBed, args.outBed)) logger.warning("No mask tracks located in %s or" " %s empty" % (args.tracksXML, args.inBed)) return 0 # make a temporary, combined, merged masking bed file tempMaskBed = getLocalTempPath("Temp_mb", ".bed") for maskTrack in maskTracks: assert os.path.isfile(maskTrack.getPath()) runShellCommand( "cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % (maskTrack.getPath(), tempMaskBed)) maskedIntervals = getMergedBedIntervals(tempMaskBed, sort=True) resolvedMasks = 0 if len(inputIntervals) == 0: logger.warning("No mask tracks located in %s" % args.tracksXML) return inputIdx = 0 rightFlank = inputIntervals[inputIdx] tempOutMask = getLocalTempPath("Temp_om", ".bed") tempOutMaskFile = open(tempOutMask, "w") for maskIdx, maskInterval in enumerate(maskedIntervals): if maskInterval[2] - maskInterval[1] > args.maxLen: continue # find candidate right flank while rightFlank < maskInterval: if inputIdx == len(inputIntervals) - 1: rightFlank = None break else: inputIdx += 1 rightFlank = inputIntervals[inputIdx] # candidate left flank leftFlank = None if inputIdx > 0: leftFlank = inputIntervals[inputIdx - 1] # identify flanking states if the intervals perfectly abut leftState = None if leftFlank is not None: if leftFlank[0] == maskInterval[0] and leftFlank[ 2] == maskInterval[1]: leftState = str(leftFlank[3]) else: assert intersectSize(leftFlank, maskInterval) == 0 rightState = None if rightFlank is not None: if rightFlank[0] == maskInterval[0] and rightFlank[ 1] == maskInterval[2]: rightState = str(rightFlank[3]) else: assert intersectSize(rightFlank, maskInterval) == 0 # choose a state for the mask interval maskState = str(args.default) if args.onlyDefault is True: pass elif leftState is not None and leftState == rightState: if len(tgtSet) == 0 or leftState in tgtSet: maskState = leftState elif leftState in oneSidedTgtSet: maskState = leftState elif rightState in oneSidedTgtSet: maskState = rightState # write our mask interval tempOutMaskFile.write( "%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1], maskInterval[2], maskState)) tempOutMaskFile.close() tempMergePath1 = getLocalTempPath("Temp_mp", ".bed") tempMergePath2 = getLocalTempPath("Temp_mp", ".bed") runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1, tempOutMask, tempMergePath1)) runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2)) tempScopePath = getLocalTempPath("temp_all", ".bed") runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath)) runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath, args.outBed)) runShellCommand("rm -f %s" % " ".join([ tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath ])) cleanBedTool(tempBedToolPath)
def removeOverlaps(inBed, outBed, args): """ Little hack to get this script workign with different settings of ltr_finder where annotations can overlap. To resolve overlaps, we choose the best element (by score, then length), and delete anything it touches. TODO: incorporate Dougs script for his lastz stuff? """ bedIntervals = [x for x in BedTool(inBed).sort()] outFile = open(outBed, "w") def getLtrID(interval): return interval.chrom + interval.name[interval.name.rfind("|") + 1:] # pass 1: element sizes sizes = dict() for interval in bedIntervals: id = getLtrID(interval) length = int(interval.end) - int(interval.start) if id in sizes: sizes[id] += length else: sizes[id] = length # pass 2: greedy kill (not optimal for all transitive cases) # strategy: any pairwise overlap will be detected in either # the left or right scan of at least one of the overlapping # elements. dead = set() for i, interval in enumerate(bedIntervals): id = getLtrID(interval) size = sizes[id] if id in dead: continue for j in xrange(i-1, -1, -1): if intersectSize((interval.chrom, interval.start, interval.end), (bedIntervals[j].chrom, bedIntervals[j].start, bedIntervals[j].end)) <= 0: break otherId = getLtrID(bedIntervals[j]) if otherId in dead: continue if (bedIntervals[j].score > interval.score or (bedIntervals[j].score == interval.score and sizes[otherId] > size)): dead.add(id) break else: dead.add(otherId) if id in dead: continue for j in xrange(i+1, len(bedIntervals), 1): if intersectSize((interval.chrom, interval.start, interval.end), (bedIntervals[j].chrom, bedIntervals[j].start, bedIntervals[j].end)) <= 0: break otherId = getLtrID(bedIntervals[j]) if otherId in dead: continue if (bedIntervals[j].score > interval.score or (bedIntervals[j].score == interval.score and sizes[otherId] > size)): dead.add(id) break else: dead.add(otherId) if id in dead: continue # pass 3: write non-killed for interval in bedIntervals: id = getLtrID(interval) if id not in dead: if interval.strand == "?": interval.strand = "." applyWeak(interval, args) outFile.write(str(interval))
def testMask(self): trackData1 = TrackData() trackData1.loadTrackData(getTracksInfoPath(8), [("scaffold_1", 0, 50)]) tableList1 = trackData1.getTrackTableList() trackData2 = TrackData() trackData2.loadTrackData(getTracksInfoPath(7), [("scaffold_1", 0, 50)]) tableList2 = trackData2.getTrackTableList() assert len(tableList1) == 1 assert len(tableList2) == len(tableList1) table1 = tableList1[0] table2 = tableList2[0] assert len(table2) == 50 assert len(table1) == 39 maskOffsets1 = table1.getMaskRunningOffsets() maskOffsets2 = table2.getMaskRunningOffsets() assert maskOffsets2 == None for i in xrange(len(table1)): v1 = table1[i] v2 = table2[i + maskOffsets1[i]] assert_array_equal(v1, v2) # now test segmentation statesPath = getStatesPath() segPath = getSegmentsPath() bedIntervals = getMergedBedIntervals(getStatesPath(), sort=True) segIntervals = readBedIntervals(getSegmentsPath(), sort=True) segTrackData2 = TrackData() segTrackData2.loadTrackData(getTracksInfoPath(7), bedIntervals, segmentIntervals=segIntervals, interpolateSegments=True) segTrackData3 = TrackData() segTrackData3.loadTrackData(getTracksInfoPath(9), bedIntervals, segmentIntervals=segIntervals, interpolateSegments=True) tlist3 = segTrackData2.getTrackTableList() tlist4 = segTrackData3.getTrackTableList() assert len(tlist4) == 3 assert len(tlist3) == 4 maskIntervals = getMergedBedIntervals(getMaskPath(), sort=True) tracks2 = segTrackData2.getTrackList() tracks3 = segTrackData3.getTrackList() for i in xrange(len(tlist4)): t3 = tlist3[i] t4 = tlist4[i] maskOffsets = t4.getMaskRunningOffsets() assert maskOffsets is not None assert t3.getMaskRunningOffsets() is None k = 0 for j in xrange(len(t3)): start3 = t3.start + t3.segOffsets[j] len3 = t3.getSegmentLength(j) end3 = start3 + len3 i3 = (t3.chrom, start3, end3) masked = False for x in maskIntervals: if intersectSize(i3, x) > 0: masked = True assert intersectSize(i3, x) == i3[2] - i3[1] if masked is False: start4 = t4.start + t4.segOffsets[k] + maskOffsets[k] len4 = t4.getSegmentLength(k) end4 = start4 + len4 i4 = (t4.chrom, start4, end4) assert_array_equal(i3, i4) v3 = [tracks2.getTrackByNumber(x).getValueMap().getMapBack(t3[j][x]) for x in xrange(len(t3[j]))] v4 = [tracks3.getTrackByNumber(x).getValueMap().getMapBack(t4[k][x]) for x in xrange(len(t4[k]))] assert_array_equal(v3, v4) k += 1