Esempio n. 1
0
def checkExactOverlap(bed1, bed2):
    """ make sure two bed files cover same region exactly: a requirement for all
    code based on the comparisons in this module."""

    errorMessage = (
        "Bed files %s and %s cannot be compared. xxx. "
        " Input files must be both sorted, cover the exact same region,"
        " and contain no self-overlaps.") % (bed1, bed2)

    # empty file may break downstream comparisons
    size1 = os.path.getsize(bed1)
    size2 = os.path.getsize(bed2)
    if size1 == 0 or size2 == 0:
        raise RuntimeError(
            errorMessage.replace("xxx", "one or both inputs empty"))

    # test self-overlap and sorting
    intervals1 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals1)):
        if intersectSize(intervals1[i - 1], intervals1[i]) != 0:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Overlapping intervals %s and %s found in input1" %
                    (intervals1[i - 1], intervals1[i])))
        if intervals1[i - 1] > intervals1[i]:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Out of order intervals %s and %s found in input1" %
                    (intervals1[i - 1], intervals1[i])))

    # test self-overlap and sorting
    intervals2 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals2)):
        if intersectSize(intervals2[i - 1], intervals2[i]) != 0:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Overlapping intervals %s and %s found in input2" %
                    (intervals2[i - 1], intervals2[i])))
        if intervals2[i - 1] > intervals2[i]:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Out of order intervals %s and %s found in input2" %
                    (intervals2[i - 1], intervals2[i])))

    # test intersection size
    tempFile = getLocalTempPath("Temp_test", ".bed")
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(
            errorMessage.replace("xxx",
                                 "Input1 covers regions outside input2"))
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(
            errorMessage.replace("xxx",
                                 "Input2 covers regions outside input1"))
    runShellCommand("rm -f %s" % tempFile)
Esempio n. 2
0
def checkExactOverlap(bed1, bed2):
    """ make sure two bed files cover same region exactly: a requirement for all
    code based on the comparisons in this module."""

    errorMessage = ("Bed files %s and %s cannot be compared. xxx. "
    " Input files must be both sorted, cover the exact same region,"
    " and contain no self-overlaps.") % (bed1, bed2)

    # empty file may break downstream comparisons
    size1 = os.path.getsize(bed1)
    size2 = os.path.getsize(bed2)
    if size1 == 0 or size2 == 0:
        raise RuntimeError(errorMessage.replace("xxx", "one or both inputs empty"))
                            

    # test self-overlap and sorting
    intervals1 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals1)):
        if intersectSize(intervals1[i-1], intervals1[i]) != 0:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Overlapping intervals %s and %s found in input1" % (
                    intervals1[i-1], intervals1[i])))
        if intervals1[i-1] > intervals1[i]:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Out of order intervals %s and %s found in input1" % (
                    intervals1[i-1], intervals1[i])))

    # test self-overlap and sorting
    intervals2 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals2)):
        if intersectSize(intervals2[i-1], intervals2[i]) != 0:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Overlapping intervals %s and %s found in input2" % (
                    intervals2[i-1], intervals2[i])))
        if intervals2[i-1] > intervals2[i]:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Out of order intervals %s and %s found in input2" % (
                    intervals2[i-1], intervals2[i])))
        

    # test intersection size
    tempFile = getLocalTempPath("Temp_test", ".bed")
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(errorMessage.replace(
            "xxx", "Input1 covers regions outside input2"))
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(errorMessage.replace(
            "xxx", "Input2 covers regions outside input1"))
    runShellCommand("rm -f %s" % tempFile)
Esempio n. 3
0
    def testIntersect(self):
        a = [0] * 6
        a[0] = ("b", 10, 100)
        a[1] = ("a", 10, 100)
        a[2] = ("a", 5, 15)
        a[3] = ("a", 11, 12)
        a[4] = ("a", 95, 105)
        a[5] = ("a", 0, 1000)

        for i in xrange(1, 6):
            assert intersectSize(a[0], a[i]) == 0
            assert intersectSize(a[i], a[0]) == 0

        assert intersectSize(a[1], a[2]) == 5
        assert intersectSize(a[2], a[1]) == 5

        assert intersectSize(a[1], a[3]) == 1
        assert intersectSize(a[3], a[1]) == 1

        assert intersectSize(a[1], a[4]) == 5
        assert intersectSize(a[4], a[1]) == 5

        assert intersectSize(a[1], a[5]) == 90
        assert intersectSize(a[5], a[1]) == 90
Esempio n. 4
0
    def testIntersect(self):
        a = [0] * 6
        a[0] = ("b", 10, 100)
        a[1] = ("a", 10, 100)
        a[2] = ("a", 5, 15)
        a[3] = ("a", 11, 12)
        a[4] = ("a", 95, 105)
        a[5] = ("a", 0, 1000)

        for i in xrange(1, 6):
            assert intersectSize(a[0], a[i]) == 0
            assert intersectSize(a[i], a[0]) == 0

        assert intersectSize(a[1], a[2]) == 5
        assert intersectSize(a[2], a[1]) == 5

        assert intersectSize(a[1], a[3]) == 1
        assert intersectSize(a[3], a[1]) == 1
        
        assert intersectSize(a[1], a[4]) == 5
        assert intersectSize(a[4], a[1]) == 5

        assert intersectSize(a[1], a[5]) == 90
        assert intersectSize(a[5], a[1]) == 90
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Fill in masked intervals of an hmm prediction "
        "(from teHmmEval.py) with state corresponding to surrounding"
        " intervals.")

    parser.add_argument("tracksXML", help="XML track list (used to id masking"
                        " tracks")
    parser.add_argument("allBed", help="Target scope.  Masked intervals outside"
                        " of these regions will not be included")
    parser.add_argument("inBed", help="TE prediction BED file.  State labels"
                        " should probably be mapped (ie with fitStateNames.py)")
    parser.add_argument("outBed", help="Output BED.  Will be equivalent to"
                        " the input bed except all gaps corresponding to "
                        "masked intervals will be filled")
    parser.add_argument("--maxLen", help="Maximum length of a masked interval"
                        " to fill (inclusive). Use --delMask option with same value"
                        "if running compareBedStates.py after.",
                        type=int, default=sys.maxint)
    parser.add_argument("--default", help="Default label to give to masked "
                        "region if no label can be determined", default="0")
    parser.add_argument("--tgts", help="Only relabel gaps that "
                        "are flanked on both sides by the same state, and this state"
                        " is in this comma- separated list. --default used for other"
                        " gaps.  If not targetst specified then all states checked.",
                        default=None)
    parser.add_argument("--oneSidedTgts", help="Only relabel gaps that "
                        "are flanked on at least one side by a state in this comma-"
                        "separated list --default used for other gaps",
                         default=None)
    parser.add_argument("--onlyDefault", help="Add the default state (--default) no"
                        " no all masked gaps no matter what. ie ignoring all other "
                        "logic", action="store_true", default=False)
    parser.add_argument("--cut", help="Cut out gaps for masked tracks from the input."
                        " By default, the input is expected to come from the HMM "
                        "with mask intervals already absent, and will crash on with"
                        " an assertion error if an overlap is detected.",
                        action="store_true", default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # make sets
    tgtSet = set()
    if args.tgts is not None:
        tgtSet = set(args.tgts.split(","))
    oneSidedTgtSet = set()
    if args.oneSidedTgts is not None:
        oneSidedTgtSet = set(args.oneSidedTgts.split(","))
    assert len(tgtSet.intersection(oneSidedTgtSet)) == 0

    # read the track list
    trackList = TrackList(args.tracksXML)
    maskTracks = trackList.getMaskTracks()

    # read the input bed
    inBed = args.inBed
    if args.cut is True:
        inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML)
    inputIntervals = readBedIntervals(inBed, ncol = 4, sort = True)
    if args.cut is True:
        runShellCommand("rm -f %s" % inBed)
    if len(maskTracks) == 0 or len(inputIntervals) == 0:
        runShellCommand("cp %s %s" % (args.inBed, args.outBed))
        logger.warning("No mask tracks located in %s or"
                       " %s empty" % (args.tracksXML, args.inBed))
        return 0


    # make a temporary, combined, merged masking bed file
    tempMaskBed = getLocalTempPath("Temp_mb", ".bed")
    for maskTrack in maskTracks:
        assert os.path.isfile(maskTrack.getPath())
        runShellCommand("cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % (
            maskTrack.getPath(), tempMaskBed))
    maskedIntervals = getMergedBedIntervals(tempMaskBed, sort = True)
    resolvedMasks = 0

    if len(inputIntervals) == 0:
        logger.warning("No mask tracks located in %s" % args.tracksXML)
        return
    inputIdx = 0
    rightFlank = inputIntervals[inputIdx]

    tempOutMask = getLocalTempPath("Temp_om", ".bed")
    tempOutMaskFile = open(tempOutMask, "w")

    for maskIdx, maskInterval in enumerate(maskedIntervals):
        if maskInterval[2] - maskInterval[1] > args.maxLen:
            continue
        # find candidate right flank
        while rightFlank < maskInterval:
            if inputIdx == len(inputIntervals) - 1:
                rightFlank = None
                break
            else:
                inputIdx += 1
                rightFlank = inputIntervals[inputIdx]

        # candidate left flank
        leftFlank = None
        if inputIdx > 0:
            leftFlank = inputIntervals[inputIdx - 1]

        # identify flanking states if the intervals perfectly abut
        leftState = None
        if leftFlank is not None:
            if leftFlank[0] == maskInterval[0] and leftFlank[2] == maskInterval[1]:
                leftState = str(leftFlank[3])
            else:
                assert intersectSize(leftFlank, maskInterval) == 0
        rightState = None
        if rightFlank is not None:
            if rightFlank[0] == maskInterval[0] and rightFlank[1] == maskInterval[2]:
                rightState = str(rightFlank[3])
            else:
                assert intersectSize(rightFlank, maskInterval) == 0
            
        # choose a state for the mask interval
        maskState = str(args.default)
        if args.onlyDefault is True:
            pass
        elif leftState is not None and leftState == rightState:
            if len(tgtSet) == 0 or leftState in tgtSet:
                maskState = leftState
        elif leftState in oneSidedTgtSet:
            maskState = leftState
        elif rightState in oneSidedTgtSet:
            maskState = rightState
        
        # write our mask interval
        tempOutMaskFile.write("%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1],
                                                    maskInterval[2], maskState))

    
    tempOutMaskFile.close()    
    tempMergePath1 = getLocalTempPath("Temp_mp", ".bed")
    tempMergePath2 = getLocalTempPath("Temp_mp", ".bed")
    runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1,
                                                 tempOutMask, tempMergePath1))
    runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2))
    tempScopePath = getLocalTempPath("temp_all", ".bed")
    runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath))
    runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath,
                                                       args.outBed))

    runShellCommand("rm -f %s" % " ".join([tempMaskBed, tempOutMask, tempMergePath1,
                                      tempMergePath2, tempScopePath]))
    cleanBedTool(tempBedToolPath)
Esempio n. 6
0
def compareIntervalsOneSided(trueIntervals, predIntervals, col, threshold,
                             usePredLenForThreshold, allowMultipleMatches):
    """ Same idea as baselevel comparison above, but treats bed intervals
    as single unit, and does not perform symmetric test.  In particular, we
    return the following stats here: for each true interval, is it covered
    by a predicted interval (with the same name) by at least threshold pct?
    The stats returned is therefore a pair for each state:
    (num intervals in truth correctly predicted , num intervals in truth
    incorrectly predicted)
    This is effectively a recall measure.  Of course, calling a second time
    with truth and pred swapped, will yield the precision.

    We also include the total lengths of the true predicted and false predicted
    elements.  So each states maps to a tuplie like
    (numTrue, totTrueLen, numFalse, totFalseLen)

    the usePredLenForThreshold option is activated by the args.strictPrec
    flag when computing prediction vs truth (see description of this flag
    for what it does)

    NOTE: this test will return a positive hit if a giant predicted interval
    overlaps a tiny true interval.  this can be changed, but since this form
    of innacuracy will be caught when called with true/pred swapped (precision)
    I'm not sure if it's necessary
    """

    # as in base level comp, both interval sets must cover exactly same regions
    # in same order.  the asserts below only partially check this:
    assert trueIntervals[0][0] == predIntervals[0][0]
    assert trueIntervals[0][1] == predIntervals[0][1]
    assert trueIntervals[-1][2] == predIntervals[-1][2]

    LP = len(predIntervals)
    LT = len(trueIntervals)

    stats = dict()
    confMat = dict()
    
    pi = 0
    for ti in xrange(LT):

        trueInterval = trueIntervals[ti]
        trueState = trueInterval[col]
        trueLen = float(trueInterval[2] - trueInterval[1])
        
        # advance pi to first pred interval that intersects ti
        while True:
            if pi < LP and intersectSize(trueInterval,
                                         predIntervals[pi]) == 0:
                pi += 1
            else:
                break

        # scan all intersecting predIntervals with ti
        bestFrac = 0.0
        totalFrac = 0.0
        for i in xrange(pi, LP):
            overlapSize = intersectSize(trueInterval, predIntervals[i])
            if overlapSize > 0:
                denom = trueLen
                if usePredLenForThreshold is True:
                    denom = float(predIntervals[i][2] - predIntervals[i][1])
                frac = float(overlapSize) / denom
                # look for biggest true overlap when computing accuracy
                if predIntervals[i][col] == trueState:
                    bestFrac = max(bestFrac, frac)
                    # compute total overlap for allowMultipleMatches option
                    totalFrac += frac
                # count all overlaps >= thresh when computing confusion matrix
                if frac >= threshold:
                    updateConfMatrix(confMat, predIntervals[i][col], trueState)
            else:
                break

        if allowMultipleMatches is True:
            bestFrac = totalFrac

        # update stats
        if trueState not in stats:
            stats[trueState] = [0, 0, 0, 0]

        if bestFrac >= threshold:
            stats[trueState][0] += 1
            stats[trueState][1] += trueLen
        else:
            # dont really need this (can be inferred from total number of
            # true intervals but whatever)
            stats[trueState][2] += 1
            stats[trueState][3] += trueLen

    return stats, confMat
Esempio n. 7
0
def compareIntervalsOneSided(trueIntervals, predIntervals, col, threshold,
                             usePredLenForThreshold, allowMultipleMatches):
    """ Same idea as baselevel comparison above, but treats bed intervals
    as single unit, and does not perform symmetric test.  In particular, we
    return the following stats here: for each true interval, is it covered
    by a predicted interval (with the same name) by at least threshold pct?
    The stats returned is therefore a pair for each state:
    (num intervals in truth correctly predicted , num intervals in truth
    incorrectly predicted)
    This is effectively a recall measure.  Of course, calling a second time
    with truth and pred swapped, will yield the precision.

    We also include the total lengths of the true predicted and false predicted
    elements.  So each states maps to a tuplie like
    (numTrue, totTrueLen, numFalse, totFalseLen)

    the usePredLenForThreshold option is activated by the args.strictPrec
    flag when computing prediction vs truth (see description of this flag
    for what it does)

    NOTE: this test will return a positive hit if a giant predicted interval
    overlaps a tiny true interval.  this can be changed, but since this form
    of innacuracy will be caught when called with true/pred swapped (precision)
    I'm not sure if it's necessary
    """

    # as in base level comp, both interval sets must cover exactly same regions
    # in same order.  the asserts below only partially check this:
    assert trueIntervals[0][0] == predIntervals[0][0]
    assert trueIntervals[0][1] == predIntervals[0][1]
    assert trueIntervals[-1][2] == predIntervals[-1][2]

    LP = len(predIntervals)
    LT = len(trueIntervals)

    stats = dict()
    confMat = dict()

    pi = 0
    for ti in xrange(LT):

        trueInterval = trueIntervals[ti]
        trueState = trueInterval[col]
        trueLen = float(trueInterval[2] - trueInterval[1])

        # advance pi to first pred interval that intersects ti
        while True:
            if pi < LP and intersectSize(trueInterval, predIntervals[pi]) == 0:
                pi += 1
            else:
                break

        # scan all intersecting predIntervals with ti
        bestFrac = 0.0
        totalFrac = 0.0
        for i in xrange(pi, LP):
            overlapSize = intersectSize(trueInterval, predIntervals[i])
            if overlapSize > 0:
                denom = trueLen
                if usePredLenForThreshold is True:
                    denom = float(predIntervals[i][2] - predIntervals[i][1])
                frac = float(overlapSize) / denom
                # look for biggest true overlap when computing accuracy
                if predIntervals[i][col] == trueState:
                    bestFrac = max(bestFrac, frac)
                    # compute total overlap for allowMultipleMatches option
                    totalFrac += frac
                # count all overlaps >= thresh when computing confusion matrix
                if frac >= threshold:
                    updateConfMatrix(confMat, predIntervals[i][col], trueState)
            else:
                break

        if allowMultipleMatches is True:
            bestFrac = totalFrac

        # update stats
        if trueState not in stats:
            stats[trueState] = [0, 0, 0, 0]

        if bestFrac >= threshold:
            stats[trueState][0] += 1
            stats[trueState][1] += trueLen
        else:
            # dont really need this (can be inferred from total number of
            # true intervals but whatever)
            stats[trueState][2] += 1
            stats[trueState][3] += trueLen

    return stats, confMat
Esempio n. 8
0
def removeOverlaps(inBed, outBed, args):
    """ Little hack to get this script workign with different settings of ltr_finder
    where annotations can overlap.  To resolve overlaps, we choose the best element
    (by score, then length), and delete anything it touches.  TODO: incorporate Dougs
    script for his lastz stuff? """

    bedIntervals = [x for x in BedTool(inBed).sort()]
    outFile = open(outBed, "w")

    def getLtrID(interval):
        return interval.chrom + interval.name[interval.name.rfind("|") + 1:]

    # pass 1: element sizes
    sizes = dict()
    for interval in bedIntervals:
        id = getLtrID(interval)
        length = int(interval.end) - int(interval.start)
        if id in sizes:
            sizes[id] += length
        else:
            sizes[id] = length

    # pass 2: greedy kill (not optimal for all transitive cases)
    # strategy: any pairwise overlap will be detected in either
    # the left or right scan of at least one of the overlapping
    # elements.
    dead = set()
    for i, interval in enumerate(bedIntervals):
        id = getLtrID(interval)
        size = sizes[id]
        if id in dead:
            continue
        for j in xrange(i - 1, -1, -1):
            if intersectSize((interval.chrom, interval.start, interval.end),
                             (bedIntervals[j].chrom, bedIntervals[j].start,
                              bedIntervals[j].end)) <= 0:
                break
            otherId = getLtrID(bedIntervals[j])
            if otherId in dead:
                continue
            if (bedIntervals[j].score > interval.score
                    or (bedIntervals[j].score == interval.score
                        and sizes[otherId] > size)):
                dead.add(id)
                break
            else:
                dead.add(otherId)
        if id in dead:
            continue
        for j in xrange(i + 1, len(bedIntervals), 1):
            if intersectSize((interval.chrom, interval.start, interval.end),
                             (bedIntervals[j].chrom, bedIntervals[j].start,
                              bedIntervals[j].end)) <= 0:
                break
            otherId = getLtrID(bedIntervals[j])
            if otherId in dead:
                continue
            if (bedIntervals[j].score > interval.score
                    or (bedIntervals[j].score == interval.score
                        and sizes[otherId] > size)):
                dead.add(id)
                break
            else:
                dead.add(otherId)
        if id in dead:
            continue

    # pass 3: write non-killed
    for interval in bedIntervals:
        id = getLtrID(interval)
        if id not in dead:
            if interval.strand == "?":
                interval.strand = "."
            applyWeak(interval, args)
            outFile.write(str(interval))
Esempio n. 9
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Fill in masked intervals of an hmm prediction "
        "(from teHmmEval.py) with state corresponding to surrounding"
        " intervals.")

    parser.add_argument("tracksXML",
                        help="XML track list (used to id masking"
                        " tracks")
    parser.add_argument("allBed",
                        help="Target scope.  Masked intervals outside"
                        " of these regions will not be included")
    parser.add_argument(
        "inBed",
        help="TE prediction BED file.  State labels"
        " should probably be mapped (ie with fitStateNames.py)")
    parser.add_argument("outBed",
                        help="Output BED.  Will be equivalent to"
                        " the input bed except all gaps corresponding to "
                        "masked intervals will be filled")
    parser.add_argument(
        "--maxLen",
        help="Maximum length of a masked interval"
        " to fill (inclusive). Use --delMask option with same value"
        "if running compareBedStates.py after.",
        type=int,
        default=sys.maxint)
    parser.add_argument("--default",
                        help="Default label to give to masked "
                        "region if no label can be determined",
                        default="0")
    parser.add_argument(
        "--tgts",
        help="Only relabel gaps that "
        "are flanked on both sides by the same state, and this state"
        " is in this comma- separated list. --default used for other"
        " gaps.  If not targetst specified then all states checked.",
        default=None)
    parser.add_argument(
        "--oneSidedTgts",
        help="Only relabel gaps that "
        "are flanked on at least one side by a state in this comma-"
        "separated list --default used for other gaps",
        default=None)
    parser.add_argument(
        "--onlyDefault",
        help="Add the default state (--default) no"
        " no all masked gaps no matter what. ie ignoring all other "
        "logic",
        action="store_true",
        default=False)
    parser.add_argument(
        "--cut",
        help="Cut out gaps for masked tracks from the input."
        " By default, the input is expected to come from the HMM "
        "with mask intervals already absent, and will crash on with"
        " an assertion error if an overlap is detected.",
        action="store_true",
        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # make sets
    tgtSet = set()
    if args.tgts is not None:
        tgtSet = set(args.tgts.split(","))
    oneSidedTgtSet = set()
    if args.oneSidedTgts is not None:
        oneSidedTgtSet = set(args.oneSidedTgts.split(","))
    assert len(tgtSet.intersection(oneSidedTgtSet)) == 0

    # read the track list
    trackList = TrackList(args.tracksXML)
    maskTracks = trackList.getMaskTracks()

    # read the input bed
    inBed = args.inBed
    if args.cut is True:
        inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML)
    inputIntervals = readBedIntervals(inBed, ncol=4, sort=True)
    if args.cut is True:
        runShellCommand("rm -f %s" % inBed)
    if len(maskTracks) == 0 or len(inputIntervals) == 0:
        runShellCommand("cp %s %s" % (args.inBed, args.outBed))
        logger.warning("No mask tracks located in %s or"
                       " %s empty" % (args.tracksXML, args.inBed))
        return 0

    # make a temporary, combined, merged masking bed file
    tempMaskBed = getLocalTempPath("Temp_mb", ".bed")
    for maskTrack in maskTracks:
        assert os.path.isfile(maskTrack.getPath())
        runShellCommand(
            "cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s"
            % (maskTrack.getPath(), tempMaskBed))
    maskedIntervals = getMergedBedIntervals(tempMaskBed, sort=True)
    resolvedMasks = 0

    if len(inputIntervals) == 0:
        logger.warning("No mask tracks located in %s" % args.tracksXML)
        return
    inputIdx = 0
    rightFlank = inputIntervals[inputIdx]

    tempOutMask = getLocalTempPath("Temp_om", ".bed")
    tempOutMaskFile = open(tempOutMask, "w")

    for maskIdx, maskInterval in enumerate(maskedIntervals):
        if maskInterval[2] - maskInterval[1] > args.maxLen:
            continue
        # find candidate right flank
        while rightFlank < maskInterval:
            if inputIdx == len(inputIntervals) - 1:
                rightFlank = None
                break
            else:
                inputIdx += 1
                rightFlank = inputIntervals[inputIdx]

        # candidate left flank
        leftFlank = None
        if inputIdx > 0:
            leftFlank = inputIntervals[inputIdx - 1]

        # identify flanking states if the intervals perfectly abut
        leftState = None
        if leftFlank is not None:
            if leftFlank[0] == maskInterval[0] and leftFlank[
                    2] == maskInterval[1]:
                leftState = str(leftFlank[3])
            else:
                assert intersectSize(leftFlank, maskInterval) == 0
        rightState = None
        if rightFlank is not None:
            if rightFlank[0] == maskInterval[0] and rightFlank[
                    1] == maskInterval[2]:
                rightState = str(rightFlank[3])
            else:
                assert intersectSize(rightFlank, maskInterval) == 0

        # choose a state for the mask interval
        maskState = str(args.default)
        if args.onlyDefault is True:
            pass
        elif leftState is not None and leftState == rightState:
            if len(tgtSet) == 0 or leftState in tgtSet:
                maskState = leftState
        elif leftState in oneSidedTgtSet:
            maskState = leftState
        elif rightState in oneSidedTgtSet:
            maskState = rightState

        # write our mask interval
        tempOutMaskFile.write(
            "%s\t%d\t%d\t%s\n" %
            (maskInterval[0], maskInterval[1], maskInterval[2], maskState))

    tempOutMaskFile.close()
    tempMergePath1 = getLocalTempPath("Temp_mp", ".bed")
    tempMergePath2 = getLocalTempPath("Temp_mp", ".bed")
    runShellCommand("cp %s %s ; cat %s >> %s" %
                    (args.inBed, tempMergePath1, tempOutMask, tempMergePath1))
    runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2))
    tempScopePath = getLocalTempPath("temp_all", ".bed")
    runShellCommand("mergeBed -i %s |sortBed > %s" %
                    (args.allBed, tempScopePath))
    runShellCommand("intersectBed -a %s -b %s > %s" %
                    (tempMergePath2, tempScopePath, args.outBed))

    runShellCommand("rm -f %s" % " ".join([
        tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath
    ]))
    cleanBedTool(tempBedToolPath)
Esempio n. 10
0
def removeOverlaps(inBed, outBed, args):
    """ Little hack to get this script workign with different settings of ltr_finder
    where annotations can overlap.  To resolve overlaps, we choose the best element
    (by score, then length), and delete anything it touches.  TODO: incorporate Dougs
    script for his lastz stuff? """

    bedIntervals = [x for x in BedTool(inBed).sort()]
    outFile = open(outBed, "w")

    def getLtrID(interval):
        return interval.chrom + interval.name[interval.name.rfind("|") + 1:]
    
    # pass 1: element sizes
    sizes = dict()
    for interval in bedIntervals:
        id = getLtrID(interval)
        length = int(interval.end) - int(interval.start)
        if id in sizes:
            sizes[id] += length
        else:
            sizes[id] = length

    # pass 2: greedy kill (not optimal for all transitive cases)
    # strategy: any pairwise overlap will be detected in either
    # the left or right scan of at least one of the overlapping
    # elements. 
    dead = set()
    for i, interval in enumerate(bedIntervals):
        id = getLtrID(interval)
        size = sizes[id]
        if id in dead:
            continue
        for j in xrange(i-1, -1, -1):
            if intersectSize((interval.chrom, interval.start, interval.end),
                             (bedIntervals[j].chrom, bedIntervals[j].start,
                             bedIntervals[j].end)) <= 0:
                break
            otherId = getLtrID(bedIntervals[j])
            if otherId in dead:
                continue
            if (bedIntervals[j].score > interval.score or
                (bedIntervals[j].score == interval.score and
                 sizes[otherId] > size)):
                dead.add(id)
                break
            else:
                dead.add(otherId)
        if id in dead:
            continue
        for j in xrange(i+1, len(bedIntervals), 1):
            if intersectSize((interval.chrom, interval.start, interval.end),
                             (bedIntervals[j].chrom, bedIntervals[j].start,
                             bedIntervals[j].end)) <= 0:
                break
            otherId = getLtrID(bedIntervals[j])
            if otherId in dead:
                continue
            if (bedIntervals[j].score > interval.score or
                (bedIntervals[j].score == interval.score and
                 sizes[otherId] > size)):
                dead.add(id)
                break
            else:
                dead.add(otherId)
        if id in dead:
            continue

    # pass 3: write non-killed
    for interval in bedIntervals:
        id = getLtrID(interval)
        if id not in dead:
            if interval.strand == "?":
                interval.strand = "."
            applyWeak(interval, args)
            outFile.write(str(interval))
Esempio n. 11
0
    def testMask(self):
        trackData1 = TrackData()
        trackData1.loadTrackData(getTracksInfoPath(8),
                                 [("scaffold_1", 0, 50)])        
        tableList1 = trackData1.getTrackTableList()

        trackData2 = TrackData()
        trackData2.loadTrackData(getTracksInfoPath(7),
                                 [("scaffold_1", 0, 50)])        
        tableList2 = trackData2.getTrackTableList()

        assert len(tableList1) == 1
        assert len(tableList2) == len(tableList1)
        table1 = tableList1[0]
        table2 = tableList2[0]

        assert len(table2) == 50
        assert len(table1) == 39

        maskOffsets1 = table1.getMaskRunningOffsets()
        maskOffsets2 = table2.getMaskRunningOffsets()

        assert maskOffsets2 == None
        for i in xrange(len(table1)):
            v1 = table1[i]
            v2 = table2[i + maskOffsets1[i]]
            assert_array_equal(v1, v2)

        # now test segmentation
        statesPath = getStatesPath()
        segPath = getSegmentsPath()

        bedIntervals = getMergedBedIntervals(getStatesPath(), sort=True)
        segIntervals = readBedIntervals(getSegmentsPath(), sort=True)
        
        segTrackData2 = TrackData()
        segTrackData2.loadTrackData(getTracksInfoPath(7), bedIntervals,
                                   segmentIntervals=segIntervals,
                                   interpolateSegments=True)
        segTrackData3 = TrackData()
        segTrackData3.loadTrackData(getTracksInfoPath(9), bedIntervals,
                                   segmentIntervals=segIntervals,
                                   interpolateSegments=True)

        tlist3 = segTrackData2.getTrackTableList()
        tlist4 = segTrackData3.getTrackTableList()
        assert len(tlist4) == 3
        assert len(tlist3) == 4

        maskIntervals = getMergedBedIntervals(getMaskPath(), sort=True)

        tracks2 = segTrackData2.getTrackList()
        tracks3 = segTrackData3.getTrackList()

        for i in xrange(len(tlist4)):
            t3 = tlist3[i]
            t4 = tlist4[i]
            maskOffsets = t4.getMaskRunningOffsets()
            assert maskOffsets is not None
            assert t3.getMaskRunningOffsets() is None
            k = 0
            for j in xrange(len(t3)):
                start3 = t3.start + t3.segOffsets[j]
                len3 = t3.getSegmentLength(j)
                end3 = start3 + len3
                i3 = (t3.chrom, start3, end3)
                masked = False
                for x in maskIntervals:
                    if intersectSize(i3, x) > 0:
                        masked = True
                        assert intersectSize(i3, x) == i3[2] - i3[1]
                if masked is False:
                    start4 = t4.start + t4.segOffsets[k] + maskOffsets[k]
                    len4 = t4.getSegmentLength(k)
                    end4 = start4 + len4
                    i4 = (t4.chrom, start4, end4)
                    assert_array_equal(i3, i4)
                    v3 = [tracks2.getTrackByNumber(x).getValueMap().getMapBack(t3[j][x]) for x in xrange(len(t3[j]))]
                    v4 = [tracks3.getTrackByNumber(x).getValueMap().getMapBack(t4[k][x]) for x in xrange(len(t4[k]))]
                    assert_array_equal(v3, v4)
                    k += 1