Esempio n. 1
0
def checkExactOverlap(bed1, bed2):
    """ make sure two bed files cover same region exactly: a requirement for all
    code based on the comparisons in this module."""

    errorMessage = (
        "Bed files %s and %s cannot be compared. xxx. "
        " Input files must be both sorted, cover the exact same region,"
        " and contain no self-overlaps.") % (bed1, bed2)

    # empty file may break downstream comparisons
    size1 = os.path.getsize(bed1)
    size2 = os.path.getsize(bed2)
    if size1 == 0 or size2 == 0:
        raise RuntimeError(
            errorMessage.replace("xxx", "one or both inputs empty"))

    # test self-overlap and sorting
    intervals1 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals1)):
        if intersectSize(intervals1[i - 1], intervals1[i]) != 0:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Overlapping intervals %s and %s found in input1" %
                    (intervals1[i - 1], intervals1[i])))
        if intervals1[i - 1] > intervals1[i]:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Out of order intervals %s and %s found in input1" %
                    (intervals1[i - 1], intervals1[i])))

    # test self-overlap and sorting
    intervals2 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals2)):
        if intersectSize(intervals2[i - 1], intervals2[i]) != 0:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Overlapping intervals %s and %s found in input2" %
                    (intervals2[i - 1], intervals2[i])))
        if intervals2[i - 1] > intervals2[i]:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Out of order intervals %s and %s found in input2" %
                    (intervals2[i - 1], intervals2[i])))

    # test intersection size
    tempFile = getLocalTempPath("Temp_test", ".bed")
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(
            errorMessage.replace("xxx",
                                 "Input1 covers regions outside input2"))
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(
            errorMessage.replace("xxx",
                                 "Input2 covers regions outside input1"))
    runShellCommand("rm -f %s" % tempFile)
Esempio n. 2
0
def checkExactOverlap(bed1, bed2):
    """ make sure two bed files cover same region exactly: a requirement for all
    code based on the comparisons in this module."""

    errorMessage = ("Bed files %s and %s cannot be compared. xxx. "
    " Input files must be both sorted, cover the exact same region,"
    " and contain no self-overlaps.") % (bed1, bed2)

    # empty file may break downstream comparisons
    size1 = os.path.getsize(bed1)
    size2 = os.path.getsize(bed2)
    if size1 == 0 or size2 == 0:
        raise RuntimeError(errorMessage.replace("xxx", "one or both inputs empty"))
                            

    # test self-overlap and sorting
    intervals1 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals1)):
        if intersectSize(intervals1[i-1], intervals1[i]) != 0:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Overlapping intervals %s and %s found in input1" % (
                    intervals1[i-1], intervals1[i])))
        if intervals1[i-1] > intervals1[i]:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Out of order intervals %s and %s found in input1" % (
                    intervals1[i-1], intervals1[i])))

    # test self-overlap and sorting
    intervals2 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals2)):
        if intersectSize(intervals2[i-1], intervals2[i]) != 0:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Overlapping intervals %s and %s found in input2" % (
                    intervals2[i-1], intervals2[i])))
        if intervals2[i-1] > intervals2[i]:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Out of order intervals %s and %s found in input2" % (
                    intervals2[i-1], intervals2[i])))
        

    # test intersection size
    tempFile = getLocalTempPath("Temp_test", ".bed")
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(errorMessage.replace(
            "xxx", "Input1 covers regions outside input2"))
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(errorMessage.replace(
            "xxx", "Input2 covers regions outside input1"))
    runShellCommand("rm -f %s" % tempFile)
Esempio n. 3
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Subselect some output of chunkBedRegions.py")
    parser.add_argument("inBed", help="Input bed file (generated with chunkBedRegions.py)")
    parser.add_argument("sampleSize", help="Desired sample size (in bases).", type=int)
    
    args = parser.parse_args()
    tempBedToolPath = initBedTool()
    assert os.path.exists(args.inBed)

    bedIntervals = readBedIntervals(args.inBed)
    outIntervals = []

    curSize = 0

    # dumb n^2 alg should be enough for our current purposes
    while curSize < args.sampleSize and len(bedIntervals) > 0:
        idx = random.randint(0, len(bedIntervals)-1)
        interval = bedIntervals[idx]
        sampleLen = interval[2] - interval[1]
        if sampleLen + curSize > args.sampleSize:
            sampleLen = (sampleLen + curSize) - args.sampleSize
            interval = (interval[0], interval[1], interval[1] + sampleLen)
        outIntervals.append(interval)
        curSize += sampleLen
        del bedIntervals[idx]

    for interval in sorted(outIntervals):
        sys.stdout.write("%s\t%d\t%d\n" % interval)

    cleanBedTool(tempBedToolPath)
Esempio n. 4
0
def baserize(inBed, outBed):
    outFile = open(outBed, "w")
    for interval in readBedIntervals(inBed):
        for i in xrange(interval[2] - interval[1]):
            outFile.write("%s\t%d\t%d\n" %
                          (interval[0], interval[1] + i, interval[1] + i + 1))
    outFile.close()
Esempio n. 5
0
def fillGaps(inBed):
    """ Make two interval sets from a given bed file:
      filledIntervals: Set of intervals with intervals added between consecutive
                       intervals on same seq (ala addBedGaps.y)
      mergedIntervals: Set of intervals spanning each continuous region from
                       above (ala getMergeItnervals)
    probably reimplementing stuff but oh well """
    filledIntervals = []
    mergedIntervals = []
    intervals = readBedIntervals(inBed, ncol=4, sort=True)
    if len(intervals) == 0:
        return [], []

    prevInterval = None
    for interval in intervals:
        if prevInterval is not None and prevInterval[0] == interval[0] and\
          prevInterval[2] != interval[1]:
            # update fill for discontinuity
            assert prevInterval[2] < interval[1]
            filledIntervals.append(
                (interval[0], prevInterval[2], interval[1], filTok))
        if prevInterval is None or prevInterval[0] != interval[0]:
            # update merge for new sequence
            mergedIntervals.append(interval)
        else:
            # extend merge for same sequence
            mergedIntervals[-1] = (mergedIntervals[-1][0],
                                   mergedIntervals[-1][1], interval[2],
                                   mergedIntervals[-1][3])

        # update fill with current interval
        filledIntervals.append(interval)
        prevInterval = interval

    return filledIntervals, mergedIntervals
Esempio n. 6
0
def combineTrack(track, outPath, tempRegionPath, iter, args):
    """ merge track with outPath """

    # make sure track is of form chrom start end state
    tempColPath = getLocalTempPath("Temp", "_col.bed")
    tempColFile = open(tempColPath, "w")
    vc = track.getValCol() + 1
    if track.getDist() == "binary":
        assert track.getName() != args.outside
        vc = 3
    bedIntervals = readBedIntervals(track.getPath(), vc,
                                    sort = True)
    for bedInterval in bedIntervals:
        outStr = "\t".join([str(x) for x in bedInterval])
        if track.getDist() == "binary":
            # state name = track name for binary track
            outStr += "\t%s" % track.getName()
        outStr += "\n"
        tempColFile.write(outStr)
    tempColFile.close()

    # intersect the target region
    tempIntersectPath = getLocalTempPath("Temp", "_int.bed")
    runShellCommand("intersectBed -a %s -b %s > %s" % (
        tempColPath, tempRegionPath, tempIntersectPath))

    # add the outside states
    tempGappedPath = getLocalTempPath("Temp", "_gap.bed")
    runShellCommand("addBedGaps.py --state %s %s %s %s" % (
        args.outside, tempRegionPath, tempIntersectPath, tempGappedPath))

    # fit the names with previous interations' result
    tempFitPath = getLocalTempPath("Temp", "_fit.bed")
    if iter == 0:
        runShellCommand("cp %s %s" % (tempGappedPath, tempFitPath))
    else:
        runShellCommand("fitStateNames.py %s %s %s --qualThresh %f --ignoreTgt %s" % (
            outPath, tempGappedPath, tempFitPath, args.fitThresh, args.outside))

    # now merge into outPath
    runShellCommand("cat %s >> %s" % (tempFitPath, outPath))
    runShellCommand("removeBedOverlaps.py %s > %s" % (outPath, tempColPath))
    runShellCommand("mv %s %s" % (tempColPath, outPath))

    # clean crap (note tempCol should already be gone)
    runShellCommand("rm -f %s" % tempColPath)
    runShellCommand("rm -f %s" % tempIntersectPath)
    runShellCommand("rm -f %s" % tempGappedPath)
    runShellCommand("rm -f %s" % tempFitPath)
Esempio n. 7
0
def makeNearnessBED(intervals, args):
    """ for each interval, measure distance to nearest interval in args.nearness and
    write as score """
    compIntervals = readBedIntervals(args.nearness, ncol=4, sort=True)
    if len(intervals) == 0:
        return ""

    # only correct if sorted non-overlapping
    outBedString = ""
    i = 0
    for interval in intervals:
        distI = sys.maxint
        for j in xrange(i, len(compIntervals)):
            distJ = distance(interval, compIntervals[j])
            if distJ <= distI:
                i, distI = j, distJ
            else:
                break
        outBedString += "%s\t%d\t%d\t%s\t%d\n" % (
            interval[0], interval[1], interval[2], interval[3], distI)
    return outBedString
Esempio n. 8
0
def makeNearnessBED(intervals, args):
    """ for each interval, measure distance to nearest interval in args.nearness and
    write as score """
    compIntervals = readBedIntervals(args.nearness, ncol = 4, sort = True)
    if len(intervals) == 0:
        return ""

    # only correct if sorted non-overlapping
    outBedString = ""
    i = 0
    for interval in intervals:
        distI = sys.maxint
        for j in xrange(i, len(compIntervals)):
            distJ = distance(interval, compIntervals[j])
            if distJ <= distI:
                i, distI = j, distJ
            else:
                break
        outBedString += "%s\t%d\t%d\t%s\t%d\n" % (interval[0], interval[1], interval[2],
                                                  interval[3], distI)
    return outBedString
Esempio n. 9
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Subselect some output of chunkBedRegions.py")
    parser.add_argument(
        "inBed", help="Input bed file (generated with chunkBedRegions.py)")
    parser.add_argument("sampleSize",
                        help="Desired sample size (in bases).",
                        type=int)

    args = parser.parse_args()
    tempBedToolPath = initBedTool()
    assert os.path.exists(args.inBed)

    bedIntervals = readBedIntervals(args.inBed)
    outIntervals = []

    curSize = 0

    # dumb n^2 alg should be enough for our current purposes
    while curSize < args.sampleSize and len(bedIntervals) > 0:
        idx = random.randint(0, len(bedIntervals) - 1)
        interval = bedIntervals[idx]
        sampleLen = interval[2] - interval[1]
        if sampleLen + curSize > args.sampleSize:
            sampleLen = (sampleLen + curSize) - args.sampleSize
            interval = (interval[0], interval[1], interval[1] + sampleLen)
        outIntervals.append(interval)
        curSize += sampleLen
        del bedIntervals[idx]

    for interval in sorted(outIntervals):
        sys.stdout.write("%s\t%d\t%d\n" % interval)

    cleanBedTool(tempBedToolPath)
def fillGaps(inBed):
    """ Make two interval sets from a given bed file:
      filledIntervals: Set of intervals with intervals added between consecutive
                       intervals on same seq (ala addBedGaps.y)
      mergedIntervals: Set of intervals spanning each continuous region from
                       above (ala getMergeItnervals)
    probably reimplementing stuff but oh well """
    filledIntervals = []
    mergedIntervals = []
    intervals = readBedIntervals(inBed, ncol=4, sort=True)
    if len(intervals) == 0:
        return [], []
    
    prevInterval = None
    for interval in intervals:
        if prevInterval is not None and prevInterval[0] == interval[0] and\
          prevInterval[2] != interval[1]:
          # update fill for discontinuity
          assert prevInterval[2] < interval[1]
          filledIntervals.append((interval[0], prevInterval[2], interval[1],
                                     filTok))
        if prevInterval is None or prevInterval[0] != interval[0]:
            # update merge for new sequence
            mergedIntervals.append(interval)
        else:
            # extend merge for same sequence
            mergedIntervals[-1] = (mergedIntervals[-1][0],
                                   mergedIntervals[-1][1],
                                   interval[2],
                                   mergedIntervals[-1][3])

        # update fill with current interval
        filledIntervals.append(interval)
        prevInterval = interval
        
    return filledIntervals, mergedIntervals
Esempio n. 11
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Compare two bed files where Model states are represented"
        " in a column.  Used to determine sensitivity and specificity.  NOTE"
        " that both bed files must be sorted and cover the exact same regions"
        " of the same genome.")

    parser.add_argument("bed1", help="Bed file (TRUTH)")
    parser.add_argument("bed2", help="Bed file covering same regions in same"
                        " order as bed1")
    parser.add_argument("--col", help="Column of bed files to use for state"
                        " (currently only support 4(name) or 5(score))",
                        default = 4, type = int)
    parser.add_argument("--thresh", help="Threshold to consider interval from"
                        " bed1 covered by bed2.",
                        type=float, default=0.8)
    parser.add_argument("--plot", help="Path of file to write Precision/Recall"
                        " graphs to in PDF format", default=None)
    parser.add_argument("--ignore", help="Comma-separated list of stateNames to"
                        " ignore", default=None)
    parser.add_argument("--strictPrec", help="By default, precision is computed"
                        " in a manner strictly symmetric to recall.  So calling"
                        " compareBedStates.py A.bed B.bed would give the exact"
                        " same output as compareBedStates.py B.bed A.bed except"
                        " precision and recall values would be swapped.  With "
                        " this option, a predicted element only counts toward"
                        " precision if it overlaps with 80pct of the true"
                        " element, as opposed to only needing 80pct of itself"
                        " overlapping with the true element. ",
                        action="store_true", default = False)
    parser.add_argument("--noBase", help="Skip base-level stats (and only show"
                        " interval stats).  Runs faster", action="store_true",
                        default=False)
    parser.add_argument("--noFrag", help="Do not allow fragmented matches in"
                        "interval predictions.  ie if a single truth interval"
                        " is covered by a series of predicted intervals, only "
                        "the best match will be counted if this flag is used", 
                        action="store_true", default=False)
    parser.add_argument("--tl", help="Path to tracks XML file.  Used to cut "
                        "out mask tracks so they are removed from comparison."
                        " (convenience option to not have to manually run "
                        "subtractBed everytime...)", default=None)
    parser.add_argument("--delMask", help="Entirely remove intervals from "
                        "mask tracks that are > given length.  Probably "
                        "only want to set to non-zero value K if using"
                        " with a prediction that was processed with "
                        "interpolateMaskedRegions.py --max K",
                        type=int, default=0)
    parser.add_argument("--window", help="A comma-delimited 5-tuple of "
                        "windowSize,stateName,compType,score,outBed.  "
                        "Where windowSize  is the sliding window size "
                        "(overlap .5), stateName is target stateName,"
                        " compType is in {base,interval,weighted}, sore is"
                        " in {f1,precision,recall} and "
                        "outBed is the path of a bedFile to write positional"
                        " accuracy to.  For example, --window 1000000,TE,base,f1"
                        ",acc.bed will write base-level f1 for 1MB sliding windows"
                        " to acc.bed.  These can be viewed on the browser by first"
                        " converting to BigWig.", default=None)

    args = parser.parse_args()
    tempBedToolPath = initBedTool()

    if args.ignore is not None:
        args.ignore = set(args.ignore.split(","))
    else:
        args.ignore = set()

    assert args.col == 4 or args.col == 5
    print "Commandline %s" % " ".join(sys.argv)
    origArgs = copy.deepcopy(args)
    
    tempFiles = []
    if args.tl is not None:
        cutBed1 = cutOutMaskIntervals(args.bed1, args.delMask,
                                      sys.maxint, args.tl)
        cutBed2 = cutOutMaskIntervals(args.bed2, args.delMask,
                                      sys.maxint, args.tl)
        if cutBed1 is not None:
            assert cutBed2 is not None
            tempFiles += [cutBed1, cutBed2]
            args.bed1 = cutBed1
            args.bed2 = cutBed2

    checkExactOverlap(args.bed1, args.bed2)

    if args.window is not None:
        runPositionalComparison(argv, origArgs)

    intervals1 = readBedIntervals(args.bed1, ncol = args.col)
    intervals2 = readBedIntervals(args.bed2, ncol = args.col)

    if args.noBase is False:
        stats = compareBaseLevel(intervals1, intervals2, args.col - 1)[0]

        totalRight, totalWrong, accMap = summarizeBaseComparision(stats, args.ignore)
        print "Base counts [False Negatives, False Positives, True Positives]:"
        print stats
        totalBoth = totalRight + totalWrong
        accuracy = float(totalRight) / float(totalBoth)
        print "Accuaracy: %d / %d = %f" % (totalRight, totalBoth, accuracy)
        print "State-by-state (Precision, Recall):"
        print "Base-by-base Accuracy"    
        print accMap

    trueStats = compareIntervalsOneSided(intervals1, intervals2, args.col -1,
                                         args.thresh, False, not args.noFrag)[0]
    predStats = compareIntervalsOneSided(intervals2, intervals1, args.col -1,
                                         args.thresh, args.strictPrec,
                                         not args.noFrag)[0]
    intAccMap = summarizeIntervalComparison(trueStats, predStats, False,
                                            args.ignore)
    intAccMapWeighted = summarizeIntervalComparison(trueStats, predStats, True,
                                                     args.ignore)
    print "\nInterval Accuracy"
    print intAccMap
    print ""

    print "\nWeighted Interval Accuracy"
    print intAccMapWeighted
    print ""


    # print some row data to be picked up by scrapeBenchmarkRow.py
    if args.noBase is False:
        header, row = summaryRow(accuracy, stats, accMap)
        print " ".join(header)
        print " ".join(row)

    # make graph
    if args.plot is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write plots.  Maybe matplotlib is "
                               "not installed?")
        writeAccPlots(accuracy, accMap, intAccMap, intAccMapWeighted,
                      args.thresh, args.plot)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
Esempio n. 12
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Fill in masked intervals of an hmm prediction "
        "(from teHmmEval.py) with state corresponding to surrounding"
        " intervals.")

    parser.add_argument("tracksXML",
                        help="XML track list (used to id masking"
                        " tracks")
    parser.add_argument("allBed",
                        help="Target scope.  Masked intervals outside"
                        " of these regions will not be included")
    parser.add_argument(
        "inBed",
        help="TE prediction BED file.  State labels"
        " should probably be mapped (ie with fitStateNames.py)")
    parser.add_argument("outBed",
                        help="Output BED.  Will be equivalent to"
                        " the input bed except all gaps corresponding to "
                        "masked intervals will be filled")
    parser.add_argument(
        "--maxLen",
        help="Maximum length of a masked interval"
        " to fill (inclusive). Use --delMask option with same value"
        "if running compareBedStates.py after.",
        type=int,
        default=sys.maxint)
    parser.add_argument("--default",
                        help="Default label to give to masked "
                        "region if no label can be determined",
                        default="0")
    parser.add_argument(
        "--tgts",
        help="Only relabel gaps that "
        "are flanked on both sides by the same state, and this state"
        " is in this comma- separated list. --default used for other"
        " gaps.  If not targetst specified then all states checked.",
        default=None)
    parser.add_argument(
        "--oneSidedTgts",
        help="Only relabel gaps that "
        "are flanked on at least one side by a state in this comma-"
        "separated list --default used for other gaps",
        default=None)
    parser.add_argument(
        "--onlyDefault",
        help="Add the default state (--default) no"
        " no all masked gaps no matter what. ie ignoring all other "
        "logic",
        action="store_true",
        default=False)
    parser.add_argument(
        "--cut",
        help="Cut out gaps for masked tracks from the input."
        " By default, the input is expected to come from the HMM "
        "with mask intervals already absent, and will crash on with"
        " an assertion error if an overlap is detected.",
        action="store_true",
        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # make sets
    tgtSet = set()
    if args.tgts is not None:
        tgtSet = set(args.tgts.split(","))
    oneSidedTgtSet = set()
    if args.oneSidedTgts is not None:
        oneSidedTgtSet = set(args.oneSidedTgts.split(","))
    assert len(tgtSet.intersection(oneSidedTgtSet)) == 0

    # read the track list
    trackList = TrackList(args.tracksXML)
    maskTracks = trackList.getMaskTracks()

    # read the input bed
    inBed = args.inBed
    if args.cut is True:
        inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML)
    inputIntervals = readBedIntervals(inBed, ncol=4, sort=True)
    if args.cut is True:
        runShellCommand("rm -f %s" % inBed)
    if len(maskTracks) == 0 or len(inputIntervals) == 0:
        runShellCommand("cp %s %s" % (args.inBed, args.outBed))
        logger.warning("No mask tracks located in %s or"
                       " %s empty" % (args.tracksXML, args.inBed))
        return 0

    # make a temporary, combined, merged masking bed file
    tempMaskBed = getLocalTempPath("Temp_mb", ".bed")
    for maskTrack in maskTracks:
        assert os.path.isfile(maskTrack.getPath())
        runShellCommand(
            "cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s"
            % (maskTrack.getPath(), tempMaskBed))
    maskedIntervals = getMergedBedIntervals(tempMaskBed, sort=True)
    resolvedMasks = 0

    if len(inputIntervals) == 0:
        logger.warning("No mask tracks located in %s" % args.tracksXML)
        return
    inputIdx = 0
    rightFlank = inputIntervals[inputIdx]

    tempOutMask = getLocalTempPath("Temp_om", ".bed")
    tempOutMaskFile = open(tempOutMask, "w")

    for maskIdx, maskInterval in enumerate(maskedIntervals):
        if maskInterval[2] - maskInterval[1] > args.maxLen:
            continue
        # find candidate right flank
        while rightFlank < maskInterval:
            if inputIdx == len(inputIntervals) - 1:
                rightFlank = None
                break
            else:
                inputIdx += 1
                rightFlank = inputIntervals[inputIdx]

        # candidate left flank
        leftFlank = None
        if inputIdx > 0:
            leftFlank = inputIntervals[inputIdx - 1]

        # identify flanking states if the intervals perfectly abut
        leftState = None
        if leftFlank is not None:
            if leftFlank[0] == maskInterval[0] and leftFlank[
                    2] == maskInterval[1]:
                leftState = str(leftFlank[3])
            else:
                assert intersectSize(leftFlank, maskInterval) == 0
        rightState = None
        if rightFlank is not None:
            if rightFlank[0] == maskInterval[0] and rightFlank[
                    1] == maskInterval[2]:
                rightState = str(rightFlank[3])
            else:
                assert intersectSize(rightFlank, maskInterval) == 0

        # choose a state for the mask interval
        maskState = str(args.default)
        if args.onlyDefault is True:
            pass
        elif leftState is not None and leftState == rightState:
            if len(tgtSet) == 0 or leftState in tgtSet:
                maskState = leftState
        elif leftState in oneSidedTgtSet:
            maskState = leftState
        elif rightState in oneSidedTgtSet:
            maskState = rightState

        # write our mask interval
        tempOutMaskFile.write(
            "%s\t%d\t%d\t%s\n" %
            (maskInterval[0], maskInterval[1], maskInterval[2], maskState))

    tempOutMaskFile.close()
    tempMergePath1 = getLocalTempPath("Temp_mp", ".bed")
    tempMergePath2 = getLocalTempPath("Temp_mp", ".bed")
    runShellCommand("cp %s %s ; cat %s >> %s" %
                    (args.inBed, tempMergePath1, tempOutMask, tempMergePath1))
    runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2))
    tempScopePath = getLocalTempPath("temp_all", ".bed")
    runShellCommand("mergeBed -i %s |sortBed > %s" %
                    (args.allBed, tempScopePath))
    runShellCommand("intersectBed -a %s -b %s > %s" %
                    (tempMergePath2, tempScopePath, args.outBed))

    runShellCommand("rm -f %s" % " ".join([
        tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath
    ]))
    cleanBedTool(tempBedToolPath)
Esempio n. 13
0
def baserize(inBed, outBed):
    outFile = open(outBed, "w")
    for interval in readBedIntervals(inBed):
        for i in xrange(interval[2] - interval[1]):
            outFile.write("%s\t%d\t%d\n" % (interval[0], interval[1] + i, interval[1] + i + 1))
    outFile.close()
Esempio n. 14
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Given two bed files: a prediction and a true (or target)"
        " annotation, re-label the prediction's state names so that they "
        " best match the true annotation.  Usees same logic as "
        " compareBedStates.py for determining accuracy")

    parser.add_argument("tgtBed", help="Target bed file")
    parser.add_argument("predBed", help="Predicted bed file to re-label. ")
    parser.add_argument("outBed", help="Output bed (relabeling of predBed)")
    parser.add_argument("--col",
                        help="Column of bed files to use for state"
                        " (currently only support 4(name) or 5(score))",
                        default=4,
                        type=int)
    parser.add_argument(
        "--intThresh",
        help="Threshold to consider interval from"
        " tgtBed covered by predBed.  If not specified, then base"
        " level statistics will be used. Value in range (0,1]",
        type=float,
        default=None)
    parser.add_argument("--noFrag",
                        help="Dont allow fragmented interval matches ("
                        "see help for --frag in compareBedStates.py).  Only"
                        " relevant with --intThresh",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--qualThresh",
        help="Minimum match ratio between truth"
        " and prediction to relabel prediction.  Example, if"
        " predicted state X overlaps target state LTR 25 pct of "
        "the time, then qualThresh must be at least 0.25 to "
        "label X as LTR in the output.  Value in range (0, 1]",
        type=float,
        default=0.1)
    parser.add_argument("--ignore",
                        help="Comma-separated list of stateNames to"
                        " ignore (in prediction)",
                        default=None)
    parser.add_argument("--ignoreTgt",
                        help="Comma-separated list of stateNames to"
                        " ignore (int target)",
                        default=None)
    parser.add_argument("--tgt",
                        help="Comma-separated list of stateNames to "
                        " consider (in target).  All others will be ignored",
                        default=None)
    parser.add_argument(
        "--unique",
        help="If more than one predicted state maps"
        " to the same target state, add a unique id (numeric "
        "suffix) to the output so that they can be distinguished",
        action="store_true",
        default=False)
    parser.add_argument("--model",
                        help="Apply state name mapping to the model"
                        " in the specified path (it is strongly advised to"
                        " make a backup of the model first)",
                        default=None)
    parser.add_argument("--noMerge",
                        help="By default, adjacent intervals"
                        " with the same state name in the output are "
                        "automatically merged into a single interval.  This"
                        " flag disables this.",
                        action="store_true",
                        default=False)
    parser.add_argument("--hm",
                        help="Write confusion matrix as heatmap in PDF"
                        " format to specified file",
                        default=None)
    parser.add_argument("--old",
                        help="Use old name mapping logic which just "
                        "takes biggest overlap in forward confusion matrix.  "
                        "faster than new default logic which does the greedy"
                        " f1 optimization",
                        action="store_true",
                        default=False)
    parser.add_argument("--fdr",
                        help="Use FDR cutoff instead of (default)"
                        " greedy F1 optimization for state labeling",
                        type=float,
                        default=None)
    parser.add_argument("--tl",
                        help="Path to tracks XML file.  Used to cut "
                        "out mask tracks so they are removed from comparison."
                        " (convenience option to not have to manually run "
                        "subtractBed everytime...)",
                        default=None)
    parser.add_argument(
        "--colOrder",
        help="List of states used to force"
        " ordering in heatmap (otherwise alphabetical) columns. These"
        " states will correspond to the tgtBed when --old used and"
        " --predBed otherwise.",
        default=None)
    parser.add_argument(
        "--hmCovRow",
        help="Path to write 1-row heatmap of "
        "state coverage (fraction of bases). only works with --hm",
        default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.ignore is not None:
        args.ignore = set(args.ignore.split(","))
    else:
        args.ignore = set()
    if args.ignoreTgt is not None:
        args.ignoreTgt = set(args.ignoreTgt.split(","))
    else:
        args.ignoreTgt = set()
    if args.tgt is not None:
        args.tgt = set(args.tgt.split(","))
        if args.old is True:
            raise RuntimeError("--tgt option not implemented for --old")
    else:
        args.tgt = set()
    if args.old is True and args.fdr is not None:
        raise RuntimeError("--old and --fdr options are exclusive")

    assert args.col == 4 or args.col == 5

    tempFiles = []
    if args.tl is not None:
        cutBedTgt = cutOutMaskIntervals(args.tgtBed, -1, sys.maxint, args.tl)
        cutBedPred = cutOutMaskIntervals(args.predBed, -1, sys.maxint, args.tl)

        if cutBedTgt is not None:
            assert cutBedPred is not None
            tempFiles += [cutBedTgt, cutBedPred]
            args.tgtBed = cutBedTgt
            args.predBed = cutBedPred

    checkExactOverlap(args.tgtBed, args.predBed)

    intervals1 = readBedIntervals(args.tgtBed, ncol=args.col)
    intervals2 = readBedIntervals(args.predBed, ncol=args.col)
    cfName = "reverse"

    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        cfName = "forward"

    # generate confusion matrix based on accuracy comparison using
    # base or interval stats as desired
    if args.intThresh is not None:
        logger.info("Computing interval %s confusion matrix" % cfName)
        confMat = compareIntervalsOneSided(intervals2, intervals1,
                                           args.col - 1, args.intThresh, False,
                                           not args.noFrag)[1]
    else:
        logger.info("Computing base %s confusion matrix" % cfName)
        confMat = compareBaseLevel(intervals2, intervals1, args.col - 1)[1]

    logger.info("%s Confusion Matrix:\n%s" % (cfName, str(confMat)))

    # find the best "true" match for each predicted state
    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        stateMap = getStateMapFromConfMatrix_simple(confMat)
    else:
        stateMap = getStateMapFromConfMatrix(confMat, args.tgt, args.ignoreTgt,
                                             args.ignore, args.qualThresh,
                                             args.fdr)

    # filter the stateMap to take into account the command-line options
    # notably --ignore, --ignoreTgt, --qualThresh, and --unique
    filterStateMap(stateMap, args)

    logger.info("State Map:\n%s", str(stateMap))

    # write the model if spefied
    if args.model is not None:
        applyNamesToModel(stateMap, args.model)

    # generate the output bed using the statemap
    writeFittedBed(intervals2, stateMap, args.outBed, args.col - 1,
                   args.noMerge, args.ignoreTgt)

    # write the confusion matrix as heatmap
    if args.hm is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write heatmap.  Maybe matplotlib is "
                               "not installed?")
        writeHeatMap(confMat, args.hm, args.colOrder, args.hmCovRow)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
Esempio n. 15
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Evaluate a given data set with a trained HMM. Display"
        " the log probability of the input data given the model, and "
        "optionally output the most likely sequence of hidden states.")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inputModel", help="Path of hmm created with"
                        "teHmmTrain.py")
    parser.add_argument("bedRegions", help="Intervals to process")
    parser.add_argument("--bed", help="path of file to write viterbi "
                        "output to (most likely sequence of hidden states)",
                        default=None)
    parser.add_argument("--numThreads", help="Number of threads to use (only"
                        " applies to CFG parser for the moment)",
                        type=int, default=1)
    parser.add_argument("--slice", help="Make sure that regions are sliced"
                        " to a maximum length of the given value.  Most "
                        "useful when model is a CFG to keep memory down. "
                        "When 0, no slicing is done",
                        type=int, default=0)
    parser.add_argument("--segment", help="Use the intervals in bedRegions"
                        " as segments which each count as a single column"
                        " for evaluattion.  Note the model should have been"
                        " trained with the --segment option pointing to this"
                        " same bed file.", action="store_true", default=False)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied)", type=int, default=0)    
    parser.add_argument("--maxPost", help="Use maximum posterior decoding instead"
                        " of Viterbi for evaluation", action="store_true",
                        default=False)
    parser.add_argument("--pd", help="Output BED file for posterior distribution. Must"
                        " be used in conjunction with --pdStates (View on the "
                        "browser via bedGraphToBigWig)", default=None)
    parser.add_argument("--pdStates", help="comma-separated list of state names to use"
                        " for computing posterior distribution.  For example: "
                        " --pdStates inside,LTR_left,LTR_right will compute the probability"
                        ", for each observation, that the hidden state is inside OR LTR_left"
                        " OR LTR_right.  Must be used with --pd to specify output "
                        "file.", default=None)
    parser.add_argument("--bic", help="save Bayesian Information Criterion (BIC) score"
                        " in given file", default=None)
    parser.add_argument("--ed", help="Output BED file for emission distribution. Must"
                        " be used in conjunction with --edStates (View on the "
                        "browser via bedGraphToBigWig)", default=None)
    parser.add_argument("--edStates", help="comma-separated list of state names to use"
                        " for computing emission distribution.  For example: "
                        " --edStates inside,LTR_left for each obsercation the probability "
                        " that inside emitted that observaiton plus the probabillity that"
                        " LTR_left emitted it. If more than one state is selected, this "
                        " is not a distribution, but a sum of distributions (and values"
                        " can exceed 1).  Mostly for debugging purposes. Note output in LOG",
                         default=None)
    parser.add_argument("--chroms", help="list of chromosomes, or regions, to run in parallel"
                        " (in BED format).  input regions will be intersected with each line"
                        " in this file, and the result will correspsond to an individual job",
                        default=None)
    parser.add_argument("--proc", help="number of processes (use in conjunction with --chroms)",
                        type=int, default=1)
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    if args.slice <= 0:
        args.slice = sys.maxint
    elif args.segment is True:
        raise RuntimeError("--slice and --segment options are not compatible at "
                           "this time")
    if (args.pd is not None) ^ (args.pdStates is not None):
        raise RuntimeError("--pd requires --pdStates and vice versa")
    if (args.ed is not None) ^ (args.edStates is not None):
        raise RuntimeError("--ed requires --edStates and vice versa")
    if args.bed is None and (args.pd is not None or args.ed is not None):
        raise RuntimeError("Both --ed and --pd only usable in conjunction with"
                           " --bed")

    if args.chroms is not None:
        # hack to allow chroms argument to chunk and rerun 
        parallelDispatch(argv, args)
        cleanBedTool(tempBedToolPath)
        return 0
    
    # load model created with teHmmTrain.py
    logger.info("loading model %s" % args.inputModel)
    model = loadModel(args.inputModel)

    if isinstance(model, MultitrackCfg):
        if args.maxPost is True:
           raise RuntimeErorr("--post not supported on CFG models")

    # apply the effective segment length
    if args.segLen > 0:
        assert args.segment is True
        model.getEmissionModel().effectiveSegmentLength = args.segLen
        
    # read intervals from the bed file
    logger.info("loading target intervals from %s" % args.bedRegions)
    mergedIntervals = getMergedBedIntervals(args.bedRegions, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.bedRegions)

    # slice if desired
    choppedIntervals = [x for x in slicedIntervals(mergedIntervals, args.slice)]

    # read segment intervals
    segIntervals = None
    if args.segment is True:
        logger.info("loading segment intervals from %s" % args.bedRegions)
        segIntervals = readBedIntervals(args.bedRegions, sort=True)

    # load the input
    # read the tracks, while intersecting them with the given interval
    trackData = TrackData()
    # note we pass in the trackList that was saved as part of the model
    # because we do not want to generate a new one.
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData.loadTrackData(args.tracksInfo, choppedIntervals, 
                            model.getTrackList(),
                            segmentIntervals=segIntervals)

    # do the viterbi algorithm
    if isinstance(model, MultitrackHmm):
        algname = "viterbi"
        if args.maxPost is True:
            algname = "posterior decoding"
        logger.info("running %s algorithm" % algname)
    elif isinstance(model, MultitrackCfg):
        logger.info("running CYK algorithm")

    vitOutFile = None
    if args.bed is not None:
        vitOutFile = open(args.bed, "w")
    totalScore = 0
    tableIndex = 0
    totalDatapoints = 0

    # Note: in general there's room to save on memory by only computing single
    # track table at once (just need to add table by table interface to hmm...)
    
    posteriors = [None] * trackData.getNumTrackTables()
    posteriorsFile = None
    posteriorsMask = None
    if args.pd is not None:
        posteriors = model.posteriorDistribution(trackData)
        posteriorsFile = open(args.pd, "w")
        posteriorsMask = getPosteriorsMask(args.pdStates, model)
        assert len(posteriors[0][0]) == len(posteriorsMask)
    emProbs = [None] * trackData.getNumTrackTables()
    emissionsFile = None
    emissionsMask = None
    if args.ed is not None:
        emProbs = model.emissionDistribution(trackData)
        emissionsFile = open(args.ed, "w")
        emissionsMask = getPosteriorsMask(args.edStates, model)
        assert len(emProbs[0][0]) == len(emissionsMask)

    
    decodeFunction = model.viterbi
    if args.maxPost is True:
        decodeFunction = model.posteriorDecode

    for i, (vitLogProb, vitStates) in enumerate(decodeFunction(trackData,
                                                numThreads=args.numThreads)):
        totalScore += vitLogProb
        if args.bed is not None or args.pd is not None:
            if args.bed is not None:
                vitOutFile.write("#Viterbi Score: %f\n" % (vitLogProb))
            trackTable = trackData.getTrackTableList()[tableIndex]
            tableIndex += 1
            statesToBed(trackTable,
                        vitStates, vitOutFile, posteriors[i], posteriorsMask,
                        posteriorsFile, emProbs[i], emissionsMask, emissionsFile)
            totalDatapoints += len(vitStates) * trackTable.getNumTracks()

    print "Viterbi (log) score: %f" % totalScore
    if isinstance(model, MultitrackHmm) and model.current_iteration is not None:
        print "Number of EM iterations: %d" % model.current_iteration
    if args.bed is not None:
        vitOutFile.close()
    if posteriorsFile is not None:
        posteriorsFile.close()
    if emissionsFile is not None:
        emissionsFile.close()

    if args.bic is not None:
        bicFile = open(args.bic, "w")
        # http://en.wikipedia.org/wiki/Bayesian_information_criterion
        lnL = float(totalScore)
        try:
            k = float(model.getNumFreeParameters())
        except:
            # numFreeParameters still not done for semi-supervised
            # just pass through a 0 instead of crashing for now
            k = 0.0 
        n = float(totalDatapoints)
        bic = -2.0 * lnL + k * (np.log(n) + np.log(2 * np.pi))
        bicFile.write("%f\n" % bic)
        bicFile.write("# = -2.0 * lnL + k * (lnN + ln(2 * np.pi))\n"
                      "# where lnL=%f  k=%d (%d states)  N=%d (%d obs * %d tracks)  lnN=%f\n" % (
            lnL, int(k), model.getEmissionModel().getNumStates(), int(totalDatapoints),
            totalDatapoints / model.getEmissionModel().getNumTracks(),
            model.getEmissionModel().getNumTracks(), np.log(n)))
        bicFile.close()

    cleanBedTool(tempBedToolPath)
Esempio n. 16
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Given two bed files: a prediction and a true (or target)"
         " annotation, re-label the prediction's state names so that they "
         " best match the true annotation.  Usees same logic as "
         " compareBedStates.py for determining accuracy")

    parser.add_argument("tgtBed", help="Target bed file")
    parser.add_argument("predBed", help="Predicted bed file to re-label. ")
    parser.add_argument("outBed", help="Output bed (relabeling of predBed)")
    parser.add_argument("--col", help="Column of bed files to use for state"
                        " (currently only support 4(name) or 5(score))",
                        default = 4, type = int)
    parser.add_argument("--intThresh", help="Threshold to consider interval from"
                        " tgtBed covered by predBed.  If not specified, then base"
                        " level statistics will be used. Value in range (0,1]",
                        type=float, default=None)
    parser.add_argument("--noFrag", help="Dont allow fragmented interval matches ("
                        "see help for --frag in compareBedStates.py).  Only"
                        " relevant with --intThresh", action="store_true",
                        default=False)
    parser.add_argument("--qualThresh", help="Minimum match ratio between truth"
                        " and prediction to relabel prediction.  Example, if"
                        " predicted state X overlaps target state LTR 25 pct of "
                        "the time, then qualThresh must be at least 0.25 to "
                        "label X as LTR in the output.  Value in range (0, 1]",
                        type=float, default=0.1)
    parser.add_argument("--ignore", help="Comma-separated list of stateNames to"
                        " ignore (in prediction)", default=None)
    parser.add_argument("--ignoreTgt", help="Comma-separated list of stateNames to"
                        " ignore (int target)", default=None)
    parser.add_argument("--tgt", help="Comma-separated list of stateNames to "
                        " consider (in target).  All others will be ignored",
                        default=None)
    parser.add_argument("--unique", help="If more than one predicted state maps"
                        " to the same target state, add a unique id (numeric "
                        "suffix) to the output so that they can be distinguished",
                        action="store_true", default=False)
    parser.add_argument("--model", help="Apply state name mapping to the model"
                        " in the specified path (it is strongly advised to"
                        " make a backup of the model first)", default=None)
    parser.add_argument("--noMerge", help="By default, adjacent intervals"
                        " with the same state name in the output are "
                        "automatically merged into a single interval.  This"
                        " flag disables this.", action="store_true",
                        default=False)
    parser.add_argument("--hm", help="Write confusion matrix as heatmap in PDF"
                        " format to specified file", default = None)
    parser.add_argument("--old", help="Use old name mapping logic which just "
                        "takes biggest overlap in forward confusion matrix.  "
                        "faster than new default logic which does the greedy"
                        " f1 optimization", action="store_true", default=False)
    parser.add_argument("--fdr", help="Use FDR cutoff instead of (default)"
                        " greedy F1 optimization for state labeling",
                        type=float, default=None)
    parser.add_argument("--tl", help="Path to tracks XML file.  Used to cut "
                        "out mask tracks so they are removed from comparison."
                        " (convenience option to not have to manually run "
                        "subtractBed everytime...)", default=None)
    parser.add_argument("--colOrder", help="List of states used to force"
                        " ordering in heatmap (otherwise alphabetical) columns. These"
                        " states will correspond to the tgtBed when --old used and"
                        " --predBed otherwise.", default=None)
    parser.add_argument("--hmCovRow", help="Path to write 1-row heatmap of "
                        "state coverage (fraction of bases). only works with --hm",
                        default=None)

    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.ignore is not None:
        args.ignore = set(args.ignore.split(","))
    else:
        args.ignore = set()
    if args.ignoreTgt is not None:
        args.ignoreTgt = set(args.ignoreTgt.split(","))
    else:
        args.ignoreTgt = set()
    if args.tgt is not None:
        args.tgt = set(args.tgt.split(","))
        if args.old is True:
            raise RuntimeError("--tgt option not implemented for --old")
    else:
        args.tgt = set()
    if args.old is True and args.fdr is not None:
        raise RuntimeError("--old and --fdr options are exclusive")

    assert args.col == 4 or args.col == 5

    tempFiles = []
    if args.tl is not None:
        cutBedTgt = cutOutMaskIntervals(args.tgtBed, -1, sys.maxint, args.tl)                                
        cutBedPred = cutOutMaskIntervals(args.predBed, -1, sys.maxint, args.tl)
        
        if cutBedTgt is not None:
            assert cutBedPred is not None
            tempFiles += [cutBedTgt, cutBedPred]
            args.tgtBed = cutBedTgt
            args.predBed = cutBedPred

    checkExactOverlap(args.tgtBed, args.predBed)

    intervals1 = readBedIntervals(args.tgtBed, ncol = args.col)
    intervals2 = readBedIntervals(args.predBed, ncol = args.col)
    cfName = "reverse"

    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        cfName = "forward"

    # generate confusion matrix based on accuracy comparison using
    # base or interval stats as desired
    if args.intThresh is not None:
        logger.info("Computing interval %s confusion matrix" % cfName)
        confMat = compareIntervalsOneSided(intervals2, intervals1, args.col -1,
                                            args.intThresh, False,
                                           not args.noFrag)[1]
    else:
        logger.info("Computing base %s confusion matrix" % cfName)
        confMat = compareBaseLevel(intervals2, intervals1, args.col - 1)[1]

    logger.info("%s Confusion Matrix:\n%s" % (cfName, str(confMat)))

    # find the best "true" match for each predicted state    
    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        stateMap = getStateMapFromConfMatrix_simple(confMat)
    else:
        stateMap = getStateMapFromConfMatrix(confMat, args.tgt, args.ignoreTgt,
                                             args.ignore, args.qualThresh,
                                             args.fdr)

    # filter the stateMap to take into account the command-line options
    # notably --ignore, --ignoreTgt, --qualThresh, and --unique
    filterStateMap(stateMap, args)

    logger.info("State Map:\n%s", str(stateMap))
        
    # write the model if spefied
    if args.model is not None:
        applyNamesToModel(stateMap, args.model)
    
    # generate the output bed using the statemap
    writeFittedBed(intervals2, stateMap, args.outBed, args.col-1, args.noMerge,
                   args.ignoreTgt)

    # write the confusion matrix as heatmap
    if args.hm is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write heatmap.  Maybe matplotlib is "
                               "not installed?")
        writeHeatMap(confMat, args.hm, args.colOrder, args.hmCovRow)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Fill in masked intervals of an hmm prediction "
        "(from teHmmEval.py) with state corresponding to surrounding"
        " intervals.")

    parser.add_argument("tracksXML", help="XML track list (used to id masking"
                        " tracks")
    parser.add_argument("allBed", help="Target scope.  Masked intervals outside"
                        " of these regions will not be included")
    parser.add_argument("inBed", help="TE prediction BED file.  State labels"
                        " should probably be mapped (ie with fitStateNames.py)")
    parser.add_argument("outBed", help="Output BED.  Will be equivalent to"
                        " the input bed except all gaps corresponding to "
                        "masked intervals will be filled")
    parser.add_argument("--maxLen", help="Maximum length of a masked interval"
                        " to fill (inclusive). Use --delMask option with same value"
                        "if running compareBedStates.py after.",
                        type=int, default=sys.maxint)
    parser.add_argument("--default", help="Default label to give to masked "
                        "region if no label can be determined", default="0")
    parser.add_argument("--tgts", help="Only relabel gaps that "
                        "are flanked on both sides by the same state, and this state"
                        " is in this comma- separated list. --default used for other"
                        " gaps.  If not targetst specified then all states checked.",
                        default=None)
    parser.add_argument("--oneSidedTgts", help="Only relabel gaps that "
                        "are flanked on at least one side by a state in this comma-"
                        "separated list --default used for other gaps",
                         default=None)
    parser.add_argument("--onlyDefault", help="Add the default state (--default) no"
                        " no all masked gaps no matter what. ie ignoring all other "
                        "logic", action="store_true", default=False)
    parser.add_argument("--cut", help="Cut out gaps for masked tracks from the input."
                        " By default, the input is expected to come from the HMM "
                        "with mask intervals already absent, and will crash on with"
                        " an assertion error if an overlap is detected.",
                        action="store_true", default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # make sets
    tgtSet = set()
    if args.tgts is not None:
        tgtSet = set(args.tgts.split(","))
    oneSidedTgtSet = set()
    if args.oneSidedTgts is not None:
        oneSidedTgtSet = set(args.oneSidedTgts.split(","))
    assert len(tgtSet.intersection(oneSidedTgtSet)) == 0

    # read the track list
    trackList = TrackList(args.tracksXML)
    maskTracks = trackList.getMaskTracks()

    # read the input bed
    inBed = args.inBed
    if args.cut is True:
        inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML)
    inputIntervals = readBedIntervals(inBed, ncol = 4, sort = True)
    if args.cut is True:
        runShellCommand("rm -f %s" % inBed)
    if len(maskTracks) == 0 or len(inputIntervals) == 0:
        runShellCommand("cp %s %s" % (args.inBed, args.outBed))
        logger.warning("No mask tracks located in %s or"
                       " %s empty" % (args.tracksXML, args.inBed))
        return 0


    # make a temporary, combined, merged masking bed file
    tempMaskBed = getLocalTempPath("Temp_mb", ".bed")
    for maskTrack in maskTracks:
        assert os.path.isfile(maskTrack.getPath())
        runShellCommand("cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % (
            maskTrack.getPath(), tempMaskBed))
    maskedIntervals = getMergedBedIntervals(tempMaskBed, sort = True)
    resolvedMasks = 0

    if len(inputIntervals) == 0:
        logger.warning("No mask tracks located in %s" % args.tracksXML)
        return
    inputIdx = 0
    rightFlank = inputIntervals[inputIdx]

    tempOutMask = getLocalTempPath("Temp_om", ".bed")
    tempOutMaskFile = open(tempOutMask, "w")

    for maskIdx, maskInterval in enumerate(maskedIntervals):
        if maskInterval[2] - maskInterval[1] > args.maxLen:
            continue
        # find candidate right flank
        while rightFlank < maskInterval:
            if inputIdx == len(inputIntervals) - 1:
                rightFlank = None
                break
            else:
                inputIdx += 1
                rightFlank = inputIntervals[inputIdx]

        # candidate left flank
        leftFlank = None
        if inputIdx > 0:
            leftFlank = inputIntervals[inputIdx - 1]

        # identify flanking states if the intervals perfectly abut
        leftState = None
        if leftFlank is not None:
            if leftFlank[0] == maskInterval[0] and leftFlank[2] == maskInterval[1]:
                leftState = str(leftFlank[3])
            else:
                assert intersectSize(leftFlank, maskInterval) == 0
        rightState = None
        if rightFlank is not None:
            if rightFlank[0] == maskInterval[0] and rightFlank[1] == maskInterval[2]:
                rightState = str(rightFlank[3])
            else:
                assert intersectSize(rightFlank, maskInterval) == 0
            
        # choose a state for the mask interval
        maskState = str(args.default)
        if args.onlyDefault is True:
            pass
        elif leftState is not None and leftState == rightState:
            if len(tgtSet) == 0 or leftState in tgtSet:
                maskState = leftState
        elif leftState in oneSidedTgtSet:
            maskState = leftState
        elif rightState in oneSidedTgtSet:
            maskState = rightState
        
        # write our mask interval
        tempOutMaskFile.write("%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1],
                                                    maskInterval[2], maskState))

    
    tempOutMaskFile.close()    
    tempMergePath1 = getLocalTempPath("Temp_mp", ".bed")
    tempMergePath2 = getLocalTempPath("Temp_mp", ".bed")
    runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1,
                                                 tempOutMask, tempMergePath1))
    runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2))
    tempScopePath = getLocalTempPath("temp_all", ".bed")
    runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath))
    runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath,
                                                       args.outBed))

    runShellCommand("rm -f %s" % " ".join([tempMaskBed, tempOutMask, tempMergePath1,
                                      tempMergePath2, tempScopePath]))
    cleanBedTool(tempBedToolPath)
Esempio n. 18
0
def parallelDispatch(argv, args):
    """ chunk up input with chrom option.  recursivlely launch eval. merge
    results """
    jobList = []
    chromIntervals = readBedIntervals(args.chroms, sort=True)
    chromFiles = []
    regionFiles = []
    segFiles = []
    statsFiles = []
    offset = args.co
    for chrom in chromIntervals:
        cmdToks = copy.deepcopy(argv)
        cmdToks[cmdToks.index("--chrom") + 1] = ""
        cmdToks[cmdToks.index("--chrom")] = ""
        
        chromPath = getLocalTempPath("TempChromPath", ".bed")
        cpFile = open(chromPath, "w")
        cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2]))
        cpFile.close()
        
        regionPath = getLocalTempPath("Temp", ".bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.allBed,
                                                                     chromPath,
                                                                     regionPath))

        if os.path.getsize(regionPath) < 2:
            continue

        offset += int(chrom[2]) - int(chrom[1])
        
        regionFiles.append(regionPath)
        chromFiles.append(chromPath)

        cmdToks[2] = regionPath

        segPath =  getLocalTempPath("Temp", ".bed")
        cmdToks[3] = segPath
        segFiles.append(segPath)

        if "--co" in cmdToks:
            cmdToks[cmdToks.index("--co")+1] = str(offset)
        else:
            cmdToks.append("--co")
            cmdToks.append(str(offset))
        
        if args.stats is not None:
            statsPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--stats")+1] = statsPath
            statsFiles.append(statsPath)
        cmd = " ".join(cmdToks)
        jobList.append(cmd)

    runParallelShellCommands(jobList, args.proc)

    for i in xrange(len(jobList)):
        if i == 0:
            ct = ">"
        else:
            ct = ">>"
        runShellCommand("cat %s %s %s" % (segFiles[i], ct, args.outBed))
        if len(statsFiles) > 0:
            runShellCommand("cat %s %s %s" % (statsFiles[i], ct, args.stats))

    for i in itertools.chain(chromFiles, regionFiles, segFiles, statsFiles):
        runShellCommand("rm %s" % i)            
Esempio n. 19
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Thin wrapper of teHmmTrain.py and teHmmEval.py "
        "to generate a table of Number-of-HMM-states VS BIC. Lower BIC"
        " is better")

    parser.add_argument("tracks", help="tracks xml used for training and eval")
    parser.add_argument(
        "trainingBeds",
        help="comma-separated list of training regions"
        " (training region size will be a variable in output table). "
        "if segmentation is activated, these must also be the "
        "segmented beds...")
    parser.add_argument("evalBed", help="eval region")
    parser.add_argument("trainOpts", help="all teHmmTrain options in quotes")
    parser.add_argument("evalOpts", help="all teHmmEval options in quotes")
    parser.add_argument("states",
                        help="comma separated-list of numbers of states"
                        " to try")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("--reps",
                        help="number of replicates",
                        type=int,
                        default=1)
    parser.add_argument("--proc",
                        help="maximum number of processors to use"
                        " in parallel",
                        type=int,
                        default=1)
    parser.add_argument("--resume",
                        help="try not to rewrite existing files",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--initTrans",
        help="the states argument is overridden"
        " to specify a list of transition initialization files "
        "instead of state numbers",
        action="store_true",
        default=False)
    parser.add_argument("--numReps",
                        help="the states argument is overridden"
                        " to specifiy a list of replicate numbers (--reps)"
                        " arguments",
                        action="store_true",
                        default=False)
    parser.add_argument("--numIter",
                        help="the states argument is overridden"
                        " to specifiy a list of iteration counts (--iter)"
                        " arugments",
                        action="store_true",
                        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1:
        raise RuntimeError("only one of {--initTrans, --numReps, --numIter} "
                           "can be used at a time")

    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    # get the sizes of the trianing beds
    trainingSizes = []
    trainingBeds = []
    for tb in args.trainingBeds.split(","):
        if len(tb) > 0:
            trainingBeds.append(tb)
    for bed in trainingBeds:
        assert os.path.isfile(bed)
        bedLen = 0
        for interval in readBedIntervals(bed):
            bedLen += interval[2] - interval[1]
        trainingSizes.append(bedLen)

    # make sure --bed not in teHmmEval options and --numStates not in train
    # options
    trainOpts = args.trainOpts.split()
    if "--numStates" in args.trainOpts and not args.numReps and not args.numIter:
        nsIdx = trainOpts.index("--numStates")
        assert nsIdx < len(trainOpts) - 1
        del trainOpts[nsIdx]
        del trainOpts[nsIdx]
    if "--initTransProbs" in args.trainOpts:
        tpIdx = trainOpts.index("--initTransProbs")
        assert tpIdx < len(trainOpts) - 1
        del trainOpts[tpIdx]
        del trianOpts[tpIdx]
    trainProcs = 1
    if "--numThreads" in args.trainOpts:
        npIdx = trainOpts.index("--numThreads")
        assert npIdx < len(trainOpts) - 1
        trainProcs = int(trainOpts[npIdx + 1])
    segOptIdx = -1
    if "--segment" in args.trainOpts:
        segIdx = trainOpts.index("--segment")
        assert segIdx < len(trainOpts) - 1
        segOptIdx = segIdx + 1
    if args.numReps and "--reps" in args.trainOpts:
        repsIdx = trainOpts.index("--reps")
        assert repsIdx < len(trainOpts) - 1
        del trainOpts[repsIdx]
        del trainOpts[repsIdx]
    if args.numIter and "--iter" in args.trainOpts:
        iterIdx = trainOpts.index("--iter")
        assert iterIdx < len(trainOpts) - 1
        del trainOpts[iterIdx]
        del trainOpts[iterIdx]
    evalOpts = args.evalOpts.split()
    if "--bed" in args.evalOpts:
        bedIdx = evalOpts.index("--bed")
        assert bedIdx < len(evalOpts) - 1
        del evalOpts[bedIdx]
        del evalOpts[bedIdx]
    if "--bic" in args.evalOpts:
        bicIdx = evalOpts.index("--bic")
        assert bicIdx < len(evalOpts) - 1
        del evalOpts[bicIdx]
        del evalOpts[bicIdx]

    # hack in support for --initTrans option by munging out model sizes
    # from the text files
    if args.initTrans is True:
        transFiles = args.states.split(",")
        states = []
        for tf in transFiles:
            stateSet = set()
            with open(tf) as f:
                for line in f:
                    toks = line.split()
                    print toks
                    if len(toks) > 1 and toks[0][0] != "#":
                        stateSet.add(toks[0])
                        stateSet.add(toks[1])
            states.append(len(stateSet))
    else:
        states = args.states.split(",")

    trainCmds = []
    evalCmds = []
    prevSize = -1
    sameSizeCount = 0
    for trainingSize, trainingBed in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        print prevSize, trainingSize, sameSizeCount
        for numStates in states:
            for rep in xrange(args.reps):
                outMod = os.path.join(
                    args.outDir, "hmm_%d.%d.%d.%d.mod" %
                    (trainingSize, sameSizeCount, int(numStates), int(rep)))
                if segOptIdx != -1:
                    trainOpts[segOptIdx] = trainingBed
                if args.initTrans is True:
                    statesOpt = "--initTransProbs %s" % transFiles[
                        states.index(numStates)]
                elif args.numIter is True:
                    # states argument overridden by iterations
                    statesOpt = "--iter %d" % int(numStates)
                elif args.numReps is True:
                    # states argument overridden by reps
                    statesOpt = "--reps %d" % int(numStates)
                else:
                    statesOpt = "--numStates %d" % int(numStates)
                trainCmd = "teHmmTrain.py %s %s %s %s %s" % (
                    args.tracks, trainingBed, outMod, " ".join(trainOpts),
                    statesOpt)
                if not args.resume or not os.path.isfile(outMod) or \
                   os.path.getsize(outMod) < 100:
                    trainCmds.append(trainCmd)

                outBic = outMod.replace(".mod", ".bic")
                outBed = outMod.replace(".mod", "_eval.bed")
                evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % (
                    args.tracks, outMod, args.evalBed, outBed, outBic,
                    " ".join(evalOpts))
                if not args.resume or not os.path.isfile(outBic) or \
                   os.path.getsize(outBic) < 2:
                    evalCmds.append(evalCmd)

    # run the training
    runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs))

    # run the eval
    runParallelShellCommands(evalCmds, args.proc)

    # make the table header
    tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w")
    stateColName = "states"
    if args.numIter is True:
        statesColName = "iter"
    elif args.numReps is True:
        stateColName = "reps"
    tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" %
                    stateColName)
    for i in xrange(args.reps):
        tableFile.write(", bic.%d" % i)
    tableFile.write("\n")

    # make the table body
    prevSize = -1
    sameSizeCount = 0
    for (trainingSize, trainingBed) in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        for numStates in states:
            bics = []
            printBics = []
            for rep in xrange(args.reps):
                outMod = os.path.join(
                    args.outDir, "hmm_%d.%d.%d.%d.mod" %
                    (trainingSize, sameSizeCount, int(numStates), int(rep)))
                outBic = outMod.replace(".mod", ".bic")
                try:
                    with open(outBic, "r") as obFile:
                        for line in obFile:
                            bic = float(line.split()[0])
                            break
                    bics.append(bic)
                    printBics.append(bic)
                except:
                    logger.warning("Coudn't find bic %s" % outBic)
                    printBics.append("ERROR")
            # write row
            tableFile.write("%s, %d, %d" %
                            (trainingBed, int(trainingSize), int(numStates)))
            if len(bics) > 0:
                tableFile.write(", %f, %f, %f" %
                                (np.mean(bics), np.min(bics), np.max(bics)))
            else:
                tableFile.write(", ERROR, ERROR, ERROR")
            for pb in printBics:
                tableFile.write(", %s" % pb)
            tableFile.write("\n")
    tableFile.close()

    cleanBedTool(tempBedToolPath)
Esempio n. 20
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Thin wrapper of teHmmTrain.py and teHmmEval.py "
        "to generate a table of Number-of-HMM-states VS BIC. Lower BIC"
        " is better")

    parser.add_argument("tracks", help="tracks xml used for training and eval")
    parser.add_argument("trainingBeds", help="comma-separated list of training regions"
                        " (training region size will be a variable in output table). "
                        "if segmentation is activated, these must also be the "
                        "segmented beds...")
    parser.add_argument("evalBed", help="eval region")
    parser.add_argument("trainOpts", help="all teHmmTrain options in quotes")
    parser.add_argument("evalOpts", help="all teHmmEval options in quotes")
    parser.add_argument("states", help="comma separated-list of numbers of states"
                        " to try")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("--reps", help="number of replicates", type = int,
                        default=1)
    parser.add_argument("--proc", help="maximum number of processors to use"
                        " in parallel", type = int, default = 1)
    parser.add_argument("--resume", help="try not to rewrite existing files",
                        action="store_true", default=False)
    parser.add_argument("--initTrans", help="the states argument is overridden"
                        " to specify a list of transition initialization files "
                        "instead of state numbers", action="store_true",
                        default=False)
    parser.add_argument("--numReps", help="the states argument is overridden"
                        " to specifiy a list of replicate numbers (--reps)"
                        " arguments", action="store_true", default=False)
    parser.add_argument("--numIter", help="the states argument is overridden"
                        " to specifiy a list of iteration counts (--iter)"
                        " arugments", action="store_true", default=False)
                        
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1:
        raise RuntimeError("only one of {--initTrans, --numReps, --numIter} "
                           "can be used at a time")

    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    # get the sizes of the trianing beds
    trainingSizes = []
    trainingBeds = []
    for tb in  args.trainingBeds.split(","):
        if len(tb) > 0:
            trainingBeds.append(tb)
    for bed in trainingBeds:
        assert os.path.isfile(bed)
        bedLen = 0
        for interval in readBedIntervals(bed):
            bedLen += interval[2] - interval[1]
        trainingSizes.append(bedLen)

    # make sure --bed not in teHmmEval options and --numStates not in train
    # options
    trainOpts = args.trainOpts.split()
    if "--numStates" in args.trainOpts and not args.numReps and not args.numIter:
        nsIdx = trainOpts.index("--numStates")
        assert nsIdx < len(trainOpts) - 1
        del trainOpts[nsIdx]
        del trainOpts[nsIdx]
    if "--initTransProbs" in args.trainOpts:
        tpIdx = trainOpts.index("--initTransProbs")
        assert tpIdx < len(trainOpts) - 1
        del trainOpts[tpIdx]
        del trianOpts[tpIdx]
    trainProcs = 1
    if "--numThreads" in args.trainOpts:
        npIdx = trainOpts.index("--numThreads")
        assert npIdx < len(trainOpts) - 1
        trainProcs = int(trainOpts[npIdx + 1])
    segOptIdx = -1
    if "--segment" in args.trainOpts:
        segIdx = trainOpts.index("--segment")
        assert segIdx < len(trainOpts) - 1
        segOptIdx = segIdx + 1
    if args.numReps and "--reps" in args.trainOpts:
        repsIdx = trainOpts.index("--reps")
        assert repsIdx < len(trainOpts) - 1
        del trainOpts[repsIdx]
        del trainOpts[repsIdx]
    if args.numIter and "--iter" in args.trainOpts:
        iterIdx = trainOpts.index("--iter")
        assert iterIdx < len(trainOpts) - 1
        del trainOpts[iterIdx]
        del trainOpts[iterIdx]
    evalOpts = args.evalOpts.split()
    if "--bed" in args.evalOpts:
        bedIdx = evalOpts.index("--bed")
        assert bedIdx < len(evalOpts) - 1
        del evalOpts[bedIdx]
        del evalOpts[bedIdx]
    if "--bic" in args.evalOpts:
        bicIdx = evalOpts.index("--bic")
        assert bicIdx < len(evalOpts) - 1
        del evalOpts[bicIdx]
        del evalOpts[bicIdx]

    # hack in support for --initTrans option by munging out model sizes
    # from the text files
    if args.initTrans is True:
        transFiles = args.states.split(",")
        states = []
        for tf in transFiles:
            stateSet = set()
            with open(tf) as f:
                for line in f:
                    toks = line.split()
                    print toks
                    if len(toks) > 1 and toks[0][0] != "#":
                        stateSet.add(toks[0])
                        stateSet.add(toks[1])
            states.append(len(stateSet))
    else:
        states = args.states.split(",")

    trainCmds = []
    evalCmds = []
    prevSize = -1
    sameSizeCount = 0
    for trainingSize, trainingBed in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        print prevSize, trainingSize, sameSizeCount
        for numStates in states:
            for rep in xrange(args.reps):
                outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % (
                    trainingSize, sameSizeCount, int(numStates), int(rep)))
                if segOptIdx != -1:
                    trainOpts[segOptIdx] = trainingBed
                if args.initTrans is True:
                    statesOpt = "--initTransProbs %s" % transFiles[states.index(numStates)]
                elif args.numIter is True:
                    # states argument overridden by iterations
                    statesOpt = "--iter %d" % int(numStates)
                elif args.numReps is True:
                    # states argument overridden by reps
                    statesOpt = "--reps %d" % int(numStates)
                else:
                    statesOpt = "--numStates %d" % int(numStates)
                trainCmd = "teHmmTrain.py %s %s %s %s %s" % (
                    args.tracks, trainingBed, outMod, " ".join(trainOpts),
                    statesOpt)
                if not args.resume or not os.path.isfile(outMod) or \
                   os.path.getsize(outMod) < 100:
                    trainCmds.append(trainCmd)

                outBic = outMod.replace(".mod", ".bic")
                outBed = outMod.replace(".mod", "_eval.bed")
                evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % (
                    args.tracks, outMod, args.evalBed, outBed, outBic,
                    " ".join(evalOpts))
                if not args.resume or not os.path.isfile(outBic) or \
                   os.path.getsize(outBic) < 2:
                    evalCmds.append(evalCmd)
            
    # run the training            
    runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs))

    # run the eval
    runParallelShellCommands(evalCmds, args.proc)

    # make the table header
    tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w")
    stateColName = "states"
    if args.numIter is True:
        statesColName = "iter"
    elif args.numReps is True:
        stateColName = "reps"
    tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" % stateColName)
    for i in xrange(args.reps):
        tableFile.write(", bic.%d" % i)
    tableFile.write("\n")

    # make the table body
    prevSize = -1
    sameSizeCount = 0
    for (trainingSize,trainingBed) in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        for numStates in states:
            bics = []
            printBics = []
            for rep in xrange(args.reps):
                outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % (
                    trainingSize, sameSizeCount, int(numStates), int(rep)))
                outBic = outMod.replace(".mod", ".bic")
                try:
                    with open(outBic, "r") as obFile:
                        for line in obFile:
                            bic = float(line.split()[0])
                            break
                    bics.append(bic)
                    printBics.append(bic)
                except:
                    logger.warning("Coudn't find bic %s" % outBic)
                    printBics.append("ERROR")
            # write row
            tableFile.write("%s, %d, %d" % (trainingBed, int(trainingSize), int(numStates)))
            if len(bics) > 0:
                tableFile.write(", %f, %f, %f" % (np.mean(bics), np.min(bics),
                                                  np.max(bics)))
            else:
                tableFile.write(", ERROR, ERROR, ERROR")
            for pb in printBics:
                tableFile.write(", %s" % pb)
            tableFile.write("\n")
    tableFile.close()
            
    cleanBedTool(tempBedToolPath)
Esempio n. 21
0
    def testSupervisedLearn(self):
        intervals = readBedIntervals(getTestDirPath("truth.bed"), ncol=4)
        truthIntervals = []
        for i in intervals:
            truthIntervals.append((i[0], i[1], i[2], int(i[3])))

        allIntervals = [(truthIntervals[0][0], truthIntervals[0][1],
                         truthIntervals[-1][2])]
        trackData = TrackData()
        trackData.loadTrackData(getTracksInfoPath(3), allIntervals)
        assert len(trackData.getTrackTableList()) == 1
        # set the fudge to 1 since when the test was written this was
        # hardcoded default
        em = IndependentMultinomialEmissionModel(
            4, trackData.getNumSymbolsPerTrack(), fudge=1.0)
        hmm = MultitrackHmm(em)
        hmm.supervisedTrain(trackData, truthIntervals)
        hmm.validate()

        # check emissions, they should basically be binary.
        trackList = hmm.getTrackList()
        emp = np.exp(em.getLogProbs())
        ltrTrack = trackList.getTrackByName("ltr")
        track = ltrTrack.getNumber()
        cmap = ltrTrack.getValueMap()
        s0 = cmap.getMap(None)
        s1 = cmap.getMap(0)
        # we add 1 to all frequencies like emission trainer
        assert_array_almost_equal(emp[track][0][s0], 36. / 37.)
        assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.)
        assert_array_almost_equal(emp[track][1][s0], 1 - 6. / 7.)
        assert_array_almost_equal(emp[track][1][s1], 6. / 7.)
        assert_array_almost_equal(emp[track][2][s0], 26. / 27.)
        assert_array_almost_equal(emp[track][2][s1], 1. - 26. / 27.)
        assert_array_almost_equal(emp[track][3][s0], 1. - 6. / 7.)
        assert_array_almost_equal(emp[track][3][s1], 6. / 7.)

        insideTrack = trackList.getTrackByName("inside")
        track = insideTrack.getNumber()
        cmap = insideTrack.getValueMap()
        s0 = cmap.getMap(None)
        s1 = cmap.getMap("Inside")
        assert_array_almost_equal(emp[track][0][s0], 36. / 37.)
        assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.)
        assert_array_almost_equal(emp[track][1][s0], 6. / 7.)
        assert_array_almost_equal(emp[track][1][s1], 1 - 6. / 7.)
        assert_array_almost_equal(emp[track][2][s0], 1. - 26. / 27.)
        assert_array_almost_equal(emp[track][2][s1], 26. / 27.)
        assert_array_almost_equal(emp[track][3][s0], 6. / 7.)
        assert_array_almost_equal(emp[track][3][s1], 1. - 6. / 7.)

        # crappy check for start probs.  need to test transition too!
        freq = [0.0] * em.getNumStates()
        total = 0.0
        for interval in truthIntervals:
            state = interval[3]
            freq[state] += float(interval[2]) - float(interval[1])
            total += float(interval[2]) - float(interval[1])

        sprobs = hmm.getStartProbs()
        assert len(sprobs) == em.getNumStates()
        for state in xrange(em.getNumStates()):
            assert_array_almost_equal(freq[state] / total, sprobs[state])

        # transition probabilites
        # from eyeball:
        #c	0	5	0   0->0 +4   0->1 +1    0-> +5
        #c	5	10	1   1->1 +4   1->2 +1    1-> +5
        #c	10	35	2   2->2 +24  2->3 +1    2-> +25
        #c	35	40	3   3->3 +4   3->0 +1    3-> +5
        #c	40	70	0   0->0 +29             0-> +19
        realTransProbs = np.array([[33. / 34., 1. / 34., 0., 0.],
                                   [0., 4. / 5., 1. / 5., 0.],
                                   [0., 0., 24. / 25., 1. / 25.],
                                   [1. / 5., 0., 0., 4. / 5.]])

        tprobs = hmm.getTransitionProbs()
        assert tprobs.shape == (em.getNumStates(), em.getNumStates())
        assert_array_almost_equal(tprobs, realTransProbs)
        prob, states = hmm.viterbi(trackData)[0]
        for truthInt in truthIntervals:
            for i in xrange(truthInt[1], truthInt[2]):
                assert states[i] == truthInt[3]
Esempio n. 22
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Make some tables of statistics from a BED file.  All"
        " output will be written in one big CSV table to be viewed in a "
        "spreadsheet.")

    parser.add_argument("inBed", help="Input bed file")
    parser.add_argument("outCsv", help="Path to write output in CSV format")
    parser.add_argument("--ignore",
                        help="Comma-separated list of names"
                        " to ignore",
                        default="")
    parser.add_argument("--numBins",
                        help="Number of (linear) bins for "
                        "histograms",
                        type=int,
                        default=10)
    parser.add_argument("--logHist",
                        help="Apply log-transform to data for "
                        "histogram",
                        action="store_true",
                        default=False)
    parser.add_argument("--histRange",
                        help="Histogram range as comma-"
                        "separated pair of numbers",
                        default=None)
    parser.add_argument("--noHist",
                        help="skip hisograms",
                        action="store_true",
                        default=False)
    parser.add_argument("--noScore",
                        help="Just do length stats",
                        action="store_true",
                        default=False)
    parser.add_argument("--noLen",
                        help="Just do score stats",
                        action="store_true",
                        default=False)
    parser.add_argument("--nearness",
                        help="Compute nearness stats (instead "
                        "of normal stats) of input bed with given BED.  Output"
                        " will be a BED instead of CSV, with nearness in the "
                        "score position",
                        default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.histRange is not None:
        args.histRange = args.histRange.split(",")
        assert len(args.histRange) == 2
        args.histRange = int(args.histRange[0]), int(args.histRange[1])

    outFile = open(args.outCsv, "w")
    args.ignoreSet = set(args.ignore.split(","))

    intervals = readBedIntervals(args.inBed,
                                 ncol=5,
                                 sort=args.nearness is not None)

    csvStats = ""
    # nearness stats
    if args.nearness is not None:
        args.noScore = True
        csvStats = makeNearnessBED(intervals, args)

    # length stats
    elif args.noLen is False:
        csvStats = makeCSV(intervals, args, lambda x: int(x[2]) - int(x[1]),
                           "Length")
    # score stats
    try:
        if args.noScore is False:
            csvStats += "\n" + makeCSV(intervals, args, lambda x: float(x[4]),
                                       "Score")
            csvStats += "\n" + makeCSV(
                intervals, args, lambda x: float(x[4]) *
                (float(x[2]) - float(x[1])), "Score*Length")
    except Exception as e:
        logger.warning("Couldn't make score stats because %s" % str(e))
    outFile.write(csvStats)
    outFile.write("\n")
    outFile.close()
    cleanBedTool(tempBedToolPath)
Esempio n. 23
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Write track data into ASCII dump.  Row i corresponds"
        " to the ith position found when scanning query BED IN SORTED ORDER."
        "Column j corresponds to the jth track in the XML file. --map option"
        " used to write internal integer format used by HMM.  Unobserved values"
        " written as \"None\" if default attribute not specified or track not"
        " binary.  Rounding can occur if scaling parameters present.\n\n"
        "IMPORTANT: values stored in 8bit integers internally.  Any track with"
        " more than 256 different values will get clamped (with a warning)")

    parser.add_argument("tracks", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("query", help="BED region(s) to dump. SCANNED IN"
                        " SORTED ORDER")
    parser.add_argument("output", help="Path of file to write output to")
    parser.add_argument("--map", help="Apply name mapping, including"
                        " transformation specified in scale, logScale"
                        ", etc. attributes, that HMM uses internally"
                        ". Important to note that resulting integers"
                        " are just unique IDs.  ID_1 > ID_2 does not"
                        " mean anything", action="store_true",
                        default=False)
    parser.add_argument("--segment", help="Treat each interval in query"
                        " as a single segment (ie with only one data point)"
                        ".  In this case, query should probably have been"
                        " generated with segmentTracks.py",
                        action="store_true",
                        default=False)
    parser.add_argument("--noPos", help="Do not print genomic position"
                        " (first 2 columnts)", action="store_true",
                        default=False)
    parser.add_argument("--noMask", help="Ignore mask tracks",
                        default=False, action="store_true")
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    # make sure output writeable
    outFile = open(args.output, "w")

    # need to remember to fix this, disable as precaution for now
    assert args.noMask is True or args.segment is False
    
    # read query intervals from the bed file
    logger.info("loading query intervals from %s" % args.query)
    mergedIntervals = getMergedBedIntervals(args.query, ncol=3)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.query)

    # read the segment intervals from the (same) bed file
    segIntervals = None
    if args.segment is True:
        logger.info("loading segment intervals from %s" % args.query)
        segIntervals = readBedIntervals(args.query, sort=True)

    # read all data from track xml
    logger.info("loading tracks %s" % args.tracks)
    trackData = TrackData()
    trackData.loadTrackData(args.tracks, mergedIntervals,
                            segmentIntervals=segIntervals,
                            applyMasking = not args.noMask)

    # dump the data to output
    dumpTrackData(trackData, outFile, args.map, not args.noPos)
    outFile.close()
Esempio n. 24
0
def runPositionalComparison(argv, args):
    """ hack to recursively exectute compareBedStates.py on a sliding window of the two
    inputs and report accuracy in a BED file """
    try:
        windowToks = args.window.split(",")
        assert len(windowToks) == 5
        windowSize = int(windowToks[0])
        stateName = windowToks[1]
        compType = windowToks[2]
        score = windowToks[3]
        outBed = windowToks[4]
    except:
        raise RuntimeError("value passed to --window is not in valid format")
    if compType == "base":
        compIdx = 0
    elif compType == "interval":
        compIdx = 1
    elif compType == "weighted":
        compIdx = 2
    else:
        raise RuntimeError("invalid compType, %s, passed to --window" % compType)
    if score != "f1" and score != "precision" and score != "recall":
        raise RuntimeError("invalid score, %s, passed to --window" % score)
    try:
        outFile = open(outBed, "w")
    except:
        raise RuntimeError("invalid outBed, %s, passed to --window" % outBed)

    tempBed = getLocalTempPath("Temp_region", ".bed")
    runShellCommand("mergeBed -i %s > %s" % (args.bed1, tempBed))
    chunkBed = getLocalTempPath("Temp_chunkBed", ".bed")
    runShellCommand("chunkBedRegions.py %s %d --overlap .5 > %s" % (
        tempBed, windowSize, chunkBed))
    window = getLocalTempPath("Temp_window", ".bed")
    slice1 = getLocalTempPath("Temp_slice1", ".bed")
    slice2 = getLocalTempPath("Temp_slice2", ".bed")
    compFile = getLocalTempPath("Temp_compFile", ".bed")
    compOpts = ""
    winIdx = argv.index("--window")
    assert winIdx > 0 and winIdx < len(argv) -1 and argv[winIdx + 1] == args.window
    for i in xrange(3, len(argv)):
        if i != winIdx and i != winIdx + 1:
            compOpts += " " + argv[i]
    
    for chunk in readBedIntervals(chunkBed):
        runShellCommand("echo \"%s\t%d\t%d\" > %s" % (chunk[0], chunk[1], chunk[2],
                                                   window))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (
            args.bed1, window, slice1))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (
            args.bed2, window, slice2))
        runShellCommand("compareBedStates.py %s %s %s > %s" % (
            slice1, slice2, compOpts, compFile))
        stats = extractCompStatsFromFile(compFile)[compIdx]
        if stateName not in stats:
            stats[stateName] = (0,0)
        f1 = 0.
        prec, rec = stats[stateName]
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)
        val = f1
        if score == "precision":
            val = prec
        elif score == "recall":
            val = rec
        outFile.write("%s\t%d\t%d\t%f\n" % (chunk[0], chunk[1], chunk[2], val))

    runShellCommand("rm -f %s %s %s %s %s %s" % (tempBed, chunkBed, window,
                                                 slice1, slice2, compFile))
    outFile.close()
Esempio n. 25
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Make some tables of statistics from a BED file.  All"
        " output will be written in one big CSV table to be viewed in a "
        "spreadsheet.")

    parser.add_argument("inBed", help="Input bed file")
    parser.add_argument("outCsv", help="Path to write output in CSV format")
    parser.add_argument("--ignore", help="Comma-separated list of names"
                        " to ignore", default="")
    parser.add_argument("--numBins", help="Number of (linear) bins for "
                        "histograms", type=int, default=10)
    parser.add_argument("--logHist", help="Apply log-transform to data for "
                        "histogram", action="store_true", default=False)
    parser.add_argument("--histRange", help="Histogram range as comma-"
                        "separated pair of numbers", default=None)
    parser.add_argument("--noHist", help="skip hisograms", action="store_true",
                        default=False)
    parser.add_argument("--noScore", help="Just do length stats",
                        action="store_true", default=False)
    parser.add_argument("--noLen", help="Just do score stats",
                        action="store_true", default=False)
    parser.add_argument("--nearness", help="Compute nearness stats (instead "
                        "of normal stats) of input bed with given BED.  Output"
                        " will be a BED instead of CSV, with nearness in the "
                        "score position", default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.histRange is not None:
        args.histRange = args.histRange.split(",")
        assert len(args.histRange) == 2
        args.histRange = int(args.histRange[0]), int(args.histRange[1])

    outFile = open(args.outCsv, "w")
    args.ignoreSet = set(args.ignore.split(","))

    intervals = readBedIntervals(args.inBed, ncol = 5, sort = args.nearness is not None)

    csvStats = ""
    # nearness stats
    if args.nearness is not None:
        args.noScore = True
        csvStats = makeNearnessBED(intervals, args)
        
    # length stats
    elif args.noLen is False:
        csvStats = makeCSV(intervals, args, lambda x : int(x[2])-int(x[1]),
                           "Length")
    # score stats
    try:
        if args.noScore is False:
            csvStats += "\n" + makeCSV(intervals, args, lambda x : float(x[4]),
                                       "Score")
            csvStats += "\n" + makeCSV(intervals, args, lambda x : float(x[4]) * (
                float(x[2]) - float(x[1])), "Score*Length")
    except Exception as e:
        logger.warning("Couldn't make score stats because %s" % str(e))
    outFile.write(csvStats)
    outFile.write("\n")
    outFile.close()
    cleanBedTool(tempBedToolPath)
Esempio n. 26
0
    def testHmmSupervisedLearn(self):
        """ Pretty much copied from the HMM unit test.  We try to recapitualte
        all results with a CFG with no nest states, which should be same as
        HMM"""
        intervals = readBedIntervals(getTestDirPath("truth.bed"), ncol=4)
        truthIntervals = []
        for i in intervals:
            truthIntervals.append((i[0], i[1], i[2], int(i[3])))

        allIntervals = [(truthIntervals[0][0], truthIntervals[0][1],
                         truthIntervals[-1][2])]
        trackData = TrackData()
        trackData.loadTrackData(getTracksInfoPath(3), allIntervals)
        assert len(trackData.getTrackTableList()) == 1
        # set the fudge to 1 since when the test was written this was
        # hardcoded default
        em = IndependentMultinomialEmissionModel(
            4, trackData.getNumSymbolsPerTrack(), fudge=1.0)
        hmm = MultitrackHmm(em)
        hmm.supervisedTrain(trackData, truthIntervals)
        hmm.validate()
        pairModel = PairEmissionModel(em, [1.0] * em.getNumStates())
        # Test validates with neststate just for fun
        cfg = MultitrackCfg(em, pairModel, nestStates=[1])

        cfg.supervisedTrain(trackData, truthIntervals)
        cfg.validate()

        # Then reload as an hmm-equivalent
        cfg = MultitrackCfg(em, pairModel, nestStates=[])

        cfg.supervisedTrain(trackData, truthIntervals)
        cfg.validate()

        # check emissions, they should basically be binary.
        trackList = cfg.getTrackList()
        emp = np.exp(em.getLogProbs())
        ltrTrack = trackList.getTrackByName("ltr")
        track = ltrTrack.getNumber()
        cmap = ltrTrack.getValueMap()
        s0 = cmap.getMap(None)
        s1 = cmap.getMap(0)
        # we add 1 to all frequencies like emission trainer
        assert_array_almost_equal(emp[track][0][s0], 36. / 37.)
        assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.)
        assert_array_almost_equal(emp[track][1][s0], 1 - 6. / 7.)
        assert_array_almost_equal(emp[track][1][s1], 6. / 7.)
        assert_array_almost_equal(emp[track][2][s0], 26. / 27.)
        assert_array_almost_equal(emp[track][2][s1], 1. - 26. / 27.)
        assert_array_almost_equal(emp[track][3][s0], 1. - 6. / 7.)
        assert_array_almost_equal(emp[track][3][s1], 6. / 7.)

        insideTrack = trackList.getTrackByName("inside")
        track = insideTrack.getNumber()
        cmap = insideTrack.getValueMap()
        s0 = cmap.getMap(None)
        s1 = cmap.getMap("Inside")
        assert_array_almost_equal(emp[track][0][s0], 36. / 37.)
        assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.)
        assert_array_almost_equal(emp[track][1][s0], 6. / 7.)
        assert_array_almost_equal(emp[track][1][s1], 1 - 6. / 7.)
        assert_array_almost_equal(emp[track][2][s0], 1. - 26. / 27.)
        assert_array_almost_equal(emp[track][2][s1], 26. / 27.)
        assert_array_almost_equal(emp[track][3][s0], 6. / 7.)
        assert_array_almost_equal(emp[track][3][s1], 1. - 6. / 7.)

        # crappy check for start probs.  need to test transition too!
        freq = [0.0] * em.getNumStates()
        total = 0.0
        for interval in truthIntervals:
            state = interval[3]
            freq[state] += float(interval[2]) - float(interval[1])
            total += float(interval[2]) - float(interval[1])

        sprobs = cfg.getStartProbs()
        assert len(sprobs) == em.getNumStates()
        for state in xrange(em.getNumStates()):
            assert_array_almost_equal(freq[state] / total, sprobs[state])

        # transition probabilites
        # from eyeball:
        #c	0	5	0   0->0 +4   0->1 +1    0-> +5
        #c	5	10	1   1->1 +4   1->2 +1    1-> +5
        #c	10	35	2   2->2 +24  2->3 +1    2-> +25
        #c	35	40	3   3->3 +4   3->0 +1    3-> +5
        #c	40	70	0   0->0 +29             0-> +19
        realTransProbs = np.array([[33. / 34., 1. / 34., 0., 0.],
                                   [0., 4. / 5., 1. / 5., 0.],
                                   [0., 0., 24. / 25., 1. / 25.],
                                   [1. / 5., 0., 0., 4. / 5.]])

        tprobs = np.exp(cfg.getLogProbTables()[0])
        assert tprobs.shape == (em.getNumStates(), em.getNumStates(),
                                em.getNumStates())
        for i in xrange(em.getNumStates()):
            for j in xrange(em.getNumStates()):
                fbTot = tprobs[i, i, j]
                if i != j:
                    fbTot += tprobs[i, j, i]
                assert_array_almost_equal(fbTot, realTransProbs[i, j])
        prob, states = cfg.decode(trackData.getTrackTableList()[0])
        for truthInt in truthIntervals:
            for i in xrange(truthInt[1], truthInt[2]):
                # gah, just realized that ltr track is binary, which means
                # ltr states can be either 1 or 3.  need to fix test properly
                # but just relax comparison for now.
                if truthInt[3] == 1 or truthInt[3] == 3:
                    assert states[i] == 1 or states[i] == 3
                else:
                    assert states[i] == truthInt[3]
Esempio n. 27
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Rename HMM states.")
    parser.add_argument("inputModel", help="Path of teHMM model created with"
                        " teHmmTrain.py")
    parser.add_argument("outputModel", help="Path of model with renamed states")
    parser.add_argument("--newNames", help="comma-separated list of state names to"
                        " apply.  This list must have exactly the same number of"
                        " states as the model.  The ith name in the list will be "
                        "assigned to the ith name of the model...", default=None)
    parser.add_argument("--teNumbers", help="comma-separated list of state numbers"
                        " that will be assigned TE states, with everything else"
                        " assigned Other.  This is less flexible but maybe more"
                        " convenient at times than --newNames.", default=None)
    parser.add_argument("--bed", help="apply naming to bed file and print "
                        "results to stdout", default=None)
    parser.add_argument("--sizes", help="bedFile to use for computing state numbering"
                        " by using decreasing order in total coverage (only works"
                        " with --teNumbers)", default=None)
    parser.add_argument("--noMerge", help="dont merge adjacent intervals with same"
                        " name with --bed option", action="store_true",default=False)
    parser.add_argument("--changeTrackName", help="dont do anything else, just change"
                        " the name of one track.  specified value should be of form"
                        " currentNAme, newName", default=None)
    

    args = parser.parse_args()
    assert args.inputModel != args.outputModel

    # load model created with teHmmTrain.py
    model = loadModel(args.inputModel)

    # trackChangeName logic hacked in completely separate from everything else
    if args.changeTrackName is not None:
        oldName, newName = args.changeTrackName.split(",")
        track = model.getTrackList().getTrackByName(oldName)
        track.setName(newName)
        saveModel(args.outputModel, model)
        return 0

    assert (args.newNames is None) != (args.teNumbers is None)
    
    # names manually specified
    if args.newNames is not None:
        names = args.newNames.split(",")
        
    # names computed using simple scheme from set of "TE" state numbers (as found from
    # log output of fitStateNames.py)
    elif args.teNumbers is not None:
        teNos = set([int(x) for x in args.teNumbers.split(",")])
        teCount, otherCount = 0, 0
        numStates = model.getEmissionModel().getNumStates()

        # re-order from sizing info
        if args.sizes is not None:
            bedIntervals = readBedIntervals(args.sizes, ncol=4)
            sizeMap = defaultdict(int)
            for interval in bedIntervals:
                sizeMap[int(interval[3])] += interval[2] - interval[1]
            stateNumbers = sorted([x for x in xrange(numStates)],
                           reverse=True, key = lambda x : sizeMap[x])
        else:
            stateNumbers = [x for x in xrange(numStates)]
        names = [""] * numStates
        for i in stateNumbers:
            if i in teNos:
                name = "TE-%.2d" % teCount
                teCount += 1
            else:
                name = "Other-%.2d" % otherCount
                otherCount += 1
            names[i] = name
        assert teCount == len(teNos) and teCount + otherCount == len(names)
                
    assert len(names) == model.getEmissionModel().getNumStates()

    # throw names in the mapping object and stick into model
    catMap = CategoryMap(reserved=0)
    for i, name in enumerate(names):
        catMap.getMap(name, update=True)
    model.stateNameMap = catMap
    
    # save model
    saveModel(args.outputModel, model)

    # process optional bed file
    if args.bed is not None:
        prevInterval = None
        bedIntervals = readBedIntervals(args.bed, ncol=4)
        for interval in bedIntervals:
            oldName = interval[3]
            newName = names[int(oldName)]
            newInterval = list(interval)
            newInterval[3] = newName
            if args.noMerge:
                # write interval
                print "\t".join(str(x) for x in newInterval)
            else:
                if prevInterval is None:
                    # update prev interval first time
                    prevInterval = newInterval
                elif newInterval[3] == prevInterval[3] and\
                         newInterval[0] == prevInterval[0] and\
                         newInterval[1] == prevInterval[2]:
                    # glue onto prev interval
                    prevInterval[2] = newInterval[2]
                else:
                    # write and update prev
                    print "\t".join(str(x) for x in prevInterval)
                    prevInterval = newInterval
        if prevInterval is not None:
            print "\t".join(str(x) for x in prevInterval)
Esempio n. 28
0
def parallelDispatch(argv, args):
    """ chunk up input with chrom option.  recursivlely launch eval. merge
    results """
    jobList = []
    chromIntervals = readBedIntervals(args.chroms, sort=True)
    chromFiles = []
    regionFiles = []
    segFiles = []
    statsFiles = []
    offset = args.co
    for chrom in chromIntervals:
        cmdToks = copy.deepcopy(argv)
        cmdToks[cmdToks.index("--chrom") + 1] = ""
        cmdToks[cmdToks.index("--chrom")] = ""

        chromPath = getLocalTempPath("TempChromPath", ".bed")
        cpFile = open(chromPath, "w")
        cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2]))
        cpFile.close()

        regionPath = getLocalTempPath("Temp", ".bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                        (args.allBed, chromPath, regionPath))

        if os.path.getsize(regionPath) < 2:
            continue

        offset += int(chrom[2]) - int(chrom[1])

        regionFiles.append(regionPath)
        chromFiles.append(chromPath)

        cmdToks[2] = regionPath

        segPath = getLocalTempPath("Temp", ".bed")
        cmdToks[3] = segPath
        segFiles.append(segPath)

        if "--co" in cmdToks:
            cmdToks[cmdToks.index("--co") + 1] = str(offset)
        else:
            cmdToks.append("--co")
            cmdToks.append(str(offset))

        if args.stats is not None:
            statsPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--stats") + 1] = statsPath
            statsFiles.append(statsPath)
        cmd = " ".join(cmdToks)
        jobList.append(cmd)

    runParallelShellCommands(jobList, args.proc)

    for i in xrange(len(jobList)):
        if i == 0:
            ct = ">"
        else:
            ct = ">>"
        runShellCommand("cat %s %s %s" % (segFiles[i], ct, args.outBed))
        if len(statsFiles) > 0:
            runShellCommand("cat %s %s %s" % (statsFiles[i], ct, args.stats))

    for i in itertools.chain(chromFiles, regionFiles, segFiles, statsFiles):
        runShellCommand("rm %s" % i)
Esempio n. 29
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create starting transition and emission distributions "
        "from a candidate BED annotation, which can"
        " be used with teHmmTrain.py using the --initTransProbs and "
        "--initEmProbs options, respectively.  The distributions created here"
        " are extremely simple, but this can be a good shortcut to at least "
        "getting the state names into the init files, which can be further "
        "tweeked by hand.")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("trackName", help="Name of Track to use as initial"
                        " annotation")
    parser.add_argument("queryBed", help="Bed file with regions to query")
    parser.add_argument("outTransProbs", help="File to write transition model"
                        " to")
    parser.add_argument("outEmProbs", help="File to write emission model to")
    parser.add_argument("--numOut", help="Number of \"outside\" states to add"
                        " to the model.", default=1, type=int)
    parser.add_argument("--numTot", help="Add x \"outside\" states such "
                        "that total states is this. (overrieds --numOut)",
                        default=0, type=int)
    parser.add_argument("--outName", help="Name of outside states (will have"
                        " numeric suffix if more than 1)", default="Outside")
    parser.add_argument("--mode", help="Strategy for initializing the "
                        "transition graph: {\'star\': all states are connected"
                        " to the oustide state(s) but not each other; "
                        " \'data\': transitions estimated from input bed; "
                        " \'full\': dont write edges and let teHmmTrain.py "
                        "initialize as a clique}", default="star")
    parser.add_argument("--selfTran", help="This script will always write all"
                        " the self-transition probabilities to the output file. "
                        "They will all be set to the specified value using this"
                        " option, or estimated from the data if -1", default=-1.,
                        type=float)
    parser.add_argument("--em", help="Emission probability for input track ("
                        "ie probability that state emits itself)",
                        type=float, default=0.95)
    parser.add_argument("--outEmNone", help="Add None emission probabilities"
                        " for target track for Outside states",
                        action="store_true", default=None)
                        
    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.mode == "star" and args.numOut < 1:
        raise RuntimeError("--numOut must be at least 1 if --mode star is used")
    if args.mode != "star" and args.mode != "data" and args.mode != "full":
        raise RuntimeError("--mode must be one of {star, data, full}")
    if args.mode == "data":
        raise RuntimeError("--data not implemented yet")
    assert os.path.isfile(args.tracksInfo)
    
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # Read the tracks info
    trackList = TrackList(args.tracksInfo)
    # Extract the track we want
    track = trackList.getTrackByName(args.trackName)
    if track is None:
        raise RuntimeError("Track %s not found in tracksInfo" % args.trackName)
    trackPath = track.getPath()
    if track.getDist() != "multinomial" and track.getDist() != "gaussian":
        raise RuntimeError("Track %s does not have multinomial or "
                           "gaussian distribution" % args.trackName)
    if track.getScale() is not None or track.getLogScale() is not None:
        raise RuntimeError("Track %s must not have scale" % args.trackName)
    
    # read query intervals from the bed file
    logger.info("loading query intervals from %s" % args.queryBed)
    mergedIntervals = getMergedBedIntervals(args.queryBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.queryBed)

    # read the track, while intersecting with query intervals
    # (track is saved as temp XML file for sake not changing interface)
    bedIntervals = []
    for queryInterval in mergedIntervals:
        bedIntervals += readBedIntervals(trackPath,
                                        ncol = track.getValCol() + 1,
                                        chrom=queryInterval[0],
                                        start=queryInterval[1],
                                        end=queryInterval[2])

    # 1st pass to collect set of names
    nameMap = CategoryMap(reserved = 0)
    for interval in bedIntervals:
        nameMap.update(interval[track.getValCol()])
    outNameMap = CategoryMap(reserved = 0)
    if args.numTot > 0:
        args.numOut = max(0, args.numTot - len(nameMap))
    for i in xrange(args.numOut):
        outName = args.outName
        if args.numOut > 1:
            outName += str(i)
        assert nameMap.has(outName) is False
        outNameMap.update(outName)

    # write the transition model for use with teHmmTrain.py --initTransProbs    
    writeTransitions(bedIntervals, nameMap, outNameMap, args)

    # write the emission model for use with teHmmTrain.py --initEmProbs
    writeEmissions(bedIntervals, nameMap, outNameMap, args)

    cleanBedTool(tempBedToolPath)
Esempio n. 30
0
def parallelDispatch(argv, args):
    """ chunk up input with chrom option.  recursivlely launch eval. merge
    results """
    jobList = []
    chromIntervals = readBedIntervals(args.chroms, sort=True)
    chromFiles = []
    regionFiles = []
    bedFiles = []
    pdFiles = []
    bicFiles = []
    edFiles = []
    for chrom in chromIntervals:
        cmdToks = copy.deepcopy(argv)
        cmdToks[cmdToks.index("--chrom") + 1] = ""
        cmdToks[cmdToks.index("--chrom")] = ""
        
        chromPath = getLocalTempPath("Temp", ".bed")
        cpFile = open(chromPath, "w")
        cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2]))
        cpFile.close()
        
        regionPath = getLocalTempPath("Temp", ".bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.bedRegions,
                                                                     chromPath,
                                                                     regionPath))

        if os.path.getsize(regionPath) < 2:
            continue
        
        regionFiles.append(regionPath)
        chromFiles.append(chromPath)

        cmdToks[3] = regionPath

        if args.bed is not None:
            bedPath =  getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--bed")+1] = bedPath
            bedFiles.append(bedPath)
        if args.pd is not None:
            pdPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--pd")+1] = pdPath
            pdFiles.append(pdPath)
        if args.ed is not None:
            edPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--ed")+1] = edPath
            edFiles.append(edPath)
        if args.bic is not None:
            bicPath = getLocalTempPath("Temp", ".bic")
            cmdToks[cmdToks.index("--bic")+1] = bicPath
            bicFiles.append(bicPath)
        cmd = " ".join(cmdToks)
        jobList.append(cmd)

    runParallelShellCommands(jobList, args.proc)

    for i in xrange(len(jobList)):
        if i == 0:
            ct = ">"
        else:
            ct = ">>"
        if len(bedFiles) > 0:
            runShellCommand("cat %s %s %s" % (bedFiles[i], ct, args.bed))
        if len(pdFiles) > 0:
            runShellCommand("cat %s %s %s" % (pdFiles[i], ct, args.pd))
        if len(edFiles) > 0:
            runShellCommand("cat %s %s %s" % (edFiles[i], ct, args.ed))
        if len(bicFiles) > 0:
            runShellCommand("cat %s %s %s" % (bicFiles[i], ct, args.bic))

    for i in itertools.chain(chromFiles, regionFiles, bedFiles, pdFiles, edFiles,
                             bicFiles):
        runShellCommand("rm %s" % i)            
Esempio n. 31
0
def runPositionalComparison(argv, args):
    """ hack to recursively exectute compareBedStates.py on a sliding window of the two
    inputs and report accuracy in a BED file """
    try:
        windowToks = args.window.split(",")
        assert len(windowToks) == 5
        windowSize = int(windowToks[0])
        stateName = windowToks[1]
        compType = windowToks[2]
        score = windowToks[3]
        outBed = windowToks[4]
    except:
        raise RuntimeError("value passed to --window is not in valid format")
    if compType == "base":
        compIdx = 0
    elif compType == "interval":
        compIdx = 1
    elif compType == "weighted":
        compIdx = 2
    else:
        raise RuntimeError("invalid compType, %s, passed to --window" %
                           compType)
    if score != "f1" and score != "precision" and score != "recall":
        raise RuntimeError("invalid score, %s, passed to --window" % score)
    try:
        outFile = open(outBed, "w")
    except:
        raise RuntimeError("invalid outBed, %s, passed to --window" % outBed)

    tempBed = getLocalTempPath("Temp_region", ".bed")
    runShellCommand("mergeBed -i %s > %s" % (args.bed1, tempBed))
    chunkBed = getLocalTempPath("Temp_chunkBed", ".bed")
    runShellCommand("chunkBedRegions.py %s %d --overlap .5 > %s" %
                    (tempBed, windowSize, chunkBed))
    window = getLocalTempPath("Temp_window", ".bed")
    slice1 = getLocalTempPath("Temp_slice1", ".bed")
    slice2 = getLocalTempPath("Temp_slice2", ".bed")
    compFile = getLocalTempPath("Temp_compFile", ".bed")
    compOpts = ""
    winIdx = argv.index("--window")
    assert winIdx > 0 and winIdx < len(argv) - 1 and argv[winIdx +
                                                          1] == args.window
    for i in xrange(3, len(argv)):
        if i != winIdx and i != winIdx + 1:
            compOpts += " " + argv[i]

    for chunk in readBedIntervals(chunkBed):
        runShellCommand("echo \"%s\t%d\t%d\" > %s" %
                        (chunk[0], chunk[1], chunk[2], window))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                        (args.bed1, window, slice1))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                        (args.bed2, window, slice2))
        runShellCommand("compareBedStates.py %s %s %s > %s" %
                        (slice1, slice2, compOpts, compFile))
        stats = extractCompStatsFromFile(compFile)[compIdx]
        if stateName not in stats:
            stats[stateName] = (0, 0)
        f1 = 0.
        prec, rec = stats[stateName]
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)
        val = f1
        if score == "precision":
            val = prec
        elif score == "recall":
            val = rec
        outFile.write("%s\t%d\t%d\t%f\n" % (chunk[0], chunk[1], chunk[2], val))

    runShellCommand("rm -f %s %s %s %s %s %s" %
                    (tempBed, chunkBed, window, slice1, slice2, compFile))
    outFile.close()
Esempio n. 32
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Find candidate TSDs (exact forward matches) flanking given"
        "BED intervals.  Score is distance between TSD and bed interval.")
    parser.add_argument("fastaSequence", help="DNA sequence in FASTA format")
    parser.add_argument("inBed", help="BED file with TEs whose flanking regions "
                        "we wish to search")
    parser.add_argument("outBed", help="BED file containing (only) output TSDs")
    parser.add_argument("--min", help="Minimum length of a TSD",
                        default=4, type=int)
    parser.add_argument("--max", help="Maximum length of a TSD",
                        default=6, type=int)
    parser.add_argument("--all", help="Report all matches in region (as opposed"
                        " to only the nearest to the BED element which is the "
                        "default behaviour", action="store_true", default=False)
    parser.add_argument("--maxScore", help="Only report matches with given "
                        "score or smaller.  The score  is definied as the "
                        "maximum distance between the (two) TSD intervals and "
                        "the query interval",
                        default=None, type=int)
    parser.add_argument("--left", help="Number of bases immediately left of the "
                        "BED element to search for the left TSD",
                        default=7, type=int)
    parser.add_argument("--right", help="Number of bases immediately right of "
                        "the BED element to search for the right TSD",
                        default=7, type=int)
    parser.add_argument("--overlap", help="Number of bases overlapping the "
                        "BED element to include in search (so total space "
                        "on each side will be --left + overlap, and --right + "
                        "--overlap", default=3, type=int)
    parser.add_argument("--leftName", help="Name of left TSDs in output Bed",
                        default="L_TSD")
    parser.add_argument("--rightName", help="Name of right TSDs in output Bed",
                        default="R_TSD")
    parser.add_argument("--id", help="Assign left/right pairs of TSDs a unique"
                        " matching ID", action="store_true", default=False)
    parser.add_argument("--names", help="Only apply to bed interval whose "
                        "name is in (comma-separated) list.  If not specified"
                        " then all intervals are processed", default=None)
    parser.add_argument("--numProc", help="Number of jobs to run in parallel."
                        " (parallization done on different sequences in FASTA"
                        "file", type=int, default=1)
    parser.add_argument("--sequences", help="Only process given sequences of input"
                        " FASTA file (comma-separated list).",  default=None)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    assert os.path.exists(args.inBed)
    assert os.path.exists(args.fastaSequence)
    assert args.min <= args.max
    args.nextId = 0

    if args.sequences is not None:
        args.sequences = set(args.sequences.split(","))

    # read intervals from the bed file
    logger.info("loading target intervals from %s" % args.inBed)
    bedIntervals = readBedIntervals(args.inBed, ncol=4, sort=True)
    if bedIntervals is None or len(bedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.inBed)

    if args.numProc > 1:
        runParallel(args, bedIntervals)
        return 0
    
    tsds = findTsds(args, bedIntervals)

    writeBedIntervals(tsds, args.outBed)
Esempio n. 33
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Rename HMM states.")
    parser.add_argument("inputModel",
                        help="Path of teHMM model created with"
                        " teHmmTrain.py")
    parser.add_argument("outputModel",
                        help="Path of model with renamed states")
    parser.add_argument(
        "--newNames",
        help="comma-separated list of state names to"
        " apply.  This list must have exactly the same number of"
        " states as the model.  The ith name in the list will be "
        "assigned to the ith name of the model...",
        default=None)
    parser.add_argument(
        "--teNumbers",
        help="comma-separated list of state numbers"
        " that will be assigned TE states, with everything else"
        " assigned Other.  This is less flexible but maybe more"
        " convenient at times than --newNames.",
        default=None)
    parser.add_argument("--bed",
                        help="apply naming to bed file and print "
                        "results to stdout",
                        default=None)
    parser.add_argument(
        "--sizes",
        help="bedFile to use for computing state numbering"
        " by using decreasing order in total coverage (only works"
        " with --teNumbers)",
        default=None)
    parser.add_argument("--noMerge",
                        help="dont merge adjacent intervals with same"
                        " name with --bed option",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--changeTrackName",
        help="dont do anything else, just change"
        " the name of one track.  specified value should be of form"
        " currentNAme, newName",
        default=None)

    args = parser.parse_args()
    assert args.inputModel != args.outputModel

    # load model created with teHmmTrain.py
    model = loadModel(args.inputModel)

    # trackChangeName logic hacked in completely separate from everything else
    if args.changeTrackName is not None:
        oldName, newName = args.changeTrackName.split(",")
        track = model.getTrackList().getTrackByName(oldName)
        track.setName(newName)
        saveModel(args.outputModel, model)
        return 0

    assert (args.newNames is None) != (args.teNumbers is None)

    # names manually specified
    if args.newNames is not None:
        names = args.newNames.split(",")

    # names computed using simple scheme from set of "TE" state numbers (as found from
    # log output of fitStateNames.py)
    elif args.teNumbers is not None:
        teNos = set([int(x) for x in args.teNumbers.split(",")])
        teCount, otherCount = 0, 0
        numStates = model.getEmissionModel().getNumStates()

        # re-order from sizing info
        if args.sizes is not None:
            bedIntervals = readBedIntervals(args.sizes, ncol=4)
            sizeMap = defaultdict(int)
            for interval in bedIntervals:
                sizeMap[int(interval[3])] += interval[2] - interval[1]
            stateNumbers = sorted([x for x in xrange(numStates)],
                                  reverse=True,
                                  key=lambda x: sizeMap[x])
        else:
            stateNumbers = [x for x in xrange(numStates)]
        names = [""] * numStates
        for i in stateNumbers:
            if i in teNos:
                name = "TE-%.2d" % teCount
                teCount += 1
            else:
                name = "Other-%.2d" % otherCount
                otherCount += 1
            names[i] = name
        assert teCount == len(teNos) and teCount + otherCount == len(names)

    assert len(names) == model.getEmissionModel().getNumStates()

    # throw names in the mapping object and stick into model
    catMap = CategoryMap(reserved=0)
    for i, name in enumerate(names):
        catMap.getMap(name, update=True)
    model.stateNameMap = catMap

    # save model
    saveModel(args.outputModel, model)

    # process optional bed file
    if args.bed is not None:
        prevInterval = None
        bedIntervals = readBedIntervals(args.bed, ncol=4)
        for interval in bedIntervals:
            oldName = interval[3]
            newName = names[int(oldName)]
            newInterval = list(interval)
            newInterval[3] = newName
            if args.noMerge:
                # write interval
                print "\t".join(str(x) for x in newInterval)
            else:
                if prevInterval is None:
                    # update prev interval first time
                    prevInterval = newInterval
                elif newInterval[3] == prevInterval[3] and\
                         newInterval[0] == prevInterval[0] and\
                         newInterval[1] == prevInterval[2]:
                    # glue onto prev interval
                    prevInterval[2] = newInterval[2]
                else:
                    # write and update prev
                    print "\t".join(str(x) for x in prevInterval)
                    prevInterval = newInterval
        if prevInterval is not None:
            print "\t".join(str(x) for x in prevInterval)
Esempio n. 34
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Compare two bed files where Model states are represented"
        " in a column.  Used to determine sensitivity and specificity.  NOTE"
        " that both bed files must be sorted and cover the exact same regions"
        " of the same genome.")

    parser.add_argument("bed1", help="Bed file (TRUTH)")
    parser.add_argument("bed2",
                        help="Bed file covering same regions in same"
                        " order as bed1")
    parser.add_argument("--col",
                        help="Column of bed files to use for state"
                        " (currently only support 4(name) or 5(score))",
                        default=4,
                        type=int)
    parser.add_argument("--thresh",
                        help="Threshold to consider interval from"
                        " bed1 covered by bed2.",
                        type=float,
                        default=0.8)
    parser.add_argument("--plot",
                        help="Path of file to write Precision/Recall"
                        " graphs to in PDF format",
                        default=None)
    parser.add_argument("--ignore",
                        help="Comma-separated list of stateNames to"
                        " ignore",
                        default=None)
    parser.add_argument(
        "--strictPrec",
        help="By default, precision is computed"
        " in a manner strictly symmetric to recall.  So calling"
        " compareBedStates.py A.bed B.bed would give the exact"
        " same output as compareBedStates.py B.bed A.bed except"
        " precision and recall values would be swapped.  With "
        " this option, a predicted element only counts toward"
        " precision if it overlaps with 80pct of the true"
        " element, as opposed to only needing 80pct of itself"
        " overlapping with the true element. ",
        action="store_true",
        default=False)
    parser.add_argument("--noBase",
                        help="Skip base-level stats (and only show"
                        " interval stats).  Runs faster",
                        action="store_true",
                        default=False)
    parser.add_argument("--noFrag",
                        help="Do not allow fragmented matches in"
                        "interval predictions.  ie if a single truth interval"
                        " is covered by a series of predicted intervals, only "
                        "the best match will be counted if this flag is used",
                        action="store_true",
                        default=False)
    parser.add_argument("--tl",
                        help="Path to tracks XML file.  Used to cut "
                        "out mask tracks so they are removed from comparison."
                        " (convenience option to not have to manually run "
                        "subtractBed everytime...)",
                        default=None)
    parser.add_argument("--delMask",
                        help="Entirely remove intervals from "
                        "mask tracks that are > given length.  Probably "
                        "only want to set to non-zero value K if using"
                        " with a prediction that was processed with "
                        "interpolateMaskedRegions.py --max K",
                        type=int,
                        default=0)
    parser.add_argument(
        "--window",
        help="A comma-delimited 5-tuple of "
        "windowSize,stateName,compType,score,outBed.  "
        "Where windowSize  is the sliding window size "
        "(overlap .5), stateName is target stateName,"
        " compType is in {base,interval,weighted}, sore is"
        " in {f1,precision,recall} and "
        "outBed is the path of a bedFile to write positional"
        " accuracy to.  For example, --window 1000000,TE,base,f1"
        ",acc.bed will write base-level f1 for 1MB sliding windows"
        " to acc.bed.  These can be viewed on the browser by first"
        " converting to BigWig.",
        default=None)

    args = parser.parse_args()
    tempBedToolPath = initBedTool()

    if args.ignore is not None:
        args.ignore = set(args.ignore.split(","))
    else:
        args.ignore = set()

    assert args.col == 4 or args.col == 5
    print "Commandline %s" % " ".join(sys.argv)
    origArgs = copy.deepcopy(args)

    tempFiles = []
    if args.tl is not None:
        cutBed1 = cutOutMaskIntervals(args.bed1, args.delMask, sys.maxint,
                                      args.tl)
        cutBed2 = cutOutMaskIntervals(args.bed2, args.delMask, sys.maxint,
                                      args.tl)
        if cutBed1 is not None:
            assert cutBed2 is not None
            tempFiles += [cutBed1, cutBed2]
            args.bed1 = cutBed1
            args.bed2 = cutBed2

    checkExactOverlap(args.bed1, args.bed2)

    if args.window is not None:
        runPositionalComparison(argv, origArgs)

    intervals1 = readBedIntervals(args.bed1, ncol=args.col)
    intervals2 = readBedIntervals(args.bed2, ncol=args.col)

    if args.noBase is False:
        stats = compareBaseLevel(intervals1, intervals2, args.col - 1)[0]

        totalRight, totalWrong, accMap = summarizeBaseComparision(
            stats, args.ignore)
        print "Base counts [False Negatives, False Positives, True Positives]:"
        print stats
        totalBoth = totalRight + totalWrong
        accuracy = float(totalRight) / float(totalBoth)
        print "Accuaracy: %d / %d = %f" % (totalRight, totalBoth, accuracy)
        print "State-by-state (Precision, Recall):"
        print "Base-by-base Accuracy"
        print accMap

    trueStats = compareIntervalsOneSided(intervals1, intervals2, args.col - 1,
                                         args.thresh, False,
                                         not args.noFrag)[0]
    predStats = compareIntervalsOneSided(intervals2, intervals1, args.col - 1,
                                         args.thresh, args.strictPrec,
                                         not args.noFrag)[0]
    intAccMap = summarizeIntervalComparison(trueStats, predStats, False,
                                            args.ignore)
    intAccMapWeighted = summarizeIntervalComparison(trueStats, predStats, True,
                                                    args.ignore)
    print "\nInterval Accuracy"
    print intAccMap
    print ""

    print "\nWeighted Interval Accuracy"
    print intAccMapWeighted
    print ""

    # print some row data to be picked up by scrapeBenchmarkRow.py
    if args.noBase is False:
        header, row = summaryRow(accuracy, stats, accMap)
        print " ".join(header)
        print " ".join(row)

    # make graph
    if args.plot is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write plots.  Maybe matplotlib is "
                               "not installed?")
        writeAccPlots(accuracy, accMap, intAccMap, intAccMapWeighted,
                      args.thresh, args.plot)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
Esempio n. 35
0
    def testSupervisedLearn(self):
        intervals = readBedIntervals(getTestDirPath("truth.bed"), ncol=4)
        truthIntervals = []
        for i in intervals:
            truthIntervals.append((i[0], i[1], i[2], int(i[3])))

        allIntervals = [(truthIntervals[0][0],
                        truthIntervals[0][1],
                        truthIntervals[-1][2])]
        trackData = TrackData()
        trackData.loadTrackData(getTracksInfoPath(3), allIntervals)
        assert len(trackData.getTrackTableList()) == 1
        # set the fudge to 1 since when the test was written this was
        # hardcoded default
        em = IndependentMultinomialEmissionModel(
            4, trackData.getNumSymbolsPerTrack(),
			  fudge = 1.0)
        hmm = MultitrackHmm(em)
        hmm.supervisedTrain(trackData, truthIntervals)
        hmm.validate()

        # check emissions, they should basically be binary. 
        trackList = hmm.getTrackList()
        emp = np.exp(em.getLogProbs())
        ltrTrack = trackList.getTrackByName("ltr")
        track = ltrTrack.getNumber()
        cmap = ltrTrack.getValueMap()
        s0 = cmap.getMap(None)
        s1 = cmap.getMap(0)
        # we add 1 to all frequencies like emission trainer
        assert_array_almost_equal(emp[track][0][s0], 36. / 37.) 
        assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.)
        assert_array_almost_equal(emp[track][1][s0], 1 - 6. / 7.) 
        assert_array_almost_equal(emp[track][1][s1], 6. / 7.)
        assert_array_almost_equal(emp[track][2][s0], 26. / 27.) 
        assert_array_almost_equal(emp[track][2][s1], 1. - 26. / 27.)
        assert_array_almost_equal(emp[track][3][s0], 1. - 6. / 7.)
        assert_array_almost_equal(emp[track][3][s1], 6. / 7.)

        insideTrack = trackList.getTrackByName("inside")
        track = insideTrack.getNumber()
        cmap = insideTrack.getValueMap()
        s0 = cmap.getMap(None)
        s1 = cmap.getMap("Inside")
        assert_array_almost_equal(emp[track][0][s0], 36. / 37.) 
        assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.)
        assert_array_almost_equal(emp[track][1][s0], 6. / 7.)
        assert_array_almost_equal(emp[track][1][s1], 1 - 6. / 7.)
        assert_array_almost_equal(emp[track][2][s0], 1. - 26. / 27.)
        assert_array_almost_equal(emp[track][2][s1], 26. / 27.) 
        assert_array_almost_equal(emp[track][3][s0], 6. / 7.)
        assert_array_almost_equal(emp[track][3][s1], 1. - 6. / 7.)

        
        # crappy check for start probs.  need to test transition too!
        freq = [0.0] * em.getNumStates()
        total = 0.0
        for interval in truthIntervals:
           state = interval[3]
           freq[state] += float(interval[2]) - float(interval[1])
           total += float(interval[2]) - float(interval[1])

        sprobs = hmm.getStartProbs()
        assert len(sprobs) == em.getNumStates()
        for state in xrange(em.getNumStates()):
            assert_array_almost_equal(freq[state] / total, sprobs[state])

        # transition probabilites
        # from eyeball:
        #c	0	5	0   0->0 +4   0->1 +1    0-> +5
        #c	5	10	1   1->1 +4   1->2 +1    1-> +5
        #c	10	35	2   2->2 +24  2->3 +1    2-> +25
        #c	35	40	3   3->3 +4   3->0 +1    3-> +5
        #c	40	70	0   0->0 +29             0-> +19
        realTransProbs = np.array([
            [33. / 34., 1. / 34., 0., 0.],
            [0., 4. / 5., 1. / 5., 0.],
            [0., 0., 24. / 25., 1. / 25.],
            [1. / 5., 0., 0., 4. / 5.]
            ])
            
        tprobs = hmm.getTransitionProbs()
        assert tprobs.shape == (em.getNumStates(), em.getNumStates())
        assert_array_almost_equal(tprobs, realTransProbs)
        prob, states = hmm.viterbi(trackData)[0]
        for truthInt in truthIntervals:
            for i in xrange(truthInt[1], truthInt[2]):
                assert states[i] == truthInt[3]
Esempio n. 36
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create a teHMM")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("trainingBed", help="Path of BED file containing"
                        " genome regions to train model on.  If --supervised "
                        "is used, the names in this bed file will be treated "
                        "as the true annotation (otherwise it is only used for "
                        "interval coordinates)")
    parser.add_argument("outputModel", help="Path of output hmm")
    parser.add_argument("--numStates", help="Number of states in model",
                        type = int, default=2)
    parser.add_argument("--iter", help="Number of EM iterations",
                        type = int, default=100)
    parser.add_argument("--supervised", help="Use name (4th) column of "
                        "<traingingBed> for the true hidden states of the"
                        " model.  Transition parameters will be estimated"
                        " directly from this information rather than EM."
                        " NOTE: The number of states will be determined "
                        "from the bed.",
                        action = "store_true", default = False)
    parser.add_argument("--cfg", help="Use Context Free Grammar insead of "
                        "HMM.  Only works with --supervised for now",
                        action = "store_true", default = False)
    parser.add_argument("--saPrior", help="Confidence in self alignment "
                        "track for CFG.  Probability of pair emission "
                        "is multiplied by this number if the bases are aligned"
                        " and its complement if bases are not aligned. Must"
                        " be between [0,1].", default=0.95, type=float)
    parser.add_argument("--pairStates", help="Comma-separated list of states"
                        " (from trainingBed) that are treated as pair-emitors"
                        " for the CFG", default=None)
    parser.add_argument("--emFac", help="Normalization factor for weighting"
                        " emission probabilities because when there are "
                        "many tracks, the transition probabilities can get "
                        "totally lost. 0 = no normalization. 1 ="
                        " divide by number of tracks.  k = divide by number "
                        "of tracks / k", type=int, default=0)
    parser.add_argument("--initTransProbs", help="Path of text file where each "
                        "line has three entries: FromState ToState Probability"
                        ".  This file (all other transitions get probability 0)"
                        " is used to specifiy the initial transition model."
                        " The names and number of states will be initialized "
                        "according to this file (overriding --numStates)",
                        default = None)
    parser.add_argument("--fixTrans", help="Do not learn transition parameters"
                        " (best used with --initTransProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ".  This file (all other emissions get probability 0)"
                        " is used to specifiy the initial emission model. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixEm", help="Do not learn emission parameters"
                        " (best used with --initEmProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initStartProbs", help="Path of text file where each "
                        "line has two entries: State Probability"
                        ".  This file (all other start probs get probability 0)"
                        " is used to specifiy the initial start dist. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixStart", help="Do not learn start parameters"
                        " (best used with --initStartProbs)",
                        action="store_true", default=False)
    parser.add_argument("--forceTransProbs",
                        help="Path of text file where each "
                        "line has three entries: FromState ToState Probability" 
                        ". These transition probabilities will override any "
                        " learned probabilities after each training iteration"
                        " (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed)" ,
                        default=None)
    parser.add_argument("--forceEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ". These "
                        "emission probabilities will override any learned"
                        " probabilities after each training iteration "
                        "(unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed.)" ,
                        default = None) 
    parser.add_argument("--flatEm", help="Use a flat emission distribution as "
                        "a baseline.  If not specified, the initial emission "
                        "distribution will be randomized by default.  Emission"
                        " probabilities specified with --initEmpProbs or "
                        "--forceEmProbs will never be affected by randomizaiton"
                        ".  The randomization is important for Baum Welch "
                        "training, since if two states dont have at least one"
                        " different emission or transition probability to begin"
                        " with, they will never learn to be different.",
                        action="store_true", default=False)
    parser.add_argument("--emRandRange", help="When randomly initialzing an"
                        " emission distribution, constrain"
                        " the values to the given range (pair of "
                        "comma-separated numbers).  Overridden by "
                        "--initEmProbs and --forceEmProbs when applicable."
                        " Completely overridden by --flatEm (which is equivalent"
                        " to --emRandRange .5,.5.). Actual values used will"
                        " always be normalized.", default="0.2,0.8")
    parser.add_argument("--segment", help="Bed file of segments to treat as "
                        "single columns for HMM (ie as created with "
                        "segmentTracks.py).  IMPORTANT: this file must cover "
                        "the same regions as the traininBed file. Unless in "
                        "supervised mode, probably best to use same bed file "
                        " as both traingBed and --segment argument.  Otherwise"
                        " use intersectBed to make sure the overlap is exact",
                        default=None)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied)", type=int, default=0)
    parser.add_argument("--seed", help="Seed for random number generator"
                        " which will be used to initialize emissions "
                        "(if --flatEM and --supervised not specified)",
                        default=None, type=int)
    parser.add_argument("--reps", help="Number of replicates (with different"
                         " random initializations) to run. The replicate"
                         " with the highest likelihood will be chosen for the"
                         " output", default=1, type=int)
    parser.add_argument("--numThreads", help="Number of threads to use when"
                        " running replicates (see --rep) in parallel.",
                        type=int, default=1)
    parser.add_argument("--emThresh", help="Threshold used for convergence"
                        " in baum welch training.  IE delta log likelihood"
                        " must be bigger than this number (which should be"
                        " positive) for convergence", type=float,
                        default=0.001)
    parser.add_argument("--saveAllReps", help="Save all replicates (--reps)"
                        " models to disk, instead of just the best one"
                        ". Format is <outputModel>.repN.  There will be "
                        " --reps -1 such models saved as the best output"
                        " counts as a replicate",
                        action="store_true", default=False)
    parser.add_argument("--maxProb", help="Gaussian distributions and/or"
                        " segment length corrections can cause probability"
                        " to *decrease* during BW iteration.  Use this option"
                        " to remember the parameters with the highest probability"
                        " rather than returning the parameters after the final "
                        "iteration.", action="store_true", default=False)
    parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop"
                        " training if a given number of iterations go by without"
                        " hitting a new maxProb", default=None, type=int)
    parser.add_argument("--transMatEpsilons", help="By default, epsilons are"
                        " added to all transition probabilities to prevent "
                        "converging on 0 due to rounding error only for fully"
                        " unsupervised training.  Use this option to force this"
                        " behaviour for supervised and semisupervised modes",
                        action="store_true", default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.cfg is True:
        assert args.supervised is True
        assert args.saPrior >= 0. and args.saPrior <= 1.
    if args.pairStates is not None:
        assert args.cfg is True
    if args.initTransProbs is not None or args.fixTrans is True or\
      args.initEmProbs is not None or args.fixEm is not None:
        if args.cfg is True:
            raise RuntimeError("--transProbs, --fixTrans, --emProbs, --fixEm "
                               "are not currently compatible with --cfg.")
    if args.fixTrans is True and args.supervised is True:
        raise RuntimeError("--fixTrans option not compatible with --supervised")
    if args.fixEm is True and args.supervised is True:
        raise RuntimeError("--fixEm option not compatible with --supervised")
    if (args.forceTransProbs is not None or args.forceEmProbs is not None) \
      and args.cfg is True:
        raise RuntimeError("--forceTransProbs and --forceEmProbs are not "
                           "currently compatible with --cfg")
    if args.flatEm is True and args.supervised is False and\
      args.initEmProbs is None and args.initTransProbs is None:
      raise RuntimeError("--flatEm must be used with --initEmProbs and or"
                         " --initTransProbs")
    if args.initEmProbs is not None and args.initTransProbs is None:
        raise RuntimeError("--initEmProbs can only be used in conjunction with"
                           " --initTransProbs")
    if args.emRandRange is not None:
        args.emRandRange = args.emRandRange.split(",")
        try:
            assert len(args.emRandRange) == 2
            args.emRandRange = (float(args.emRandRange[0]),
                                float(args.emRandRange[1]))
        except:
            raise RuntimeError("Invalid --emRandRange specified")
    if args.transMatEpsilons is False:
        # old logic here. now overriden with above options
        args.transMatEpsilons = (args.supervised is False and
                                 args.initTransProbs is None and
                                 args.forceTransProbs is None)

    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read training intervals from the bed file
    logger.info("loading training intervals from %s" % args.trainingBed)
    mergedIntervals = getMergedBedIntervals(args.trainingBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.trainingBed)

    # read segment intervals
    segIntervals = None
    if args.segment is not None:
        logger.info("loading segment intervals from %s" % args.segment)
        try:
            checkExactOverlap(args.trainingBed, args.segment)
        except:
            raise RuntimeError("bed file passed with --segments option"
                               " must exactly overlap trainingBed")
        segIntervals = readBedIntervals(args.segment, sort=True)
    elif args.segLen > 0:
        raise RuntimeError("--segLen can only be used with --segment")
    if args.segLen <= 0:
        args.segLen = None
    if args.segLen > 0 and args.segLen != 1:
        logger.warning("--segLen should be 0 (no correction) or 1 (base"
                       " correction).  Values > 1 may cause bias.")

    # read the tracks, while intersecting them with the training intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals,
                            segmentIntervals=segIntervals)

    catMap = None
    userTrans = None
    if args.supervised is False and args.initTransProbs is not None:
        logger.debug("initializing transition model with user data")
        catMap = stateNamesFromUserTrans(args.initTransProbs)
        # state number is overrided by the transProbs file
        args.numStates = len(catMap)

    truthIntervals = None
    # state number is overrided by the input bed file in supervised mode
    if args.supervised is True:
        logger.info("processing supervised state names")
        # we reload because we don't want to be merging them here
        truthIntervals = readBedIntervals(args.trainingBed, ncol=4)
        catMap = mapStateNames(truthIntervals)
        args.numStates = len(catMap)

    # train the model
    seeds = [random.randint(0, 4294967294)]
    if args.seed is not None:
        seeds = [args.seed]
        random.seed(args.seed)
    seeds += [random.randint(0, sys.maxint) for x in xrange(1, args.reps)]

    def trainClosure(randomSeed):
        return trainModel(randomSeed, trackData=trackData, catMap=catMap,
                          userTrans=userTrans, truthIntervals=truthIntervals,
                          args=args)
    
    modelList = runParallelShellCommands(argList=seeds, numProc = args.numThreads,
                                         execFunction = trainClosure,
                                         useThreads = True)

    # select best model
    logmsg = ""
    bestModel = (-1, LOGZERO)
    for i in xrange(len(modelList)):
        curModel = (i, modelList[i].getLastLogProb())
        if curModel[1] > bestModel[1]:
            bestModel = curModel
        if curModel[1] is not None:
            logmsg += "Rep %i: TotalProb: %f\n" % curModel
    if len(modelList) > 1:
        logging.info("Training Replicates Statistics:\n%s" % logmsg)
        logging.info("Selecting best replicate (%d, %f)" % bestModel)
    model = modelList[bestModel[0]]
        
    # write the model to a pickle
    logger.info("saving trained model to %s" % args.outputModel)
    saveModel(args.outputModel, model)

    # write all replicates
    writtenCount = 0
    if args.saveAllReps is True:
        for i, repModel in enumerate(modelList):
            if i != bestModel[0]:
                repPath = "%s.rep%d" % (args.outputModel, writtenCount)
                logger.info("saving replicate model to %s" % repPath)                
                saveModel(repPath, repModel)
                writtenCount += 1

    cleanBedTool(tempBedToolPath)
Esempio n. 37
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Find candidate TSDs (exact forward matches) flanking given"
        "BED intervals.  Score is distance between TSD and bed interval.")
    parser.add_argument("fastaSequence", help="DNA sequence in FASTA format")
    parser.add_argument("inBed",
                        help="BED file with TEs whose flanking regions "
                        "we wish to search")
    parser.add_argument("outBed",
                        help="BED file containing (only) output TSDs")
    parser.add_argument("--min",
                        help="Minimum length of a TSD",
                        default=4,
                        type=int)
    parser.add_argument("--max",
                        help="Maximum length of a TSD",
                        default=6,
                        type=int)
    parser.add_argument("--all",
                        help="Report all matches in region (as opposed"
                        " to only the nearest to the BED element which is the "
                        "default behaviour",
                        action="store_true",
                        default=False)
    parser.add_argument("--maxScore",
                        help="Only report matches with given "
                        "score or smaller.  The score  is definied as the "
                        "maximum distance between the (two) TSD intervals and "
                        "the query interval",
                        default=None,
                        type=int)
    parser.add_argument("--left",
                        help="Number of bases immediately left of the "
                        "BED element to search for the left TSD",
                        default=7,
                        type=int)
    parser.add_argument("--right",
                        help="Number of bases immediately right of "
                        "the BED element to search for the right TSD",
                        default=7,
                        type=int)
    parser.add_argument("--overlap",
                        help="Number of bases overlapping the "
                        "BED element to include in search (so total space "
                        "on each side will be --left + overlap, and --right + "
                        "--overlap",
                        default=3,
                        type=int)
    parser.add_argument("--leftName",
                        help="Name of left TSDs in output Bed",
                        default="L_TSD")
    parser.add_argument("--rightName",
                        help="Name of right TSDs in output Bed",
                        default="R_TSD")
    parser.add_argument("--id",
                        help="Assign left/right pairs of TSDs a unique"
                        " matching ID",
                        action="store_true",
                        default=False)
    parser.add_argument("--names",
                        help="Only apply to bed interval whose "
                        "name is in (comma-separated) list.  If not specified"
                        " then all intervals are processed",
                        default=None)
    parser.add_argument("--numProc",
                        help="Number of jobs to run in parallel."
                        " (parallization done on different sequences in FASTA"
                        "file",
                        type=int,
                        default=1)
    parser.add_argument("--sequences",
                        help="Only process given sequences of input"
                        " FASTA file (comma-separated list).",
                        default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    assert os.path.exists(args.inBed)
    assert os.path.exists(args.fastaSequence)
    assert args.min <= args.max
    args.nextId = 0

    if args.sequences is not None:
        args.sequences = set(args.sequences.split(","))

    # read intervals from the bed file
    logger.info("loading target intervals from %s" % args.inBed)
    bedIntervals = readBedIntervals(args.inBed, ncol=4, sort=True)
    if bedIntervals is None or len(bedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" % args.inBed)

    if args.numProc > 1:
        runParallel(args, bedIntervals)
        return 0

    tsds = findTsds(args, bedIntervals)

    writeBedIntervals(tsds, args.outBed)