Exemple #1
0
def runTsdFinder(faPath, inBedPath, outBedPath, args):
    """ call tsdFinder and either overwrite or append output.  also call
    removeBedOverlaps on final output to make sure it is clean """

    # convert input to bed if necessary
    tempBed = None
    if os.path.splitext(inBedPath)[1].lower() == ".bb":
        tempBed = getLocalTempPath("Temp_addTsdTrack", ".bed")
        runShellCommand("bigBedToBed %s %s" % (inFile, tempBed))
        inBedPath = tempBed

    # run tsdfinder on input
    tempOut = getLocalTempPath("Temp_addTsdTrack", ".bed")
    runShellCommand("tsdFinder.py %s %s %s %s" % (faPath, inBedPath,
                                                  tempOut,
                                                  args.tsdFinderOptions))
    if tempBed is not None:
        runShellCommand("rm %s" % tempBed)

    # merge with existing track
    if os.path.isfile(outBedPath) and args.append is True:
        runShellCommand("cat %s >> %s" % (outBedPath, tempOut))

    # remove overlaps into final output
    runShellCommand("removeBedOverlaps.py %s > %s" % (tempOut, outBedPath))

    runShellCommand("rm %s" % tempOut)
Exemple #2
0
def runCleaning(args, tempTracksInfo):
    """ run scripts for cleaning chaux, ltr_finder, and termini"""
    trackList = TrackList(args.tracksInfo)

    for track in trackList:
        if track.getPreprocess() is None:
            continue

        # convert bigbed/wig
        inFile = track.getPath()
        tempBed1 = None
        if inFile[-3:] == ".bb" or inFile[-3:] == ".bw":
            tempBed1 = getLocalTempPath("Temp_%s" % track.getName(), ".bed")
            if inFile[-3:] == ".bb":
                runShellCommand("bigBedToBed %s %s" % (inFile, tempBed1))
            else:
                runShellCommand("bigWigToBedGraph %s %s" % (inFile, tempBed1))
            inFile = tempBed1

        # run cleanRM.py on all tracks with rm or rmu preprocessor
        if track.getPreprocess() == "rm" or track.getPreprocess() == "rmu":
            flag = ""
            if track.getPreprocess() == "rmu":
                flag == "--keepUnderscore"
            inFile = track.getPath()
            outFile = cleanPath(args, track)
            tempBed = getLocalTempPath("Temp_%s" % track.getName(), ".bed")
            runShellCommand("cleanRM.py %s %s > %s" % (inFile, flag, tempBed))
            runShellCommand("removeBedOverlaps.py --rm %s > %s" %
                            (tempBed, outFile))
            runShellCommand("rm -f %s" % tempBed)
            track.setPath(outFile)

        # run cleanTermini.py
        elif track.getPreprocess() == "termini":
            outFile = cleanPath(args, track)
            inFile = track.getPath()
            runShellCommand("cleanTermini.py %s %s" % (inFile, outFile))
            track.setPath(outFile)

        # run removeBedOverlaps
        elif track.getPreprocess() == "overlap":
            outFile = cleanPath(args, track)
            inFile = track.getPath()
            runShellCommand("removeBedOverlaps.py %s > %s" % (inFile, outFile))
            track.setPath(outFile)

        # run cleanLtrFinder.py
        elif track.getPreprocess() == "ltr_finder":
            inFile = track.getPath()
            outFile = cleanPath(args, track)
            # note: overlaps now removed in cleanLtrFinderID script
            runShellCommand("cleanLtrFinderID.py %s %s" % (inFile, outFile))
            track.setPath(outFile)

        if tempBed1 is not None:
            runShellCommand("rm -f %s" % tempBed1)

    # save a temporary xml
    trackList.saveXML(tempTracksInfo)
Exemple #3
0
def runParallel(args, bedIntervals):
    """ Quick hack to rerun parallel jobs on different interval subsets. """
    nameSet = None
    if args.names is not None:
        nameSet = set(args.names.split(","))

    # chunk up BED input
    numIntervals = 0
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            numIntervals += 1
    jobSize = 1 + (numIntervals / args.numProc)
    logger.info("Dviding %d intervals into %d processes (%d intervals per)" %
                (numIntervals, args.numProc, jobSize))
    tempBeds = []
    curSize = sys.maxint
    curFile = None
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            if curSize >= jobSize:
                if curFile is not None:
                    curFile.close()
                tempBed = getLocalTempPath("TempTsdFinderIn", ".bed")
                tempBeds.append(tempBed)
                curFile = open(tempBed, "w")
                curSize = 0
            curFile.write("\t".join([str(s) for s in interval]))
            curFile.write("\n")
            curSize += 1
    if curFile is not None:
        curFile.close()

    # map jobs
    assert len(tempBeds) <= args.numProc
    tempOuts = []
    jobCmds = []
    for tempBed in tempBeds:
        cmdLine = " ".join(sys.argv)
        cmdLine = cmdLine.replace("--numProc %d" % args.numProc, "--numProc 1")
        cmdLine = cmdLine.replace(args.inBed, tempBed)
        tempOut = getLocalTempPath("TempTsdFinderOut", ".bed")
        cmdLine = cmdLine.replace(args.outBed, tempOut)
        tempOuts.append(tempOut)
        jobCmds.append(cmdLine)

    runParallelShellCommands(jobCmds, args.numProc)

    # reduce
    for i, tempOut in enumerate(tempOuts):
        if i == 0:
            runShellCommand("mv %s %s" % (tempOut, args.outBed))
        else:
            runShellCommand("cat %s >> %s" % (tempOut, args.outBed))
            runShellCommand("rm -f %s" % (tempOut))
Exemple #4
0
def cutOutMaskIntervals(inBed, minLength, maxLength, tracksInfoPath):
    """ Filter out intervals of mask tracks from inBed with lengths
    outside given range. Idea is that it makes less sense to simply ignore,
    say, giant stretches of N's (like centromeres), as we would by masking
    them normally, than it does to remove them entirely, splitting the
    genome into multiple chunks.  Can also be used during comparision to
    get rid of all masked intervals """
    outPath = getLocalTempPath("Tempcut", ".bed")
    trackList = TrackList(tracksInfoPath)
    maskPaths = [t.getPath() for t in trackList.getMaskTracks()]
    if len(maskPaths) == 0:
        return None
    tempPath1 = getLocalTempPath("Tempcut1", ".bed")
    tempPath2 = getLocalTempPath("Tempcut2", ".bed")
    runShellCommand("cp %s %s" % (inBed, outPath))
    for maskPath in maskPaths:
        runShellCommand("cat %s | awk \'{print $1\"\t\"$2\"\t\"$3}\' >> %s" %
                        (maskPath, tempPath1))
    if os.path.getsize(tempPath1) > 0:
        runShellCommand("sortBed -i %s > %s ; mergeBed -i %s > %s" %
                        (tempPath1, tempPath2, tempPath2, tempPath1))
        runShellCommand("filterBedLengths.py %s %d %d > %s" %
                        (tempPath1, minLength + 1, maxLength - 1, tempPath2))
        runShellCommand("subtractBed -a %s -b %s | sortBed > %s" %
                        (outPath, tempPath2, tempPath1))
        runShellCommand("mv %s %s" % (tempPath1, outPath))
    runShellCommand("rm -f %s %s" % (tempPath1, tempPath2))
    if os.path.getsize(outPath) == 0:
        raise RuntimeError(
            "cutOutMaskIntervals removed everything.  Can't continue."
            " probably best to rerun calling script on bigger region?")
    return outPath
Exemple #5
0
def filterCutTrack(genomePath, fragmentFilterLen, trackListPath, cutTrackName,
                   cutTrackLenFilter):
    """ return path of length filtered cut track"""
    tracks = TrackList(trackListPath)
    track = tracks.getTrackByName(cutTrackName)
    assert track is not None
    cutTrackOriginalPath = track.getPath()
    cutTrackPath = getOutPath(cutTrackOriginalPath, outDir,
                              "filter%d" % cutTrackLenFilter)
    runShellCommand("filterBedLengths.py %s %s > %s" %
                    (cutTrackOriginalPath, cutTrackLenFilter, cutTrackPath))
    tempPath1 = getLocalTempPath("Temp", ".bed")
    runShellCommand("subtractBed -a %s -b %s | sortBed > %s" %
                    (genomePath, cutTrackPath, tempPath1))
    tempPath2 = getLocalTempPath("Temp", ".bed")
    S = string.ascii_uppercase + string.digits
    tag = ''.join(random.choice(S) for x in range(200))
    runShellCommand(
        "filterBedLengths.py %s %d --rename %s |grep %s | sortBed> %s" %
        (tempPath1, fragmentFilterLen, tag, tag, tempPath2))
    runShellCommand(
        "cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . > %s"
        % (tempPath2, tempPath1))
    runShellCommand(
        "cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . >> %s"
        % (cutTrackPath, tempPath1))
    runShellCommand("sortBed -i %s > %s" % (tempPath1, tempPath2))
    runShellCommand("mergeBed -i %s > %s" % (tempPath2, cutTrackPath))
    runShellCommand("rm -f %s %s" % (tempPath1, tempPath2))
    return cutTrackPath
def cutOutMaskIntervals(inBed, minLength, maxLength, tracksInfoPath):
    """ Filter out intervals of mask tracks from inBed with lengths
    outside given range. Idea is that it makes less sense to simply ignore,
    say, giant stretches of N's (like centromeres), as we would by masking
    them normally, than it does to remove them entirely, splitting the
    genome into multiple chunks.  Can also be used during comparision to
    get rid of all masked intervals """
    outPath = getLocalTempPath("Tempcut", ".bed")
    trackList = TrackList(tracksInfoPath)
    maskPaths = [t.getPath() for t in trackList.getMaskTracks()]
    if len(maskPaths) == 0:
        return None
    tempPath1 = getLocalTempPath("Tempcut1", ".bed")
    tempPath2 = getLocalTempPath("Tempcut2", ".bed")
    runShellCommand("cp %s %s" % (inBed, outPath))
    for maskPath in maskPaths:
        runShellCommand("cat %s | awk \'{print $1\"\t\"$2\"\t\"$3}\' >> %s" % (
            maskPath, tempPath1))
    if os.path.getsize(tempPath1) > 0:
        runShellCommand("sortBed -i %s > %s ; mergeBed -i %s > %s" % (
            tempPath1, tempPath2, tempPath2, tempPath1))
        runShellCommand("filterBedLengths.py %s %d %d > %s" % (
            tempPath1, minLength+1, maxLength-1, tempPath2))
        runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (
            outPath, tempPath2, tempPath1))
        runShellCommand("mv %s %s" % (tempPath1, outPath))
    runShellCommand("rm -f %s %s" % (tempPath1, tempPath2))
    if os.path.getsize(outPath) == 0:
        raise RuntimeError("cutOutMaskIntervals removed everything.  Can't continue."
                           " probably best to rerun calling script on bigger region?")
    return outPath
Exemple #7
0
def filterCutTrack(genomePath, fragmentFilterLen, trackListPath, cutTrackName,
                   cutTrackLenFilter):
    """ return path of length filtered cut track"""
    tracks = TrackList(trackListPath)
    track = tracks.getTrackByName(cutTrackName)
    assert track is not None
    cutTrackOriginalPath = track.getPath()
    cutTrackPath = getOutPath(cutTrackOriginalPath, outDir,
                           "filter%d" % cutTrackLenFilter)
    runShellCommand("filterBedLengths.py %s %s > %s" % (cutTrackOriginalPath,
                                                    cutTrackLenFilter,
                                                    cutTrackPath))
    tempPath1 = getLocalTempPath("Temp", ".bed")
    runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (genomePath,
                                                                cutTrackPath,
                                                                tempPath1))
    tempPath2 = getLocalTempPath("Temp", ".bed")
    S = string.ascii_uppercase + string.digits
    tag = ''.join(random.choice(S) for x in range(200))
    runShellCommand("filterBedLengths.py %s %d --rename %s |grep %s | sortBed> %s" % (
        tempPath1, fragmentFilterLen, tag, tag, tempPath2))
    runShellCommand("cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . > %s" % (tempPath2, tempPath1))
    runShellCommand("cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . >> %s" % (cutTrackPath, tempPath1))
    runShellCommand("sortBed -i %s > %s" % (tempPath1, tempPath2))
    runShellCommand("mergeBed -i %s > %s" %(tempPath2, cutTrackPath))
    runShellCommand("rm -f %s %s" % (tempPath1, tempPath2))                    
    return cutTrackPath
Exemple #8
0
def runParallel(args, bedIntervals):
    """ Quick hack to rerun parallel jobs on different interval subsets. """
    nameSet = None
    if args.names is not None:
        nameSet = set(args.names.split(","))
        
    # chunk up BED input
    numIntervals = 0
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            numIntervals += 1
    jobSize = 1 + (numIntervals / args.numProc)
    logger.info("Dviding %d intervals into %d processes (%d intervals per)" % (
        numIntervals, args.numProc, jobSize))
    tempBeds = []
    curSize = sys.maxint
    curFile = None
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            if curSize >= jobSize:
                if curFile is not None:
                    curFile.close()
                tempBed = getLocalTempPath("TempTsdFinderIn", ".bed")
                tempBeds.append(tempBed)
                curFile = open(tempBed, "w")
                curSize = 0
            curFile.write("\t".join([str(s) for s in interval]))
            curFile.write("\n")
            curSize += 1
    if curFile is not None:
        curFile.close()

    # map jobs
    assert len(tempBeds) <= args.numProc
    tempOuts = []
    jobCmds = []
    for tempBed in tempBeds:
        cmdLine = " ".join(sys.argv)
        cmdLine = cmdLine.replace("--numProc %d" % args.numProc,"--numProc 1")
        cmdLine = cmdLine.replace(args.inBed, tempBed)
        tempOut = getLocalTempPath("TempTsdFinderOut", ".bed")
        cmdLine = cmdLine.replace(args.outBed, tempOut)
        tempOuts.append(tempOut)
        jobCmds.append(cmdLine)
        
    runParallelShellCommands(jobCmds, args.numProc)

    # reduce
    for i, tempOut in enumerate(tempOuts):
        if i == 0:
            runShellCommand("mv %s %s" % (tempOut, args.outBed))
        else:
            runShellCommand("cat %s >> %s" % (tempOut, args.outBed))
            runShellCommand("rm -f %s" % (tempOut))
Exemple #9
0
def runTsdFinder(faPath, inBedPath, outBedPath, args):
    """ call tsdFinder and either overwrite or append output.  also call
    removeBedOverlaps on final output to make sure it is clean """

    # convert input to bed if necessary
    tempBed = None
    if os.path.splitext(inBedPath)[1].lower() == ".bb":
        tempBed = getLocalTempPath("Temp_addTsdTrack", ".bed")
        runShellCommand("bigBedToBed %s %s" % (inFile, tempBed))
        inBedPath = tempBed

    # run tsdfinder on input
    tempOut = getLocalTempPath("Temp_addTsdTrack", ".bed")
    runShellCommand("tsdFinder.py %s %s %s %s" %
                    (faPath, inBedPath, tempOut, args.tsdFinderOptions))
    if tempBed is not None:
        runShellCommand("rm %s" % tempBed)

    # merge with existing track
    if os.path.isfile(outBedPath) and args.append is True:
        runShellCommand("cat %s >> %s" % (outBedPath, tempOut))

    # remove overlaps into final output
    runShellCommand("removeBedOverlaps.py %s > %s" % (tempOut, outBedPath))

    runShellCommand("rm %s" % tempOut)
Exemple #10
0
def runCleaning(args, tempTracksInfo):
    """ run scripts for cleaning chaux, ltr_finder, and termini"""
    trackList = TrackList(args.tracksInfo)

    for track in trackList:
        if track.getPreprocess() is None:
            continue

        # convert bigbed/wig
        inFile = track.getPath()
        tempBed1 = None
        if inFile[-3:] == ".bb" or inFile[-3:] == ".bw":
            tempBed1 = getLocalTempPath("Temp_%s" % track.getName(), ".bed")
            if inFile[-3:] == ".bb":
                runShellCommand("bigBedToBed %s %s" % (inFile, tempBed1))
            else:
                runShellCommand("bigWigToBedGraph %s %s" % (inFile, tempBed1))    
            inFile = tempBed1            
        
        # run cleanRM.py on all tracks with rm or rmu preprocessor
        if track.getPreprocess() == "rm" or track.getPreprocess() == "rmu":
            flag = ""
            if track.getPreprocess() == "rmu":
                flag == "--keepUnderscore"
            inFile = track.getPath()
            outFile = cleanPath(args, track)
            tempBed = getLocalTempPath("Temp_%s" % track.getName(), ".bed")
            runShellCommand("cleanRM.py %s %s > %s" % (inFile, flag, tempBed))
            runShellCommand("removeBedOverlaps.py --rm %s > %s" % (tempBed, outFile)) 
            runShellCommand("rm -f %s" % tempBed)
            track.setPath(outFile)

        # run cleanTermini.py            
        elif track.getPreprocess() == "termini":            
            outFile = cleanPath(args, track)
            inFile = track.getPath()
            runShellCommand("cleanTermini.py %s %s" % (inFile, outFile))
            track.setPath(outFile)

        # run removeBedOverlaps
        elif track.getPreprocess() == "overlap":
            outFile = cleanPath(args, track)
            inFile = track.getPath()
            runShellCommand("removeBedOverlaps.py %s > %s" % (inFile, outFile))
            track.setPath(outFile)

        # run cleanLtrFinder.py
        elif track.getPreprocess() == "ltr_finder":
            inFile = track.getPath()
            outFile = cleanPath(args, track)
            # note: overlaps now removed in cleanLtrFinderID script
            runShellCommand("cleanLtrFinderID.py %s %s" % (inFile, outFile))
            track.setPath(outFile)

        if tempBed1 is not None:
            runShellCommand("rm -f %s" % tempBed1)

    # save a temporary xml
    trackList.saveXML(tempTracksInfo)
Exemple #11
0
def cutBedRegion(bedInterval, cutTrackPath, inBed, outBed):
    """ intersect with a given interval """
    tempPath = getLocalTempPath("Temp_cut", ".bed")
    tempPath2 = getLocalTempPath("Temp_cut", ".bed")
    runShellCommand("rm -f %s" % outBed)
    runShellCommand(
        "echo \"%s\t%s\t%s\n\" > %s" %
        (bedInterval[0], bedInterval[1], bedInterval[2], tempPath2))
    runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                    (inBed, tempPath2, tempPath))
    runShellCommand("subtractBed -a %s -b %s | sortBed > %s" %
                    (tempPath, cutTrackPath, outBed))
    runShellCommand("rm -f %s %s" % (tempPath, tempPath2))
Exemple #12
0
def combineTrack(track, outPath, tempRegionPath, iter, args):
    """ merge track with outPath """

    # make sure track is of form chrom start end state
    tempColPath = getLocalTempPath("Temp", "_col.bed")
    tempColFile = open(tempColPath, "w")
    vc = track.getValCol() + 1
    if track.getDist() == "binary":
        assert track.getName() != args.outside
        vc = 3
    bedIntervals = readBedIntervals(track.getPath(), vc,
                                    sort = True)
    for bedInterval in bedIntervals:
        outStr = "\t".join([str(x) for x in bedInterval])
        if track.getDist() == "binary":
            # state name = track name for binary track
            outStr += "\t%s" % track.getName()
        outStr += "\n"
        tempColFile.write(outStr)
    tempColFile.close()

    # intersect the target region
    tempIntersectPath = getLocalTempPath("Temp", "_int.bed")
    runShellCommand("intersectBed -a %s -b %s > %s" % (
        tempColPath, tempRegionPath, tempIntersectPath))

    # add the outside states
    tempGappedPath = getLocalTempPath("Temp", "_gap.bed")
    runShellCommand("addBedGaps.py --state %s %s %s %s" % (
        args.outside, tempRegionPath, tempIntersectPath, tempGappedPath))

    # fit the names with previous interations' result
    tempFitPath = getLocalTempPath("Temp", "_fit.bed")
    if iter == 0:
        runShellCommand("cp %s %s" % (tempGappedPath, tempFitPath))
    else:
        runShellCommand("fitStateNames.py %s %s %s --qualThresh %f --ignoreTgt %s" % (
            outPath, tempGappedPath, tempFitPath, args.fitThresh, args.outside))

    # now merge into outPath
    runShellCommand("cat %s >> %s" % (tempFitPath, outPath))
    runShellCommand("removeBedOverlaps.py %s > %s" % (outPath, tempColPath))
    runShellCommand("mv %s %s" % (tempColPath, outPath))

    # clean crap (note tempCol should already be gone)
    runShellCommand("rm -f %s" % tempColPath)
    runShellCommand("rm -f %s" % tempIntersectPath)
    runShellCommand("rm -f %s" % tempGappedPath)
    runShellCommand("rm -f %s" % tempFitPath)
Exemple #13
0
def cutBedRegion(bedInterval, cutTrackPath, inBed, outBed):
    """ intersect with a given interval """
    tempPath = getLocalTempPath("Temp_cut", ".bed")
    tempPath2 = getLocalTempPath("Temp_cut", ".bed")
    runShellCommand("rm -f %s" % outBed)
    runShellCommand("echo \"%s\t%s\t%s\n\" > %s" % (bedInterval[0],
                                                bedInterval[1],
                                                bedInterval[2],
                                                tempPath2))
    runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (inBed,
                                                                 tempPath2,
                                                                 tempPath))
    runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (tempPath,
                                                                cutTrackPath,
                                                                outBed))
    runShellCommand("rm -f %s %s" % (tempPath, tempPath2))
Exemple #14
0
def checkExactOverlap(bed1, bed2):
    """ make sure two bed files cover same region exactly: a requirement for all
    code based on the comparisons in this module."""

    errorMessage = (
        "Bed files %s and %s cannot be compared. xxx. "
        " Input files must be both sorted, cover the exact same region,"
        " and contain no self-overlaps.") % (bed1, bed2)

    # empty file may break downstream comparisons
    size1 = os.path.getsize(bed1)
    size2 = os.path.getsize(bed2)
    if size1 == 0 or size2 == 0:
        raise RuntimeError(
            errorMessage.replace("xxx", "one or both inputs empty"))

    # test self-overlap and sorting
    intervals1 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals1)):
        if intersectSize(intervals1[i - 1], intervals1[i]) != 0:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Overlapping intervals %s and %s found in input1" %
                    (intervals1[i - 1], intervals1[i])))
        if intervals1[i - 1] > intervals1[i]:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Out of order intervals %s and %s found in input1" %
                    (intervals1[i - 1], intervals1[i])))

    # test self-overlap and sorting
    intervals2 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals2)):
        if intersectSize(intervals2[i - 1], intervals2[i]) != 0:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Overlapping intervals %s and %s found in input2" %
                    (intervals2[i - 1], intervals2[i])))
        if intervals2[i - 1] > intervals2[i]:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Out of order intervals %s and %s found in input2" %
                    (intervals2[i - 1], intervals2[i])))

    # test intersection size
    tempFile = getLocalTempPath("Temp_test", ".bed")
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(
            errorMessage.replace("xxx",
                                 "Input1 covers regions outside input2"))
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(
            errorMessage.replace("xxx",
                                 "Input2 covers regions outside input1"))
    runShellCommand("rm -f %s" % tempFile)
Exemple #15
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Set the score column of each bed interval in input to "
        "(MODE, BINNED) average value of the intersection region in another track). "
        "Can be used, for instance, to assign a copy number of each RepeatModeler "
        "prediction...")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inBed", help="BED file to annotate")
    parser.add_argument("track", help="Track to use for annotation")
    parser.add_argument("outBed", help="Path for output, annotated BED file")
    parser.add_argument("--name",
                        help="Set ID field (column 4 instead of 5)",
                        action="store_true",
                        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read the tracks list
    trackList = TrackList(args.tracksInfo)
    track = trackList.getTrackByName(args.track)
    if track is None:
        raise RuntimeError("Can't find track %s" % args.track)
    # make temporary tracks list with just our track so we can keep using
    # tracks list interface but not read unecessary crap.
    singleListPath = getLocalTempPath("Temp_secScore", ".bed")
    trackList.trackList = [track]
    trackList.saveXML(singleListPath)

    obFile = open(args.outBed, "w")

    # trackData interface not so great at cherry picking intervals.
    # need to merge them up and use segmentation interface
    filledIntervals, mergedIntervals = fillGaps(args.inBed)

    # read track into trackData
    trackData = TrackData()
    logger.info("loading track %s" % singleListPath)
    trackData.loadTrackData(singleListPath,
                            mergedIntervals,
                            segmentIntervals=filledIntervals,
                            applyMasking=False)

    # finally, write the annotation
    writeAnnotatedIntervals(trackData, filledIntervals, mergedIntervals,
                            obFile, args)

    runShellCommand("rm -f %s" % singleListPath)
    obFile.close()
    cleanBedTool(tempBedToolPath)
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Set the score column of each bed interval in input to "
        "(MODE, BINNED) average value of the intersection region in another track). "
        "Can be used, for instance, to assign a copy number of each RepeatModeler "
        "prediction...")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inBed", help="BED file to annotate")
    parser.add_argument("track", help="Track to use for annotation")
    parser.add_argument("outBed", help="Path for output, annotated BED file")
    parser.add_argument("--name", help="Set ID field (column 4 instead of 5)",
                        action="store_true", default=False)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read the tracks list
    trackList = TrackList(args.tracksInfo)
    track = trackList.getTrackByName(args.track)
    if track is None:
        raise RuntimeError("Can't find track %s" % args.track)
    # make temporary tracks list with just our track so we can keep using
    # tracks list interface but not read unecessary crap.
    singleListPath = getLocalTempPath("Temp_secScore", ".bed")
    trackList.trackList = [track]
    trackList.saveXML(singleListPath)

    obFile = open(args.outBed, "w")

    # trackData interface not so great at cherry picking intervals.
    # need to merge them up and use segmentation interface    
    filledIntervals, mergedIntervals = fillGaps(args.inBed)

    # read track into trackData
    trackData = TrackData()
    logger.info("loading track %s" % singleListPath)
    trackData.loadTrackData(singleListPath, mergedIntervals,
                            segmentIntervals=filledIntervals,
                            applyMasking=False)

    # finally, write the annotation
    writeAnnotatedIntervals(trackData, filledIntervals, mergedIntervals, obFile,
                             args)

    runShellCommand("rm -f %s" % singleListPath)
    obFile.close()
    cleanBedTool(tempBedToolPath)
Exemple #17
0
def checkExactOverlap(bed1, bed2):
    """ make sure two bed files cover same region exactly: a requirement for all
    code based on the comparisons in this module."""

    errorMessage = ("Bed files %s and %s cannot be compared. xxx. "
    " Input files must be both sorted, cover the exact same region,"
    " and contain no self-overlaps.") % (bed1, bed2)

    # empty file may break downstream comparisons
    size1 = os.path.getsize(bed1)
    size2 = os.path.getsize(bed2)
    if size1 == 0 or size2 == 0:
        raise RuntimeError(errorMessage.replace("xxx", "one or both inputs empty"))
                            

    # test self-overlap and sorting
    intervals1 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals1)):
        if intersectSize(intervals1[i-1], intervals1[i]) != 0:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Overlapping intervals %s and %s found in input1" % (
                    intervals1[i-1], intervals1[i])))
        if intervals1[i-1] > intervals1[i]:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Out of order intervals %s and %s found in input1" % (
                    intervals1[i-1], intervals1[i])))

    # test self-overlap and sorting
    intervals2 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals2)):
        if intersectSize(intervals2[i-1], intervals2[i]) != 0:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Overlapping intervals %s and %s found in input2" % (
                    intervals2[i-1], intervals2[i])))
        if intervals2[i-1] > intervals2[i]:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Out of order intervals %s and %s found in input2" % (
                    intervals2[i-1], intervals2[i])))
        

    # test intersection size
    tempFile = getLocalTempPath("Temp_test", ".bed")
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(errorMessage.replace(
            "xxx", "Input1 covers regions outside input2"))
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(errorMessage.replace(
            "xxx", "Input2 covers regions outside input1"))
    runShellCommand("rm -f %s" % tempFile)
Exemple #18
0
def filterEmptyRegions(genomePath, regions, outDir, cutTrackPath):
    """ to a trial cut on each region.  return a list of those that
    aren't empty after cut """
    filteredRegions = []
    for i, region in enumerate(regions):
        regionName = getRegionName(region, i)
        tempPath1 = getLocalTempPath("Temp", ".bed")
        cutBedRegion(region, cutTrackPath, genomePath, tempPath1)
        intervals = bedRead(tempPath1)
        runShellCommand("rm -f %s" % tempPath1)
        if len(intervals) > 0:
            filteredRegions.append(region)
    return filteredRegions
Exemple #19
0
def filterEmptyRegions(genomePath, regions, outDir, cutTrackPath):
    """ to a trial cut on each region.  return a list of those that
    aren't empty after cut """
    filteredRegions = []
    for i, region in enumerate(regions):
        regionName = getRegionName(region, i)
        tempPath1 = getLocalTempPath("Temp", ".bed")
        cutBedRegion(region, cutTrackPath, genomePath, tempPath1)
        intervals = bedRead(tempPath1)
        runShellCommand("rm -f %s" % tempPath1)
        if len(intervals) > 0:
            filteredRegions.append(region)
    return filteredRegions
Exemple #20
0
def runVennMaker(args0):                     
    # venn_maker seems designed to run on intervals (and looks pretty broken doing this).
    # try converting to base intervals.
    todie = []
    for i, f in enumerate(args.inputFiles):
        tempFile = getLocalTempPath("Temp_%d" % i, ".bed")
        todie.append(tempFile)
        baserize(f, tempFile)
        args.inputFiles[i] = tempFile
    
    venn_maker(args.inputFiles, args.names, args.outTiff, "venn.R",
               additional_args=None, run=True)

    for f in todie:
        runShellCommand("rm -f %s" % f)
Exemple #21
0
def runVennMaker(args0):
    # venn_maker seems designed to run on intervals (and looks pretty broken doing this).
    # try converting to base intervals.
    todie = []
    for i, f in enumerate(args.inputFiles):
        tempFile = getLocalTempPath("Temp_%d" % i, ".bed")
        todie.append(tempFile)
        baserize(f, tempFile)
        args.inputFiles[i] = tempFile

    venn_maker(args.inputFiles,
               args.names,
               args.outTiff,
               "venn.R",
               additional_args=None,
               run=True)

    for f in todie:
        runShellCommand("rm -f %s" % f)
Exemple #22
0
    cmd += " --iter %d" % iter
    cmd += " --segment %s" % trainSegPath
    runShellCommand(cmd)

# eval ############
evalPath = "eval.bed"
if startPoint <=3:
    cmd = "teHmmEval.py %s %s %s --bed %s --segment %s" % (trainTracksPath, modelPath, evalSegPath, evalPath, logOpts)
    runShellCommand(cmd)
    
# fit ############
fitPath = "fit.bed"
fitFdrPath = "fitFdr.bed"
labelPath = "label.bed"
if startPoint <=4:
    tempPath = getLocalTempPath("Tempmask", ".bed")
    runShellCommand("mergeBed -i %s | sortBed > %s" % (evalSegPath, tempPath))
    runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (modelerPath, tempPath, labelPath))
    runShellCommand("rm -f %s" % tempPath)
    fitCmd = "fitStateNames.py %s %s %s %s" % (labelPath, evalPath, fitPath, fitFlags)
    fitFdrCmd = "fitStateNames.py %s %s %s %s" % (labelPath, evalPath, fitFdrPath, fitFlagsFdr)
    runParallelShellCommands([fitCmd, fitFdrCmd], 2)
    
# compare ############
compDir = "comp"
if not os.path.exists(compDir):
    runShellCommand("mkdir %s" % compDir)
def getTruthPath(idx):
    return os.path.join(compDir, truthNames[idx] + ".bed")

fitPathMI = "fitMI.bed"
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Fill in masked intervals of an hmm prediction "
        "(from teHmmEval.py) with state corresponding to surrounding"
        " intervals.")

    parser.add_argument("tracksXML", help="XML track list (used to id masking"
                        " tracks")
    parser.add_argument("allBed", help="Target scope.  Masked intervals outside"
                        " of these regions will not be included")
    parser.add_argument("inBed", help="TE prediction BED file.  State labels"
                        " should probably be mapped (ie with fitStateNames.py)")
    parser.add_argument("outBed", help="Output BED.  Will be equivalent to"
                        " the input bed except all gaps corresponding to "
                        "masked intervals will be filled")
    parser.add_argument("--maxLen", help="Maximum length of a masked interval"
                        " to fill (inclusive). Use --delMask option with same value"
                        "if running compareBedStates.py after.",
                        type=int, default=sys.maxint)
    parser.add_argument("--default", help="Default label to give to masked "
                        "region if no label can be determined", default="0")
    parser.add_argument("--tgts", help="Only relabel gaps that "
                        "are flanked on both sides by the same state, and this state"
                        " is in this comma- separated list. --default used for other"
                        " gaps.  If not targetst specified then all states checked.",
                        default=None)
    parser.add_argument("--oneSidedTgts", help="Only relabel gaps that "
                        "are flanked on at least one side by a state in this comma-"
                        "separated list --default used for other gaps",
                         default=None)
    parser.add_argument("--onlyDefault", help="Add the default state (--default) no"
                        " no all masked gaps no matter what. ie ignoring all other "
                        "logic", action="store_true", default=False)
    parser.add_argument("--cut", help="Cut out gaps for masked tracks from the input."
                        " By default, the input is expected to come from the HMM "
                        "with mask intervals already absent, and will crash on with"
                        " an assertion error if an overlap is detected.",
                        action="store_true", default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # make sets
    tgtSet = set()
    if args.tgts is not None:
        tgtSet = set(args.tgts.split(","))
    oneSidedTgtSet = set()
    if args.oneSidedTgts is not None:
        oneSidedTgtSet = set(args.oneSidedTgts.split(","))
    assert len(tgtSet.intersection(oneSidedTgtSet)) == 0

    # read the track list
    trackList = TrackList(args.tracksXML)
    maskTracks = trackList.getMaskTracks()

    # read the input bed
    inBed = args.inBed
    if args.cut is True:
        inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML)
    inputIntervals = readBedIntervals(inBed, ncol = 4, sort = True)
    if args.cut is True:
        runShellCommand("rm -f %s" % inBed)
    if len(maskTracks) == 0 or len(inputIntervals) == 0:
        runShellCommand("cp %s %s" % (args.inBed, args.outBed))
        logger.warning("No mask tracks located in %s or"
                       " %s empty" % (args.tracksXML, args.inBed))
        return 0


    # make a temporary, combined, merged masking bed file
    tempMaskBed = getLocalTempPath("Temp_mb", ".bed")
    for maskTrack in maskTracks:
        assert os.path.isfile(maskTrack.getPath())
        runShellCommand("cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % (
            maskTrack.getPath(), tempMaskBed))
    maskedIntervals = getMergedBedIntervals(tempMaskBed, sort = True)
    resolvedMasks = 0

    if len(inputIntervals) == 0:
        logger.warning("No mask tracks located in %s" % args.tracksXML)
        return
    inputIdx = 0
    rightFlank = inputIntervals[inputIdx]

    tempOutMask = getLocalTempPath("Temp_om", ".bed")
    tempOutMaskFile = open(tempOutMask, "w")

    for maskIdx, maskInterval in enumerate(maskedIntervals):
        if maskInterval[2] - maskInterval[1] > args.maxLen:
            continue
        # find candidate right flank
        while rightFlank < maskInterval:
            if inputIdx == len(inputIntervals) - 1:
                rightFlank = None
                break
            else:
                inputIdx += 1
                rightFlank = inputIntervals[inputIdx]

        # candidate left flank
        leftFlank = None
        if inputIdx > 0:
            leftFlank = inputIntervals[inputIdx - 1]

        # identify flanking states if the intervals perfectly abut
        leftState = None
        if leftFlank is not None:
            if leftFlank[0] == maskInterval[0] and leftFlank[2] == maskInterval[1]:
                leftState = str(leftFlank[3])
            else:
                assert intersectSize(leftFlank, maskInterval) == 0
        rightState = None
        if rightFlank is not None:
            if rightFlank[0] == maskInterval[0] and rightFlank[1] == maskInterval[2]:
                rightState = str(rightFlank[3])
            else:
                assert intersectSize(rightFlank, maskInterval) == 0
            
        # choose a state for the mask interval
        maskState = str(args.default)
        if args.onlyDefault is True:
            pass
        elif leftState is not None and leftState == rightState:
            if len(tgtSet) == 0 or leftState in tgtSet:
                maskState = leftState
        elif leftState in oneSidedTgtSet:
            maskState = leftState
        elif rightState in oneSidedTgtSet:
            maskState = rightState
        
        # write our mask interval
        tempOutMaskFile.write("%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1],
                                                    maskInterval[2], maskState))

    
    tempOutMaskFile.close()    
    tempMergePath1 = getLocalTempPath("Temp_mp", ".bed")
    tempMergePath2 = getLocalTempPath("Temp_mp", ".bed")
    runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1,
                                                 tempOutMask, tempMergePath1))
    runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2))
    tempScopePath = getLocalTempPath("temp_all", ".bed")
    runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath))
    runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath,
                                                       args.outBed))

    runShellCommand("rm -f %s" % " ".join([tempMaskBed, tempOutMask, tempMergePath1,
                                      tempMergePath2, tempScopePath]))
    cleanBedTool(tempBedToolPath)
Exemple #24
0
def runPositionalComparison(argv, args):
    """ hack to recursively exectute compareBedStates.py on a sliding window of the two
    inputs and report accuracy in a BED file """
    try:
        windowToks = args.window.split(",")
        assert len(windowToks) == 5
        windowSize = int(windowToks[0])
        stateName = windowToks[1]
        compType = windowToks[2]
        score = windowToks[3]
        outBed = windowToks[4]
    except:
        raise RuntimeError("value passed to --window is not in valid format")
    if compType == "base":
        compIdx = 0
    elif compType == "interval":
        compIdx = 1
    elif compType == "weighted":
        compIdx = 2
    else:
        raise RuntimeError("invalid compType, %s, passed to --window" % compType)
    if score != "f1" and score != "precision" and score != "recall":
        raise RuntimeError("invalid score, %s, passed to --window" % score)
    try:
        outFile = open(outBed, "w")
    except:
        raise RuntimeError("invalid outBed, %s, passed to --window" % outBed)

    tempBed = getLocalTempPath("Temp_region", ".bed")
    runShellCommand("mergeBed -i %s > %s" % (args.bed1, tempBed))
    chunkBed = getLocalTempPath("Temp_chunkBed", ".bed")
    runShellCommand("chunkBedRegions.py %s %d --overlap .5 > %s" % (
        tempBed, windowSize, chunkBed))
    window = getLocalTempPath("Temp_window", ".bed")
    slice1 = getLocalTempPath("Temp_slice1", ".bed")
    slice2 = getLocalTempPath("Temp_slice2", ".bed")
    compFile = getLocalTempPath("Temp_compFile", ".bed")
    compOpts = ""
    winIdx = argv.index("--window")
    assert winIdx > 0 and winIdx < len(argv) -1 and argv[winIdx + 1] == args.window
    for i in xrange(3, len(argv)):
        if i != winIdx and i != winIdx + 1:
            compOpts += " " + argv[i]
    
    for chunk in readBedIntervals(chunkBed):
        runShellCommand("echo \"%s\t%d\t%d\" > %s" % (chunk[0], chunk[1], chunk[2],
                                                   window))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (
            args.bed1, window, slice1))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (
            args.bed2, window, slice2))
        runShellCommand("compareBedStates.py %s %s %s > %s" % (
            slice1, slice2, compOpts, compFile))
        stats = extractCompStatsFromFile(compFile)[compIdx]
        if stateName not in stats:
            stats[stateName] = (0,0)
        f1 = 0.
        prec, rec = stats[stateName]
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)
        val = f1
        if score == "precision":
            val = prec
        elif score == "recall":
            val = rec
        outFile.write("%s\t%d\t%d\t%f\n" % (chunk[0], chunk[1], chunk[2], val))

    runShellCommand("rm -f %s %s %s %s %s %s" % (tempBed, chunkBed, window,
                                                 slice1, slice2, compFile))
    outFile.close()
Exemple #25
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Remove ltr_finder ids from 4th column")
    parser.add_argument("inBed", help="bed with ltr results to process")
    parser.add_argument("outBed", help="bed to write output to.  Will also "
                        "write outBed_sym.bed outBed_tsd_as_gap.bed etc.")
    parser.add_argument("--keepOl", help="by default, if LTR elements "
                        "overlap, the one with the highest score (length "
                        "in event of tie) is kept. This option disables"
                        " this logic.", action="store_true", default=False)
    parser.add_argument("--all", help="write _sym, _tsd_as_gap, etc. versions"
                        " of output", action="store_true", default=False)
    parser.add_argument("--weak", help="score threshold such that any elemetns"
                        " with a score lower or equal to will be assigned the"
                        " prefix WEAK_ to their names.", type=float,
                        default=-1)
    parser.add_argument("--weakIgnore", help="dont apply --weak to state names"
                        " that contain given keywords (defined as comma-separated"
                        " list", default=None)
    
    args = parser.parse_args()
    tempBedToolPath = initBedTool()
    assert os.path.exists(args.inBed)
    baseOut, ext = os.path.splitext(args.outBed)
    if args.weakIgnore is not None:
        args.weakIgnore = args.weakIgnore.split(",")
    else:
        args.weakIgnore = []

    inBed = args.inBed

    toRm = []
    if not args.keepOl:
        inBed = getLocalTempPath("Temp", ".bed")
        removeOverlaps(args.inBed, inBed, args)
        toRm.append(inBed)

    os.system("sed -e \"s/|LTR_TE|[0-9]*//g\" -e \"s/|-//g\" %s > %s" % (
        inBed, args.outBed))

    if args.all:
        symBed = baseOut + "_sym" + ext
        os.system("sed -e \"s/|left//g\" -e \"s/|right//g\" %s > %s" % (args.outBed,
                                                                        symBed))

        tsd_as_gapsBed = baseOut + "_tsd_as_gap" + ext
        os.system("grep -v TSD %s > %s" % (args.outBed, tsd_as_gapsBed))

        sym_tsd_as_gapsBed = baseOut + "_sym_tsd_as_gap" + ext
        os.system("grep -v TSD %s > %s" % (symBed, sym_tsd_as_gapsBed))

        tsd_as_ltrBed = baseOut + "_tsd_as_ltr" + ext
        os.system("sed -e \"s/TSD/LTR/g\" %s > %s" % (args.outBed, tsd_as_ltrBed))

        sym_tsd_as_ltrBed = baseOut + "_sym_tsd_as_ltr" + ext
        os.system("sed -e \"s/TSD/LTR/g\" %s > %s" % (symBed, sym_tsd_as_ltrBed))

        singleBed = baseOut + "_single" + ext
        os.system("sed -e \"s/LTR/inside/g\" %s > %s" % (sym_tsd_as_ltrBed,
                                                         singleBed))

    for path in toRm:
        runShellCommand("rm -f %s" % path)
        
    cleanBedTool(tempBedToolPath)
Exemple #26
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Combine a bunch of non-numeric BED tracks into"
        " single file using fitStateNames.py to try to keep names "
        "consistent.  Idea is to be used as baseline to compare"
        " hmm to (via base-by-base statistics, primarily, since"
        " this procedure could induce some fragmentation)")

    parser.add_argument("tracksXML", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("regionBed", help="BED file representing "
                        "target region (best if whole genome)")
    parser.add_argument("outBed", help="Output bed")
    parser.add_argument("--tracks", help="Comma-separated list of "
                        "track names to use.  All tracks will be"
                        " used by default", default=None)
    parser.add_argument("--outside", help="Name to give non-annotated"
                        "regions", default="Outside")
    parser.add_argument("--fitThresh", help="Min map percentage (0,1)"
                        " in order to rename (see --qualThresh option"
                        "of fitStateNames.py", type=float,
                        default=0.5)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    inputTrackList = TrackList(args.tracksXML)
    iter = 0

    # get regionBed where all intervals are merged when possible
    regionIntervals = getMergedBedIntervals(args.regionBed, sort=True)
    tempRegionPath = getLocalTempPath("Temp", "_reg.bed")
    tempRegionFile = open(tempRegionPath, "w")
    for interval in regionIntervals:
        tempRegionFile.write("\t".join([str(x) for x in interval]) + "\n")
    tempRegionFile.close()

    # accumulate tracks in temp file
    tempOutPath = getLocalTempPath("Temp", "_out.bed")
    
    for track in inputTrackList:
        if track.shift is not None or track.scale is not None or\
          track.logScale is not None or track.dist == "gaussian" or\
          os.path.splitext(track.getPath())[1].lower() != ".bed":
          logger.warning("Skipping numeric track %s" % track.getName())
        elif args.tracks is None or track.getName() in args.tracks.split(","):
            combineTrack(track, tempOutPath, tempRegionPath, iter, args)
            iter += 1

    # nothing got written, make everything outside
    if iter == 0:
        tempOutFile = open(tempOutPath, "w")
        for interval in regionIntervals:
            tempOutFile.write("%s\t%s\t%s\t%s\n" % (interval[0], interval[1],
                                                   interval[2], args.outside))
        tempOutFile.close()

    runShellCommand("mv %s %s" % (tempOutPath, args.outBed))
    runShellCommand("rm -f %s" % (tempRegionPath))
                
    cleanBedTool(tempBedToolPath)
Exemple #27
0
def runTsd(args, tempTracksInfo):
    """ run addTsdTrack on termini and chaux to generate tsd track"""
    if args.noTsd is True:
        return

    origTrackList = TrackList(args.tracksInfo)
    outTrackList = TrackList(tempTracksInfo)

    tempFiles = []
    tsdInputFiles = []
    tsdInputTracks = []
        
    # preprocess termini
    lastzTracks = [origTrackList.getTrackByName(args.ltr_termini),
                  origTrackList.getTrackByName(args.tir)]
    for terminiTrack in lastzTracks:
        if terminiTrack is not None:
            inFile = terminiTrack.getPath()
            fillFile = getLocalTempPath("Temp_fill", ".bed")
            tempBed = None
            if inFile[-3:] == ".bb":
                tempBed = getLocalTempPath("Temp_termini", ".bed")
                runShellCommand("bigBedToBed %s %s" % (inFile, tempBed))
                inFile = tempBed
            runShellCommand("fillTermini.py %s %s" % (inFile, fillFile))
            tsdInputFiles.append(fillFile)
            tsdInputTracks.append(terminiTrack.getName())
            tempFiles.append(fillFile)
            if tempBed is not None:
                runShellCommand("rm -f %s" % tempBed)
        else:
            logger.warning("Could not find termini track")

    # add repeat_modeler
    repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler)
    if repeat_modelerTrack is not None:
        tsdInputFiles.append(repeat_modelerTrack.getPath())
        tsdInputTracks.append(repeat_modelerTrack.getName())

    # run addTsdTrack (appending except first time)
    # note we override input track paths in each case
    assert len(tsdInputFiles) == len(tsdInputTracks)
    for i in xrange(len(tsdInputFiles)):
        optString = ""
        if i > 0:
            optString += " --append"
        # really rough hardcoded params based on
        # (A unified classification system for eukaryotic transposable elements
        # Wicker et. al 2007)
        if tsdInputTracks[i] == args.repeat_modeler:
            optString += " --names LINE,SINE,Unknown"
            optString += " --maxScore 20"
            optString += " --left 20"
            optString += " --right 20"
            optString += " --min 5"
            optString += " --max 20"
            optString += " --overlap 20"
        elif tsdInputTracks[i] == args.ltr_termini:
            optString += " --maxScore 3"
            optString += " --left 8"
            optString += " --right 8"
            optString += " --min 3"
            optString += " --max 6"
        elif tsdInputTracks[i] == args.tir:
            optString += " --maxScore 3"
            optString += " --left 15"
            optString += " --right 15"
            optString += " --min 3"
            optString += " --max 12"

        tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml")
        runShellCommand("addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" % (
            tempTracksInfo,
            args.cleanTrackPath,
            tempXMLOut,
            tsdInputTracks[i],
            args.sequence,
            args.tsd,
            tsdInputFiles[i],
            optString,
            args.logOpString,
            args.numProc))
        
        runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo))

    for i in xrange(len(tempFiles)):
        runShellCommand("rm %s" % tempFiles[i])
Exemple #28
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Generate HMM-usable tracklist from raw tracklist. EX "
        "used to transform mustang_alyrata_tracks.xml -> "
        "mustang_alyrata_clean.xml.  Runs cleanRM.py cleanLtrFinder.py and "
        " cleanTermini.py and addTsdTrack.py and setTrackScaling.py (also runs "
        " removeBedOverlaps.py before each of the clean scripts)")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("cleanTrackPath", help="Directory to write cleaned BED"
                        " tracks to")
    parser.add_argument("outTracksInfo", help="Path to write modified tracks XML"
                        " to.")
    parser.add_argument("--numBins", help="Maximum number of bins after scaling",
                        default=10, type=int)
    parser.add_argument("--scaleTracks", help="Comma-separated list of tracks "
                        "to process for scaling. If not set, all"
                        " tracks listed as having a multinomial distribution"
                        " (since this is the default value, this includes "
                        "tracks with no distribution attribute) or gaussian "
                        "distribution will be processed.", default=None)
    parser.add_argument("--skipScale", help="Comma-separated list of tracks to "
                        "skip for scaling.", default=None)
    parser.add_argument("--ltr_termini", help="Name of termini track (appy tsd)",
                        default="ltr_termini")
    parser.add_argument("--repeat_modeler", help="Name of repeat_modeler track (appy tsd)",
                        default="repeat_modeler")
    parser.add_argument("--sequence", help="Name of fasta sequence track",
                        default="sequence")
    parser.add_argument("--tsd", help="Name of tsd track to generate (appy cleanTermini.py)",
                        default="tsd")
    parser.add_argument("--tir", help="Name of tir_termini track (appy cleanTermini.py)",
                        default="tir_termini")
    parser.add_argument("--noScale", help="Dont do any scaling", default=False,
                        action="store_true")
    parser.add_argument("--noTsd", help="Dont generate TSD track.  NOTE:"
                        " TSD track is hardcoded to be generated from "
                        "termini and (non-LTR elements of ) chaux",
                        default=False, action="store_true")
    parser.add_argument("--numProc", help="Number of processes to use for tsdFinder.py",
                        default=1, type=int)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    args.logOpString = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.logOpString += " --logFile %s" % args.logFile

    try:
        os.makedirs(args.cleanTrackPath)
    except:
        pass
    if not os.path.isdir(args.cleanTrackPath):
        raise RuntimeError("Unable to find or create cleanTrack dir %s" %
                           args.cleanTrackPath)

    tempTracksInfo = getLocalTempPath("Temp_mustang_alyrata_clean", "xml")
    runCleaning(args, tempTracksInfo)
    assert os.path.isfile(tempTracksInfo)

    runTsd(args, tempTracksInfo)
    
    runScaling(args, tempTracksInfo)

    runShellCommand("rm -f %s" % tempTracksInfo)

    cleanBedTool(tempBedToolPath)
Exemple #29
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Remove ltr_finder ids from 4th column")
    parser.add_argument("inBed", help="bed with ltr results to process")
    parser.add_argument("outBed",
                        help="bed to write output to.  Will also "
                        "write outBed_sym.bed outBed_tsd_as_gap.bed etc.")
    parser.add_argument("--keepOl",
                        help="by default, if LTR elements "
                        "overlap, the one with the highest score (length "
                        "in event of tie) is kept. This option disables"
                        " this logic.",
                        action="store_true",
                        default=False)
    parser.add_argument("--all",
                        help="write _sym, _tsd_as_gap, etc. versions"
                        " of output",
                        action="store_true",
                        default=False)
    parser.add_argument("--weak",
                        help="score threshold such that any elemetns"
                        " with a score lower or equal to will be assigned the"
                        " prefix WEAK_ to their names.",
                        type=float,
                        default=-1)
    parser.add_argument(
        "--weakIgnore",
        help="dont apply --weak to state names"
        " that contain given keywords (defined as comma-separated"
        " list",
        default=None)

    args = parser.parse_args()
    tempBedToolPath = initBedTool()
    assert os.path.exists(args.inBed)
    baseOut, ext = os.path.splitext(args.outBed)
    if args.weakIgnore is not None:
        args.weakIgnore = args.weakIgnore.split(",")
    else:
        args.weakIgnore = []

    inBed = args.inBed

    toRm = []
    if not args.keepOl:
        inBed = getLocalTempPath("Temp", ".bed")
        removeOverlaps(args.inBed, inBed, args)
        toRm.append(inBed)

    os.system("sed -e \"s/|LTR_TE|[0-9]*//g\" -e \"s/|-//g\" %s > %s" %
              (inBed, args.outBed))

    if args.all:
        symBed = baseOut + "_sym" + ext
        os.system("sed -e \"s/|left//g\" -e \"s/|right//g\" %s > %s" %
                  (args.outBed, symBed))

        tsd_as_gapsBed = baseOut + "_tsd_as_gap" + ext
        os.system("grep -v TSD %s > %s" % (args.outBed, tsd_as_gapsBed))

        sym_tsd_as_gapsBed = baseOut + "_sym_tsd_as_gap" + ext
        os.system("grep -v TSD %s > %s" % (symBed, sym_tsd_as_gapsBed))

        tsd_as_ltrBed = baseOut + "_tsd_as_ltr" + ext
        os.system("sed -e \"s/TSD/LTR/g\" %s > %s" %
                  (args.outBed, tsd_as_ltrBed))

        sym_tsd_as_ltrBed = baseOut + "_sym_tsd_as_ltr" + ext
        os.system("sed -e \"s/TSD/LTR/g\" %s > %s" %
                  (symBed, sym_tsd_as_ltrBed))

        singleBed = baseOut + "_single" + ext
        os.system("sed -e \"s/LTR/inside/g\" %s > %s" %
                  (sym_tsd_as_ltrBed, singleBed))

    for path in toRm:
        runShellCommand("rm -f %s" % path)

    cleanBedTool(tempBedToolPath)
Exemple #30
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Filter overlapping intervals out")
    parser.add_argument("inputBed", help="Bed file to filter")
    parser.add_argument("--bed12", help="Use bed12 exons instead of start/end"
                        " if present (equivalent to running bed12ToBed6 on"
                        " input first).", action="store_true", default=False)
    parser.add_argument("--rm", help="Make sure intervals that are labeled as TE "
                        "by rm2State.sh script are never cut by ones that are not",
                        default=False, action='store_true')
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    assert os.path.isfile(args.inputBed)
    tempBedToolPath = initBedTool()

    # do the --rm filter.  by splitting into TE / non-TE
    # then removing everything in non-TE that overlaps
    # TE.  The adding the remainder back to TE. 
    inputPath = args.inputBed
    if args.rm is True:
        tempPath = getLocalTempPath("Temp_", ".bed")
        tePath = getLocalTempPath("Temp_te_", ".bed")
        runShellCommand("rm2State.sh %s |grep TE | sortBed > %s" % (
            args.inputBed, tempPath))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %(
            args.inputBed, tempPath, tePath))
        otherPath = getLocalTempPath("Temp_other_", ".bed")
        runShellCommand("rm2State.sh %s |grep -v TE | sortBed > %s" % (
            args.inputBed, tempPath))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %(
            args.inputBed, tempPath, otherPath))
        if os.path.getsize(tePath) > 0  and\
           os.path.getsize(otherPath) > 0:
            filterPath = getLocalTempPath("Temp_filter_", ".bed")
            runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (
                otherPath, tePath, filterPath))
            inputPath = getLocalTempPath("Temp_input_", ".bed")
            runShellCommand("cat %s %s | sortBed > %s" % (
                tePath, filterPath, inputPath))
            runShellCommand("rm -f %s" % filterPath)
        runShellCommand("rm -f %s %s %s" % (tePath, otherPath, tempPath))

    bedIntervals = BedTool(inputPath).sort()
    if args.bed12 is True:
        bedIntervals = bedIntervals.bed6()
        
    prevInterval = None

    # this code has been way to buggy for something so simple
    # keep extra list to check for sure even though it's a waste of
    # time and space
    sanity = []
    
    for interval in bedIntervals:
        if (prevInterval is not None and
            interval.chrom == prevInterval.chrom and
            interval.start < prevInterval.end):
            logger.debug("Replace %d bases of \n%s with\n%s" % (
                prevInterval.end - interval.start,
                str(interval), str(prevInterval)))
            interval.start = prevInterval.end
            
        if interval.end > interval.start:
            sys.stdout.write("%s" % str(interval))
            sanity.append(interval)
            prevInterval = interval

    for i in xrange(len(sanity) - 1):
        if sanity[i].chrom == sanity[i+1].chrom:
            assert sanity[i+1].start >= sanity[i].end
    cleanBedTool(tempBedToolPath)
    if args.inputBed != inputPath:
        runShellCommand("rm -f %s" % inputPath)
Exemple #31
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Fill in masked intervals of an hmm prediction "
        "(from teHmmEval.py) with state corresponding to surrounding"
        " intervals.")

    parser.add_argument("tracksXML",
                        help="XML track list (used to id masking"
                        " tracks")
    parser.add_argument("allBed",
                        help="Target scope.  Masked intervals outside"
                        " of these regions will not be included")
    parser.add_argument(
        "inBed",
        help="TE prediction BED file.  State labels"
        " should probably be mapped (ie with fitStateNames.py)")
    parser.add_argument("outBed",
                        help="Output BED.  Will be equivalent to"
                        " the input bed except all gaps corresponding to "
                        "masked intervals will be filled")
    parser.add_argument(
        "--maxLen",
        help="Maximum length of a masked interval"
        " to fill (inclusive). Use --delMask option with same value"
        "if running compareBedStates.py after.",
        type=int,
        default=sys.maxint)
    parser.add_argument("--default",
                        help="Default label to give to masked "
                        "region if no label can be determined",
                        default="0")
    parser.add_argument(
        "--tgts",
        help="Only relabel gaps that "
        "are flanked on both sides by the same state, and this state"
        " is in this comma- separated list. --default used for other"
        " gaps.  If not targetst specified then all states checked.",
        default=None)
    parser.add_argument(
        "--oneSidedTgts",
        help="Only relabel gaps that "
        "are flanked on at least one side by a state in this comma-"
        "separated list --default used for other gaps",
        default=None)
    parser.add_argument(
        "--onlyDefault",
        help="Add the default state (--default) no"
        " no all masked gaps no matter what. ie ignoring all other "
        "logic",
        action="store_true",
        default=False)
    parser.add_argument(
        "--cut",
        help="Cut out gaps for masked tracks from the input."
        " By default, the input is expected to come from the HMM "
        "with mask intervals already absent, and will crash on with"
        " an assertion error if an overlap is detected.",
        action="store_true",
        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # make sets
    tgtSet = set()
    if args.tgts is not None:
        tgtSet = set(args.tgts.split(","))
    oneSidedTgtSet = set()
    if args.oneSidedTgts is not None:
        oneSidedTgtSet = set(args.oneSidedTgts.split(","))
    assert len(tgtSet.intersection(oneSidedTgtSet)) == 0

    # read the track list
    trackList = TrackList(args.tracksXML)
    maskTracks = trackList.getMaskTracks()

    # read the input bed
    inBed = args.inBed
    if args.cut is True:
        inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML)
    inputIntervals = readBedIntervals(inBed, ncol=4, sort=True)
    if args.cut is True:
        runShellCommand("rm -f %s" % inBed)
    if len(maskTracks) == 0 or len(inputIntervals) == 0:
        runShellCommand("cp %s %s" % (args.inBed, args.outBed))
        logger.warning("No mask tracks located in %s or"
                       " %s empty" % (args.tracksXML, args.inBed))
        return 0

    # make a temporary, combined, merged masking bed file
    tempMaskBed = getLocalTempPath("Temp_mb", ".bed")
    for maskTrack in maskTracks:
        assert os.path.isfile(maskTrack.getPath())
        runShellCommand(
            "cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s"
            % (maskTrack.getPath(), tempMaskBed))
    maskedIntervals = getMergedBedIntervals(tempMaskBed, sort=True)
    resolvedMasks = 0

    if len(inputIntervals) == 0:
        logger.warning("No mask tracks located in %s" % args.tracksXML)
        return
    inputIdx = 0
    rightFlank = inputIntervals[inputIdx]

    tempOutMask = getLocalTempPath("Temp_om", ".bed")
    tempOutMaskFile = open(tempOutMask, "w")

    for maskIdx, maskInterval in enumerate(maskedIntervals):
        if maskInterval[2] - maskInterval[1] > args.maxLen:
            continue
        # find candidate right flank
        while rightFlank < maskInterval:
            if inputIdx == len(inputIntervals) - 1:
                rightFlank = None
                break
            else:
                inputIdx += 1
                rightFlank = inputIntervals[inputIdx]

        # candidate left flank
        leftFlank = None
        if inputIdx > 0:
            leftFlank = inputIntervals[inputIdx - 1]

        # identify flanking states if the intervals perfectly abut
        leftState = None
        if leftFlank is not None:
            if leftFlank[0] == maskInterval[0] and leftFlank[
                    2] == maskInterval[1]:
                leftState = str(leftFlank[3])
            else:
                assert intersectSize(leftFlank, maskInterval) == 0
        rightState = None
        if rightFlank is not None:
            if rightFlank[0] == maskInterval[0] and rightFlank[
                    1] == maskInterval[2]:
                rightState = str(rightFlank[3])
            else:
                assert intersectSize(rightFlank, maskInterval) == 0

        # choose a state for the mask interval
        maskState = str(args.default)
        if args.onlyDefault is True:
            pass
        elif leftState is not None and leftState == rightState:
            if len(tgtSet) == 0 or leftState in tgtSet:
                maskState = leftState
        elif leftState in oneSidedTgtSet:
            maskState = leftState
        elif rightState in oneSidedTgtSet:
            maskState = rightState

        # write our mask interval
        tempOutMaskFile.write(
            "%s\t%d\t%d\t%s\n" %
            (maskInterval[0], maskInterval[1], maskInterval[2], maskState))

    tempOutMaskFile.close()
    tempMergePath1 = getLocalTempPath("Temp_mp", ".bed")
    tempMergePath2 = getLocalTempPath("Temp_mp", ".bed")
    runShellCommand("cp %s %s ; cat %s >> %s" %
                    (args.inBed, tempMergePath1, tempOutMask, tempMergePath1))
    runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2))
    tempScopePath = getLocalTempPath("temp_all", ".bed")
    runShellCommand("mergeBed -i %s |sortBed > %s" %
                    (args.allBed, tempScopePath))
    runShellCommand("intersectBed -a %s -b %s > %s" %
                    (tempMergePath2, tempScopePath, args.outBed))

    runShellCommand("rm -f %s" % " ".join([
        tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath
    ]))
    cleanBedTool(tempBedToolPath)
Exemple #32
0
def parallelDispatch(argv, args):
    """ chunk up input with chrom option.  recursivlely launch eval. merge
    results """
    jobList = []
    chromIntervals = readBedIntervals(args.chroms, sort=True)
    chromFiles = []
    regionFiles = []
    segFiles = []
    statsFiles = []
    offset = args.co
    for chrom in chromIntervals:
        cmdToks = copy.deepcopy(argv)
        cmdToks[cmdToks.index("--chrom") + 1] = ""
        cmdToks[cmdToks.index("--chrom")] = ""

        chromPath = getLocalTempPath("TempChromPath", ".bed")
        cpFile = open(chromPath, "w")
        cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2]))
        cpFile.close()

        regionPath = getLocalTempPath("Temp", ".bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                        (args.allBed, chromPath, regionPath))

        if os.path.getsize(regionPath) < 2:
            continue

        offset += int(chrom[2]) - int(chrom[1])

        regionFiles.append(regionPath)
        chromFiles.append(chromPath)

        cmdToks[2] = regionPath

        segPath = getLocalTempPath("Temp", ".bed")
        cmdToks[3] = segPath
        segFiles.append(segPath)

        if "--co" in cmdToks:
            cmdToks[cmdToks.index("--co") + 1] = str(offset)
        else:
            cmdToks.append("--co")
            cmdToks.append(str(offset))

        if args.stats is not None:
            statsPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--stats") + 1] = statsPath
            statsFiles.append(statsPath)
        cmd = " ".join(cmdToks)
        jobList.append(cmd)

    runParallelShellCommands(jobList, args.proc)

    for i in xrange(len(jobList)):
        if i == 0:
            ct = ">"
        else:
            ct = ">>"
        runShellCommand("cat %s %s %s" % (segFiles[i], ct, args.outBed))
        if len(statsFiles) > 0:
            runShellCommand("cat %s %s %s" % (statsFiles[i], ct, args.stats))

    for i in itertools.chain(chromFiles, regionFiles, segFiles, statsFiles):
        runShellCommand("rm %s" % i)
Exemple #33
0
def parallelDispatch(argv, args):
    """ chunk up input with chrom option.  recursivlely launch eval. merge
    results """
    jobList = []
    chromIntervals = readBedIntervals(args.chroms, sort=True)
    chromFiles = []
    regionFiles = []
    bedFiles = []
    pdFiles = []
    bicFiles = []
    edFiles = []
    for chrom in chromIntervals:
        cmdToks = copy.deepcopy(argv)
        cmdToks[cmdToks.index("--chrom") + 1] = ""
        cmdToks[cmdToks.index("--chrom")] = ""
        
        chromPath = getLocalTempPath("Temp", ".bed")
        cpFile = open(chromPath, "w")
        cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2]))
        cpFile.close()
        
        regionPath = getLocalTempPath("Temp", ".bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.bedRegions,
                                                                     chromPath,
                                                                     regionPath))

        if os.path.getsize(regionPath) < 2:
            continue
        
        regionFiles.append(regionPath)
        chromFiles.append(chromPath)

        cmdToks[3] = regionPath

        if args.bed is not None:
            bedPath =  getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--bed")+1] = bedPath
            bedFiles.append(bedPath)
        if args.pd is not None:
            pdPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--pd")+1] = pdPath
            pdFiles.append(pdPath)
        if args.ed is not None:
            edPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--ed")+1] = edPath
            edFiles.append(edPath)
        if args.bic is not None:
            bicPath = getLocalTempPath("Temp", ".bic")
            cmdToks[cmdToks.index("--bic")+1] = bicPath
            bicFiles.append(bicPath)
        cmd = " ".join(cmdToks)
        jobList.append(cmd)

    runParallelShellCommands(jobList, args.proc)

    for i in xrange(len(jobList)):
        if i == 0:
            ct = ">"
        else:
            ct = ">>"
        if len(bedFiles) > 0:
            runShellCommand("cat %s %s %s" % (bedFiles[i], ct, args.bed))
        if len(pdFiles) > 0:
            runShellCommand("cat %s %s %s" % (pdFiles[i], ct, args.pd))
        if len(edFiles) > 0:
            runShellCommand("cat %s %s %s" % (edFiles[i], ct, args.ed))
        if len(bicFiles) > 0:
            runShellCommand("cat %s %s %s" % (bicFiles[i], ct, args.bic))

    for i in itertools.chain(chromFiles, regionFiles, bedFiles, pdFiles, edFiles,
                             bicFiles):
        runShellCommand("rm %s" % i)            
Exemple #34
0
def runPositionalComparison(argv, args):
    """ hack to recursively exectute compareBedStates.py on a sliding window of the two
    inputs and report accuracy in a BED file """
    try:
        windowToks = args.window.split(",")
        assert len(windowToks) == 5
        windowSize = int(windowToks[0])
        stateName = windowToks[1]
        compType = windowToks[2]
        score = windowToks[3]
        outBed = windowToks[4]
    except:
        raise RuntimeError("value passed to --window is not in valid format")
    if compType == "base":
        compIdx = 0
    elif compType == "interval":
        compIdx = 1
    elif compType == "weighted":
        compIdx = 2
    else:
        raise RuntimeError("invalid compType, %s, passed to --window" %
                           compType)
    if score != "f1" and score != "precision" and score != "recall":
        raise RuntimeError("invalid score, %s, passed to --window" % score)
    try:
        outFile = open(outBed, "w")
    except:
        raise RuntimeError("invalid outBed, %s, passed to --window" % outBed)

    tempBed = getLocalTempPath("Temp_region", ".bed")
    runShellCommand("mergeBed -i %s > %s" % (args.bed1, tempBed))
    chunkBed = getLocalTempPath("Temp_chunkBed", ".bed")
    runShellCommand("chunkBedRegions.py %s %d --overlap .5 > %s" %
                    (tempBed, windowSize, chunkBed))
    window = getLocalTempPath("Temp_window", ".bed")
    slice1 = getLocalTempPath("Temp_slice1", ".bed")
    slice2 = getLocalTempPath("Temp_slice2", ".bed")
    compFile = getLocalTempPath("Temp_compFile", ".bed")
    compOpts = ""
    winIdx = argv.index("--window")
    assert winIdx > 0 and winIdx < len(argv) - 1 and argv[winIdx +
                                                          1] == args.window
    for i in xrange(3, len(argv)):
        if i != winIdx and i != winIdx + 1:
            compOpts += " " + argv[i]

    for chunk in readBedIntervals(chunkBed):
        runShellCommand("echo \"%s\t%d\t%d\" > %s" %
                        (chunk[0], chunk[1], chunk[2], window))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                        (args.bed1, window, slice1))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                        (args.bed2, window, slice2))
        runShellCommand("compareBedStates.py %s %s %s > %s" %
                        (slice1, slice2, compOpts, compFile))
        stats = extractCompStatsFromFile(compFile)[compIdx]
        if stateName not in stats:
            stats[stateName] = (0, 0)
        f1 = 0.
        prec, rec = stats[stateName]
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)
        val = f1
        if score == "precision":
            val = prec
        elif score == "recall":
            val = rec
        outFile.write("%s\t%d\t%d\t%f\n" % (chunk[0], chunk[1], chunk[2], val))

    runShellCommand("rm -f %s %s %s %s %s %s" %
                    (tempBed, chunkBed, window, slice1, slice2, compFile))
    outFile.close()
Exemple #35
0
def runTsd(args, tempTracksInfo):
    """ run addTsdTrack on termini and chaux to generate tsd track"""
    if args.noTsd is True:
        return

    origTrackList = TrackList(args.tracksInfo)
    outTrackList = TrackList(tempTracksInfo)

    tempFiles = []
    tsdInputFiles = []
    tsdInputTracks = []

    # preprocess termini
    lastzTracks = [
        origTrackList.getTrackByName(args.ltr_termini),
        origTrackList.getTrackByName(args.tir)
    ]
    for terminiTrack in lastzTracks:
        if terminiTrack is not None:
            inFile = terminiTrack.getPath()
            fillFile = getLocalTempPath("Temp_fill", ".bed")
            tempBed = None
            if inFile[-3:] == ".bb":
                tempBed = getLocalTempPath("Temp_termini", ".bed")
                runShellCommand("bigBedToBed %s %s" % (inFile, tempBed))
                inFile = tempBed
            runShellCommand("fillTermini.py %s %s" % (inFile, fillFile))
            tsdInputFiles.append(fillFile)
            tsdInputTracks.append(terminiTrack.getName())
            tempFiles.append(fillFile)
            if tempBed is not None:
                runShellCommand("rm -f %s" % tempBed)
        else:
            logger.warning("Could not find termini track")

    # add repeat_modeler
    repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler)
    if repeat_modelerTrack is not None:
        tsdInputFiles.append(repeat_modelerTrack.getPath())
        tsdInputTracks.append(repeat_modelerTrack.getName())

    # run addTsdTrack (appending except first time)
    # note we override input track paths in each case
    assert len(tsdInputFiles) == len(tsdInputTracks)
    for i in xrange(len(tsdInputFiles)):
        optString = ""
        if i > 0:
            optString += " --append"
        # really rough hardcoded params based on
        # (A unified classification system for eukaryotic transposable elements
        # Wicker et. al 2007)
        if tsdInputTracks[i] == args.repeat_modeler:
            optString += " --names LINE,SINE,Unknown"
            optString += " --maxScore 20"
            optString += " --left 20"
            optString += " --right 20"
            optString += " --min 5"
            optString += " --max 20"
            optString += " --overlap 20"
        elif tsdInputTracks[i] == args.ltr_termini:
            optString += " --maxScore 3"
            optString += " --left 8"
            optString += " --right 8"
            optString += " --min 3"
            optString += " --max 6"
        elif tsdInputTracks[i] == args.tir:
            optString += " --maxScore 3"
            optString += " --left 15"
            optString += " --right 15"
            optString += " --min 3"
            optString += " --max 12"

        tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml")
        runShellCommand(
            "addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" %
            (tempTracksInfo, args.cleanTrackPath, tempXMLOut,
             tsdInputTracks[i], args.sequence, args.tsd, tsdInputFiles[i],
             optString, args.logOpString, args.numProc))

        runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo))

    for i in xrange(len(tempFiles)):
        runShellCommand("rm %s" % tempFiles[i])
Exemple #36
0
    cmd += " --segment %s" % trainSegPath
    runShellCommand(cmd)

# eval ############
evalPath = "eval.bed"
if startPoint <= 3:
    cmd = "teHmmEval.py %s %s %s --bed %s --segment %s" % (
        trainTracksPath, modelPath, evalSegPath, evalPath, logOpts)
    runShellCommand(cmd)

# fit ############
fitPath = "fit.bed"
fitFdrPath = "fitFdr.bed"
labelPath = "label.bed"
if startPoint <= 4:
    tempPath = getLocalTempPath("Tempmask", ".bed")
    runShellCommand("mergeBed -i %s | sortBed > %s" % (evalSegPath, tempPath))
    runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                    (modelerPath, tempPath, labelPath))
    runShellCommand("rm -f %s" % tempPath)
    fitCmd = "fitStateNames.py %s %s %s %s" % (labelPath, evalPath, fitPath,
                                               fitFlags)
    fitFdrCmd = "fitStateNames.py %s %s %s %s" % (labelPath, evalPath,
                                                  fitFdrPath, fitFlagsFdr)
    runParallelShellCommands([fitCmd, fitFdrCmd], 2)

# compare ############
compDir = "comp"
if not os.path.exists(compDir):
    runShellCommand("mkdir %s" % compDir)
Exemple #37
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Cut names off at first |, /, ?, or _")
    parser.add_argument("inBed", help="bed with chaux results to process")
    parser.add_argument("--keepSlash", help="dont strip anything after slash "
                        "ex: DNA/HELITRONY1C -> DNA", action="store_true",
                        default=False)
    parser.add_argument("--keepUnderscore", help="dont strip anything after _ ",
                        action="store_true", default=False)
    parser.add_argument("--leaveNumbers", help="by default, numbers as the end"
                        " of names are trimmed off.  ex: DNA/HELITRONY1C -> "
                        " DNA/HELITRONY. This option disables this behaviour",
                        default=False)
    parser.add_argument("--mapPrefix", help="Rename all strings with given "
                        "prefix to just the prefix. ex: --mapPrefix DNA/HELI"
                        " would cause any instance of DNA/HELITRONY1C or "
                        "HELITRON2 to be mapped to just DNA/HELI.  This option"
                        " overrides --keepSlash and --leaveNumbers for the"
                        " elements to which it applies.  This option can be"
                        " specified more than once. ex --mapPrefix DNA/HELI "
                        "--maxPrefix DNA/ASINE.", action="append")
    parser.add_argument("--minScore", help="Minimum score value to not filter"
                        " out", default=-sys.maxint, type=float)
    parser.add_argument("--maxScore", help="Maximum score value to not filter"
                        " out", default=sys.maxint, type=float)
    parser.add_argument("--overlap", help="Dont run removeBedOverlaps.py",
                        action="store_true", default=False)

    args = parser.parse_args()
    assert os.path.exists(args.inBed)
    assert args.minScore <= args.maxScore
    tempBedToolPath = initBedTool()

    tempPath = getLocalTempPath("Temp_cleanOut", ".bed")
    tempPath2 = getLocalTempPath("Temp2_", ".bed")
    tempFile = open(tempPath, "w")
    
    for interval in BedTool(args.inBed).sort():
        # filter score if exists
        try:
            if interval.score is not None and\
                (float(interval.score) < args.minScore or
                 float(interval.score) > args.maxScore):
                continue
        except:
            pass
        prefix = findPrefix(interval.name, args.mapPrefix)
        if prefix is not None:
            # prefix was specified with --mapPrefix, that's what we use
            interval.name = prefix
        else:
            # otherwise, strip after |
            if "|" in interval.name:
                interval.name = interval.name[:interval.name.find("|")]
            # strip after ?
            if "?" in interval.name:
                interval.name = interval.name[:interval.name.find("?")]
            #strip after _ unlerss told not to
            if "_" in interval.name and args.keepUnderscore is False:
                interval.name = interval.name[:interval.name.find("_")]
            # strip after "/" unless told not to
            if "/" in interval.name and args.keepSlash is False:
                interval.name = interval.name[:interval.name.find("/")]
            # strip trailing digits (and anything after) unless told not to
            if args.leaveNumbers is False:
                m = re.search("\d", interval.name)
                if m is not None:
                    interval.name = interval.name[:m.start()]
        
        tempFile.write(str(interval))

    tempFile.close()
    if not args.overlap:
        runShellCommand("removeBedOverlaps.py %s --rm > %s" % (tempPath,
                                                               tempPath2))
        tempPath, tempPath2, = tempPath2, tempPath

    tempFile = open(tempPath, "r")
    for line in tempFile:
        sys.stdout.write(line)
    tempFile.close()
    
    runShellCommand("rm -f %s %s" % (tempPath, tempPath2))
    cleanBedTool(tempBedToolPath)
Exemple #38
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Cut names off at first |, /, ?, or _")
    parser.add_argument("inBed", help="bed with chaux results to process")
    parser.add_argument("--keepSlash",
                        help="dont strip anything after slash "
                        "ex: DNA/HELITRONY1C -> DNA",
                        action="store_true",
                        default=False)
    parser.add_argument("--keepUnderscore",
                        help="dont strip anything after _ ",
                        action="store_true",
                        default=False)
    parser.add_argument("--leaveNumbers",
                        help="by default, numbers as the end"
                        " of names are trimmed off.  ex: DNA/HELITRONY1C -> "
                        " DNA/HELITRONY. This option disables this behaviour",
                        default=False)
    parser.add_argument("--mapPrefix",
                        help="Rename all strings with given "
                        "prefix to just the prefix. ex: --mapPrefix DNA/HELI"
                        " would cause any instance of DNA/HELITRONY1C or "
                        "HELITRON2 to be mapped to just DNA/HELI.  This option"
                        " overrides --keepSlash and --leaveNumbers for the"
                        " elements to which it applies.  This option can be"
                        " specified more than once. ex --mapPrefix DNA/HELI "
                        "--maxPrefix DNA/ASINE.",
                        action="append")
    parser.add_argument("--minScore",
                        help="Minimum score value to not filter"
                        " out",
                        default=-sys.maxint,
                        type=float)
    parser.add_argument("--maxScore",
                        help="Maximum score value to not filter"
                        " out",
                        default=sys.maxint,
                        type=float)
    parser.add_argument("--overlap",
                        help="Dont run removeBedOverlaps.py",
                        action="store_true",
                        default=False)

    args = parser.parse_args()
    assert os.path.exists(args.inBed)
    assert args.minScore <= args.maxScore
    tempBedToolPath = initBedTool()

    tempPath = getLocalTempPath("Temp_cleanOut", ".bed")
    tempPath2 = getLocalTempPath("Temp2_", ".bed")
    tempFile = open(tempPath, "w")

    for interval in BedTool(args.inBed).sort():
        # filter score if exists
        try:
            if interval.score is not None and\
                (float(interval.score) < args.minScore or
                 float(interval.score) > args.maxScore):
                continue
        except:
            pass
        prefix = findPrefix(interval.name, args.mapPrefix)
        if prefix is not None:
            # prefix was specified with --mapPrefix, that's what we use
            interval.name = prefix
        else:
            # otherwise, strip after |
            if "|" in interval.name:
                interval.name = interval.name[:interval.name.find("|")]
            # strip after ?
            if "?" in interval.name:
                interval.name = interval.name[:interval.name.find("?")]
            #strip after _ unlerss told not to
            if "_" in interval.name and args.keepUnderscore is False:
                interval.name = interval.name[:interval.name.find("_")]
            # strip after "/" unless told not to
            if "/" in interval.name and args.keepSlash is False:
                interval.name = interval.name[:interval.name.find("/")]
            # strip trailing digits (and anything after) unless told not to
            if args.leaveNumbers is False:
                m = re.search("\d", interval.name)
                if m is not None:
                    interval.name = interval.name[:m.start()]

        tempFile.write(str(interval))

    tempFile.close()
    if not args.overlap:
        runShellCommand("removeBedOverlaps.py %s --rm > %s" %
                        (tempPath, tempPath2))
        tempPath, tempPath2, = tempPath2, tempPath

    tempFile = open(tempPath, "r")
    for line in tempFile:
        sys.stdout.write(line)
    tempFile.close()

    runShellCommand("rm -f %s %s" % (tempPath, tempPath2))
    cleanBedTool(tempBedToolPath)
Exemple #39
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Generate HMM-usable tracklist from raw tracklist. EX "
        "used to transform mustang_alyrata_tracks.xml -> "
        "mustang_alyrata_clean.xml.  Runs cleanRM.py cleanLtrFinder.py and "
        " cleanTermini.py and addTsdTrack.py and setTrackScaling.py (also runs "
        " removeBedOverlaps.py before each of the clean scripts)")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("cleanTrackPath",
                        help="Directory to write cleaned BED"
                        " tracks to")
    parser.add_argument("outTracksInfo",
                        help="Path to write modified tracks XML"
                        " to.")
    parser.add_argument("--numBins",
                        help="Maximum number of bins after scaling",
                        default=10,
                        type=int)
    parser.add_argument("--scaleTracks",
                        help="Comma-separated list of tracks "
                        "to process for scaling. If not set, all"
                        " tracks listed as having a multinomial distribution"
                        " (since this is the default value, this includes "
                        "tracks with no distribution attribute) or gaussian "
                        "distribution will be processed.",
                        default=None)
    parser.add_argument("--skipScale",
                        help="Comma-separated list of tracks to "
                        "skip for scaling.",
                        default=None)
    parser.add_argument("--ltr_termini",
                        help="Name of termini track (appy tsd)",
                        default="ltr_termini")
    parser.add_argument("--repeat_modeler",
                        help="Name of repeat_modeler track (appy tsd)",
                        default="repeat_modeler")
    parser.add_argument("--sequence",
                        help="Name of fasta sequence track",
                        default="sequence")
    parser.add_argument(
        "--tsd",
        help="Name of tsd track to generate (appy cleanTermini.py)",
        default="tsd")
    parser.add_argument(
        "--tir",
        help="Name of tir_termini track (appy cleanTermini.py)",
        default="tir_termini")
    parser.add_argument("--noScale",
                        help="Dont do any scaling",
                        default=False,
                        action="store_true")
    parser.add_argument("--noTsd",
                        help="Dont generate TSD track.  NOTE:"
                        " TSD track is hardcoded to be generated from "
                        "termini and (non-LTR elements of ) chaux",
                        default=False,
                        action="store_true")
    parser.add_argument("--numProc",
                        help="Number of processes to use for tsdFinder.py",
                        default=1,
                        type=int)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    args.logOpString = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.logOpString += " --logFile %s" % args.logFile

    try:
        os.makedirs(args.cleanTrackPath)
    except:
        pass
    if not os.path.isdir(args.cleanTrackPath):
        raise RuntimeError("Unable to find or create cleanTrack dir %s" %
                           args.cleanTrackPath)

    tempTracksInfo = getLocalTempPath("Temp_mustang_alyrata_clean", "xml")
    runCleaning(args, tempTracksInfo)
    assert os.path.isfile(tempTracksInfo)

    runTsd(args, tempTracksInfo)

    runScaling(args, tempTracksInfo)

    runShellCommand("rm -f %s" % tempTracksInfo)

    cleanBedTool(tempBedToolPath)
Exemple #40
0
def parallelDispatch(argv, args):
    """ chunk up input with chrom option.  recursivlely launch eval. merge
    results """
    jobList = []
    chromIntervals = readBedIntervals(args.chroms, sort=True)
    chromFiles = []
    regionFiles = []
    segFiles = []
    statsFiles = []
    offset = args.co
    for chrom in chromIntervals:
        cmdToks = copy.deepcopy(argv)
        cmdToks[cmdToks.index("--chrom") + 1] = ""
        cmdToks[cmdToks.index("--chrom")] = ""
        
        chromPath = getLocalTempPath("TempChromPath", ".bed")
        cpFile = open(chromPath, "w")
        cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2]))
        cpFile.close()
        
        regionPath = getLocalTempPath("Temp", ".bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.allBed,
                                                                     chromPath,
                                                                     regionPath))

        if os.path.getsize(regionPath) < 2:
            continue

        offset += int(chrom[2]) - int(chrom[1])
        
        regionFiles.append(regionPath)
        chromFiles.append(chromPath)

        cmdToks[2] = regionPath

        segPath =  getLocalTempPath("Temp", ".bed")
        cmdToks[3] = segPath
        segFiles.append(segPath)

        if "--co" in cmdToks:
            cmdToks[cmdToks.index("--co")+1] = str(offset)
        else:
            cmdToks.append("--co")
            cmdToks.append(str(offset))
        
        if args.stats is not None:
            statsPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--stats")+1] = statsPath
            statsFiles.append(statsPath)
        cmd = " ".join(cmdToks)
        jobList.append(cmd)

    runParallelShellCommands(jobList, args.proc)

    for i in xrange(len(jobList)):
        if i == 0:
            ct = ">"
        else:
            ct = ">>"
        runShellCommand("cat %s %s %s" % (segFiles[i], ct, args.outBed))
        if len(statsFiles) > 0:
            runShellCommand("cat %s %s %s" % (statsFiles[i], ct, args.stats))

    for i in itertools.chain(chromFiles, regionFiles, segFiles, statsFiles):
        runShellCommand("rm %s" % i)