Ejemplo n.º 1
0
def cutOutMaskIntervals(inBed, minLength, maxLength, tracksInfoPath):
    """ Filter out intervals of mask tracks from inBed with lengths
    outside given range. Idea is that it makes less sense to simply ignore,
    say, giant stretches of N's (like centromeres), as we would by masking
    them normally, than it does to remove them entirely, splitting the
    genome into multiple chunks.  Can also be used during comparision to
    get rid of all masked intervals """
    outPath = getLocalTempPath("Tempcut", ".bed")
    trackList = TrackList(tracksInfoPath)
    maskPaths = [t.getPath() for t in trackList.getMaskTracks()]
    if len(maskPaths) == 0:
        return None
    tempPath1 = getLocalTempPath("Tempcut1", ".bed")
    tempPath2 = getLocalTempPath("Tempcut2", ".bed")
    runShellCommand("cp %s %s" % (inBed, outPath))
    for maskPath in maskPaths:
        runShellCommand("cat %s | awk \'{print $1\"\t\"$2\"\t\"$3}\' >> %s" %
                        (maskPath, tempPath1))
    if os.path.getsize(tempPath1) > 0:
        runShellCommand("sortBed -i %s > %s ; mergeBed -i %s > %s" %
                        (tempPath1, tempPath2, tempPath2, tempPath1))
        runShellCommand("filterBedLengths.py %s %d %d > %s" %
                        (tempPath1, minLength + 1, maxLength - 1, tempPath2))
        runShellCommand("subtractBed -a %s -b %s | sortBed > %s" %
                        (outPath, tempPath2, tempPath1))
        runShellCommand("mv %s %s" % (tempPath1, outPath))
    runShellCommand("rm -f %s %s" % (tempPath1, tempPath2))
    if os.path.getsize(outPath) == 0:
        raise RuntimeError(
            "cutOutMaskIntervals removed everything.  Can't continue."
            " probably best to rerun calling script on bigger region?")
    return outPath
Ejemplo n.º 2
0
def filterCutTrack(genomePath, fragmentFilterLen, trackListPath, cutTrackName,
                   cutTrackLenFilter):
    """ return path of length filtered cut track"""
    tracks = TrackList(trackListPath)
    track = tracks.getTrackByName(cutTrackName)
    assert track is not None
    cutTrackOriginalPath = track.getPath()
    cutTrackPath = getOutPath(cutTrackOriginalPath, outDir,
                              "filter%d" % cutTrackLenFilter)
    runShellCommand("filterBedLengths.py %s %s > %s" %
                    (cutTrackOriginalPath, cutTrackLenFilter, cutTrackPath))
    tempPath1 = getLocalTempPath("Temp", ".bed")
    runShellCommand("subtractBed -a %s -b %s | sortBed > %s" %
                    (genomePath, cutTrackPath, tempPath1))
    tempPath2 = getLocalTempPath("Temp", ".bed")
    S = string.ascii_uppercase + string.digits
    tag = ''.join(random.choice(S) for x in range(200))
    runShellCommand(
        "filterBedLengths.py %s %d --rename %s |grep %s | sortBed> %s" %
        (tempPath1, fragmentFilterLen, tag, tag, tempPath2))
    runShellCommand(
        "cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . > %s"
        % (tempPath2, tempPath1))
    runShellCommand(
        "cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . >> %s"
        % (cutTrackPath, tempPath1))
    runShellCommand("sortBed -i %s > %s" % (tempPath1, tempPath2))
    runShellCommand("mergeBed -i %s > %s" % (tempPath2, cutTrackPath))
    runShellCommand("rm -f %s %s" % (tempPath1, tempPath2))
    return cutTrackPath
Ejemplo n.º 3
0
def cutOutMaskIntervals(inBed, minLength, maxLength, tracksInfoPath):
    """ Filter out intervals of mask tracks from inBed with lengths
    outside given range. Idea is that it makes less sense to simply ignore,
    say, giant stretches of N's (like centromeres), as we would by masking
    them normally, than it does to remove them entirely, splitting the
    genome into multiple chunks.  Can also be used during comparision to
    get rid of all masked intervals """
    outPath = getLocalTempPath("Tempcut", ".bed")
    trackList = TrackList(tracksInfoPath)
    maskPaths = [t.getPath() for t in trackList.getMaskTracks()]
    if len(maskPaths) == 0:
        return None
    tempPath1 = getLocalTempPath("Tempcut1", ".bed")
    tempPath2 = getLocalTempPath("Tempcut2", ".bed")
    runShellCommand("cp %s %s" % (inBed, outPath))
    for maskPath in maskPaths:
        runShellCommand("cat %s | awk \'{print $1\"\t\"$2\"\t\"$3}\' >> %s" % (
            maskPath, tempPath1))
    if os.path.getsize(tempPath1) > 0:
        runShellCommand("sortBed -i %s > %s ; mergeBed -i %s > %s" % (
            tempPath1, tempPath2, tempPath2, tempPath1))
        runShellCommand("filterBedLengths.py %s %d %d > %s" % (
            tempPath1, minLength+1, maxLength-1, tempPath2))
        runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (
            outPath, tempPath2, tempPath1))
        runShellCommand("mv %s %s" % (tempPath1, outPath))
    runShellCommand("rm -f %s %s" % (tempPath1, tempPath2))
    if os.path.getsize(outPath) == 0:
        raise RuntimeError("cutOutMaskIntervals removed everything.  Can't continue."
                           " probably best to rerun calling script on bigger region?")
    return outPath
Ejemplo n.º 4
0
def filterCutTrack(genomePath, fragmentFilterLen, trackListPath, cutTrackName,
                   cutTrackLenFilter):
    """ return path of length filtered cut track"""
    tracks = TrackList(trackListPath)
    track = tracks.getTrackByName(cutTrackName)
    assert track is not None
    cutTrackOriginalPath = track.getPath()
    cutTrackPath = getOutPath(cutTrackOriginalPath, outDir,
                           "filter%d" % cutTrackLenFilter)
    runShellCommand("filterBedLengths.py %s %s > %s" % (cutTrackOriginalPath,
                                                    cutTrackLenFilter,
                                                    cutTrackPath))
    tempPath1 = getLocalTempPath("Temp", ".bed")
    runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (genomePath,
                                                                cutTrackPath,
                                                                tempPath1))
    tempPath2 = getLocalTempPath("Temp", ".bed")
    S = string.ascii_uppercase + string.digits
    tag = ''.join(random.choice(S) for x in range(200))
    runShellCommand("filterBedLengths.py %s %d --rename %s |grep %s | sortBed> %s" % (
        tempPath1, fragmentFilterLen, tag, tag, tempPath2))
    runShellCommand("cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . > %s" % (tempPath2, tempPath1))
    runShellCommand("cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . >> %s" % (cutTrackPath, tempPath1))
    runShellCommand("sortBed -i %s > %s" % (tempPath1, tempPath2))
    runShellCommand("mergeBed -i %s > %s" %(tempPath2, cutTrackPath))
    runShellCommand("rm -f %s %s" % (tempPath1, tempPath2))                    
    return cutTrackPath
Ejemplo n.º 5
0
def runCleaning(args, tempTracksInfo):
    """ run scripts for cleaning chaux, ltr_finder, and termini"""
    trackList = TrackList(args.tracksInfo)

    for track in trackList:
        if track.getPreprocess() is None:
            continue

        # convert bigbed/wig
        inFile = track.getPath()
        tempBed1 = None
        if inFile[-3:] == ".bb" or inFile[-3:] == ".bw":
            tempBed1 = getLocalTempPath("Temp_%s" % track.getName(), ".bed")
            if inFile[-3:] == ".bb":
                runShellCommand("bigBedToBed %s %s" % (inFile, tempBed1))
            else:
                runShellCommand("bigWigToBedGraph %s %s" % (inFile, tempBed1))
            inFile = tempBed1

        # run cleanRM.py on all tracks with rm or rmu preprocessor
        if track.getPreprocess() == "rm" or track.getPreprocess() == "rmu":
            flag = ""
            if track.getPreprocess() == "rmu":
                flag == "--keepUnderscore"
            inFile = track.getPath()
            outFile = cleanPath(args, track)
            tempBed = getLocalTempPath("Temp_%s" % track.getName(), ".bed")
            runShellCommand("cleanRM.py %s %s > %s" % (inFile, flag, tempBed))
            runShellCommand("removeBedOverlaps.py --rm %s > %s" %
                            (tempBed, outFile))
            runShellCommand("rm -f %s" % tempBed)
            track.setPath(outFile)

        # run cleanTermini.py
        elif track.getPreprocess() == "termini":
            outFile = cleanPath(args, track)
            inFile = track.getPath()
            runShellCommand("cleanTermini.py %s %s" % (inFile, outFile))
            track.setPath(outFile)

        # run removeBedOverlaps
        elif track.getPreprocess() == "overlap":
            outFile = cleanPath(args, track)
            inFile = track.getPath()
            runShellCommand("removeBedOverlaps.py %s > %s" % (inFile, outFile))
            track.setPath(outFile)

        # run cleanLtrFinder.py
        elif track.getPreprocess() == "ltr_finder":
            inFile = track.getPath()
            outFile = cleanPath(args, track)
            # note: overlaps now removed in cleanLtrFinderID script
            runShellCommand("cleanLtrFinderID.py %s %s" % (inFile, outFile))
            track.setPath(outFile)

        if tempBed1 is not None:
            runShellCommand("rm -f %s" % tempBed1)

    # save a temporary xml
    trackList.saveXML(tempTracksInfo)
Ejemplo n.º 6
0
def runCleaning(args, tempTracksInfo):
    """ run scripts for cleaning chaux, ltr_finder, and termini"""
    trackList = TrackList(args.tracksInfo)

    for track in trackList:
        if track.getPreprocess() is None:
            continue

        # convert bigbed/wig
        inFile = track.getPath()
        tempBed1 = None
        if inFile[-3:] == ".bb" or inFile[-3:] == ".bw":
            tempBed1 = getLocalTempPath("Temp_%s" % track.getName(), ".bed")
            if inFile[-3:] == ".bb":
                runShellCommand("bigBedToBed %s %s" % (inFile, tempBed1))
            else:
                runShellCommand("bigWigToBedGraph %s %s" % (inFile, tempBed1))    
            inFile = tempBed1            
        
        # run cleanRM.py on all tracks with rm or rmu preprocessor
        if track.getPreprocess() == "rm" or track.getPreprocess() == "rmu":
            flag = ""
            if track.getPreprocess() == "rmu":
                flag == "--keepUnderscore"
            inFile = track.getPath()
            outFile = cleanPath(args, track)
            tempBed = getLocalTempPath("Temp_%s" % track.getName(), ".bed")
            runShellCommand("cleanRM.py %s %s > %s" % (inFile, flag, tempBed))
            runShellCommand("removeBedOverlaps.py --rm %s > %s" % (tempBed, outFile)) 
            runShellCommand("rm -f %s" % tempBed)
            track.setPath(outFile)

        # run cleanTermini.py            
        elif track.getPreprocess() == "termini":            
            outFile = cleanPath(args, track)
            inFile = track.getPath()
            runShellCommand("cleanTermini.py %s %s" % (inFile, outFile))
            track.setPath(outFile)

        # run removeBedOverlaps
        elif track.getPreprocess() == "overlap":
            outFile = cleanPath(args, track)
            inFile = track.getPath()
            runShellCommand("removeBedOverlaps.py %s > %s" % (inFile, outFile))
            track.setPath(outFile)

        # run cleanLtrFinder.py
        elif track.getPreprocess() == "ltr_finder":
            inFile = track.getPath()
            outFile = cleanPath(args, track)
            # note: overlaps now removed in cleanLtrFinderID script
            runShellCommand("cleanLtrFinderID.py %s %s" % (inFile, outFile))
            track.setPath(outFile)

        if tempBed1 is not None:
            runShellCommand("rm -f %s" % tempBed1)

    # save a temporary xml
    trackList.saveXML(tempTracksInfo)
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Set the score column of each bed interval in input to "
        "(MODE, BINNED) average value of the intersection region in another track). "
        "Can be used, for instance, to assign a copy number of each RepeatModeler "
        "prediction...")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inBed", help="BED file to annotate")
    parser.add_argument("track", help="Track to use for annotation")
    parser.add_argument("outBed", help="Path for output, annotated BED file")
    parser.add_argument("--name", help="Set ID field (column 4 instead of 5)",
                        action="store_true", default=False)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read the tracks list
    trackList = TrackList(args.tracksInfo)
    track = trackList.getTrackByName(args.track)
    if track is None:
        raise RuntimeError("Can't find track %s" % args.track)
    # make temporary tracks list with just our track so we can keep using
    # tracks list interface but not read unecessary crap.
    singleListPath = getLocalTempPath("Temp_secScore", ".bed")
    trackList.trackList = [track]
    trackList.saveXML(singleListPath)

    obFile = open(args.outBed, "w")

    # trackData interface not so great at cherry picking intervals.
    # need to merge them up and use segmentation interface    
    filledIntervals, mergedIntervals = fillGaps(args.inBed)

    # read track into trackData
    trackData = TrackData()
    logger.info("loading track %s" % singleListPath)
    trackData.loadTrackData(singleListPath, mergedIntervals,
                            segmentIntervals=filledIntervals,
                            applyMasking=False)

    # finally, write the annotation
    writeAnnotatedIntervals(trackData, filledIntervals, mergedIntervals, obFile,
                             args)

    runShellCommand("rm -f %s" % singleListPath)
    obFile.close()
    cleanBedTool(tempBedToolPath)
Ejemplo n.º 8
0
def subsetTrackList(trackList, sizeRange, mandTracks):
    """ generate tracklists of all combinations of tracks in the input list
    optionally using size range to limit the different sizes tried. so, for
    example, given input list [t1, t2, t3] and sizeRange=None this
    will gneerate [t1] [t2] [t3] [t1,t2] [t1,t3] [t2,t3] [t1,t2,t3] """
    assert sizeRange[0] > 0
    sizeRange  = (sizeRange[0], min(sizeRange[1], len(trackList) + 1))
    for outLen in xrange(*sizeRange):
        for perm in itertools.combinations([x for x in xrange(len(trackList))],
                                            outLen):
            permList = TrackList()
            mandFound = 0
            for trackNo in perm:
                track = copy.deepcopy(trackList.getTrackByNumber(trackNo))
                permList.addTrack(track)
                if track.getName() in mandTracks:
                    mandFound += 1

            if mandFound == len(mandTracks):
                yield permList
Ejemplo n.º 9
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Set the score column of each bed interval in input to "
        "(MODE, BINNED) average value of the intersection region in another track). "
        "Can be used, for instance, to assign a copy number of each RepeatModeler "
        "prediction...")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inBed", help="BED file to annotate")
    parser.add_argument("track", help="Track to use for annotation")
    parser.add_argument("outBed", help="Path for output, annotated BED file")
    parser.add_argument("--name",
                        help="Set ID field (column 4 instead of 5)",
                        action="store_true",
                        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read the tracks list
    trackList = TrackList(args.tracksInfo)
    track = trackList.getTrackByName(args.track)
    if track is None:
        raise RuntimeError("Can't find track %s" % args.track)
    # make temporary tracks list with just our track so we can keep using
    # tracks list interface but not read unecessary crap.
    singleListPath = getLocalTempPath("Temp_secScore", ".bed")
    trackList.trackList = [track]
    trackList.saveXML(singleListPath)

    obFile = open(args.outBed, "w")

    # trackData interface not so great at cherry picking intervals.
    # need to merge them up and use segmentation interface
    filledIntervals, mergedIntervals = fillGaps(args.inBed)

    # read track into trackData
    trackData = TrackData()
    logger.info("loading track %s" % singleListPath)
    trackData.loadTrackData(singleListPath,
                            mergedIntervals,
                            segmentIntervals=filledIntervals,
                            applyMasking=False)

    # finally, write the annotation
    writeAnnotatedIntervals(trackData, filledIntervals, mergedIntervals,
                            obFile, args)

    runShellCommand("rm -f %s" % singleListPath)
    obFile.close()
    cleanBedTool(tempBedToolPath)
Ejemplo n.º 10
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Make benchmark summary row.  Called from within "
        "teHmmBenchmark.py")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file that"
                        " teHmmBenchmark.py was run on.")
    parser.add_argument("localTracksInfo",
                        help="Path of Tracks Info file for"
                        " row (could be a subset of above)")
    parser.add_argument("evalBed",
                        help="Bed file created by teHmmEval.  Used"
                        " for the Viterbi score in comment at top")
    parser.add_argument("compBed", help="Results of comparison script")
    parser.add_argument("outRow", help="File to write row information to")

    args = parser.parse_args()

    inputTrackList = TrackList(args.tracksInfo)
    trackList = TrackList(args.localTracksInfo)

    header, row = scrapeRow(inputTrackList, trackList, args.evalBed,
                            args.compBed)

    header = map(str, header)
    row = map(str, row)

    outFile = open(args.outRow, "w")
    outFile.write(",".join(header) + "\n")
    outFile.write(",".join(row) + "\n")
    outFile.close()
Ejemplo n.º 11
0
def runTrial(tracksList, iteration, newTrackName, args):
    """ compute a score for a given set of tracks using teHmmBenchmark.py """
    benchDir = os.path.join(args.outDir, "iter%d" % iteration)
    benchDir = os.path.join(benchDir, "%s_bench" % newTrackName)
    if not os.path.exists(benchDir):
        os.makedirs(benchDir)

    trainingPath = args.training
    truthPath = args.truth

    tracksPath =  os.path.join(benchDir, "tracks.xml")
    tracksList.saveXML(tracksPath)

    segLogPath = os.path.join(benchDir, "segment_cmd.txt")
    segLog = open(segLogPath, "w")

    if args.segTracks == args.tracks:
        segTracksPath = tracksPath
    # pull out desired tracks from segment tracks XML if specified
    else:
        segTracksIn = TrackList(args.segTracks)
        segTracks = TrackList()
        for track in tracksList:
            segTrack = segTracksIn.getTrackByName(track.getName())
            if segTrack is not None:
                segTracks.addTrack(segTrack)
            else:
                logger.warning("track %s not found in segment tracks %s" % (
                    track.getName(), args.segTracks))
        segTracksPath = os.path.join(benchDir, "seg_tracks.xml")
        segTracks.saveXML(segTracksPath)
        
    # segment training
    segTrainingPath = os.path.join(benchDir,
                                   os.path.splitext(
                                       os.path.basename(trainingPath))[0]+
                                   "_trainSeg.bed")    
    segmentCmd = "segmentTracks.py %s %s %s %s" % (segTracksPath,
                                                   trainingPath,
                                                   segTrainingPath,
                                                   args.segOpts)

    if args.fullSegment is False:
        runShellCommand(segmentCmd)
        segLog.write(segmentCmd + "\n")
    else:
        runShellCommand("ln -f -s %s %s" % (args.fullSegTrainPath, segTrainingPath))

    # segment eval
    segEvalPath = os.path.join(benchDir,
                                os.path.splitext(os.path.basename(truthPath))[0]+
                                "_evalSeg.bed")    
    segmentCmd = "segmentTracks.py %s %s %s %s" % (segTracksPath,
                                                   truthPath,
                                                   segEvalPath,
                                                   args.segOpts)
    if trainingPath == truthPath:
        segmentCmd = "ln -f -s %s %s" % (os.path.abspath(segTrainingPath), segEvalPath)
    if args.fullSegment is False:
        runShellCommand(segmentCmd)
        segLog.write(segmentCmd + "\n")
    else:
        runShellCommand("ln -f -s %s %s" % (args.fullSegEvalPath, segEvalPath))
    
    segLog.close()

    segPathOpts = " --eval %s --truth %s" % (segEvalPath, truthPath)
    
    benchCmd = "teHmmBenchmark.py %s %s %s %s" % (tracksPath,
                                                  benchDir,
                                                  segTrainingPath,
                                                  args.benchOpts + segPathOpts)
    runShellCommand(benchCmd)

    score = extractScore(benchDir, segTrainingPath, args)
    bic = extractBIC(benchDir, segTrainingPath, args)
    naive = 0
    if args.doNaive is True:
        naive = extractNaive(tracksPath, benchDir, segTrainingPath, args)
    slope, rsq = extractF1ProbSlope(benchDir, segTrainingPath, args)

    # clean up big files?

    return score, bic, naive, slope, rsq
Ejemplo n.º 12
0
def greedyRank(args):
    """ Iteratively add best track to a (initially empty) tracklist according
    to some metric"""
    inputTrackList = TrackList(args.tracks)
    rankedTrackList = TrackList()
    if args.startTracks is not None:
        for startTrack in args.startTracks.split(","):
            track = inputTrackList.getTrackByName(startTrack)
            if track is None:
                logger.warning("Start track %s not found in tracks XML" %
                               startTrack)
            else:
                rankedTrackList.addTrack(copy.deepcopy(track))
            
    numTracks = len(inputTrackList) - len(rankedTrackList)
    currentScore, currentBIC = 0.0, sys.maxint

    # compute full segmentation if --fullSegment is True
    if args.fullSegment is True:
        args.fullSegTrainPath = os.path.abspath(os.path.join(args.outDir,
                                                             "fullSegTrain.bed"))
        segmentCmd = "segmentTracks.py %s %s %s %s" % (args.segTracks,
                                                       args.training,
                                                       args.fullSegTrainPath,
                                                       args.segOpts)
        runShellCommand(segmentCmd)
        args.fullSegEvalPath = os.path.abspath(os.path.join(args.outDir,
                                                            "fullSegEval.bed"))
        segmentCmd = "segmentTracks.py %s %s %s %s" % (args.segTracks,
                                                       args.truth,
                                                       args.fullSegEvalPath,
                                                       args.segOpts)
        runShellCommand(segmentCmd)

    #header
    rankFile = open(os.path.join(args.outDir, "ranking.txt"), "w")
    rankFile.write("It.\tTrack\tF1\tBIC\tNaiveF1\tAccProbSlop\tAccProbR2\n")
    rankFile.close()
    
    # baseline score if we not starting from scratch
    baseIt = 0
    if args.startTracks is not None:
        curTrackList = copy.deepcopy(rankedTrackList)
        score,bic,naive,slope,rsq = runTrial(curTrackList, baseIt, "baseline_test", args)
        rankFile = open(os.path.join(args.outDir, "ranking.txt"), "a")
        rankFile.write("%d\t%s\t%s\t%s\t%s\t%s\t%s\n" % (baseIt, args.startTracks,
                                        score, bic, naive,slope,rsq))
        rankFile.close()
        baseIt += 1
        
    for iteration in xrange(baseIt, baseIt + numTracks):
        bestItScore = -sys.maxint
        bestItBic = sys.maxint
        bestItNaive = -sys.maxint
        bestNextTrack = None
        bestSlope = None
        bestR = None
        for nextTrack in inputTrackList:
            if rankedTrackList.getTrackByName(nextTrack.getName()) is not None:
                continue
            curTrackList = copy.deepcopy(rankedTrackList)
            curTrackList.addTrack(nextTrack)
            score,bic,naive,slope,rsq = runTrial(curTrackList, iteration, nextTrack.getName(),
                                args)
            best = False
            if args.bic is True:
                if bic < bestItBic or (bic == bestItBic and score > bestItScore):
                    best = True
            elif args.naive is True:
                if naive > bestItNaive or (naive == bestItNaive and score > bestItScore):
                    best = True
            elif score > bestItScore or (score == bestItScore and bic < bestItBic):
                    best = True
            if best is True:
                bestItScore, bestItBic, bestItNaive, bestSlope, bestR, bestNextTrack =\
                       score, bic, naive, slope, rsq, nextTrack
            flags = "a"
            if iteration == baseIt:
                flags = "w"      
            trackLogFile = open(os.path.join(args.outDir, nextTrack.getName() +
                                             ".txt"), flags)
            trackLogFile.write("%d\t%f\t%f\t%f\t%f\t%f\n" % (iteration, score, bic, naive,
                                                             slope, rsq))
            trackLogFile.close()
        rankedTrackList.addTrack(copy.deepcopy(bestNextTrack))
        rankedTrackList.saveXML(os.path.join(args.outDir, "iter%d" % iteration,
                                "tracks.xml"))
        
        rankFile = open(os.path.join(args.outDir, "ranking.txt"), "a")
        rankFile.write("%d\t%s\t%s\t%s\t%s\t%s\t%s\n" % (iteration, bestNextTrack.getName(),
                                            bestItScore, bestItBic, bestItNaive,
                                            bestSlope, bestR))
        rankFile.close()
Ejemplo n.º 13
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Combine a bunch of non-numeric BED tracks into"
        " single file using fitStateNames.py to try to keep names "
        "consistent.  Idea is to be used as baseline to compare"
        " hmm to (via base-by-base statistics, primarily, since"
        " this procedure could induce some fragmentation)")

    parser.add_argument("tracksXML", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("regionBed", help="BED file representing "
                        "target region (best if whole genome)")
    parser.add_argument("outBed", help="Output bed")
    parser.add_argument("--tracks", help="Comma-separated list of "
                        "track names to use.  All tracks will be"
                        " used by default", default=None)
    parser.add_argument("--outside", help="Name to give non-annotated"
                        "regions", default="Outside")
    parser.add_argument("--fitThresh", help="Min map percentage (0,1)"
                        " in order to rename (see --qualThresh option"
                        "of fitStateNames.py", type=float,
                        default=0.5)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    inputTrackList = TrackList(args.tracksXML)
    iter = 0

    # get regionBed where all intervals are merged when possible
    regionIntervals = getMergedBedIntervals(args.regionBed, sort=True)
    tempRegionPath = getLocalTempPath("Temp", "_reg.bed")
    tempRegionFile = open(tempRegionPath, "w")
    for interval in regionIntervals:
        tempRegionFile.write("\t".join([str(x) for x in interval]) + "\n")
    tempRegionFile.close()

    # accumulate tracks in temp file
    tempOutPath = getLocalTempPath("Temp", "_out.bed")
    
    for track in inputTrackList:
        if track.shift is not None or track.scale is not None or\
          track.logScale is not None or track.dist == "gaussian" or\
          os.path.splitext(track.getPath())[1].lower() != ".bed":
          logger.warning("Skipping numeric track %s" % track.getName())
        elif args.tracks is None or track.getName() in args.tracks.split(","):
            combineTrack(track, tempOutPath, tempRegionPath, iter, args)
            iter += 1

    # nothing got written, make everything outside
    if iter == 0:
        tempOutFile = open(tempOutPath, "w")
        for interval in regionIntervals:
            tempOutFile.write("%s\t%s\t%s\t%s\n" % (interval[0], interval[1],
                                                   interval[2], args.outside))
        tempOutFile.close()

    runShellCommand("mv %s %s" % (tempOutPath, args.outBed))
    runShellCommand("rm -f %s" % (tempRegionPath))
                
    cleanBedTool(tempBedToolPath)
Ejemplo n.º 14
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Train, evalaute, then compare hmm model on input")

    parser.add_argument("trainingTracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks used "
                        "for training")
    parser.add_argument("outputDir", help="directory to write output")
    parser.add_argument("inBeds", nargs="*", help="list of training beds")
    parser.add_argument("--evalTracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks used"
                        " for evaluation (only need if different from"
                        " trainingTracksInfo", default=None)
    parser.add_argument("--numProc", help="Max number of processors to use",
                        type=int, default=1)
    parser.add_argument("--allTrackCombinations", help="Rerun with all"
                        " possible combinations of tracks from the input"
                        " tracksInfo file.  Note that this number gets big"
                        " pretty fast.", action = "store_true", default= False)
    parser.add_argument("--emStates", help="By default the supervised mode"
                        " of teHmmTrain is activated.  This option overrides"
                        " that and uses the EM mode and the given number of "
                        "states instead", type=int, default=None)
    parser.add_argument("--cross", help="Do 50/50 cross validation by training"
                        " on first half input and validating on second",
                        action="store_true", default=False)
    parser.add_argument("--emFac", help="Normalization factor for weighting"
                        " emission probabilities because when there are "
                        "many tracks, the transition probabilities can get "
                        "totally lost. 0 = no normalization. 1 ="
                        " divide by number of tracks.  k = divide by number "
                        "of tracks / k", type=int, default=0)
    parser.add_argument("--mod", help="Path to trained model.  This will "
                        "bypass the training phase that would normally be done"
                        " and just skip to the evaluation.  Note that the user"
                        " must make sure that the trained model has the "
                        "states required to process the input data",
                        default = None)
    parser.add_argument("--iter", help="Number of EM iterations.  Needs to be"
                        " used in conjunction with --emStates to specify EM"
                        " training",
                        type = int, default=None)
    parser.add_argument("--initTransProbs", help="Path of text file where each "
                        "line has three entries: FromState ToState Probability"
                        ".  This file (all other transitions get probability 0)"
                        " is used to specifiy the initial transition model."
                        " The names and number of states will be initialized "
                        "according to this file (overriding --numStates)",
                        default = None)
    parser.add_argument("--fixTrans", help="Do not learn transition parameters"
                        " (best used with --initTransProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ".  This file (all other emissions get probability 0)"
                        " is used to specifiy the initial emission model. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixEm", help="Do not learn emission parameters"
                        " (best used with --initEmProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initStartProbs", help="Path of text file where each "
                        "line has two entries: State Probability"
                        ".  This file (all other start probs get probability 0)"
                        " is used to specifiy the initial start dist. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixStart", help="Do not learn start parameters"
                        " (best used with --initStartProbs)",
                        action="store_true", default=False)
    parser.add_argument("--forceTransProbs",
                        help="Path of text file where each "
                        "line has three entries: FromState ToState Probability" 
                        ". These transition probabilities will override any "
                        " learned probabilities after training (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed" ,
                        default=None)
    parser.add_argument("--forceEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ". These "
                        "emission probabilities will override any learned"
                        " probabilities after training (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed." ,
                        default = None) 
    parser.add_argument("--flatEm", help="Use a flat emission distribution as "
                        "a baseline.  If not specified, the initial emission "
                        "distribution will be randomized by default.  Emission"
                        " probabilities specified with --initEmpProbs or "
                        "--forceEmProbs will never be affected by randomizaiton"
                        ".  The randomization is important for Baum Welch "
                        "training, since if two states dont have at least one"
                        " different emission or transition probability to begin"
                        " with, they will never learn to be different.",
                        action="store_true", default=False)
    parser.add_argument("--emRandRange", help="When randomly initialzing a"
                        " multinomial emission distribution, constrain"
                        " the values to the given range (pair of "
                        "comma-separated numbers).  Overridden by "
                        "--initEmProbs and --forceEmProbs when applicable."
                        " Completely overridden by --flatEm (which is equivalent"
                        " to --emRandRange .5,.5.). Actual values used will"
                        " always be normalized.", default=None)    
    parser.add_argument("--mandTracks", help="Mandatory track names for use "
                        "with --allTrackCombinations in comma-separated list",
                        default=None)
    parser.add_argument("--combinationRange", help="in form MIN,MAX: Only "
                        "explore track combination in given (closed) range. "
                        "A more refined version of --allTrackCombinations.",
                        default=None)
    parser.add_argument("--supervised", help="Use name (4th) column of "
                        "<traingingBed> for the true hidden states of the"
                        " model.  Transition parameters will be estimated"
                        " directly from this information rather than EM."
                        " NOTE: The number of states will be determined "
                        "from the bed.",
                        action = "store_true", default = False)
    parser.add_argument("--segment", help="Input bed files are also used to "
                        "segment data.  Ie teHmmTrain is called with --segment"
                        " set to the input file. Not currently working with "
                        " --supervised",
                        action = "store_true", default=False)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied) in training", type=int,
                        default=None)
    parser.add_argument("--truth", help="Use specifed file instead of "
                        "input file(s) for truth comparison.  Makes sense"
                        " when --segment is specified and only one input"
                        " bed specified", default = None)
    parser.add_argument("--eval", help="Bed file used for evaluation.  It should"
                        " cover same region in same order as --truth.  Option "
                        "exists mostly to specify segmentation of --truth",
                        default=None)
    parser.add_argument("--seed", help="Seed for random number generator"
                        " which will be used to initialize emissions "
                        "(if --flatEM and --supervised not specified)",
                        default=None, type=int)
    parser.add_argument("--reps", help="Number of training replicates (with "
                        " different"
                         " random initializations) to run. The replicate"
                         " with the highest likelihood will be chosen for the"
                         " output", default=None, type=int)
    parser.add_argument("--numThreads", help="Number of threads to use when"
                        " running training replicates (see --rep) in parallel.",
                        type=int, default=None)
    parser.add_argument("--emThresh", help="Threshold used for convergence"
                        " in baum welch training.  IE delta log likelihood"
                        " must be bigger than this number (which should be"
                        " positive) for convergence", type=float,
                        default=None)
    parser.add_argument("--fit", help="Run fitStateNames.py to automap names"
                        " before running comparison", action="store_true",
                        default=False)
    parser.add_argument("--fitOpts", help="Options to pass to fitStateNames.py"
                        " (only effective if used with --fit)", default=None)
    parser.add_argument("--saveAllReps", help="Save all replicates (--reps)"
                        " models to disk, instead of just the best one"
                        ". Format is <outputModel>.repN.  There will be "
                        " --reps -1 such models saved as the best output"
                        " counts as a replicate.  Comparison statistics"
                        " will be generated for each rep.",
                        action="store_true", default=False)
    parser.add_argument("--maxProb", help="Gaussian distributions and/or"
                        " segment length corrections can cause probability"
                        " to *decrease* during BW iteration.  Use this option"
                        " to remember the parameters with the highest probability"
                        " rather than returning the parameters after the final "
                        "iteration.", action="store_true", default=False)
    parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop"
                        " training if a given number of iterations go by without"
                        " hitting a new maxProb", default=None, type=int)
    parser.add_argument("--transMatEpsilons", help="By default, epsilons are"
                        " added to all transition probabilities to prevent "
                        "converging on 0 due to rounding error only for fully"
                        " unsupervised training.  Use this option to force this"
                        " behaviour for supervised and semisupervised modes",
                        action="store_true", default=False)

        
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    logOps = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        logOps += " --logFile %s" % args.logFile

    if not os.path.exists(args.outputDir):
        os.makedirs(args.outputDir)
    if args.evalTracksInfo is None:
        args.evalTracksInfo = args.trainingTracksInfo

    trainingTrackList = TrackList(args.trainingTracksInfo)
    evalTrackList = TrackList(args.evalTracksInfo)
    checkTrackListCompatible(trainingTrackList, evalTrackList)

    sizeRange = (len(trainingTrackList), len(trainingTrackList) + 1)
    if args.allTrackCombinations is True:
        sizeRange = (1, len(trainingTrackList) + 1)
    if args.combinationRange is not None:
        toks = args.combinationRange.split(",")
        sizeRange = int(toks[0]),int(toks[1]) + 1
        logger.debug("manual range (%d, %d) " % sizeRange)
    mandTracks = set()
    if args.mandTracks is not None:
        mandTracks = set(args.mandTracks.split(","))
        logger.debug("mandatory set %s" % str(mandTracks))
    trainFlags = ""
    if args.emStates is not None:
        trainFlags += " --numStates %d" % args.emStates
    if args.supervised is True:
        trainFlags += " --supervised"
        if args.segment is True:
            raise RuntimeError("--supervised not currently compatible with "
                               "--segment")
    trainFlags += " --emFac %d" % args.emFac
    if args.forceEmProbs is not None:
        trainFlags += " --forceEmProbs %s" % args.forceEmProbs
    if args.iter is not None:
        assert args.emStates is not None or args.initTransProbs is not None
        trainFlags += " --iter %d" % args.iter
    if args.initTransProbs is not None:
        trainFlags += " --initTransProbs %s" % args.initTransProbs
    if args.initEmProbs is not None:
        trainFlags += " --initEmProbs %s" % args.initEmProbs
    if args.fixEm is True:
        trainFlags += " --fixEm"
    if args.initStartProbs is not None:
        trainFlags += " --initStartProbs %s" % args.initStartProbs
    if args.fixStart is True:
        trainFlags += " --fixStart"
    if args.forceTransProbs is not None:
        trainFlags += " --forceTransProbs %s" % args.forceTransProbs
    if args.forceEmProbs is not None:
        trainFlags += " --forceEmProbs %s" % args.forceEmProbs
    if args.flatEm is True:
        trainFlags += " --flatEm"
    if args.emRandRange is not None:
        trainFlags += " --emRandRange %s" % args.emRandRange
    if args.segLen is not None:
        trainFlags += " --segLen %d" % args.segLen
    if args.seed is not None:
        trainFlags += " --seed %d" % args.seed
    if args.reps is not None:
        trainFlags += " --reps %d" % args.reps
    if args.numThreads is not None:
        trainFlags += " --numThreads %d" % args.numThreads
    if args.emThresh is not None:
        trainFlags += " --emThresh %f" % args.emThresh
    if args.saveAllReps is True:
        trainFlags += " --saveAllReps"
    if args.maxProb is True:
        trainFlags += " --maxProb"
    if args.transMatEpsilons is True:
        trainFlags += " --transMatEpsilons"
    if args.maxProbCut is not None:
        trainFlags += " --maxProbCut %d" % args.maxProbCut

    # write out command line for posteriorty's sake
    if not os.path.exists(args.outputDir):
        os.makedirs(args.outputDir)
    cmdPath = os.path.join(args.outputDir, "teHmmBenchmark_cmd.txt")
    cmdFile = open(cmdPath, "w")
    cmdFile.write(" ".join(argv) + "\n")
    cmdFile.close()
                           
    #todo: try to get timing for each command
    commands = []
    rows = dict()
    for pn, pList in enumerate(subsetTrackList(trainingTrackList, sizeRange,
                                               mandTracks)):
        if len(pList) == len(trainingTrackList):
            outDir = args.outputDir
        else:
            outDir = os.path.join(args.outputDir, "perm%d" % pn)
        if not os.path.exists(outDir):
            os.makedirs(outDir)
        trainingTrackPath = os.path.join(outDir, "training_tracks.xml")
        evalTrackPath = os.path.join(outDir, "eval_tracks.xml")
        for maskTrack in trainingTrackList.getMaskTracks():
            pList.addTrack(copy.deepcopy(maskTrack))
        pList.saveXML(trainingTrackPath)
        epList = TrackList()
        for track in pList:
            t = copy.deepcopy(evalTrackList.getTrackByName(track.getName()))
            epList.addTrack(t)
        for maskTrack in trainingTrackList.getMaskTracks():
            epList.addTrack(copy.deepcopy(maskTrack))
        epList.saveXML(evalTrackPath)
        
        for inBed in args.inBeds:
            
            base = os.path.basename(inBed)
            truthBed = inBed
            testBed = inBed
            if args.cross is True:
                truthBed = os.path.join(outDir,
                                        os.path.splitext(base)[0] +
                                        "_truth_temp.bed")
                testBed = os.path.join(outDir,
                                       os.path.splitext(base)[0] +
                                       "_test_temp.bed")
                splitBed(inBed, truthBed, testBed)

                                        
            
            # train
            if args.mod is not None:
                modPath = args.mod
                command = "ls %s" % modPath
            else:
                modPath = os.path.join(outDir,
                                       os.path.splitext(base)[0] + ".mod")
                command = "teHmmTrain.py %s %s %s %s %s" % (trainingTrackPath,
                                                            truthBed,
                                                            modPath,
                                                            logOps,
                                                            trainFlags)
                if args.segment is True:
                    command += " --segment %s" % truthBed

            # view
            viewPath = os.path.join(outDir,
                                   os.path.splitext(base)[0] + "_view.txt")
            command += " && teHmmView.py %s > %s" % (modPath, viewPath)

            # evaluate
            numReps = 1
            if args.reps is not None and args.saveAllReps is True:
                numReps = args.reps
                assert numReps > 0
            missed = 0
            # little hack to repeat evaluation for each training replicate
            for repNum in xrange(-1, numReps-1):
                if repNum == -1:
                    repSuffix = ""
                else:
                    repSuffix = ".rep%d" % repNum                
                evalBed = os.path.join(outDir,
                                       os.path.splitext(base)[0] + "_eval.bed" +
                                       repSuffix)
                hmmEvalInputBed = testBed
                if args.eval is not None:
                    hmmEvalInputBed = args.eval
                bicPath = os.path.join(outDir,
                                       os.path.splitext(base)[0] + "_bic.txt" +
                                       repSuffix)

                command += " && teHmmEval.py %s %s %s --bed %s %s --bic %s" % (
                    evalTrackPath,
                    modPath + repSuffix,
                    hmmEvalInputBed,
                    evalBed,
                    logOps,
                    bicPath)
                zin = True

                if args.segment is True:
                    command += " --segment"

                # fit
                compTruth = testBed
                if args.truth is not None:
                    compTruth = args.truth
                compareInputBed = evalBed
                if args.fit is True:
                    fitBed = os.path.join(outDir,
                                          os.path.splitext(base)[0] + "_eval_fit.bed" +
                                          repSuffix)
                    command += " && fitStateNames.py %s %s %s --tl %s" % (compTruth,
                                                                          evalBed,
                                                                          fitBed,
                                                                          evalTrackPath)
                    if args.fitOpts is not None:
                        command += " " + args.fitOpts
                    compareInputBed = fitBed

                # compare
                compPath = os.path.join(outDir,
                                        os.path.splitext(base)[0] + "_comp.txt" +
                                        repSuffix)
                command += " && compareBedStates.py %s %s --tl %s > %s" % (
                    compTruth,
                    compareInputBed,
                    evalTrackPath,
                    compPath)
            

                # make table row
                if repSuffix == "":
                    rowPath = os.path.join(outDir,
                                           os.path.splitext(base)[0] + "_row.txt")
                    if inBed in rows:
                        rows[inBed].append(rowPath)
                    else:
                        rows[inBed] = [rowPath]
                    command += " && scrapeBenchmarkRow.py %s %s %s %s %s" % (
                        args.trainingTracksInfo,
                        trainingTrackPath,
                        evalBed,
                        compPath,
                        rowPath)

            # remember command
            inCmdPath = os.path.join(outDir,
                                    os.path.splitext(base)[0] + "_cmd.txt")
            inCmdFile = open(inCmdPath, "w")
            inCmdFile.write(command + "\n")
            inCmdFile.close()
            commands.append(command)
            
    runParallelShellCommands(commands, args.numProc)
    writeTables(args.outputDir, rows)
Ejemplo n.º 15
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Fill in masked intervals of an hmm prediction "
        "(from teHmmEval.py) with state corresponding to surrounding"
        " intervals.")

    parser.add_argument("tracksXML", help="XML track list (used to id masking"
                        " tracks")
    parser.add_argument("allBed", help="Target scope.  Masked intervals outside"
                        " of these regions will not be included")
    parser.add_argument("inBed", help="TE prediction BED file.  State labels"
                        " should probably be mapped (ie with fitStateNames.py)")
    parser.add_argument("outBed", help="Output BED.  Will be equivalent to"
                        " the input bed except all gaps corresponding to "
                        "masked intervals will be filled")
    parser.add_argument("--maxLen", help="Maximum length of a masked interval"
                        " to fill (inclusive). Use --delMask option with same value"
                        "if running compareBedStates.py after.",
                        type=int, default=sys.maxint)
    parser.add_argument("--default", help="Default label to give to masked "
                        "region if no label can be determined", default="0")
    parser.add_argument("--tgts", help="Only relabel gaps that "
                        "are flanked on both sides by the same state, and this state"
                        " is in this comma- separated list. --default used for other"
                        " gaps.  If not targetst specified then all states checked.",
                        default=None)
    parser.add_argument("--oneSidedTgts", help="Only relabel gaps that "
                        "are flanked on at least one side by a state in this comma-"
                        "separated list --default used for other gaps",
                         default=None)
    parser.add_argument("--onlyDefault", help="Add the default state (--default) no"
                        " no all masked gaps no matter what. ie ignoring all other "
                        "logic", action="store_true", default=False)
    parser.add_argument("--cut", help="Cut out gaps for masked tracks from the input."
                        " By default, the input is expected to come from the HMM "
                        "with mask intervals already absent, and will crash on with"
                        " an assertion error if an overlap is detected.",
                        action="store_true", default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # make sets
    tgtSet = set()
    if args.tgts is not None:
        tgtSet = set(args.tgts.split(","))
    oneSidedTgtSet = set()
    if args.oneSidedTgts is not None:
        oneSidedTgtSet = set(args.oneSidedTgts.split(","))
    assert len(tgtSet.intersection(oneSidedTgtSet)) == 0

    # read the track list
    trackList = TrackList(args.tracksXML)
    maskTracks = trackList.getMaskTracks()

    # read the input bed
    inBed = args.inBed
    if args.cut is True:
        inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML)
    inputIntervals = readBedIntervals(inBed, ncol = 4, sort = True)
    if args.cut is True:
        runShellCommand("rm -f %s" % inBed)
    if len(maskTracks) == 0 or len(inputIntervals) == 0:
        runShellCommand("cp %s %s" % (args.inBed, args.outBed))
        logger.warning("No mask tracks located in %s or"
                       " %s empty" % (args.tracksXML, args.inBed))
        return 0


    # make a temporary, combined, merged masking bed file
    tempMaskBed = getLocalTempPath("Temp_mb", ".bed")
    for maskTrack in maskTracks:
        assert os.path.isfile(maskTrack.getPath())
        runShellCommand("cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % (
            maskTrack.getPath(), tempMaskBed))
    maskedIntervals = getMergedBedIntervals(tempMaskBed, sort = True)
    resolvedMasks = 0

    if len(inputIntervals) == 0:
        logger.warning("No mask tracks located in %s" % args.tracksXML)
        return
    inputIdx = 0
    rightFlank = inputIntervals[inputIdx]

    tempOutMask = getLocalTempPath("Temp_om", ".bed")
    tempOutMaskFile = open(tempOutMask, "w")

    for maskIdx, maskInterval in enumerate(maskedIntervals):
        if maskInterval[2] - maskInterval[1] > args.maxLen:
            continue
        # find candidate right flank
        while rightFlank < maskInterval:
            if inputIdx == len(inputIntervals) - 1:
                rightFlank = None
                break
            else:
                inputIdx += 1
                rightFlank = inputIntervals[inputIdx]

        # candidate left flank
        leftFlank = None
        if inputIdx > 0:
            leftFlank = inputIntervals[inputIdx - 1]

        # identify flanking states if the intervals perfectly abut
        leftState = None
        if leftFlank is not None:
            if leftFlank[0] == maskInterval[0] and leftFlank[2] == maskInterval[1]:
                leftState = str(leftFlank[3])
            else:
                assert intersectSize(leftFlank, maskInterval) == 0
        rightState = None
        if rightFlank is not None:
            if rightFlank[0] == maskInterval[0] and rightFlank[1] == maskInterval[2]:
                rightState = str(rightFlank[3])
            else:
                assert intersectSize(rightFlank, maskInterval) == 0
            
        # choose a state for the mask interval
        maskState = str(args.default)
        if args.onlyDefault is True:
            pass
        elif leftState is not None and leftState == rightState:
            if len(tgtSet) == 0 or leftState in tgtSet:
                maskState = leftState
        elif leftState in oneSidedTgtSet:
            maskState = leftState
        elif rightState in oneSidedTgtSet:
            maskState = rightState
        
        # write our mask interval
        tempOutMaskFile.write("%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1],
                                                    maskInterval[2], maskState))

    
    tempOutMaskFile.close()    
    tempMergePath1 = getLocalTempPath("Temp_mp", ".bed")
    tempMergePath2 = getLocalTempPath("Temp_mp", ".bed")
    runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1,
                                                 tempOutMask, tempMergePath1))
    runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2))
    tempScopePath = getLocalTempPath("temp_all", ".bed")
    runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath))
    runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath,
                                                       args.outBed))

    runShellCommand("rm -f %s" % " ".join([tempMaskBed, tempOutMask, tempMergePath1,
                                      tempMergePath2, tempScopePath]))
    cleanBedTool(tempBedToolPath)
Ejemplo n.º 16
0
def runTsd(args, tempTracksInfo):
    """ run addTsdTrack on termini and chaux to generate tsd track"""
    if args.noTsd is True:
        return

    origTrackList = TrackList(args.tracksInfo)
    outTrackList = TrackList(tempTracksInfo)

    tempFiles = []
    tsdInputFiles = []
    tsdInputTracks = []
        
    # preprocess termini
    lastzTracks = [origTrackList.getTrackByName(args.ltr_termini),
                  origTrackList.getTrackByName(args.tir)]
    for terminiTrack in lastzTracks:
        if terminiTrack is not None:
            inFile = terminiTrack.getPath()
            fillFile = getLocalTempPath("Temp_fill", ".bed")
            tempBed = None
            if inFile[-3:] == ".bb":
                tempBed = getLocalTempPath("Temp_termini", ".bed")
                runShellCommand("bigBedToBed %s %s" % (inFile, tempBed))
                inFile = tempBed
            runShellCommand("fillTermini.py %s %s" % (inFile, fillFile))
            tsdInputFiles.append(fillFile)
            tsdInputTracks.append(terminiTrack.getName())
            tempFiles.append(fillFile)
            if tempBed is not None:
                runShellCommand("rm -f %s" % tempBed)
        else:
            logger.warning("Could not find termini track")

    # add repeat_modeler
    repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler)
    if repeat_modelerTrack is not None:
        tsdInputFiles.append(repeat_modelerTrack.getPath())
        tsdInputTracks.append(repeat_modelerTrack.getName())

    # run addTsdTrack (appending except first time)
    # note we override input track paths in each case
    assert len(tsdInputFiles) == len(tsdInputTracks)
    for i in xrange(len(tsdInputFiles)):
        optString = ""
        if i > 0:
            optString += " --append"
        # really rough hardcoded params based on
        # (A unified classification system for eukaryotic transposable elements
        # Wicker et. al 2007)
        if tsdInputTracks[i] == args.repeat_modeler:
            optString += " --names LINE,SINE,Unknown"
            optString += " --maxScore 20"
            optString += " --left 20"
            optString += " --right 20"
            optString += " --min 5"
            optString += " --max 20"
            optString += " --overlap 20"
        elif tsdInputTracks[i] == args.ltr_termini:
            optString += " --maxScore 3"
            optString += " --left 8"
            optString += " --right 8"
            optString += " --min 3"
            optString += " --max 6"
        elif tsdInputTracks[i] == args.tir:
            optString += " --maxScore 3"
            optString += " --left 15"
            optString += " --right 15"
            optString += " --min 3"
            optString += " --max 12"

        tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml")
        runShellCommand("addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" % (
            tempTracksInfo,
            args.cleanTrackPath,
            tempXMLOut,
            tsdInputTracks[i],
            args.sequence,
            args.tsd,
            tsdInputFiles[i],
            optString,
            args.logOpString,
            args.numProc))
        
        runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo))

    for i in xrange(len(tempFiles)):
        runShellCommand("rm %s" % tempFiles[i])
Ejemplo n.º 17
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create starting transition and emission distributions "
        "from a candidate BED annotation, which can"
        " be used with teHmmTrain.py using the --initTransProbs and "
        "--initEmProbs options, respectively.  The distributions created here"
        " are extremely simple, but this can be a good shortcut to at least "
        "getting the state names into the init files, which can be further "
        "tweeked by hand.")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("trackName", help="Name of Track to use as initial"
                        " annotation")
    parser.add_argument("queryBed", help="Bed file with regions to query")
    parser.add_argument("outTransProbs", help="File to write transition model"
                        " to")
    parser.add_argument("outEmProbs", help="File to write emission model to")
    parser.add_argument("--numOut", help="Number of \"outside\" states to add"
                        " to the model.", default=1, type=int)
    parser.add_argument("--numTot", help="Add x \"outside\" states such "
                        "that total states is this. (overrieds --numOut)",
                        default=0, type=int)
    parser.add_argument("--outName", help="Name of outside states (will have"
                        " numeric suffix if more than 1)", default="Outside")
    parser.add_argument("--mode", help="Strategy for initializing the "
                        "transition graph: {\'star\': all states are connected"
                        " to the oustide state(s) but not each other; "
                        " \'data\': transitions estimated from input bed; "
                        " \'full\': dont write edges and let teHmmTrain.py "
                        "initialize as a clique}", default="star")
    parser.add_argument("--selfTran", help="This script will always write all"
                        " the self-transition probabilities to the output file. "
                        "They will all be set to the specified value using this"
                        " option, or estimated from the data if -1", default=-1.,
                        type=float)
    parser.add_argument("--em", help="Emission probability for input track ("
                        "ie probability that state emits itself)",
                        type=float, default=0.95)
    parser.add_argument("--outEmNone", help="Add None emission probabilities"
                        " for target track for Outside states",
                        action="store_true", default=None)
                        
    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.mode == "star" and args.numOut < 1:
        raise RuntimeError("--numOut must be at least 1 if --mode star is used")
    if args.mode != "star" and args.mode != "data" and args.mode != "full":
        raise RuntimeError("--mode must be one of {star, data, full}")
    if args.mode == "data":
        raise RuntimeError("--data not implemented yet")
    assert os.path.isfile(args.tracksInfo)
    
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # Read the tracks info
    trackList = TrackList(args.tracksInfo)
    # Extract the track we want
    track = trackList.getTrackByName(args.trackName)
    if track is None:
        raise RuntimeError("Track %s not found in tracksInfo" % args.trackName)
    trackPath = track.getPath()
    if track.getDist() != "multinomial" and track.getDist() != "gaussian":
        raise RuntimeError("Track %s does not have multinomial or "
                           "gaussian distribution" % args.trackName)
    if track.getScale() is not None or track.getLogScale() is not None:
        raise RuntimeError("Track %s must not have scale" % args.trackName)
    
    # read query intervals from the bed file
    logger.info("loading query intervals from %s" % args.queryBed)
    mergedIntervals = getMergedBedIntervals(args.queryBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.queryBed)

    # read the track, while intersecting with query intervals
    # (track is saved as temp XML file for sake not changing interface)
    bedIntervals = []
    for queryInterval in mergedIntervals:
        bedIntervals += readBedIntervals(trackPath,
                                        ncol = track.getValCol() + 1,
                                        chrom=queryInterval[0],
                                        start=queryInterval[1],
                                        end=queryInterval[2])

    # 1st pass to collect set of names
    nameMap = CategoryMap(reserved = 0)
    for interval in bedIntervals:
        nameMap.update(interval[track.getValCol()])
    outNameMap = CategoryMap(reserved = 0)
    if args.numTot > 0:
        args.numOut = max(0, args.numTot - len(nameMap))
    for i in xrange(args.numOut):
        outName = args.outName
        if args.numOut > 1:
            outName += str(i)
        assert nameMap.has(outName) is False
        outNameMap.update(outName)

    # write the transition model for use with teHmmTrain.py --initTransProbs    
    writeTransitions(bedIntervals, nameMap, outNameMap, args)

    # write the emission model for use with teHmmTrain.py --initEmProbs
    writeEmissions(bedIntervals, nameMap, outNameMap, args)

    cleanBedTool(tempBedToolPath)
Ejemplo n.º 18
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Automatically set the scale attributes of numeric tracks"
        " within a given tracks.xml function using some simple heuristics. ")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("outputTracks", help="Path to write modified tracks XML"
                        " to.")
    parser.add_argument("--numBins", help="Maximum number of bins after scaling",
                        default=10, type=int)
    parser.add_argument("--tracks", help="Comma-separated list of tracks "
                        "to process. If not set, all"
                        " tracks listed as having a multinomial distribution"
                        " (since this is the default value, this includes "
                        "tracks with no distribution attribute) or gaussian"
                        " distribution will be processed.", default=None)
    parser.add_argument("--skip", help="Comma-separated list of tracks to "
                        "skip.", default=None)
    parser.add_argument("--noLog", help="Never use log scaling",
                        action="store_true", default=False)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    trackNames = []
    if args.tracks is not None:
        trackNames = args.tracks.split(",")
    skipNames = []
    if args.skip is not None:
        skipNames = args.skip.split(",")
    
    trackList = TrackList(args.tracksInfo)
    outTrackList = copy.deepcopy(trackList)

    allIntervals = getMergedBedIntervals(args.allBed)

    for track in trackList:
        trackExt = os.path.splitext(track.getPath())[1]
        isFasta = len(trackExt) >= 3 and trackExt[:3].lower() == ".fa"
        if track.getName() not in skipNames and\
          (track.getName() in trackNames or len(trackNames) == 0) and\
          (track.getDist() == "multinomial" or
           track.getDist() == "sparse_multinomial" or
          track.getDist() == "gaussian") and\
          not isFasta:
          try:
              setTrackScale(track, args.numBins, allIntervals, args.noLog)
          except ValueError as e:
              logger.warning("Skipping (non-numeric?) track %s due to: %s" % (
                  track.getName(), str(e)))

    trackList.saveXML(args.outputTracks)
    cleanBedTool(tempBedToolPath)
Ejemplo n.º 19
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Fill in masked intervals of an hmm prediction "
        "(from teHmmEval.py) with state corresponding to surrounding"
        " intervals.")

    parser.add_argument("tracksXML",
                        help="XML track list (used to id masking"
                        " tracks")
    parser.add_argument("allBed",
                        help="Target scope.  Masked intervals outside"
                        " of these regions will not be included")
    parser.add_argument(
        "inBed",
        help="TE prediction BED file.  State labels"
        " should probably be mapped (ie with fitStateNames.py)")
    parser.add_argument("outBed",
                        help="Output BED.  Will be equivalent to"
                        " the input bed except all gaps corresponding to "
                        "masked intervals will be filled")
    parser.add_argument(
        "--maxLen",
        help="Maximum length of a masked interval"
        " to fill (inclusive). Use --delMask option with same value"
        "if running compareBedStates.py after.",
        type=int,
        default=sys.maxint)
    parser.add_argument("--default",
                        help="Default label to give to masked "
                        "region if no label can be determined",
                        default="0")
    parser.add_argument(
        "--tgts",
        help="Only relabel gaps that "
        "are flanked on both sides by the same state, and this state"
        " is in this comma- separated list. --default used for other"
        " gaps.  If not targetst specified then all states checked.",
        default=None)
    parser.add_argument(
        "--oneSidedTgts",
        help="Only relabel gaps that "
        "are flanked on at least one side by a state in this comma-"
        "separated list --default used for other gaps",
        default=None)
    parser.add_argument(
        "--onlyDefault",
        help="Add the default state (--default) no"
        " no all masked gaps no matter what. ie ignoring all other "
        "logic",
        action="store_true",
        default=False)
    parser.add_argument(
        "--cut",
        help="Cut out gaps for masked tracks from the input."
        " By default, the input is expected to come from the HMM "
        "with mask intervals already absent, and will crash on with"
        " an assertion error if an overlap is detected.",
        action="store_true",
        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # make sets
    tgtSet = set()
    if args.tgts is not None:
        tgtSet = set(args.tgts.split(","))
    oneSidedTgtSet = set()
    if args.oneSidedTgts is not None:
        oneSidedTgtSet = set(args.oneSidedTgts.split(","))
    assert len(tgtSet.intersection(oneSidedTgtSet)) == 0

    # read the track list
    trackList = TrackList(args.tracksXML)
    maskTracks = trackList.getMaskTracks()

    # read the input bed
    inBed = args.inBed
    if args.cut is True:
        inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML)
    inputIntervals = readBedIntervals(inBed, ncol=4, sort=True)
    if args.cut is True:
        runShellCommand("rm -f %s" % inBed)
    if len(maskTracks) == 0 or len(inputIntervals) == 0:
        runShellCommand("cp %s %s" % (args.inBed, args.outBed))
        logger.warning("No mask tracks located in %s or"
                       " %s empty" % (args.tracksXML, args.inBed))
        return 0

    # make a temporary, combined, merged masking bed file
    tempMaskBed = getLocalTempPath("Temp_mb", ".bed")
    for maskTrack in maskTracks:
        assert os.path.isfile(maskTrack.getPath())
        runShellCommand(
            "cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s"
            % (maskTrack.getPath(), tempMaskBed))
    maskedIntervals = getMergedBedIntervals(tempMaskBed, sort=True)
    resolvedMasks = 0

    if len(inputIntervals) == 0:
        logger.warning("No mask tracks located in %s" % args.tracksXML)
        return
    inputIdx = 0
    rightFlank = inputIntervals[inputIdx]

    tempOutMask = getLocalTempPath("Temp_om", ".bed")
    tempOutMaskFile = open(tempOutMask, "w")

    for maskIdx, maskInterval in enumerate(maskedIntervals):
        if maskInterval[2] - maskInterval[1] > args.maxLen:
            continue
        # find candidate right flank
        while rightFlank < maskInterval:
            if inputIdx == len(inputIntervals) - 1:
                rightFlank = None
                break
            else:
                inputIdx += 1
                rightFlank = inputIntervals[inputIdx]

        # candidate left flank
        leftFlank = None
        if inputIdx > 0:
            leftFlank = inputIntervals[inputIdx - 1]

        # identify flanking states if the intervals perfectly abut
        leftState = None
        if leftFlank is not None:
            if leftFlank[0] == maskInterval[0] and leftFlank[
                    2] == maskInterval[1]:
                leftState = str(leftFlank[3])
            else:
                assert intersectSize(leftFlank, maskInterval) == 0
        rightState = None
        if rightFlank is not None:
            if rightFlank[0] == maskInterval[0] and rightFlank[
                    1] == maskInterval[2]:
                rightState = str(rightFlank[3])
            else:
                assert intersectSize(rightFlank, maskInterval) == 0

        # choose a state for the mask interval
        maskState = str(args.default)
        if args.onlyDefault is True:
            pass
        elif leftState is not None and leftState == rightState:
            if len(tgtSet) == 0 or leftState in tgtSet:
                maskState = leftState
        elif leftState in oneSidedTgtSet:
            maskState = leftState
        elif rightState in oneSidedTgtSet:
            maskState = rightState

        # write our mask interval
        tempOutMaskFile.write(
            "%s\t%d\t%d\t%s\n" %
            (maskInterval[0], maskInterval[1], maskInterval[2], maskState))

    tempOutMaskFile.close()
    tempMergePath1 = getLocalTempPath("Temp_mp", ".bed")
    tempMergePath2 = getLocalTempPath("Temp_mp", ".bed")
    runShellCommand("cp %s %s ; cat %s >> %s" %
                    (args.inBed, tempMergePath1, tempOutMask, tempMergePath1))
    runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2))
    tempScopePath = getLocalTempPath("temp_all", ".bed")
    runShellCommand("mergeBed -i %s |sortBed > %s" %
                    (args.allBed, tempScopePath))
    runShellCommand("intersectBed -a %s -b %s > %s" %
                    (tempMergePath2, tempScopePath, args.outBed))

    runShellCommand("rm -f %s" % " ".join([
        tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath
    ]))
    cleanBedTool(tempBedToolPath)
Ejemplo n.º 20
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Add a TSD track (or modify an existing one) based on a "
        "given track")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("tsdTrackDir",
                        help="Directory to write cleaned BED"
                        " tracks to")
    parser.add_argument("outTracksInfo",
                        help="Path to write modified tracks XML"
                        " to.")
    parser.add_argument("inputTrack", help="Name of track to createTSDs from")
    parser.add_argument("fastaTrack", help="Name of track for fasta sequence")
    parser.add_argument("outputTrack",
                        help="Name of tsd track to add.  Will"
                        " overwrite if it already exists (or append with"
                        " --append option)")
    parser.add_argument("--append",
                        help="Add onto existing TSD track if exists",
                        default=False,
                        action="store_true")
    parser.add_argument("--inPath",
                        help="Use given file instead of inputTrack"
                        " path to generate TSD",
                        default=None)

    ############ TSDFINDER OPTIONS ##############
    parser.add_argument("--min",
                        help="Minimum length of a TSD",
                        default=None,
                        type=int)
    parser.add_argument("--max",
                        help="Maximum length of a TSD",
                        default=None,
                        type=int)
    parser.add_argument("--all",
                        help="Report all matches in region (as opposed"
                        " to only the nearest to the BED element which is the "
                        "default behaviour",
                        action="store_true",
                        default=False)
    parser.add_argument("--maxScore",
                        help="Only report matches with given "
                        "score or smaller.  The score  is definied as the "
                        "maximum distance between the (two) TSD intervals and "
                        "the query interval",
                        default=None,
                        type=int)
    parser.add_argument("--left",
                        help="Number of bases immediately left of the "
                        "BED element to search for the left TSD",
                        default=None,
                        type=int)
    parser.add_argument("--right",
                        help="Number of bases immediately right of "
                        "the BED element to search for the right TSD",
                        default=None,
                        type=int)
    parser.add_argument("--overlap",
                        help="Number of bases overlapping the "
                        "BED element to include in search (so total space "
                        "on each side will be --left + overlap, and --right + "
                        "--overlap",
                        default=None,
                        type=int)
    parser.add_argument("--leftName",
                        help="Name of left TSDs in output Bed",
                        default=None)
    parser.add_argument("--rightName",
                        help="Name of right TSDs in output Bed",
                        default=None)
    parser.add_argument("--id",
                        help="Assign left/right pairs of TSDs a unique"
                        " matching ID",
                        action="store_true",
                        default=False)
    parser.add_argument("--names",
                        help="Only apply to bed interval whose "
                        "name is in (comma-separated) list.  If not specified"
                        " then all intervals are processed",
                        default=None)
    parser.add_argument("--numProc",
                        help="Number of jobs to run in parallel."
                        " (parallization done on different sequences in FASTA"
                        "file",
                        type=int,
                        default=1)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # copy out all options for call to tsd finder
    args.tsdFinderOptions = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.tsdFinderOptions += " --logFile %s" % args.logFile
    for option in [
            "min", "max", "all", "maxScore", "left", "right", "overlap",
            "leftName", "rightName", "id", "names", "numProc"
    ]:
        val = getattr(args, option)
        if val is True:
            args.tsdFinderOptions += " --%s" % option
        elif val is not None and val is not False:
            args.tsdFinderOptions += " --%s %s" % (option, val)

    try:
        os.makedirs(args.tsdTrackDir)
    except:
        pass
    if not os.path.isdir(args.tsdTrackDir):
        raise RuntimeError("Unable to find or create tsdTrack dir %s" %
                           args.tsdTrackDir)

    trackList = TrackList(args.tracksInfo)
    outTrackList = copy.deepcopy(trackList)
    inputTrack = trackList.getTrackByName(args.inputTrack)
    if inputTrack is None:
        raise RuntimeError("Track %s not found" % args.inputTrack)
    if args.inPath is not None:
        assert os.path.isfile(args.inPath)
        inputTrack.setPath(args.inPath)
    inTrackExt = os.path.splitext(inputTrack.getPath())[1].lower()
    if inTrackExt != ".bb" and inTrackExt != ".bed":
        raise RuntimeError("Track %s has non-bed extension %s" %
                           (args.inputTrack, inTrackExt))

    fastaTrack = trackList.getTrackByName(args.fastaTrack)
    if fastaTrack is None:
        raise RuntimeError("Fasta Track %s not found" % args.fastaTrack)
    faTrackExt = os.path.splitext(fastaTrack.getPath())[1].lower()
    if faTrackExt[:3] != ".fa":
        raise RuntimeError("Fasta Track %s has non-fasta extension %s" %
                           (args.fastaTrack, faTrackExt))

    tsdTrack = outTrackList.getTrackByName(args.outputTrack)
    if tsdTrack is None:
        if args.append is True:
            raise RuntimeError("TSD track %s not found. Cannot append" %
                               (args.outputTrack))
        tsdTrack = Track()
        tsdTrack.name = args.outputTrack
        tsdTrack.path = os.path.join(
            args.tsdTrackDir,
            args.inputTrack + "_" + args.outputTrack + ".bed")

    runTsdFinder(fastaTrack.getPath(), inputTrack.getPath(),
                 tsdTrack.getPath(), args)

    if outTrackList.getTrackByName(tsdTrack.getName()) is None:
        outTrackList.addTrack(tsdTrack)
    outTrackList.saveXML(args.outTracksInfo)

    cleanBedTool(tempBedToolPath)
Ejemplo n.º 21
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="fix up track names and sort alphabetically.  easier to do here on xml than at end for pape\
        r.")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("outTracksInfo", help="Path to write modified tracks XML")

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    args.logOpString = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.logOpString += " --logFile %s" % args.logFile

    nm = dict()
    nm["hollister"] = "RM-RepBase-Hollister"
    nm["chaux"] = "RM-RepBase-deLaChaux"
    nm["repeat_modeler"] = "RM-RepeatModeler"
    nm["repbase"] = "RM-RepBase"
    nm["repet"] = "REPET"
    nm["ltr_finder"] = "LTR_FINDER"
    nm["ltr_harvest"] = "LTR_Harvest"
    nm["ltr_termini"] = "lastz-Termini"
    nm["lastz-Termini"] = "lastz-LTRTermini"
    nm["tir_termini"] = "lastz-InvTermini"
    nm["irf"] = "IRF"
    nm["palindrome"] = "lastz-Palindrome"
    nm["overlap"] = "lastz-Overlap"
    nm["mitehunter"] = "MITE-Hunter"
    nm["helitronscanner"] = "HelitronScanner"
    nm["cov_80-"] = "lastz-SelfLowId"
    nm["cov_80-90"] = "lastz-SelfMedId"
    nm["cov_90+"] = "lastz-SelfHighId"
    nm["left_peak_80-"] = "lastz-SelfPeakLeftLow"
    nm["lastz-SelfLowLeftPeak"] = nm["left_peak_80-"]
    nm["left_peak_80-90"] = "lastz-SelfPeakLeftMed"
    nm["lastz-SelfMedLeftPeak"] = nm["left_peak_80-90"]
    nm["left_peak_90+"] = "lastz-SelfPeakLeftHigh"
    nm["lastz-SelfHighLeftPeak"] = nm["left_peak_90+"]
    nm["right_peak_80-"] = "lastz-SelfPeakRightLow"
    nm["lastz-SelfLowRightPeak"] = nm["right_peak_80-"]
    nm["right_peak_80-90"] = "lastz-SelfPeakRightMed"
    nm["lastz-SelfMedRightPeak"] = nm["right_peak_80-90"]
    nm["right_peak_90+"] = "lastz-SelfPeakRightHigh"
    nm["lastz-SelfHighRightPeak"] = nm["right_peak_90+"]
    nm["cov_maxPId"] = "lastz-SelfPctMaxId"
    nm["lastz-SelfMaxPctId"] = nm["cov_maxPId"]
    nm["te_domains"] = "TE-Domains"
    nm["fgenesh"] = "Genes"
    nm["genes"] = nm["fgenesh"]
    nm["refseq"] = nm["fgenesh"]
    nm["mrna"] = "mRNA"
    nm["srna"] = "sRNA"
    nm["ortho_depth"] = "Alignment-Depth"
    nm["orthology"] = nm["ortho_depth"]
    nm["chain_depth"] = nm["ortho_depth"]
    nm["alignment_depth"] = nm["ortho_depth"]
    nm["gcpct"] = "GC"
    nm["trf"] = "TRF"
    nm["windowmasker"] = "WindowMasker"
    nm["polyN"] = "Ns"
    nm["phastcons_ce"] = "Conservation"
    nm["phastcons"] = nm["phastcons_ce"]
    nm["PhastCons"] = nm["phastcons_ce"]
    nm["phyloP"] = nm["phastcons_ce"]
    nm["phylop"] = nm["phastcons_ce"] 

    rtracks = dict()
    rtracks["tantan"] = True
    rtracks["polyA"] = True
    rtracks["transposon_psi"] = True
    rtracks["transposonpsi"] = True
    rtracks["repbase_censor"] = True
    rtracks["tsd"] = True
    rtracks["repbase_default"] = True
    rtracks["dustmasker"] = True
       
    inTracks = TrackList(args.tracksInfo)
    outTracks = TrackList()
    outList = []

    for track in itertools.chain(inTracks.trackList, inTracks.maskTrackList):
        if not os.path.exists(track.path):
            raise RuntimeError("Track DNE %s" % track.path)
        if track.name not in rtracks:
            if track.name in nm:
                track.name = nm[track.name]
            else:
                logger.warning("Did not map track %s" % track.name)
            outList.append(track)                        
        else:
            logger.warning("Deleted track %s" % track.name)


    # sort the list
    def sortComp(x):
        lname = x.name.lower()
        if x.name == "RM-RepeatModeler":
            return "aaaaa" + lname
        elif "RM" in x.name:
            return "aaaa" + lname
        elif "REPET" in x.name:
            return "aaa" + lname
        elif "softmask" in lname or "tigr" in lname or "te-domains" in lname:
            return "aa" + lname
        elif x.getDist == "mask":
            return "zzzz" + lname
        else:
            return lname
        
    outList = sorted(outList, key = lambda track : sortComp(track))

    for track in outList:
        outTracks.addTrack(track)

    outTracks.saveXML(args.outTracksInfo)
    
    cleanBedTool(tempBedToolPath)    
Ejemplo n.º 22
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "fix up track names and sort alphabetically.  easier to do here on xml than at end for pape\
        r.")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("outTracksInfo",
                        help="Path to write modified tracks XML")

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    args.logOpString = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.logOpString += " --logFile %s" % args.logFile

    nm = dict()
    nm["hollister"] = "RM-RepBase-Hollister"
    nm["chaux"] = "RM-RepBase-deLaChaux"
    nm["repeat_modeler"] = "RM-RepeatModeler"
    nm["repbase"] = "RM-RepBase"
    nm["repet"] = "REPET"
    nm["ltr_finder"] = "LTR_FINDER"
    nm["ltr_harvest"] = "LTR_Harvest"
    nm["ltr_termini"] = "lastz-Termini"
    nm["lastz-Termini"] = "lastz-LTRTermini"
    nm["tir_termini"] = "lastz-InvTermini"
    nm["irf"] = "IRF"
    nm["palindrome"] = "lastz-Palindrome"
    nm["overlap"] = "lastz-Overlap"
    nm["mitehunter"] = "MITE-Hunter"
    nm["helitronscanner"] = "HelitronScanner"
    nm["cov_80-"] = "lastz-SelfLowId"
    nm["cov_80-90"] = "lastz-SelfMedId"
    nm["cov_90+"] = "lastz-SelfHighId"
    nm["left_peak_80-"] = "lastz-SelfPeakLeftLow"
    nm["lastz-SelfLowLeftPeak"] = nm["left_peak_80-"]
    nm["left_peak_80-90"] = "lastz-SelfPeakLeftMed"
    nm["lastz-SelfMedLeftPeak"] = nm["left_peak_80-90"]
    nm["left_peak_90+"] = "lastz-SelfPeakLeftHigh"
    nm["lastz-SelfHighLeftPeak"] = nm["left_peak_90+"]
    nm["right_peak_80-"] = "lastz-SelfPeakRightLow"
    nm["lastz-SelfLowRightPeak"] = nm["right_peak_80-"]
    nm["right_peak_80-90"] = "lastz-SelfPeakRightMed"
    nm["lastz-SelfMedRightPeak"] = nm["right_peak_80-90"]
    nm["right_peak_90+"] = "lastz-SelfPeakRightHigh"
    nm["lastz-SelfHighRightPeak"] = nm["right_peak_90+"]
    nm["cov_maxPId"] = "lastz-SelfPctMaxId"
    nm["lastz-SelfMaxPctId"] = nm["cov_maxPId"]
    nm["te_domains"] = "TE-Domains"
    nm["fgenesh"] = "Genes"
    nm["genes"] = nm["fgenesh"]
    nm["refseq"] = nm["fgenesh"]
    nm["mrna"] = "mRNA"
    nm["srna"] = "sRNA"
    nm["ortho_depth"] = "Alignment-Depth"
    nm["orthology"] = nm["ortho_depth"]
    nm["chain_depth"] = nm["ortho_depth"]
    nm["alignment_depth"] = nm["ortho_depth"]
    nm["gcpct"] = "GC"
    nm["trf"] = "TRF"
    nm["windowmasker"] = "WindowMasker"
    nm["polyN"] = "Ns"
    nm["phastcons_ce"] = "Conservation"
    nm["phastcons"] = nm["phastcons_ce"]
    nm["PhastCons"] = nm["phastcons_ce"]
    nm["phyloP"] = nm["phastcons_ce"]
    nm["phylop"] = nm["phastcons_ce"]

    rtracks = dict()
    rtracks["tantan"] = True
    rtracks["polyA"] = True
    rtracks["transposon_psi"] = True
    rtracks["transposonpsi"] = True
    rtracks["repbase_censor"] = True
    rtracks["tsd"] = True
    rtracks["repbase_default"] = True
    rtracks["dustmasker"] = True

    inTracks = TrackList(args.tracksInfo)
    outTracks = TrackList()
    outList = []

    for track in itertools.chain(inTracks.trackList, inTracks.maskTrackList):
        if not os.path.exists(track.path):
            raise RuntimeError("Track DNE %s" % track.path)
        if track.name not in rtracks:
            if track.name in nm:
                track.name = nm[track.name]
            else:
                logger.warning("Did not map track %s" % track.name)
            outList.append(track)
        else:
            logger.warning("Deleted track %s" % track.name)

    # sort the list
    def sortComp(x):
        lname = x.name.lower()
        if x.name == "RM-RepeatModeler":
            return "aaaaa" + lname
        elif "RM" in x.name:
            return "aaaa" + lname
        elif "REPET" in x.name:
            return "aaa" + lname
        elif "softmask" in lname or "tigr" in lname or "te-domains" in lname:
            return "aa" + lname
        elif x.getDist == "mask":
            return "zzzz" + lname
        else:
            return lname

    outList = sorted(outList, key=lambda track: sortComp(track))

    for track in outList:
        outTracks.addTrack(track)

    outTracks.saveXML(args.outTracksInfo)

    cleanBedTool(tempBedToolPath)
Ejemplo n.º 23
0
def runTsd(args, tempTracksInfo):
    """ run addTsdTrack on termini and chaux to generate tsd track"""
    if args.noTsd is True:
        return

    origTrackList = TrackList(args.tracksInfo)
    outTrackList = TrackList(tempTracksInfo)

    tempFiles = []
    tsdInputFiles = []
    tsdInputTracks = []

    # preprocess termini
    lastzTracks = [
        origTrackList.getTrackByName(args.ltr_termini),
        origTrackList.getTrackByName(args.tir)
    ]
    for terminiTrack in lastzTracks:
        if terminiTrack is not None:
            inFile = terminiTrack.getPath()
            fillFile = getLocalTempPath("Temp_fill", ".bed")
            tempBed = None
            if inFile[-3:] == ".bb":
                tempBed = getLocalTempPath("Temp_termini", ".bed")
                runShellCommand("bigBedToBed %s %s" % (inFile, tempBed))
                inFile = tempBed
            runShellCommand("fillTermini.py %s %s" % (inFile, fillFile))
            tsdInputFiles.append(fillFile)
            tsdInputTracks.append(terminiTrack.getName())
            tempFiles.append(fillFile)
            if tempBed is not None:
                runShellCommand("rm -f %s" % tempBed)
        else:
            logger.warning("Could not find termini track")

    # add repeat_modeler
    repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler)
    if repeat_modelerTrack is not None:
        tsdInputFiles.append(repeat_modelerTrack.getPath())
        tsdInputTracks.append(repeat_modelerTrack.getName())

    # run addTsdTrack (appending except first time)
    # note we override input track paths in each case
    assert len(tsdInputFiles) == len(tsdInputTracks)
    for i in xrange(len(tsdInputFiles)):
        optString = ""
        if i > 0:
            optString += " --append"
        # really rough hardcoded params based on
        # (A unified classification system for eukaryotic transposable elements
        # Wicker et. al 2007)
        if tsdInputTracks[i] == args.repeat_modeler:
            optString += " --names LINE,SINE,Unknown"
            optString += " --maxScore 20"
            optString += " --left 20"
            optString += " --right 20"
            optString += " --min 5"
            optString += " --max 20"
            optString += " --overlap 20"
        elif tsdInputTracks[i] == args.ltr_termini:
            optString += " --maxScore 3"
            optString += " --left 8"
            optString += " --right 8"
            optString += " --min 3"
            optString += " --max 6"
        elif tsdInputTracks[i] == args.tir:
            optString += " --maxScore 3"
            optString += " --left 15"
            optString += " --right 15"
            optString += " --min 3"
            optString += " --max 12"

        tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml")
        runShellCommand(
            "addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" %
            (tempTracksInfo, args.cleanTrackPath, tempXMLOut,
             tsdInputTracks[i], args.sequence, args.tsd, tsdInputFiles[i],
             optString, args.logOpString, args.numProc))

        runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo))

    for i in xrange(len(tempFiles)):
        runShellCommand("rm %s" % tempFiles[i])
Ejemplo n.º 24
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Add a TSD track (or modify an existing one) based on a "
        "given track")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("tsdTrackDir", help="Directory to write cleaned BED"
                        " tracks to")
    parser.add_argument("outTracksInfo", help="Path to write modified tracks XML"
                        " to.")
    parser.add_argument("inputTrack", help="Name of track to createTSDs from")
    parser.add_argument("fastaTrack", help="Name of track for fasta sequence")
    parser.add_argument("outputTrack", help="Name of tsd track to add.  Will"
                        " overwrite if it already exists (or append with"
                        " --append option)")
    parser.add_argument("--append", help="Add onto existing TSD track if exists",
                        default=False, action="store_true")
    parser.add_argument("--inPath", help="Use given file instead of inputTrack"
                        " path to generate TSD", default=None)

    ############ TSDFINDER OPTIONS ##############
    parser.add_argument("--min", help="Minimum length of a TSD",
                        default=None, type=int)
    parser.add_argument("--max", help="Maximum length of a TSD",
                        default=None, type=int)
    parser.add_argument("--all", help="Report all matches in region (as opposed"
                        " to only the nearest to the BED element which is the "
                        "default behaviour", action="store_true", default=False)
    parser.add_argument("--maxScore", help="Only report matches with given "
                            "score or smaller.  The score  is definied as the "
                        "maximum distance between the (two) TSD intervals and "
                        "the query interval",
                        default=None, type=int)
    parser.add_argument("--left", help="Number of bases immediately left of the "
                        "BED element to search for the left TSD",
                        default=None, type=int)
    parser.add_argument("--right", help="Number of bases immediately right of "
                        "the BED element to search for the right TSD",
                        default=None, type=int)
    parser.add_argument("--overlap", help="Number of bases overlapping the "
                        "BED element to include in search (so total space "
                        "on each side will be --left + overlap, and --right + "
                        "--overlap", default=None, type=int)
    parser.add_argument("--leftName", help="Name of left TSDs in output Bed",
                        default=None)
    parser.add_argument("--rightName", help="Name of right TSDs in output Bed",
                        default=None)
    parser.add_argument("--id", help="Assign left/right pairs of TSDs a unique"
                        " matching ID", action="store_true", default=False)
    parser.add_argument("--names", help="Only apply to bed interval whose "
                        "name is in (comma-separated) list.  If not specified"
                        " then all intervals are processed", default=None)
    parser.add_argument("--numProc", help="Number of jobs to run in parallel."
                        " (parallization done on different sequences in FASTA"
                        "file", type=int, default=1)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # copy out all options for call to tsd finder
    args.tsdFinderOptions = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.tsdFinderOptions += " --logFile %s" % args.logFile
    for option in ["min", "max", "all", "maxScore", "left", "right", "overlap",
                   "leftName", "rightName", "id", "names", "numProc"]:
        val = getattr(args, option)
        if val is True:
            args.tsdFinderOptions += " --%s" % option
        elif val is not None and val is not False:
            args.tsdFinderOptions += " --%s %s" % (option, val)
            
    try:
        os.makedirs(args.tsdTrackDir)
    except:
        pass
    if not os.path.isdir(args.tsdTrackDir):
        raise RuntimeError("Unable to find or create tsdTrack dir %s" %
                           args.tsdTrackDir)

    trackList = TrackList(args.tracksInfo)
    outTrackList = copy.deepcopy(trackList)
    inputTrack = trackList.getTrackByName(args.inputTrack)
    if inputTrack is None:
        raise RuntimeError("Track %s not found" % args.inputTrack)
    if args.inPath is not None:
        assert os.path.isfile(args.inPath)
        inputTrack.setPath(args.inPath)
    inTrackExt = os.path.splitext(inputTrack.getPath())[1].lower()
    if inTrackExt != ".bb" and inTrackExt != ".bed":
        raise RuntimeError("Track %s has non-bed extension %s" % (
            args.inputTrack, inTrackExt))

    fastaTrack = trackList.getTrackByName(args.fastaTrack)
    if fastaTrack is None:
        raise RuntimeError("Fasta Track %s not found" % args.fastaTrack)
    faTrackExt = os.path.splitext(fastaTrack.getPath())[1].lower()
    if faTrackExt[:3] != ".fa":
        raise RuntimeError("Fasta Track %s has non-fasta extension %s" % (
            args.fastaTrack, faTrackExt))

    tsdTrack = outTrackList.getTrackByName(args.outputTrack)
    if tsdTrack is None:
        if args.append is True:
            raise RuntimeError("TSD track %s not found. Cannot append" % (
                args.outputTrack))
        tsdTrack = Track()
        tsdTrack.name = args.outputTrack
        tsdTrack.path = os.path.join(args.tsdTrackDir, args.inputTrack + "_" +
                                     args.outputTrack + ".bed")

    runTsdFinder(fastaTrack.getPath(), inputTrack.getPath(),
                  tsdTrack.getPath(), args)

    if outTrackList.getTrackByName(tsdTrack.getName()) is None:
        outTrackList.addTrack(tsdTrack)
    outTrackList.saveXML(args.outTracksInfo)

    cleanBedTool(tempBedToolPath)