Esempio n. 1
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Set the score column of each bed interval in input to "
        "(MODE, BINNED) average value of the intersection region in another track). "
        "Can be used, for instance, to assign a copy number of each RepeatModeler "
        "prediction...")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inBed", help="BED file to annotate")
    parser.add_argument("track", help="Track to use for annotation")
    parser.add_argument("outBed", help="Path for output, annotated BED file")
    parser.add_argument("--name",
                        help="Set ID field (column 4 instead of 5)",
                        action="store_true",
                        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read the tracks list
    trackList = TrackList(args.tracksInfo)
    track = trackList.getTrackByName(args.track)
    if track is None:
        raise RuntimeError("Can't find track %s" % args.track)
    # make temporary tracks list with just our track so we can keep using
    # tracks list interface but not read unecessary crap.
    singleListPath = getLocalTempPath("Temp_secScore", ".bed")
    trackList.trackList = [track]
    trackList.saveXML(singleListPath)

    obFile = open(args.outBed, "w")

    # trackData interface not so great at cherry picking intervals.
    # need to merge them up and use segmentation interface
    filledIntervals, mergedIntervals = fillGaps(args.inBed)

    # read track into trackData
    trackData = TrackData()
    logger.info("loading track %s" % singleListPath)
    trackData.loadTrackData(singleListPath,
                            mergedIntervals,
                            segmentIntervals=filledIntervals,
                            applyMasking=False)

    # finally, write the annotation
    writeAnnotatedIntervals(trackData, filledIntervals, mergedIntervals,
                            obFile, args)

    runShellCommand("rm -f %s" % singleListPath)
    obFile.close()
    cleanBedTool(tempBedToolPath)
Esempio n. 2
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Automatically set the scale attributes of numeric tracks"
        " within a given tracks.xml function using some simple heuristics. ")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("chromSizes",
                        help="2-column chrom sizes file as needed"
                        " by bedGraphToBigWig")
    parser.add_argument("queryBed", help="Region(s) to apply scaling to")
    parser.add_argument("outputDir", help="Output directory")
    parser.add_argument("--tracks",
                        help="Comma-separated list of tracks "
                        "to process. If not set, all tracks with a scaling"
                        " attribute are processed",
                        default=None)
    parser.add_argument("--skip",
                        help="Comma-separated list of tracks to "
                        "skip.",
                        default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if not os.path.exists(args.outputDir):
        os.makedirs(args.outputDir)

    trackNames = []
    if args.tracks is not None:
        trackNames = args.tracks.split(",")
    skipNames = []
    if args.skip is not None:
        skipNames = args.skip.split(",")

    mergedIntervals = getMergedBedIntervals(args.queryBed)

    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals)
    trackList = trackData.getTrackList()

    for track in trackList:
        if track.getName() not in skipNames and\
          (track.getName() in trackNames or len(trackNames) == 0):
            if track.getScale() is not None or\
              track.getLogScale() is not None or\
              track.getShift() is not None or\
              track.getDelta() is True:
                logger.info("Writing scaled track %s" % track.getName())
                writeScaledTrack(trackData, track, args)

    cleanBedTool(tempBedToolPath)
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Set the score column of each bed interval in input to "
        "(MODE, BINNED) average value of the intersection region in another track). "
        "Can be used, for instance, to assign a copy number of each RepeatModeler "
        "prediction...")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inBed", help="BED file to annotate")
    parser.add_argument("track", help="Track to use for annotation")
    parser.add_argument("outBed", help="Path for output, annotated BED file")
    parser.add_argument("--name", help="Set ID field (column 4 instead of 5)",
                        action="store_true", default=False)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read the tracks list
    trackList = TrackList(args.tracksInfo)
    track = trackList.getTrackByName(args.track)
    if track is None:
        raise RuntimeError("Can't find track %s" % args.track)
    # make temporary tracks list with just our track so we can keep using
    # tracks list interface but not read unecessary crap.
    singleListPath = getLocalTempPath("Temp_secScore", ".bed")
    trackList.trackList = [track]
    trackList.saveXML(singleListPath)

    obFile = open(args.outBed, "w")

    # trackData interface not so great at cherry picking intervals.
    # need to merge them up and use segmentation interface    
    filledIntervals, mergedIntervals = fillGaps(args.inBed)

    # read track into trackData
    trackData = TrackData()
    logger.info("loading track %s" % singleListPath)
    trackData.loadTrackData(singleListPath, mergedIntervals,
                            segmentIntervals=filledIntervals,
                            applyMasking=False)

    # finally, write the annotation
    writeAnnotatedIntervals(trackData, filledIntervals, mergedIntervals, obFile,
                             args)

    runShellCommand("rm -f %s" % singleListPath)
    obFile.close()
    cleanBedTool(tempBedToolPath)
Esempio n. 4
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Automatically set the scale attributes of numeric tracks"
        " within a given tracks.xml function using some simple heuristics. ")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("chromSizes", help="2-column chrom sizes file as needed"
                        " by bedGraphToBigWig")
    parser.add_argument("queryBed", help="Region(s) to apply scaling to")
    parser.add_argument("outputDir", help="Output directory")
    parser.add_argument("--tracks", help="Comma-separated list of tracks "
                        "to process. If not set, all tracks with a scaling"
                        " attribute are processed", default=None)
    parser.add_argument("--skip", help="Comma-separated list of tracks to "
                        "skip.", default=None)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()


    if not os.path.exists(args.outputDir):
        os.makedirs(args.outputDir)

    trackNames = []
    if args.tracks is not None:
        trackNames = args.tracks.split(",")
    skipNames = []
    if args.skip is not None:
        skipNames = args.skip.split(",")
    
    mergedIntervals = getMergedBedIntervals(args.queryBed)

    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals)
    trackList = trackData.getTrackList()

    for track in trackList:
        if track.getName() not in skipNames and\
          (track.getName() in trackNames or len(trackNames) == 0):
          if track.getScale() is not None or\
            track.getLogScale() is not None or\
            track.getShift() is not None or\
            track.getDelta() is True:
            logger.info("Writing scaled track %s" % track.getName())  
            writeScaledTrack(trackData, track, args)

    cleanBedTool(tempBedToolPath)
Esempio n. 5
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Run unit tests")
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    suite = allSuites()
    runner = unittest.TextTestRunner()
    i = runner.run(suite)
    return len(i.failures) + len(i.errors)
Esempio n. 6
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Automatically set the scale attributes of numeric tracks"
        " within a given tracks.xml function using some simple heuristics. ")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("outputTracks", help="Path to write modified tracks XML"
                        " to.")
    parser.add_argument("--numBins", help="Maximum number of bins after scaling",
                        default=10, type=int)
    parser.add_argument("--tracks", help="Comma-separated list of tracks "
                        "to process. If not set, all"
                        " tracks listed as having a multinomial distribution"
                        " (since this is the default value, this includes "
                        "tracks with no distribution attribute) or gaussian"
                        " distribution will be processed.", default=None)
    parser.add_argument("--skip", help="Comma-separated list of tracks to "
                        "skip.", default=None)
    parser.add_argument("--noLog", help="Never use log scaling",
                        action="store_true", default=False)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    trackNames = []
    if args.tracks is not None:
        trackNames = args.tracks.split(",")
    skipNames = []
    if args.skip is not None:
        skipNames = args.skip.split(",")
    
    trackList = TrackList(args.tracksInfo)
    outTrackList = copy.deepcopy(trackList)

    allIntervals = getMergedBedIntervals(args.allBed)

    for track in trackList:
        trackExt = os.path.splitext(track.getPath())[1]
        isFasta = len(trackExt) >= 3 and trackExt[:3].lower() == ".fa"
        if track.getName() not in skipNames and\
          (track.getName() in trackNames or len(trackNames) == 0) and\
          (track.getDist() == "multinomial" or
           track.getDist() == "sparse_multinomial" or
          track.getDist() == "gaussian") and\
          not isFasta:
          try:
              setTrackScale(track, args.numBins, allIntervals, args.noLog)
          except ValueError as e:
              logger.warning("Skipping (non-numeric?) track %s due to: %s" % (
                  track.getName(), str(e)))

    trackList.saveXML(args.outputTracks)
    cleanBedTool(tempBedToolPath)
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Fill in masked intervals of an hmm prediction "
        "(from teHmmEval.py) with state corresponding to surrounding"
        " intervals.")

    parser.add_argument("tracksXML", help="XML track list (used to id masking"
                        " tracks")
    parser.add_argument("allBed", help="Target scope.  Masked intervals outside"
                        " of these regions will not be included")
    parser.add_argument("inBed", help="TE prediction BED file.  State labels"
                        " should probably be mapped (ie with fitStateNames.py)")
    parser.add_argument("outBed", help="Output BED.  Will be equivalent to"
                        " the input bed except all gaps corresponding to "
                        "masked intervals will be filled")
    parser.add_argument("--maxLen", help="Maximum length of a masked interval"
                        " to fill (inclusive). Use --delMask option with same value"
                        "if running compareBedStates.py after.",
                        type=int, default=sys.maxint)
    parser.add_argument("--default", help="Default label to give to masked "
                        "region if no label can be determined", default="0")
    parser.add_argument("--tgts", help="Only relabel gaps that "
                        "are flanked on both sides by the same state, and this state"
                        " is in this comma- separated list. --default used for other"
                        " gaps.  If not targetst specified then all states checked.",
                        default=None)
    parser.add_argument("--oneSidedTgts", help="Only relabel gaps that "
                        "are flanked on at least one side by a state in this comma-"
                        "separated list --default used for other gaps",
                         default=None)
    parser.add_argument("--onlyDefault", help="Add the default state (--default) no"
                        " no all masked gaps no matter what. ie ignoring all other "
                        "logic", action="store_true", default=False)
    parser.add_argument("--cut", help="Cut out gaps for masked tracks from the input."
                        " By default, the input is expected to come from the HMM "
                        "with mask intervals already absent, and will crash on with"
                        " an assertion error if an overlap is detected.",
                        action="store_true", default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # make sets
    tgtSet = set()
    if args.tgts is not None:
        tgtSet = set(args.tgts.split(","))
    oneSidedTgtSet = set()
    if args.oneSidedTgts is not None:
        oneSidedTgtSet = set(args.oneSidedTgts.split(","))
    assert len(tgtSet.intersection(oneSidedTgtSet)) == 0

    # read the track list
    trackList = TrackList(args.tracksXML)
    maskTracks = trackList.getMaskTracks()

    # read the input bed
    inBed = args.inBed
    if args.cut is True:
        inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML)
    inputIntervals = readBedIntervals(inBed, ncol = 4, sort = True)
    if args.cut is True:
        runShellCommand("rm -f %s" % inBed)
    if len(maskTracks) == 0 or len(inputIntervals) == 0:
        runShellCommand("cp %s %s" % (args.inBed, args.outBed))
        logger.warning("No mask tracks located in %s or"
                       " %s empty" % (args.tracksXML, args.inBed))
        return 0


    # make a temporary, combined, merged masking bed file
    tempMaskBed = getLocalTempPath("Temp_mb", ".bed")
    for maskTrack in maskTracks:
        assert os.path.isfile(maskTrack.getPath())
        runShellCommand("cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % (
            maskTrack.getPath(), tempMaskBed))
    maskedIntervals = getMergedBedIntervals(tempMaskBed, sort = True)
    resolvedMasks = 0

    if len(inputIntervals) == 0:
        logger.warning("No mask tracks located in %s" % args.tracksXML)
        return
    inputIdx = 0
    rightFlank = inputIntervals[inputIdx]

    tempOutMask = getLocalTempPath("Temp_om", ".bed")
    tempOutMaskFile = open(tempOutMask, "w")

    for maskIdx, maskInterval in enumerate(maskedIntervals):
        if maskInterval[2] - maskInterval[1] > args.maxLen:
            continue
        # find candidate right flank
        while rightFlank < maskInterval:
            if inputIdx == len(inputIntervals) - 1:
                rightFlank = None
                break
            else:
                inputIdx += 1
                rightFlank = inputIntervals[inputIdx]

        # candidate left flank
        leftFlank = None
        if inputIdx > 0:
            leftFlank = inputIntervals[inputIdx - 1]

        # identify flanking states if the intervals perfectly abut
        leftState = None
        if leftFlank is not None:
            if leftFlank[0] == maskInterval[0] and leftFlank[2] == maskInterval[1]:
                leftState = str(leftFlank[3])
            else:
                assert intersectSize(leftFlank, maskInterval) == 0
        rightState = None
        if rightFlank is not None:
            if rightFlank[0] == maskInterval[0] and rightFlank[1] == maskInterval[2]:
                rightState = str(rightFlank[3])
            else:
                assert intersectSize(rightFlank, maskInterval) == 0
            
        # choose a state for the mask interval
        maskState = str(args.default)
        if args.onlyDefault is True:
            pass
        elif leftState is not None and leftState == rightState:
            if len(tgtSet) == 0 or leftState in tgtSet:
                maskState = leftState
        elif leftState in oneSidedTgtSet:
            maskState = leftState
        elif rightState in oneSidedTgtSet:
            maskState = rightState
        
        # write our mask interval
        tempOutMaskFile.write("%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1],
                                                    maskInterval[2], maskState))

    
    tempOutMaskFile.close()    
    tempMergePath1 = getLocalTempPath("Temp_mp", ".bed")
    tempMergePath2 = getLocalTempPath("Temp_mp", ".bed")
    runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1,
                                                 tempOutMask, tempMergePath1))
    runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2))
    tempScopePath = getLocalTempPath("temp_all", ".bed")
    runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath))
    runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath,
                                                       args.outBed))

    runShellCommand("rm -f %s" % " ".join([tempMaskBed, tempOutMask, tempMergePath1,
                                      tempMergePath2, tempScopePath]))
    cleanBedTool(tempBedToolPath)
Esempio n. 8
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Add a TSD track (or modify an existing one) based on a "
        "given track")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("tsdTrackDir", help="Directory to write cleaned BED"
                        " tracks to")
    parser.add_argument("outTracksInfo", help="Path to write modified tracks XML"
                        " to.")
    parser.add_argument("inputTrack", help="Name of track to createTSDs from")
    parser.add_argument("fastaTrack", help="Name of track for fasta sequence")
    parser.add_argument("outputTrack", help="Name of tsd track to add.  Will"
                        " overwrite if it already exists (or append with"
                        " --append option)")
    parser.add_argument("--append", help="Add onto existing TSD track if exists",
                        default=False, action="store_true")
    parser.add_argument("--inPath", help="Use given file instead of inputTrack"
                        " path to generate TSD", default=None)

    ############ TSDFINDER OPTIONS ##############
    parser.add_argument("--min", help="Minimum length of a TSD",
                        default=None, type=int)
    parser.add_argument("--max", help="Maximum length of a TSD",
                        default=None, type=int)
    parser.add_argument("--all", help="Report all matches in region (as opposed"
                        " to only the nearest to the BED element which is the "
                        "default behaviour", action="store_true", default=False)
    parser.add_argument("--maxScore", help="Only report matches with given "
                            "score or smaller.  The score  is definied as the "
                        "maximum distance between the (two) TSD intervals and "
                        "the query interval",
                        default=None, type=int)
    parser.add_argument("--left", help="Number of bases immediately left of the "
                        "BED element to search for the left TSD",
                        default=None, type=int)
    parser.add_argument("--right", help="Number of bases immediately right of "
                        "the BED element to search for the right TSD",
                        default=None, type=int)
    parser.add_argument("--overlap", help="Number of bases overlapping the "
                        "BED element to include in search (so total space "
                        "on each side will be --left + overlap, and --right + "
                        "--overlap", default=None, type=int)
    parser.add_argument("--leftName", help="Name of left TSDs in output Bed",
                        default=None)
    parser.add_argument("--rightName", help="Name of right TSDs in output Bed",
                        default=None)
    parser.add_argument("--id", help="Assign left/right pairs of TSDs a unique"
                        " matching ID", action="store_true", default=False)
    parser.add_argument("--names", help="Only apply to bed interval whose "
                        "name is in (comma-separated) list.  If not specified"
                        " then all intervals are processed", default=None)
    parser.add_argument("--numProc", help="Number of jobs to run in parallel."
                        " (parallization done on different sequences in FASTA"
                        "file", type=int, default=1)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # copy out all options for call to tsd finder
    args.tsdFinderOptions = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.tsdFinderOptions += " --logFile %s" % args.logFile
    for option in ["min", "max", "all", "maxScore", "left", "right", "overlap",
                   "leftName", "rightName", "id", "names", "numProc"]:
        val = getattr(args, option)
        if val is True:
            args.tsdFinderOptions += " --%s" % option
        elif val is not None and val is not False:
            args.tsdFinderOptions += " --%s %s" % (option, val)
            
    try:
        os.makedirs(args.tsdTrackDir)
    except:
        pass
    if not os.path.isdir(args.tsdTrackDir):
        raise RuntimeError("Unable to find or create tsdTrack dir %s" %
                           args.tsdTrackDir)

    trackList = TrackList(args.tracksInfo)
    outTrackList = copy.deepcopy(trackList)
    inputTrack = trackList.getTrackByName(args.inputTrack)
    if inputTrack is None:
        raise RuntimeError("Track %s not found" % args.inputTrack)
    if args.inPath is not None:
        assert os.path.isfile(args.inPath)
        inputTrack.setPath(args.inPath)
    inTrackExt = os.path.splitext(inputTrack.getPath())[1].lower()
    if inTrackExt != ".bb" and inTrackExt != ".bed":
        raise RuntimeError("Track %s has non-bed extension %s" % (
            args.inputTrack, inTrackExt))

    fastaTrack = trackList.getTrackByName(args.fastaTrack)
    if fastaTrack is None:
        raise RuntimeError("Fasta Track %s not found" % args.fastaTrack)
    faTrackExt = os.path.splitext(fastaTrack.getPath())[1].lower()
    if faTrackExt[:3] != ".fa":
        raise RuntimeError("Fasta Track %s has non-fasta extension %s" % (
            args.fastaTrack, faTrackExt))

    tsdTrack = outTrackList.getTrackByName(args.outputTrack)
    if tsdTrack is None:
        if args.append is True:
            raise RuntimeError("TSD track %s not found. Cannot append" % (
                args.outputTrack))
        tsdTrack = Track()
        tsdTrack.name = args.outputTrack
        tsdTrack.path = os.path.join(args.tsdTrackDir, args.inputTrack + "_" +
                                     args.outputTrack + ".bed")

    runTsdFinder(fastaTrack.getPath(), inputTrack.getPath(),
                  tsdTrack.getPath(), args)

    if outTrackList.getTrackByName(tsdTrack.getName()) is None:
        outTrackList.addTrack(tsdTrack)
    outTrackList.saveXML(args.outTracksInfo)

    cleanBedTool(tempBedToolPath)
Esempio n. 9
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Filter overlapping intervals out")
    parser.add_argument("inputBed", help="Bed file to filter")
    parser.add_argument("--bed12", help="Use bed12 exons instead of start/end"
                        " if present (equivalent to running bed12ToBed6 on"
                        " input first).", action="store_true", default=False)
    parser.add_argument("--rm", help="Make sure intervals that are labeled as TE "
                        "by rm2State.sh script are never cut by ones that are not",
                        default=False, action='store_true')
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    assert os.path.isfile(args.inputBed)
    tempBedToolPath = initBedTool()

    # do the --rm filter.  by splitting into TE / non-TE
    # then removing everything in non-TE that overlaps
    # TE.  The adding the remainder back to TE. 
    inputPath = args.inputBed
    if args.rm is True:
        tempPath = getLocalTempPath("Temp_", ".bed")
        tePath = getLocalTempPath("Temp_te_", ".bed")
        runShellCommand("rm2State.sh %s |grep TE | sortBed > %s" % (
            args.inputBed, tempPath))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %(
            args.inputBed, tempPath, tePath))
        otherPath = getLocalTempPath("Temp_other_", ".bed")
        runShellCommand("rm2State.sh %s |grep -v TE | sortBed > %s" % (
            args.inputBed, tempPath))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %(
            args.inputBed, tempPath, otherPath))
        if os.path.getsize(tePath) > 0  and\
           os.path.getsize(otherPath) > 0:
            filterPath = getLocalTempPath("Temp_filter_", ".bed")
            runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (
                otherPath, tePath, filterPath))
            inputPath = getLocalTempPath("Temp_input_", ".bed")
            runShellCommand("cat %s %s | sortBed > %s" % (
                tePath, filterPath, inputPath))
            runShellCommand("rm -f %s" % filterPath)
        runShellCommand("rm -f %s %s %s" % (tePath, otherPath, tempPath))

    bedIntervals = BedTool(inputPath).sort()
    if args.bed12 is True:
        bedIntervals = bedIntervals.bed6()
        
    prevInterval = None

    # this code has been way to buggy for something so simple
    # keep extra list to check for sure even though it's a waste of
    # time and space
    sanity = []
    
    for interval in bedIntervals:
        if (prevInterval is not None and
            interval.chrom == prevInterval.chrom and
            interval.start < prevInterval.end):
            logger.debug("Replace %d bases of \n%s with\n%s" % (
                prevInterval.end - interval.start,
                str(interval), str(prevInterval)))
            interval.start = prevInterval.end
            
        if interval.end > interval.start:
            sys.stdout.write("%s" % str(interval))
            sanity.append(interval)
            prevInterval = interval

    for i in xrange(len(sanity) - 1):
        if sanity[i].chrom == sanity[i+1].chrom:
            assert sanity[i+1].start >= sanity[i].end
    cleanBedTool(tempBedToolPath)
    if args.inputBed != inputPath:
        runShellCommand("rm -f %s" % inputPath)
Esempio n. 10
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="fix up track names and sort alphabetically.  easier to do here on xml than at end for pape\
        r.")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("outTracksInfo", help="Path to write modified tracks XML")

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    args.logOpString = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.logOpString += " --logFile %s" % args.logFile

    nm = dict()
    nm["hollister"] = "RM-RepBase-Hollister"
    nm["chaux"] = "RM-RepBase-deLaChaux"
    nm["repeat_modeler"] = "RM-RepeatModeler"
    nm["repbase"] = "RM-RepBase"
    nm["repet"] = "REPET"
    nm["ltr_finder"] = "LTR_FINDER"
    nm["ltr_harvest"] = "LTR_Harvest"
    nm["ltr_termini"] = "lastz-Termini"
    nm["lastz-Termini"] = "lastz-LTRTermini"
    nm["tir_termini"] = "lastz-InvTermini"
    nm["irf"] = "IRF"
    nm["palindrome"] = "lastz-Palindrome"
    nm["overlap"] = "lastz-Overlap"
    nm["mitehunter"] = "MITE-Hunter"
    nm["helitronscanner"] = "HelitronScanner"
    nm["cov_80-"] = "lastz-SelfLowId"
    nm["cov_80-90"] = "lastz-SelfMedId"
    nm["cov_90+"] = "lastz-SelfHighId"
    nm["left_peak_80-"] = "lastz-SelfPeakLeftLow"
    nm["lastz-SelfLowLeftPeak"] = nm["left_peak_80-"]
    nm["left_peak_80-90"] = "lastz-SelfPeakLeftMed"
    nm["lastz-SelfMedLeftPeak"] = nm["left_peak_80-90"]
    nm["left_peak_90+"] = "lastz-SelfPeakLeftHigh"
    nm["lastz-SelfHighLeftPeak"] = nm["left_peak_90+"]
    nm["right_peak_80-"] = "lastz-SelfPeakRightLow"
    nm["lastz-SelfLowRightPeak"] = nm["right_peak_80-"]
    nm["right_peak_80-90"] = "lastz-SelfPeakRightMed"
    nm["lastz-SelfMedRightPeak"] = nm["right_peak_80-90"]
    nm["right_peak_90+"] = "lastz-SelfPeakRightHigh"
    nm["lastz-SelfHighRightPeak"] = nm["right_peak_90+"]
    nm["cov_maxPId"] = "lastz-SelfPctMaxId"
    nm["lastz-SelfMaxPctId"] = nm["cov_maxPId"]
    nm["te_domains"] = "TE-Domains"
    nm["fgenesh"] = "Genes"
    nm["genes"] = nm["fgenesh"]
    nm["refseq"] = nm["fgenesh"]
    nm["mrna"] = "mRNA"
    nm["srna"] = "sRNA"
    nm["ortho_depth"] = "Alignment-Depth"
    nm["orthology"] = nm["ortho_depth"]
    nm["chain_depth"] = nm["ortho_depth"]
    nm["alignment_depth"] = nm["ortho_depth"]
    nm["gcpct"] = "GC"
    nm["trf"] = "TRF"
    nm["windowmasker"] = "WindowMasker"
    nm["polyN"] = "Ns"
    nm["phastcons_ce"] = "Conservation"
    nm["phastcons"] = nm["phastcons_ce"]
    nm["PhastCons"] = nm["phastcons_ce"]
    nm["phyloP"] = nm["phastcons_ce"]
    nm["phylop"] = nm["phastcons_ce"] 

    rtracks = dict()
    rtracks["tantan"] = True
    rtracks["polyA"] = True
    rtracks["transposon_psi"] = True
    rtracks["transposonpsi"] = True
    rtracks["repbase_censor"] = True
    rtracks["tsd"] = True
    rtracks["repbase_default"] = True
    rtracks["dustmasker"] = True
       
    inTracks = TrackList(args.tracksInfo)
    outTracks = TrackList()
    outList = []

    for track in itertools.chain(inTracks.trackList, inTracks.maskTrackList):
        if not os.path.exists(track.path):
            raise RuntimeError("Track DNE %s" % track.path)
        if track.name not in rtracks:
            if track.name in nm:
                track.name = nm[track.name]
            else:
                logger.warning("Did not map track %s" % track.name)
            outList.append(track)                        
        else:
            logger.warning("Deleted track %s" % track.name)


    # sort the list
    def sortComp(x):
        lname = x.name.lower()
        if x.name == "RM-RepeatModeler":
            return "aaaaa" + lname
        elif "RM" in x.name:
            return "aaaa" + lname
        elif "REPET" in x.name:
            return "aaa" + lname
        elif "softmask" in lname or "tigr" in lname or "te-domains" in lname:
            return "aa" + lname
        elif x.getDist == "mask":
            return "zzzz" + lname
        else:
            return lname
        
    outList = sorted(outList, key = lambda track : sortComp(track))

    for track in outList:
        outTracks.addTrack(track)

    outTracks.saveXML(args.outTracksInfo)
    
    cleanBedTool(tempBedToolPath)    
Esempio n. 11
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create a teHMM")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("trainingBed", help="Path of BED file containing"
                        " genome regions to train model on.  If --supervised "
                        "is used, the names in this bed file will be treated "
                        "as the true annotation (otherwise it is only used for "
                        "interval coordinates)")
    parser.add_argument("outputModel", help="Path of output hmm")
    parser.add_argument("--numStates", help="Number of states in model",
                        type = int, default=2)
    parser.add_argument("--iter", help="Number of EM iterations",
                        type = int, default=100)
    parser.add_argument("--supervised", help="Use name (4th) column of "
                        "<traingingBed> for the true hidden states of the"
                        " model.  Transition parameters will be estimated"
                        " directly from this information rather than EM."
                        " NOTE: The number of states will be determined "
                        "from the bed.",
                        action = "store_true", default = False)
    parser.add_argument("--cfg", help="Use Context Free Grammar insead of "
                        "HMM.  Only works with --supervised for now",
                        action = "store_true", default = False)
    parser.add_argument("--saPrior", help="Confidence in self alignment "
                        "track for CFG.  Probability of pair emission "
                        "is multiplied by this number if the bases are aligned"
                        " and its complement if bases are not aligned. Must"
                        " be between [0,1].", default=0.95, type=float)
    parser.add_argument("--pairStates", help="Comma-separated list of states"
                        " (from trainingBed) that are treated as pair-emitors"
                        " for the CFG", default=None)
    parser.add_argument("--emFac", help="Normalization factor for weighting"
                        " emission probabilities because when there are "
                        "many tracks, the transition probabilities can get "
                        "totally lost. 0 = no normalization. 1 ="
                        " divide by number of tracks.  k = divide by number "
                        "of tracks / k", type=int, default=0)
    parser.add_argument("--initTransProbs", help="Path of text file where each "
                        "line has three entries: FromState ToState Probability"
                        ".  This file (all other transitions get probability 0)"
                        " is used to specifiy the initial transition model."
                        " The names and number of states will be initialized "
                        "according to this file (overriding --numStates)",
                        default = None)
    parser.add_argument("--fixTrans", help="Do not learn transition parameters"
                        " (best used with --initTransProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ".  This file (all other emissions get probability 0)"
                        " is used to specifiy the initial emission model. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixEm", help="Do not learn emission parameters"
                        " (best used with --initEmProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initStartProbs", help="Path of text file where each "
                        "line has two entries: State Probability"
                        ".  This file (all other start probs get probability 0)"
                        " is used to specifiy the initial start dist. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixStart", help="Do not learn start parameters"
                        " (best used with --initStartProbs)",
                        action="store_true", default=False)
    parser.add_argument("--forceTransProbs",
                        help="Path of text file where each "
                        "line has three entries: FromState ToState Probability" 
                        ". These transition probabilities will override any "
                        " learned probabilities after each training iteration"
                        " (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed)" ,
                        default=None)
    parser.add_argument("--forceEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ". These "
                        "emission probabilities will override any learned"
                        " probabilities after each training iteration "
                        "(unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed.)" ,
                        default = None) 
    parser.add_argument("--flatEm", help="Use a flat emission distribution as "
                        "a baseline.  If not specified, the initial emission "
                        "distribution will be randomized by default.  Emission"
                        " probabilities specified with --initEmpProbs or "
                        "--forceEmProbs will never be affected by randomizaiton"
                        ".  The randomization is important for Baum Welch "
                        "training, since if two states dont have at least one"
                        " different emission or transition probability to begin"
                        " with, they will never learn to be different.",
                        action="store_true", default=False)
    parser.add_argument("--emRandRange", help="When randomly initialzing an"
                        " emission distribution, constrain"
                        " the values to the given range (pair of "
                        "comma-separated numbers).  Overridden by "
                        "--initEmProbs and --forceEmProbs when applicable."
                        " Completely overridden by --flatEm (which is equivalent"
                        " to --emRandRange .5,.5.). Actual values used will"
                        " always be normalized.", default="0.2,0.8")
    parser.add_argument("--segment", help="Bed file of segments to treat as "
                        "single columns for HMM (ie as created with "
                        "segmentTracks.py).  IMPORTANT: this file must cover "
                        "the same regions as the traininBed file. Unless in "
                        "supervised mode, probably best to use same bed file "
                        " as both traingBed and --segment argument.  Otherwise"
                        " use intersectBed to make sure the overlap is exact",
                        default=None)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied)", type=int, default=0)
    parser.add_argument("--seed", help="Seed for random number generator"
                        " which will be used to initialize emissions "
                        "(if --flatEM and --supervised not specified)",
                        default=None, type=int)
    parser.add_argument("--reps", help="Number of replicates (with different"
                         " random initializations) to run. The replicate"
                         " with the highest likelihood will be chosen for the"
                         " output", default=1, type=int)
    parser.add_argument("--numThreads", help="Number of threads to use when"
                        " running replicates (see --rep) in parallel.",
                        type=int, default=1)
    parser.add_argument("--emThresh", help="Threshold used for convergence"
                        " in baum welch training.  IE delta log likelihood"
                        " must be bigger than this number (which should be"
                        " positive) for convergence", type=float,
                        default=0.001)
    parser.add_argument("--saveAllReps", help="Save all replicates (--reps)"
                        " models to disk, instead of just the best one"
                        ". Format is <outputModel>.repN.  There will be "
                        " --reps -1 such models saved as the best output"
                        " counts as a replicate",
                        action="store_true", default=False)
    parser.add_argument("--maxProb", help="Gaussian distributions and/or"
                        " segment length corrections can cause probability"
                        " to *decrease* during BW iteration.  Use this option"
                        " to remember the parameters with the highest probability"
                        " rather than returning the parameters after the final "
                        "iteration.", action="store_true", default=False)
    parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop"
                        " training if a given number of iterations go by without"
                        " hitting a new maxProb", default=None, type=int)
    parser.add_argument("--transMatEpsilons", help="By default, epsilons are"
                        " added to all transition probabilities to prevent "
                        "converging on 0 due to rounding error only for fully"
                        " unsupervised training.  Use this option to force this"
                        " behaviour for supervised and semisupervised modes",
                        action="store_true", default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.cfg is True:
        assert args.supervised is True
        assert args.saPrior >= 0. and args.saPrior <= 1.
    if args.pairStates is not None:
        assert args.cfg is True
    if args.initTransProbs is not None or args.fixTrans is True or\
      args.initEmProbs is not None or args.fixEm is not None:
        if args.cfg is True:
            raise RuntimeError("--transProbs, --fixTrans, --emProbs, --fixEm "
                               "are not currently compatible with --cfg.")
    if args.fixTrans is True and args.supervised is True:
        raise RuntimeError("--fixTrans option not compatible with --supervised")
    if args.fixEm is True and args.supervised is True:
        raise RuntimeError("--fixEm option not compatible with --supervised")
    if (args.forceTransProbs is not None or args.forceEmProbs is not None) \
      and args.cfg is True:
        raise RuntimeError("--forceTransProbs and --forceEmProbs are not "
                           "currently compatible with --cfg")
    if args.flatEm is True and args.supervised is False and\
      args.initEmProbs is None and args.initTransProbs is None:
      raise RuntimeError("--flatEm must be used with --initEmProbs and or"
                         " --initTransProbs")
    if args.initEmProbs is not None and args.initTransProbs is None:
        raise RuntimeError("--initEmProbs can only be used in conjunction with"
                           " --initTransProbs")
    if args.emRandRange is not None:
        args.emRandRange = args.emRandRange.split(",")
        try:
            assert len(args.emRandRange) == 2
            args.emRandRange = (float(args.emRandRange[0]),
                                float(args.emRandRange[1]))
        except:
            raise RuntimeError("Invalid --emRandRange specified")
    if args.transMatEpsilons is False:
        # old logic here. now overriden with above options
        args.transMatEpsilons = (args.supervised is False and
                                 args.initTransProbs is None and
                                 args.forceTransProbs is None)

    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read training intervals from the bed file
    logger.info("loading training intervals from %s" % args.trainingBed)
    mergedIntervals = getMergedBedIntervals(args.trainingBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.trainingBed)

    # read segment intervals
    segIntervals = None
    if args.segment is not None:
        logger.info("loading segment intervals from %s" % args.segment)
        try:
            checkExactOverlap(args.trainingBed, args.segment)
        except:
            raise RuntimeError("bed file passed with --segments option"
                               " must exactly overlap trainingBed")
        segIntervals = readBedIntervals(args.segment, sort=True)
    elif args.segLen > 0:
        raise RuntimeError("--segLen can only be used with --segment")
    if args.segLen <= 0:
        args.segLen = None
    if args.segLen > 0 and args.segLen != 1:
        logger.warning("--segLen should be 0 (no correction) or 1 (base"
                       " correction).  Values > 1 may cause bias.")

    # read the tracks, while intersecting them with the training intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals,
                            segmentIntervals=segIntervals)

    catMap = None
    userTrans = None
    if args.supervised is False and args.initTransProbs is not None:
        logger.debug("initializing transition model with user data")
        catMap = stateNamesFromUserTrans(args.initTransProbs)
        # state number is overrided by the transProbs file
        args.numStates = len(catMap)

    truthIntervals = None
    # state number is overrided by the input bed file in supervised mode
    if args.supervised is True:
        logger.info("processing supervised state names")
        # we reload because we don't want to be merging them here
        truthIntervals = readBedIntervals(args.trainingBed, ncol=4)
        catMap = mapStateNames(truthIntervals)
        args.numStates = len(catMap)

    # train the model
    seeds = [random.randint(0, 4294967294)]
    if args.seed is not None:
        seeds = [args.seed]
        random.seed(args.seed)
    seeds += [random.randint(0, sys.maxint) for x in xrange(1, args.reps)]

    def trainClosure(randomSeed):
        return trainModel(randomSeed, trackData=trackData, catMap=catMap,
                          userTrans=userTrans, truthIntervals=truthIntervals,
                          args=args)
    
    modelList = runParallelShellCommands(argList=seeds, numProc = args.numThreads,
                                         execFunction = trainClosure,
                                         useThreads = True)

    # select best model
    logmsg = ""
    bestModel = (-1, LOGZERO)
    for i in xrange(len(modelList)):
        curModel = (i, modelList[i].getLastLogProb())
        if curModel[1] > bestModel[1]:
            bestModel = curModel
        if curModel[1] is not None:
            logmsg += "Rep %i: TotalProb: %f\n" % curModel
    if len(modelList) > 1:
        logging.info("Training Replicates Statistics:\n%s" % logmsg)
        logging.info("Selecting best replicate (%d, %f)" % bestModel)
    model = modelList[bestModel[0]]
        
    # write the model to a pickle
    logger.info("saving trained model to %s" % args.outputModel)
    saveModel(args.outputModel, model)

    # write all replicates
    writtenCount = 0
    if args.saveAllReps is True:
        for i, repModel in enumerate(modelList):
            if i != bestModel[0]:
                repPath = "%s.rep%d" % (args.outputModel, writtenCount)
                logger.info("saving replicate model to %s" % repPath)                
                saveModel(repPath, repModel)
                writtenCount += 1

    cleanBedTool(tempBedToolPath)
Esempio n. 12
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Given two bed files: a prediction and a true (or target)"
        " annotation, re-label the prediction's state names so that they "
        " best match the true annotation.  Usees same logic as "
        " compareBedStates.py for determining accuracy")

    parser.add_argument("tgtBed", help="Target bed file")
    parser.add_argument("predBed", help="Predicted bed file to re-label. ")
    parser.add_argument("outBed", help="Output bed (relabeling of predBed)")
    parser.add_argument("--col",
                        help="Column of bed files to use for state"
                        " (currently only support 4(name) or 5(score))",
                        default=4,
                        type=int)
    parser.add_argument(
        "--intThresh",
        help="Threshold to consider interval from"
        " tgtBed covered by predBed.  If not specified, then base"
        " level statistics will be used. Value in range (0,1]",
        type=float,
        default=None)
    parser.add_argument("--noFrag",
                        help="Dont allow fragmented interval matches ("
                        "see help for --frag in compareBedStates.py).  Only"
                        " relevant with --intThresh",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--qualThresh",
        help="Minimum match ratio between truth"
        " and prediction to relabel prediction.  Example, if"
        " predicted state X overlaps target state LTR 25 pct of "
        "the time, then qualThresh must be at least 0.25 to "
        "label X as LTR in the output.  Value in range (0, 1]",
        type=float,
        default=0.1)
    parser.add_argument("--ignore",
                        help="Comma-separated list of stateNames to"
                        " ignore (in prediction)",
                        default=None)
    parser.add_argument("--ignoreTgt",
                        help="Comma-separated list of stateNames to"
                        " ignore (int target)",
                        default=None)
    parser.add_argument("--tgt",
                        help="Comma-separated list of stateNames to "
                        " consider (in target).  All others will be ignored",
                        default=None)
    parser.add_argument(
        "--unique",
        help="If more than one predicted state maps"
        " to the same target state, add a unique id (numeric "
        "suffix) to the output so that they can be distinguished",
        action="store_true",
        default=False)
    parser.add_argument("--model",
                        help="Apply state name mapping to the model"
                        " in the specified path (it is strongly advised to"
                        " make a backup of the model first)",
                        default=None)
    parser.add_argument("--noMerge",
                        help="By default, adjacent intervals"
                        " with the same state name in the output are "
                        "automatically merged into a single interval.  This"
                        " flag disables this.",
                        action="store_true",
                        default=False)
    parser.add_argument("--hm",
                        help="Write confusion matrix as heatmap in PDF"
                        " format to specified file",
                        default=None)
    parser.add_argument("--old",
                        help="Use old name mapping logic which just "
                        "takes biggest overlap in forward confusion matrix.  "
                        "faster than new default logic which does the greedy"
                        " f1 optimization",
                        action="store_true",
                        default=False)
    parser.add_argument("--fdr",
                        help="Use FDR cutoff instead of (default)"
                        " greedy F1 optimization for state labeling",
                        type=float,
                        default=None)
    parser.add_argument("--tl",
                        help="Path to tracks XML file.  Used to cut "
                        "out mask tracks so they are removed from comparison."
                        " (convenience option to not have to manually run "
                        "subtractBed everytime...)",
                        default=None)
    parser.add_argument(
        "--colOrder",
        help="List of states used to force"
        " ordering in heatmap (otherwise alphabetical) columns. These"
        " states will correspond to the tgtBed when --old used and"
        " --predBed otherwise.",
        default=None)
    parser.add_argument(
        "--hmCovRow",
        help="Path to write 1-row heatmap of "
        "state coverage (fraction of bases). only works with --hm",
        default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.ignore is not None:
        args.ignore = set(args.ignore.split(","))
    else:
        args.ignore = set()
    if args.ignoreTgt is not None:
        args.ignoreTgt = set(args.ignoreTgt.split(","))
    else:
        args.ignoreTgt = set()
    if args.tgt is not None:
        args.tgt = set(args.tgt.split(","))
        if args.old is True:
            raise RuntimeError("--tgt option not implemented for --old")
    else:
        args.tgt = set()
    if args.old is True and args.fdr is not None:
        raise RuntimeError("--old and --fdr options are exclusive")

    assert args.col == 4 or args.col == 5

    tempFiles = []
    if args.tl is not None:
        cutBedTgt = cutOutMaskIntervals(args.tgtBed, -1, sys.maxint, args.tl)
        cutBedPred = cutOutMaskIntervals(args.predBed, -1, sys.maxint, args.tl)

        if cutBedTgt is not None:
            assert cutBedPred is not None
            tempFiles += [cutBedTgt, cutBedPred]
            args.tgtBed = cutBedTgt
            args.predBed = cutBedPred

    checkExactOverlap(args.tgtBed, args.predBed)

    intervals1 = readBedIntervals(args.tgtBed, ncol=args.col)
    intervals2 = readBedIntervals(args.predBed, ncol=args.col)
    cfName = "reverse"

    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        cfName = "forward"

    # generate confusion matrix based on accuracy comparison using
    # base or interval stats as desired
    if args.intThresh is not None:
        logger.info("Computing interval %s confusion matrix" % cfName)
        confMat = compareIntervalsOneSided(intervals2, intervals1,
                                           args.col - 1, args.intThresh, False,
                                           not args.noFrag)[1]
    else:
        logger.info("Computing base %s confusion matrix" % cfName)
        confMat = compareBaseLevel(intervals2, intervals1, args.col - 1)[1]

    logger.info("%s Confusion Matrix:\n%s" % (cfName, str(confMat)))

    # find the best "true" match for each predicted state
    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        stateMap = getStateMapFromConfMatrix_simple(confMat)
    else:
        stateMap = getStateMapFromConfMatrix(confMat, args.tgt, args.ignoreTgt,
                                             args.ignore, args.qualThresh,
                                             args.fdr)

    # filter the stateMap to take into account the command-line options
    # notably --ignore, --ignoreTgt, --qualThresh, and --unique
    filterStateMap(stateMap, args)

    logger.info("State Map:\n%s", str(stateMap))

    # write the model if spefied
    if args.model is not None:
        applyNamesToModel(stateMap, args.model)

    # generate the output bed using the statemap
    writeFittedBed(intervals2, stateMap, args.outBed, args.col - 1,
                   args.noMerge, args.ignoreTgt)

    # write the confusion matrix as heatmap
    if args.hm is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write heatmap.  Maybe matplotlib is "
                               "not installed?")
        writeHeatMap(confMat, args.hm, args.colOrder, args.hmCovRow)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
Esempio n. 13
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Generate HMM-usable tracklist from raw tracklist. EX "
        "used to transform mustang_alyrata_tracks.xml -> "
        "mustang_alyrata_clean.xml.  Runs cleanRM.py cleanLtrFinder.py and "
        " cleanTermini.py and addTsdTrack.py and setTrackScaling.py (also runs "
        " removeBedOverlaps.py before each of the clean scripts)")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("cleanTrackPath", help="Directory to write cleaned BED"
                        " tracks to")
    parser.add_argument("outTracksInfo", help="Path to write modified tracks XML"
                        " to.")
    parser.add_argument("--numBins", help="Maximum number of bins after scaling",
                        default=10, type=int)
    parser.add_argument("--scaleTracks", help="Comma-separated list of tracks "
                        "to process for scaling. If not set, all"
                        " tracks listed as having a multinomial distribution"
                        " (since this is the default value, this includes "
                        "tracks with no distribution attribute) or gaussian "
                        "distribution will be processed.", default=None)
    parser.add_argument("--skipScale", help="Comma-separated list of tracks to "
                        "skip for scaling.", default=None)
    parser.add_argument("--ltr_termini", help="Name of termini track (appy tsd)",
                        default="ltr_termini")
    parser.add_argument("--repeat_modeler", help="Name of repeat_modeler track (appy tsd)",
                        default="repeat_modeler")
    parser.add_argument("--sequence", help="Name of fasta sequence track",
                        default="sequence")
    parser.add_argument("--tsd", help="Name of tsd track to generate (appy cleanTermini.py)",
                        default="tsd")
    parser.add_argument("--tir", help="Name of tir_termini track (appy cleanTermini.py)",
                        default="tir_termini")
    parser.add_argument("--noScale", help="Dont do any scaling", default=False,
                        action="store_true")
    parser.add_argument("--noTsd", help="Dont generate TSD track.  NOTE:"
                        " TSD track is hardcoded to be generated from "
                        "termini and (non-LTR elements of ) chaux",
                        default=False, action="store_true")
    parser.add_argument("--numProc", help="Number of processes to use for tsdFinder.py",
                        default=1, type=int)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    args.logOpString = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.logOpString += " --logFile %s" % args.logFile

    try:
        os.makedirs(args.cleanTrackPath)
    except:
        pass
    if not os.path.isdir(args.cleanTrackPath):
        raise RuntimeError("Unable to find or create cleanTrack dir %s" %
                           args.cleanTrackPath)

    tempTracksInfo = getLocalTempPath("Temp_mustang_alyrata_clean", "xml")
    runCleaning(args, tempTracksInfo)
    assert os.path.isfile(tempTracksInfo)

    runTsd(args, tempTracksInfo)
    
    runScaling(args, tempTracksInfo)

    runShellCommand("rm -f %s" % tempTracksInfo)

    cleanBedTool(tempBedToolPath)
Esempio n. 14
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Fill in masked intervals of an hmm prediction "
        "(from teHmmEval.py) with state corresponding to surrounding"
        " intervals.")

    parser.add_argument("tracksXML",
                        help="XML track list (used to id masking"
                        " tracks")
    parser.add_argument("allBed",
                        help="Target scope.  Masked intervals outside"
                        " of these regions will not be included")
    parser.add_argument(
        "inBed",
        help="TE prediction BED file.  State labels"
        " should probably be mapped (ie with fitStateNames.py)")
    parser.add_argument("outBed",
                        help="Output BED.  Will be equivalent to"
                        " the input bed except all gaps corresponding to "
                        "masked intervals will be filled")
    parser.add_argument(
        "--maxLen",
        help="Maximum length of a masked interval"
        " to fill (inclusive). Use --delMask option with same value"
        "if running compareBedStates.py after.",
        type=int,
        default=sys.maxint)
    parser.add_argument("--default",
                        help="Default label to give to masked "
                        "region if no label can be determined",
                        default="0")
    parser.add_argument(
        "--tgts",
        help="Only relabel gaps that "
        "are flanked on both sides by the same state, and this state"
        " is in this comma- separated list. --default used for other"
        " gaps.  If not targetst specified then all states checked.",
        default=None)
    parser.add_argument(
        "--oneSidedTgts",
        help="Only relabel gaps that "
        "are flanked on at least one side by a state in this comma-"
        "separated list --default used for other gaps",
        default=None)
    parser.add_argument(
        "--onlyDefault",
        help="Add the default state (--default) no"
        " no all masked gaps no matter what. ie ignoring all other "
        "logic",
        action="store_true",
        default=False)
    parser.add_argument(
        "--cut",
        help="Cut out gaps for masked tracks from the input."
        " By default, the input is expected to come from the HMM "
        "with mask intervals already absent, and will crash on with"
        " an assertion error if an overlap is detected.",
        action="store_true",
        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # make sets
    tgtSet = set()
    if args.tgts is not None:
        tgtSet = set(args.tgts.split(","))
    oneSidedTgtSet = set()
    if args.oneSidedTgts is not None:
        oneSidedTgtSet = set(args.oneSidedTgts.split(","))
    assert len(tgtSet.intersection(oneSidedTgtSet)) == 0

    # read the track list
    trackList = TrackList(args.tracksXML)
    maskTracks = trackList.getMaskTracks()

    # read the input bed
    inBed = args.inBed
    if args.cut is True:
        inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML)
    inputIntervals = readBedIntervals(inBed, ncol=4, sort=True)
    if args.cut is True:
        runShellCommand("rm -f %s" % inBed)
    if len(maskTracks) == 0 or len(inputIntervals) == 0:
        runShellCommand("cp %s %s" % (args.inBed, args.outBed))
        logger.warning("No mask tracks located in %s or"
                       " %s empty" % (args.tracksXML, args.inBed))
        return 0

    # make a temporary, combined, merged masking bed file
    tempMaskBed = getLocalTempPath("Temp_mb", ".bed")
    for maskTrack in maskTracks:
        assert os.path.isfile(maskTrack.getPath())
        runShellCommand(
            "cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s"
            % (maskTrack.getPath(), tempMaskBed))
    maskedIntervals = getMergedBedIntervals(tempMaskBed, sort=True)
    resolvedMasks = 0

    if len(inputIntervals) == 0:
        logger.warning("No mask tracks located in %s" % args.tracksXML)
        return
    inputIdx = 0
    rightFlank = inputIntervals[inputIdx]

    tempOutMask = getLocalTempPath("Temp_om", ".bed")
    tempOutMaskFile = open(tempOutMask, "w")

    for maskIdx, maskInterval in enumerate(maskedIntervals):
        if maskInterval[2] - maskInterval[1] > args.maxLen:
            continue
        # find candidate right flank
        while rightFlank < maskInterval:
            if inputIdx == len(inputIntervals) - 1:
                rightFlank = None
                break
            else:
                inputIdx += 1
                rightFlank = inputIntervals[inputIdx]

        # candidate left flank
        leftFlank = None
        if inputIdx > 0:
            leftFlank = inputIntervals[inputIdx - 1]

        # identify flanking states if the intervals perfectly abut
        leftState = None
        if leftFlank is not None:
            if leftFlank[0] == maskInterval[0] and leftFlank[
                    2] == maskInterval[1]:
                leftState = str(leftFlank[3])
            else:
                assert intersectSize(leftFlank, maskInterval) == 0
        rightState = None
        if rightFlank is not None:
            if rightFlank[0] == maskInterval[0] and rightFlank[
                    1] == maskInterval[2]:
                rightState = str(rightFlank[3])
            else:
                assert intersectSize(rightFlank, maskInterval) == 0

        # choose a state for the mask interval
        maskState = str(args.default)
        if args.onlyDefault is True:
            pass
        elif leftState is not None and leftState == rightState:
            if len(tgtSet) == 0 or leftState in tgtSet:
                maskState = leftState
        elif leftState in oneSidedTgtSet:
            maskState = leftState
        elif rightState in oneSidedTgtSet:
            maskState = rightState

        # write our mask interval
        tempOutMaskFile.write(
            "%s\t%d\t%d\t%s\n" %
            (maskInterval[0], maskInterval[1], maskInterval[2], maskState))

    tempOutMaskFile.close()
    tempMergePath1 = getLocalTempPath("Temp_mp", ".bed")
    tempMergePath2 = getLocalTempPath("Temp_mp", ".bed")
    runShellCommand("cp %s %s ; cat %s >> %s" %
                    (args.inBed, tempMergePath1, tempOutMask, tempMergePath1))
    runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2))
    tempScopePath = getLocalTempPath("temp_all", ".bed")
    runShellCommand("mergeBed -i %s |sortBed > %s" %
                    (args.allBed, tempScopePath))
    runShellCommand("intersectBed -a %s -b %s > %s" %
                    (tempMergePath2, tempScopePath, args.outBed))

    runShellCommand("rm -f %s" % " ".join([
        tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath
    ]))
    cleanBedTool(tempBedToolPath)
Esempio n. 15
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Evaluate a given data set with a trained HMM. Display"
        " the log probability of the input data given the model, and "
        "optionally output the most likely sequence of hidden states.")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inputModel", help="Path of hmm created with"
                        "teHmmTrain.py")
    parser.add_argument("bedRegions", help="Intervals to process")
    parser.add_argument("--bed", help="path of file to write viterbi "
                        "output to (most likely sequence of hidden states)",
                        default=None)
    parser.add_argument("--numThreads", help="Number of threads to use (only"
                        " applies to CFG parser for the moment)",
                        type=int, default=1)
    parser.add_argument("--slice", help="Make sure that regions are sliced"
                        " to a maximum length of the given value.  Most "
                        "useful when model is a CFG to keep memory down. "
                        "When 0, no slicing is done",
                        type=int, default=0)
    parser.add_argument("--segment", help="Use the intervals in bedRegions"
                        " as segments which each count as a single column"
                        " for evaluattion.  Note the model should have been"
                        " trained with the --segment option pointing to this"
                        " same bed file.", action="store_true", default=False)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied)", type=int, default=0)    
    parser.add_argument("--maxPost", help="Use maximum posterior decoding instead"
                        " of Viterbi for evaluation", action="store_true",
                        default=False)
    parser.add_argument("--pd", help="Output BED file for posterior distribution. Must"
                        " be used in conjunction with --pdStates (View on the "
                        "browser via bedGraphToBigWig)", default=None)
    parser.add_argument("--pdStates", help="comma-separated list of state names to use"
                        " for computing posterior distribution.  For example: "
                        " --pdStates inside,LTR_left,LTR_right will compute the probability"
                        ", for each observation, that the hidden state is inside OR LTR_left"
                        " OR LTR_right.  Must be used with --pd to specify output "
                        "file.", default=None)
    parser.add_argument("--bic", help="save Bayesian Information Criterion (BIC) score"
                        " in given file", default=None)
    parser.add_argument("--ed", help="Output BED file for emission distribution. Must"
                        " be used in conjunction with --edStates (View on the "
                        "browser via bedGraphToBigWig)", default=None)
    parser.add_argument("--edStates", help="comma-separated list of state names to use"
                        " for computing emission distribution.  For example: "
                        " --edStates inside,LTR_left for each obsercation the probability "
                        " that inside emitted that observaiton plus the probabillity that"
                        " LTR_left emitted it. If more than one state is selected, this "
                        " is not a distribution, but a sum of distributions (and values"
                        " can exceed 1).  Mostly for debugging purposes. Note output in LOG",
                         default=None)
    parser.add_argument("--chroms", help="list of chromosomes, or regions, to run in parallel"
                        " (in BED format).  input regions will be intersected with each line"
                        " in this file, and the result will correspsond to an individual job",
                        default=None)
    parser.add_argument("--proc", help="number of processes (use in conjunction with --chroms)",
                        type=int, default=1)
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    if args.slice <= 0:
        args.slice = sys.maxint
    elif args.segment is True:
        raise RuntimeError("--slice and --segment options are not compatible at "
                           "this time")
    if (args.pd is not None) ^ (args.pdStates is not None):
        raise RuntimeError("--pd requires --pdStates and vice versa")
    if (args.ed is not None) ^ (args.edStates is not None):
        raise RuntimeError("--ed requires --edStates and vice versa")
    if args.bed is None and (args.pd is not None or args.ed is not None):
        raise RuntimeError("Both --ed and --pd only usable in conjunction with"
                           " --bed")

    if args.chroms is not None:
        # hack to allow chroms argument to chunk and rerun 
        parallelDispatch(argv, args)
        cleanBedTool(tempBedToolPath)
        return 0
    
    # load model created with teHmmTrain.py
    logger.info("loading model %s" % args.inputModel)
    model = loadModel(args.inputModel)

    if isinstance(model, MultitrackCfg):
        if args.maxPost is True:
           raise RuntimeErorr("--post not supported on CFG models")

    # apply the effective segment length
    if args.segLen > 0:
        assert args.segment is True
        model.getEmissionModel().effectiveSegmentLength = args.segLen
        
    # read intervals from the bed file
    logger.info("loading target intervals from %s" % args.bedRegions)
    mergedIntervals = getMergedBedIntervals(args.bedRegions, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.bedRegions)

    # slice if desired
    choppedIntervals = [x for x in slicedIntervals(mergedIntervals, args.slice)]

    # read segment intervals
    segIntervals = None
    if args.segment is True:
        logger.info("loading segment intervals from %s" % args.bedRegions)
        segIntervals = readBedIntervals(args.bedRegions, sort=True)

    # load the input
    # read the tracks, while intersecting them with the given interval
    trackData = TrackData()
    # note we pass in the trackList that was saved as part of the model
    # because we do not want to generate a new one.
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData.loadTrackData(args.tracksInfo, choppedIntervals, 
                            model.getTrackList(),
                            segmentIntervals=segIntervals)

    # do the viterbi algorithm
    if isinstance(model, MultitrackHmm):
        algname = "viterbi"
        if args.maxPost is True:
            algname = "posterior decoding"
        logger.info("running %s algorithm" % algname)
    elif isinstance(model, MultitrackCfg):
        logger.info("running CYK algorithm")

    vitOutFile = None
    if args.bed is not None:
        vitOutFile = open(args.bed, "w")
    totalScore = 0
    tableIndex = 0
    totalDatapoints = 0

    # Note: in general there's room to save on memory by only computing single
    # track table at once (just need to add table by table interface to hmm...)
    
    posteriors = [None] * trackData.getNumTrackTables()
    posteriorsFile = None
    posteriorsMask = None
    if args.pd is not None:
        posteriors = model.posteriorDistribution(trackData)
        posteriorsFile = open(args.pd, "w")
        posteriorsMask = getPosteriorsMask(args.pdStates, model)
        assert len(posteriors[0][0]) == len(posteriorsMask)
    emProbs = [None] * trackData.getNumTrackTables()
    emissionsFile = None
    emissionsMask = None
    if args.ed is not None:
        emProbs = model.emissionDistribution(trackData)
        emissionsFile = open(args.ed, "w")
        emissionsMask = getPosteriorsMask(args.edStates, model)
        assert len(emProbs[0][0]) == len(emissionsMask)

    
    decodeFunction = model.viterbi
    if args.maxPost is True:
        decodeFunction = model.posteriorDecode

    for i, (vitLogProb, vitStates) in enumerate(decodeFunction(trackData,
                                                numThreads=args.numThreads)):
        totalScore += vitLogProb
        if args.bed is not None or args.pd is not None:
            if args.bed is not None:
                vitOutFile.write("#Viterbi Score: %f\n" % (vitLogProb))
            trackTable = trackData.getTrackTableList()[tableIndex]
            tableIndex += 1
            statesToBed(trackTable,
                        vitStates, vitOutFile, posteriors[i], posteriorsMask,
                        posteriorsFile, emProbs[i], emissionsMask, emissionsFile)
            totalDatapoints += len(vitStates) * trackTable.getNumTracks()

    print "Viterbi (log) score: %f" % totalScore
    if isinstance(model, MultitrackHmm) and model.current_iteration is not None:
        print "Number of EM iterations: %d" % model.current_iteration
    if args.bed is not None:
        vitOutFile.close()
    if posteriorsFile is not None:
        posteriorsFile.close()
    if emissionsFile is not None:
        emissionsFile.close()

    if args.bic is not None:
        bicFile = open(args.bic, "w")
        # http://en.wikipedia.org/wiki/Bayesian_information_criterion
        lnL = float(totalScore)
        try:
            k = float(model.getNumFreeParameters())
        except:
            # numFreeParameters still not done for semi-supervised
            # just pass through a 0 instead of crashing for now
            k = 0.0 
        n = float(totalDatapoints)
        bic = -2.0 * lnL + k * (np.log(n) + np.log(2 * np.pi))
        bicFile.write("%f\n" % bic)
        bicFile.write("# = -2.0 * lnL + k * (lnN + ln(2 * np.pi))\n"
                      "# where lnL=%f  k=%d (%d states)  N=%d (%d obs * %d tracks)  lnN=%f\n" % (
            lnL, int(k), model.getEmissionModel().getNumStates(), int(totalDatapoints),
            totalDatapoints / model.getEmissionModel().getNumTracks(),
            model.getEmissionModel().getNumTracks(), np.log(n)))
        bicFile.close()

    cleanBedTool(tempBedToolPath)
Esempio n. 16
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create starting transition and emission distributions "
        "from a candidate BED annotation, which can"
        " be used with teHmmTrain.py using the --initTransProbs and "
        "--initEmProbs options, respectively.  The distributions created here"
        " are extremely simple, but this can be a good shortcut to at least "
        "getting the state names into the init files, which can be further "
        "tweeked by hand.")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("trackName", help="Name of Track to use as initial"
                        " annotation")
    parser.add_argument("queryBed", help="Bed file with regions to query")
    parser.add_argument("outTransProbs", help="File to write transition model"
                        " to")
    parser.add_argument("outEmProbs", help="File to write emission model to")
    parser.add_argument("--numOut", help="Number of \"outside\" states to add"
                        " to the model.", default=1, type=int)
    parser.add_argument("--numTot", help="Add x \"outside\" states such "
                        "that total states is this. (overrieds --numOut)",
                        default=0, type=int)
    parser.add_argument("--outName", help="Name of outside states (will have"
                        " numeric suffix if more than 1)", default="Outside")
    parser.add_argument("--mode", help="Strategy for initializing the "
                        "transition graph: {\'star\': all states are connected"
                        " to the oustide state(s) but not each other; "
                        " \'data\': transitions estimated from input bed; "
                        " \'full\': dont write edges and let teHmmTrain.py "
                        "initialize as a clique}", default="star")
    parser.add_argument("--selfTran", help="This script will always write all"
                        " the self-transition probabilities to the output file. "
                        "They will all be set to the specified value using this"
                        " option, or estimated from the data if -1", default=-1.,
                        type=float)
    parser.add_argument("--em", help="Emission probability for input track ("
                        "ie probability that state emits itself)",
                        type=float, default=0.95)
    parser.add_argument("--outEmNone", help="Add None emission probabilities"
                        " for target track for Outside states",
                        action="store_true", default=None)
                        
    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.mode == "star" and args.numOut < 1:
        raise RuntimeError("--numOut must be at least 1 if --mode star is used")
    if args.mode != "star" and args.mode != "data" and args.mode != "full":
        raise RuntimeError("--mode must be one of {star, data, full}")
    if args.mode == "data":
        raise RuntimeError("--data not implemented yet")
    assert os.path.isfile(args.tracksInfo)
    
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # Read the tracks info
    trackList = TrackList(args.tracksInfo)
    # Extract the track we want
    track = trackList.getTrackByName(args.trackName)
    if track is None:
        raise RuntimeError("Track %s not found in tracksInfo" % args.trackName)
    trackPath = track.getPath()
    if track.getDist() != "multinomial" and track.getDist() != "gaussian":
        raise RuntimeError("Track %s does not have multinomial or "
                           "gaussian distribution" % args.trackName)
    if track.getScale() is not None or track.getLogScale() is not None:
        raise RuntimeError("Track %s must not have scale" % args.trackName)
    
    # read query intervals from the bed file
    logger.info("loading query intervals from %s" % args.queryBed)
    mergedIntervals = getMergedBedIntervals(args.queryBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.queryBed)

    # read the track, while intersecting with query intervals
    # (track is saved as temp XML file for sake not changing interface)
    bedIntervals = []
    for queryInterval in mergedIntervals:
        bedIntervals += readBedIntervals(trackPath,
                                        ncol = track.getValCol() + 1,
                                        chrom=queryInterval[0],
                                        start=queryInterval[1],
                                        end=queryInterval[2])

    # 1st pass to collect set of names
    nameMap = CategoryMap(reserved = 0)
    for interval in bedIntervals:
        nameMap.update(interval[track.getValCol()])
    outNameMap = CategoryMap(reserved = 0)
    if args.numTot > 0:
        args.numOut = max(0, args.numTot - len(nameMap))
    for i in xrange(args.numOut):
        outName = args.outName
        if args.numOut > 1:
            outName += str(i)
        assert nameMap.has(outName) is False
        outNameMap.update(outName)

    # write the transition model for use with teHmmTrain.py --initTransProbs    
    writeTransitions(bedIntervals, nameMap, outNameMap, args)

    # write the emission model for use with teHmmTrain.py --initEmProbs
    writeEmissions(bedIntervals, nameMap, outNameMap, args)

    cleanBedTool(tempBedToolPath)
Esempio n. 17
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Make some tables of statistics from a BED file.  All"
        " output will be written in one big CSV table to be viewed in a "
        "spreadsheet.")

    parser.add_argument("inBed", help="Input bed file")
    parser.add_argument("outCsv", help="Path to write output in CSV format")
    parser.add_argument("--ignore",
                        help="Comma-separated list of names"
                        " to ignore",
                        default="")
    parser.add_argument("--numBins",
                        help="Number of (linear) bins for "
                        "histograms",
                        type=int,
                        default=10)
    parser.add_argument("--logHist",
                        help="Apply log-transform to data for "
                        "histogram",
                        action="store_true",
                        default=False)
    parser.add_argument("--histRange",
                        help="Histogram range as comma-"
                        "separated pair of numbers",
                        default=None)
    parser.add_argument("--noHist",
                        help="skip hisograms",
                        action="store_true",
                        default=False)
    parser.add_argument("--noScore",
                        help="Just do length stats",
                        action="store_true",
                        default=False)
    parser.add_argument("--noLen",
                        help="Just do score stats",
                        action="store_true",
                        default=False)
    parser.add_argument("--nearness",
                        help="Compute nearness stats (instead "
                        "of normal stats) of input bed with given BED.  Output"
                        " will be a BED instead of CSV, with nearness in the "
                        "score position",
                        default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.histRange is not None:
        args.histRange = args.histRange.split(",")
        assert len(args.histRange) == 2
        args.histRange = int(args.histRange[0]), int(args.histRange[1])

    outFile = open(args.outCsv, "w")
    args.ignoreSet = set(args.ignore.split(","))

    intervals = readBedIntervals(args.inBed,
                                 ncol=5,
                                 sort=args.nearness is not None)

    csvStats = ""
    # nearness stats
    if args.nearness is not None:
        args.noScore = True
        csvStats = makeNearnessBED(intervals, args)

    # length stats
    elif args.noLen is False:
        csvStats = makeCSV(intervals, args, lambda x: int(x[2]) - int(x[1]),
                           "Length")
    # score stats
    try:
        if args.noScore is False:
            csvStats += "\n" + makeCSV(intervals, args, lambda x: float(x[4]),
                                       "Score")
            csvStats += "\n" + makeCSV(
                intervals, args, lambda x: float(x[4]) *
                (float(x[2]) - float(x[1])), "Score*Length")
    except Exception as e:
        logger.warning("Couldn't make score stats because %s" % str(e))
    outFile.write(csvStats)
    outFile.write("\n")
    outFile.close()
    cleanBedTool(tempBedToolPath)
Esempio n. 18
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Given two bed files: a prediction and a true (or target)"
         " annotation, re-label the prediction's state names so that they "
         " best match the true annotation.  Usees same logic as "
         " compareBedStates.py for determining accuracy")

    parser.add_argument("tgtBed", help="Target bed file")
    parser.add_argument("predBed", help="Predicted bed file to re-label. ")
    parser.add_argument("outBed", help="Output bed (relabeling of predBed)")
    parser.add_argument("--col", help="Column of bed files to use for state"
                        " (currently only support 4(name) or 5(score))",
                        default = 4, type = int)
    parser.add_argument("--intThresh", help="Threshold to consider interval from"
                        " tgtBed covered by predBed.  If not specified, then base"
                        " level statistics will be used. Value in range (0,1]",
                        type=float, default=None)
    parser.add_argument("--noFrag", help="Dont allow fragmented interval matches ("
                        "see help for --frag in compareBedStates.py).  Only"
                        " relevant with --intThresh", action="store_true",
                        default=False)
    parser.add_argument("--qualThresh", help="Minimum match ratio between truth"
                        " and prediction to relabel prediction.  Example, if"
                        " predicted state X overlaps target state LTR 25 pct of "
                        "the time, then qualThresh must be at least 0.25 to "
                        "label X as LTR in the output.  Value in range (0, 1]",
                        type=float, default=0.1)
    parser.add_argument("--ignore", help="Comma-separated list of stateNames to"
                        " ignore (in prediction)", default=None)
    parser.add_argument("--ignoreTgt", help="Comma-separated list of stateNames to"
                        " ignore (int target)", default=None)
    parser.add_argument("--tgt", help="Comma-separated list of stateNames to "
                        " consider (in target).  All others will be ignored",
                        default=None)
    parser.add_argument("--unique", help="If more than one predicted state maps"
                        " to the same target state, add a unique id (numeric "
                        "suffix) to the output so that they can be distinguished",
                        action="store_true", default=False)
    parser.add_argument("--model", help="Apply state name mapping to the model"
                        " in the specified path (it is strongly advised to"
                        " make a backup of the model first)", default=None)
    parser.add_argument("--noMerge", help="By default, adjacent intervals"
                        " with the same state name in the output are "
                        "automatically merged into a single interval.  This"
                        " flag disables this.", action="store_true",
                        default=False)
    parser.add_argument("--hm", help="Write confusion matrix as heatmap in PDF"
                        " format to specified file", default = None)
    parser.add_argument("--old", help="Use old name mapping logic which just "
                        "takes biggest overlap in forward confusion matrix.  "
                        "faster than new default logic which does the greedy"
                        " f1 optimization", action="store_true", default=False)
    parser.add_argument("--fdr", help="Use FDR cutoff instead of (default)"
                        " greedy F1 optimization for state labeling",
                        type=float, default=None)
    parser.add_argument("--tl", help="Path to tracks XML file.  Used to cut "
                        "out mask tracks so they are removed from comparison."
                        " (convenience option to not have to manually run "
                        "subtractBed everytime...)", default=None)
    parser.add_argument("--colOrder", help="List of states used to force"
                        " ordering in heatmap (otherwise alphabetical) columns. These"
                        " states will correspond to the tgtBed when --old used and"
                        " --predBed otherwise.", default=None)
    parser.add_argument("--hmCovRow", help="Path to write 1-row heatmap of "
                        "state coverage (fraction of bases). only works with --hm",
                        default=None)

    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.ignore is not None:
        args.ignore = set(args.ignore.split(","))
    else:
        args.ignore = set()
    if args.ignoreTgt is not None:
        args.ignoreTgt = set(args.ignoreTgt.split(","))
    else:
        args.ignoreTgt = set()
    if args.tgt is not None:
        args.tgt = set(args.tgt.split(","))
        if args.old is True:
            raise RuntimeError("--tgt option not implemented for --old")
    else:
        args.tgt = set()
    if args.old is True and args.fdr is not None:
        raise RuntimeError("--old and --fdr options are exclusive")

    assert args.col == 4 or args.col == 5

    tempFiles = []
    if args.tl is not None:
        cutBedTgt = cutOutMaskIntervals(args.tgtBed, -1, sys.maxint, args.tl)                                
        cutBedPred = cutOutMaskIntervals(args.predBed, -1, sys.maxint, args.tl)
        
        if cutBedTgt is not None:
            assert cutBedPred is not None
            tempFiles += [cutBedTgt, cutBedPred]
            args.tgtBed = cutBedTgt
            args.predBed = cutBedPred

    checkExactOverlap(args.tgtBed, args.predBed)

    intervals1 = readBedIntervals(args.tgtBed, ncol = args.col)
    intervals2 = readBedIntervals(args.predBed, ncol = args.col)
    cfName = "reverse"

    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        cfName = "forward"

    # generate confusion matrix based on accuracy comparison using
    # base or interval stats as desired
    if args.intThresh is not None:
        logger.info("Computing interval %s confusion matrix" % cfName)
        confMat = compareIntervalsOneSided(intervals2, intervals1, args.col -1,
                                            args.intThresh, False,
                                           not args.noFrag)[1]
    else:
        logger.info("Computing base %s confusion matrix" % cfName)
        confMat = compareBaseLevel(intervals2, intervals1, args.col - 1)[1]

    logger.info("%s Confusion Matrix:\n%s" % (cfName, str(confMat)))

    # find the best "true" match for each predicted state    
    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        stateMap = getStateMapFromConfMatrix_simple(confMat)
    else:
        stateMap = getStateMapFromConfMatrix(confMat, args.tgt, args.ignoreTgt,
                                             args.ignore, args.qualThresh,
                                             args.fdr)

    # filter the stateMap to take into account the command-line options
    # notably --ignore, --ignoreTgt, --qualThresh, and --unique
    filterStateMap(stateMap, args)

    logger.info("State Map:\n%s", str(stateMap))
        
    # write the model if spefied
    if args.model is not None:
        applyNamesToModel(stateMap, args.model)
    
    # generate the output bed using the statemap
    writeFittedBed(intervals2, stateMap, args.outBed, args.col-1, args.noMerge,
                   args.ignoreTgt)

    # write the confusion matrix as heatmap
    if args.hm is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write heatmap.  Maybe matplotlib is "
                               "not installed?")
        writeHeatMap(confMat, args.hm, args.colOrder, args.hmCovRow)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
Esempio n. 19
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Train, evalaute, then compare hmm model on input")

    parser.add_argument("trainingTracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks used "
                        "for training")
    parser.add_argument("outputDir", help="directory to write output")
    parser.add_argument("inBeds", nargs="*", help="list of training beds")
    parser.add_argument("--evalTracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks used"
                        " for evaluation (only need if different from"
                        " trainingTracksInfo", default=None)
    parser.add_argument("--numProc", help="Max number of processors to use",
                        type=int, default=1)
    parser.add_argument("--allTrackCombinations", help="Rerun with all"
                        " possible combinations of tracks from the input"
                        " tracksInfo file.  Note that this number gets big"
                        " pretty fast.", action = "store_true", default= False)
    parser.add_argument("--emStates", help="By default the supervised mode"
                        " of teHmmTrain is activated.  This option overrides"
                        " that and uses the EM mode and the given number of "
                        "states instead", type=int, default=None)
    parser.add_argument("--cross", help="Do 50/50 cross validation by training"
                        " on first half input and validating on second",
                        action="store_true", default=False)
    parser.add_argument("--emFac", help="Normalization factor for weighting"
                        " emission probabilities because when there are "
                        "many tracks, the transition probabilities can get "
                        "totally lost. 0 = no normalization. 1 ="
                        " divide by number of tracks.  k = divide by number "
                        "of tracks / k", type=int, default=0)
    parser.add_argument("--mod", help="Path to trained model.  This will "
                        "bypass the training phase that would normally be done"
                        " and just skip to the evaluation.  Note that the user"
                        " must make sure that the trained model has the "
                        "states required to process the input data",
                        default = None)
    parser.add_argument("--iter", help="Number of EM iterations.  Needs to be"
                        " used in conjunction with --emStates to specify EM"
                        " training",
                        type = int, default=None)
    parser.add_argument("--initTransProbs", help="Path of text file where each "
                        "line has three entries: FromState ToState Probability"
                        ".  This file (all other transitions get probability 0)"
                        " is used to specifiy the initial transition model."
                        " The names and number of states will be initialized "
                        "according to this file (overriding --numStates)",
                        default = None)
    parser.add_argument("--fixTrans", help="Do not learn transition parameters"
                        " (best used with --initTransProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ".  This file (all other emissions get probability 0)"
                        " is used to specifiy the initial emission model. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixEm", help="Do not learn emission parameters"
                        " (best used with --initEmProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initStartProbs", help="Path of text file where each "
                        "line has two entries: State Probability"
                        ".  This file (all other start probs get probability 0)"
                        " is used to specifiy the initial start dist. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixStart", help="Do not learn start parameters"
                        " (best used with --initStartProbs)",
                        action="store_true", default=False)
    parser.add_argument("--forceTransProbs",
                        help="Path of text file where each "
                        "line has three entries: FromState ToState Probability" 
                        ". These transition probabilities will override any "
                        " learned probabilities after training (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed" ,
                        default=None)
    parser.add_argument("--forceEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ". These "
                        "emission probabilities will override any learned"
                        " probabilities after training (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed." ,
                        default = None) 
    parser.add_argument("--flatEm", help="Use a flat emission distribution as "
                        "a baseline.  If not specified, the initial emission "
                        "distribution will be randomized by default.  Emission"
                        " probabilities specified with --initEmpProbs or "
                        "--forceEmProbs will never be affected by randomizaiton"
                        ".  The randomization is important for Baum Welch "
                        "training, since if two states dont have at least one"
                        " different emission or transition probability to begin"
                        " with, they will never learn to be different.",
                        action="store_true", default=False)
    parser.add_argument("--emRandRange", help="When randomly initialzing a"
                        " multinomial emission distribution, constrain"
                        " the values to the given range (pair of "
                        "comma-separated numbers).  Overridden by "
                        "--initEmProbs and --forceEmProbs when applicable."
                        " Completely overridden by --flatEm (which is equivalent"
                        " to --emRandRange .5,.5.). Actual values used will"
                        " always be normalized.", default=None)    
    parser.add_argument("--mandTracks", help="Mandatory track names for use "
                        "with --allTrackCombinations in comma-separated list",
                        default=None)
    parser.add_argument("--combinationRange", help="in form MIN,MAX: Only "
                        "explore track combination in given (closed) range. "
                        "A more refined version of --allTrackCombinations.",
                        default=None)
    parser.add_argument("--supervised", help="Use name (4th) column of "
                        "<traingingBed> for the true hidden states of the"
                        " model.  Transition parameters will be estimated"
                        " directly from this information rather than EM."
                        " NOTE: The number of states will be determined "
                        "from the bed.",
                        action = "store_true", default = False)
    parser.add_argument("--segment", help="Input bed files are also used to "
                        "segment data.  Ie teHmmTrain is called with --segment"
                        " set to the input file. Not currently working with "
                        " --supervised",
                        action = "store_true", default=False)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied) in training", type=int,
                        default=None)
    parser.add_argument("--truth", help="Use specifed file instead of "
                        "input file(s) for truth comparison.  Makes sense"
                        " when --segment is specified and only one input"
                        " bed specified", default = None)
    parser.add_argument("--eval", help="Bed file used for evaluation.  It should"
                        " cover same region in same order as --truth.  Option "
                        "exists mostly to specify segmentation of --truth",
                        default=None)
    parser.add_argument("--seed", help="Seed for random number generator"
                        " which will be used to initialize emissions "
                        "(if --flatEM and --supervised not specified)",
                        default=None, type=int)
    parser.add_argument("--reps", help="Number of training replicates (with "
                        " different"
                         " random initializations) to run. The replicate"
                         " with the highest likelihood will be chosen for the"
                         " output", default=None, type=int)
    parser.add_argument("--numThreads", help="Number of threads to use when"
                        " running training replicates (see --rep) in parallel.",
                        type=int, default=None)
    parser.add_argument("--emThresh", help="Threshold used for convergence"
                        " in baum welch training.  IE delta log likelihood"
                        " must be bigger than this number (which should be"
                        " positive) for convergence", type=float,
                        default=None)
    parser.add_argument("--fit", help="Run fitStateNames.py to automap names"
                        " before running comparison", action="store_true",
                        default=False)
    parser.add_argument("--fitOpts", help="Options to pass to fitStateNames.py"
                        " (only effective if used with --fit)", default=None)
    parser.add_argument("--saveAllReps", help="Save all replicates (--reps)"
                        " models to disk, instead of just the best one"
                        ". Format is <outputModel>.repN.  There will be "
                        " --reps -1 such models saved as the best output"
                        " counts as a replicate.  Comparison statistics"
                        " will be generated for each rep.",
                        action="store_true", default=False)
    parser.add_argument("--maxProb", help="Gaussian distributions and/or"
                        " segment length corrections can cause probability"
                        " to *decrease* during BW iteration.  Use this option"
                        " to remember the parameters with the highest probability"
                        " rather than returning the parameters after the final "
                        "iteration.", action="store_true", default=False)
    parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop"
                        " training if a given number of iterations go by without"
                        " hitting a new maxProb", default=None, type=int)
    parser.add_argument("--transMatEpsilons", help="By default, epsilons are"
                        " added to all transition probabilities to prevent "
                        "converging on 0 due to rounding error only for fully"
                        " unsupervised training.  Use this option to force this"
                        " behaviour for supervised and semisupervised modes",
                        action="store_true", default=False)

        
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    logOps = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        logOps += " --logFile %s" % args.logFile

    if not os.path.exists(args.outputDir):
        os.makedirs(args.outputDir)
    if args.evalTracksInfo is None:
        args.evalTracksInfo = args.trainingTracksInfo

    trainingTrackList = TrackList(args.trainingTracksInfo)
    evalTrackList = TrackList(args.evalTracksInfo)
    checkTrackListCompatible(trainingTrackList, evalTrackList)

    sizeRange = (len(trainingTrackList), len(trainingTrackList) + 1)
    if args.allTrackCombinations is True:
        sizeRange = (1, len(trainingTrackList) + 1)
    if args.combinationRange is not None:
        toks = args.combinationRange.split(",")
        sizeRange = int(toks[0]),int(toks[1]) + 1
        logger.debug("manual range (%d, %d) " % sizeRange)
    mandTracks = set()
    if args.mandTracks is not None:
        mandTracks = set(args.mandTracks.split(","))
        logger.debug("mandatory set %s" % str(mandTracks))
    trainFlags = ""
    if args.emStates is not None:
        trainFlags += " --numStates %d" % args.emStates
    if args.supervised is True:
        trainFlags += " --supervised"
        if args.segment is True:
            raise RuntimeError("--supervised not currently compatible with "
                               "--segment")
    trainFlags += " --emFac %d" % args.emFac
    if args.forceEmProbs is not None:
        trainFlags += " --forceEmProbs %s" % args.forceEmProbs
    if args.iter is not None:
        assert args.emStates is not None or args.initTransProbs is not None
        trainFlags += " --iter %d" % args.iter
    if args.initTransProbs is not None:
        trainFlags += " --initTransProbs %s" % args.initTransProbs
    if args.initEmProbs is not None:
        trainFlags += " --initEmProbs %s" % args.initEmProbs
    if args.fixEm is True:
        trainFlags += " --fixEm"
    if args.initStartProbs is not None:
        trainFlags += " --initStartProbs %s" % args.initStartProbs
    if args.fixStart is True:
        trainFlags += " --fixStart"
    if args.forceTransProbs is not None:
        trainFlags += " --forceTransProbs %s" % args.forceTransProbs
    if args.forceEmProbs is not None:
        trainFlags += " --forceEmProbs %s" % args.forceEmProbs
    if args.flatEm is True:
        trainFlags += " --flatEm"
    if args.emRandRange is not None:
        trainFlags += " --emRandRange %s" % args.emRandRange
    if args.segLen is not None:
        trainFlags += " --segLen %d" % args.segLen
    if args.seed is not None:
        trainFlags += " --seed %d" % args.seed
    if args.reps is not None:
        trainFlags += " --reps %d" % args.reps
    if args.numThreads is not None:
        trainFlags += " --numThreads %d" % args.numThreads
    if args.emThresh is not None:
        trainFlags += " --emThresh %f" % args.emThresh
    if args.saveAllReps is True:
        trainFlags += " --saveAllReps"
    if args.maxProb is True:
        trainFlags += " --maxProb"
    if args.transMatEpsilons is True:
        trainFlags += " --transMatEpsilons"
    if args.maxProbCut is not None:
        trainFlags += " --maxProbCut %d" % args.maxProbCut

    # write out command line for posteriorty's sake
    if not os.path.exists(args.outputDir):
        os.makedirs(args.outputDir)
    cmdPath = os.path.join(args.outputDir, "teHmmBenchmark_cmd.txt")
    cmdFile = open(cmdPath, "w")
    cmdFile.write(" ".join(argv) + "\n")
    cmdFile.close()
                           
    #todo: try to get timing for each command
    commands = []
    rows = dict()
    for pn, pList in enumerate(subsetTrackList(trainingTrackList, sizeRange,
                                               mandTracks)):
        if len(pList) == len(trainingTrackList):
            outDir = args.outputDir
        else:
            outDir = os.path.join(args.outputDir, "perm%d" % pn)
        if not os.path.exists(outDir):
            os.makedirs(outDir)
        trainingTrackPath = os.path.join(outDir, "training_tracks.xml")
        evalTrackPath = os.path.join(outDir, "eval_tracks.xml")
        for maskTrack in trainingTrackList.getMaskTracks():
            pList.addTrack(copy.deepcopy(maskTrack))
        pList.saveXML(trainingTrackPath)
        epList = TrackList()
        for track in pList:
            t = copy.deepcopy(evalTrackList.getTrackByName(track.getName()))
            epList.addTrack(t)
        for maskTrack in trainingTrackList.getMaskTracks():
            epList.addTrack(copy.deepcopy(maskTrack))
        epList.saveXML(evalTrackPath)
        
        for inBed in args.inBeds:
            
            base = os.path.basename(inBed)
            truthBed = inBed
            testBed = inBed
            if args.cross is True:
                truthBed = os.path.join(outDir,
                                        os.path.splitext(base)[0] +
                                        "_truth_temp.bed")
                testBed = os.path.join(outDir,
                                       os.path.splitext(base)[0] +
                                       "_test_temp.bed")
                splitBed(inBed, truthBed, testBed)

                                        
            
            # train
            if args.mod is not None:
                modPath = args.mod
                command = "ls %s" % modPath
            else:
                modPath = os.path.join(outDir,
                                       os.path.splitext(base)[0] + ".mod")
                command = "teHmmTrain.py %s %s %s %s %s" % (trainingTrackPath,
                                                            truthBed,
                                                            modPath,
                                                            logOps,
                                                            trainFlags)
                if args.segment is True:
                    command += " --segment %s" % truthBed

            # view
            viewPath = os.path.join(outDir,
                                   os.path.splitext(base)[0] + "_view.txt")
            command += " && teHmmView.py %s > %s" % (modPath, viewPath)

            # evaluate
            numReps = 1
            if args.reps is not None and args.saveAllReps is True:
                numReps = args.reps
                assert numReps > 0
            missed = 0
            # little hack to repeat evaluation for each training replicate
            for repNum in xrange(-1, numReps-1):
                if repNum == -1:
                    repSuffix = ""
                else:
                    repSuffix = ".rep%d" % repNum                
                evalBed = os.path.join(outDir,
                                       os.path.splitext(base)[0] + "_eval.bed" +
                                       repSuffix)
                hmmEvalInputBed = testBed
                if args.eval is not None:
                    hmmEvalInputBed = args.eval
                bicPath = os.path.join(outDir,
                                       os.path.splitext(base)[0] + "_bic.txt" +
                                       repSuffix)

                command += " && teHmmEval.py %s %s %s --bed %s %s --bic %s" % (
                    evalTrackPath,
                    modPath + repSuffix,
                    hmmEvalInputBed,
                    evalBed,
                    logOps,
                    bicPath)
                zin = True

                if args.segment is True:
                    command += " --segment"

                # fit
                compTruth = testBed
                if args.truth is not None:
                    compTruth = args.truth
                compareInputBed = evalBed
                if args.fit is True:
                    fitBed = os.path.join(outDir,
                                          os.path.splitext(base)[0] + "_eval_fit.bed" +
                                          repSuffix)
                    command += " && fitStateNames.py %s %s %s --tl %s" % (compTruth,
                                                                          evalBed,
                                                                          fitBed,
                                                                          evalTrackPath)
                    if args.fitOpts is not None:
                        command += " " + args.fitOpts
                    compareInputBed = fitBed

                # compare
                compPath = os.path.join(outDir,
                                        os.path.splitext(base)[0] + "_comp.txt" +
                                        repSuffix)
                command += " && compareBedStates.py %s %s --tl %s > %s" % (
                    compTruth,
                    compareInputBed,
                    evalTrackPath,
                    compPath)
            

                # make table row
                if repSuffix == "":
                    rowPath = os.path.join(outDir,
                                           os.path.splitext(base)[0] + "_row.txt")
                    if inBed in rows:
                        rows[inBed].append(rowPath)
                    else:
                        rows[inBed] = [rowPath]
                    command += " && scrapeBenchmarkRow.py %s %s %s %s %s" % (
                        args.trainingTracksInfo,
                        trainingTrackPath,
                        evalBed,
                        compPath,
                        rowPath)

            # remember command
            inCmdPath = os.path.join(outDir,
                                    os.path.splitext(base)[0] + "_cmd.txt")
            inCmdFile = open(inCmdPath, "w")
            inCmdFile.write(command + "\n")
            inCmdFile.close()
            commands.append(command)
            
    runParallelShellCommands(commands, args.numProc)
    writeTables(args.outputDir, rows)
Esempio n. 20
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Find candidate TSDs (exact forward matches) flanking given"
        "BED intervals.  Score is distance between TSD and bed interval.")
    parser.add_argument("fastaSequence", help="DNA sequence in FASTA format")
    parser.add_argument("inBed", help="BED file with TEs whose flanking regions "
                        "we wish to search")
    parser.add_argument("outBed", help="BED file containing (only) output TSDs")
    parser.add_argument("--min", help="Minimum length of a TSD",
                        default=4, type=int)
    parser.add_argument("--max", help="Maximum length of a TSD",
                        default=6, type=int)
    parser.add_argument("--all", help="Report all matches in region (as opposed"
                        " to only the nearest to the BED element which is the "
                        "default behaviour", action="store_true", default=False)
    parser.add_argument("--maxScore", help="Only report matches with given "
                        "score or smaller.  The score  is definied as the "
                        "maximum distance between the (two) TSD intervals and "
                        "the query interval",
                        default=None, type=int)
    parser.add_argument("--left", help="Number of bases immediately left of the "
                        "BED element to search for the left TSD",
                        default=7, type=int)
    parser.add_argument("--right", help="Number of bases immediately right of "
                        "the BED element to search for the right TSD",
                        default=7, type=int)
    parser.add_argument("--overlap", help="Number of bases overlapping the "
                        "BED element to include in search (so total space "
                        "on each side will be --left + overlap, and --right + "
                        "--overlap", default=3, type=int)
    parser.add_argument("--leftName", help="Name of left TSDs in output Bed",
                        default="L_TSD")
    parser.add_argument("--rightName", help="Name of right TSDs in output Bed",
                        default="R_TSD")
    parser.add_argument("--id", help="Assign left/right pairs of TSDs a unique"
                        " matching ID", action="store_true", default=False)
    parser.add_argument("--names", help="Only apply to bed interval whose "
                        "name is in (comma-separated) list.  If not specified"
                        " then all intervals are processed", default=None)
    parser.add_argument("--numProc", help="Number of jobs to run in parallel."
                        " (parallization done on different sequences in FASTA"
                        "file", type=int, default=1)
    parser.add_argument("--sequences", help="Only process given sequences of input"
                        " FASTA file (comma-separated list).",  default=None)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    assert os.path.exists(args.inBed)
    assert os.path.exists(args.fastaSequence)
    assert args.min <= args.max
    args.nextId = 0

    if args.sequences is not None:
        args.sequences = set(args.sequences.split(","))

    # read intervals from the bed file
    logger.info("loading target intervals from %s" % args.inBed)
    bedIntervals = readBedIntervals(args.inBed, ncol=4, sort=True)
    if bedIntervals is None or len(bedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.inBed)

    if args.numProc > 1:
        runParallel(args, bedIntervals)
        return 0
    
    tsds = findTsds(args, bedIntervals)

    writeBedIntervals(tsds, args.outBed)
Esempio n. 21
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Produce a bed file of genome segments which are atomic"
        " elements with resepect to the hmm. ie each segment emits a single"
        " state. Mask tracks always cut.  "
        "Output intervals are assigned name 0 1 0 1 etc.")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("outBed", help="Output segments")
    parser.add_argument("--thresh",
                        help="Number of tracks that can change "
                        "before a new segment formed.  Increasing this value"
                        " increases the expected lengths of output segments",
                        type=int,
                        default=1)
    parser.add_argument("--cutTracks",
                        help="Create a new segment if something"
                        " changes in one of these tracks (as specified by "
                        "comman-separated list), overriding --thresh options"
                        " if necessary.  For example, --cutTracks tsd,chaux"
                        " would invoke a new segment everytime the value at"
                        "either of these tracks changed",
                        default=None)
    parser.add_argument("--cutUnscaled",
                        help="Cut on all unscaled (used as "
                        "a proxy for non-numeric) tracks",
                        default=False,
                        action="store_true")
    parser.add_argument("--cutMultinomial",
                        help="Cut non-gaussian, non-binary"
                        " tracks everytime",
                        default=False,
                        action="store_true")
    parser.add_argument("--cutNonGaussian",
                        help="Cut all but guassian tracks",
                        default=False,
                        action="store_true")
    parser.add_argument("--comp",
                        help="Strategy for comparing columns for the "
                        "threshold cutoff.  Options are [first, prev], where"
                        " first compares with first column of segment and "
                        "prev compares with column immediately left",
                        default="first")
    parser.add_argument("--ignore",
                        help="Comma-separated list of tracks to "
                        "ignore (the FASTA DNA sequence would be a good "
                        "candidate",
                        default="sequence")
    parser.add_argument("--maxLen",
                        help="Maximum length of a segment (<= 0 means"
                        " no max length applied",
                        type=int,
                        default=0)
    parser.add_argument(
        "--fixLen",
        help="Just make segments of specifed fixed "
        "length ignoring other parameters and logic (<= 0 means"
        " no fixed length applied",
        type=int,
        default=0)
    parser.add_argument("--stats",
                        help="Write some statistics to specified "
                        "file. Of the form <trackName> <Diff> <DiffPct> "
                        " where <Diff> is the number of times a track differs"
                        " between two consecutive segments, and <DiffPct> "
                        " is the average perecentage of all such differences "
                        "accounted for by the track",
                        default=None)
    parser.add_argument(
        "--delMask",
        help="Entirely remove intervals from "
        "mask tracks that are > given length (otherwise "
        "they would just be ignored by HMM tools). The difference"
        " here is that removed intervals will break contiguity.",
        type=int,
        default=None)
    parser.add_argument(
        "--chroms",
        help="list of chromosomes, or regions, to run in parallel"
        " (in BED format).  input regions will be intersected with each line"
        " in this file, and the result will correspsond to an individual job",
        default=None)
    parser.add_argument(
        "--proc",
        help="number of processes (use in conjunction with --chroms)",
        type=int,
        default=1)
    parser.add_argument(
        "--co",
        help="count offset for segment labels.  only used internally",
        type=int,
        default=0)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.comp != "first" and args.comp != "prev":
        raise RuntimeError("--comp must be either first or prev")

    if args.chroms is not None:
        # hack to allow chroms argument to chunk and rerun
        parallelDispatch(argv, args)
        cleanBedTool(tempBedToolPath)
        return 0

    # read query intervals from the bed file
    tempFiles = []
    if args.delMask is not None:
        cutBed = cutOutMaskIntervals(args.allBed, args.delMask, sys.maxint,
                                     args.tracksInfo)
        if cutBed is not None:
            tempFiles.append(cutBed)
            args.allBed = cutBed
    logger.info("loading segment region intervals from %s" % args.allBed)
    mergedIntervals = getMergedBedIntervals(args.allBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.allBed)

    # read the tracks, while intersecting them with the query intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo,
                            mergedIntervals,
                            treatMaskAsBinary=True)

    # process the --cutTracks option
    trackList = trackData.getTrackList()
    cutList = np.zeros((len(trackList)), np.int)
    if args.cutTracks is not None:
        cutNames = args.cutTracks.split(",")
        for name in cutNames:
            track = trackList.getTrackByName(name)
            if track is None:
                raise RuntimeError("cutTrack %s not found" % name)
            trackNo = track.getNumber()
            assert trackNo < len(cutList)
            cutList[trackNo] = 1
    args.cutList = cutList

    # make sure mask tracks count as cut tracks
    for track in trackList:
        if track.getDist() == 'mask':
            args.cutList[track.getNumber()] = 1

    # process the --ignore option
    ignoreList = np.zeros((len(trackList)), np.int)
    if args.ignore is not None:
        ignoreNames = args.ignore.split(",")
        for name in ignoreNames:
            track = trackList.getTrackByName(name)
            if track is None:
                if name is not "sequence":
                    logger.warning("ignore track %s not found" % name)
                continue
            trackNo = track.getNumber()
            assert trackNo < len(ignoreList)
            ignoreList[trackNo] = 1
            if args.cutList[trackNo] == 1:
                raise RuntimeError("Same track (%s) cant be cut and ignored" %
                                   name)
    args.ignoreList = ignoreList

    #process the --cutUnscaled option
    if args.cutUnscaled is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.scale is None and track.shift is None and\
              track.logScale is None and\
              args.ignoreList[trackNo] == 0:
                assert trackNo < len(cutList)
                cutList[trackNo] = 1

    #process the --cutMultinomial option
    if args.cutMultinomial is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist == "multinomial" and\
              args.ignoreList[trackNo] == 0:
                assert trackNo < len(cutList)
                cutList[trackNo] = 1

    #process the --cutNonGaussian option
    if args.cutNonGaussian is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist != "gaussian" and\
              args.ignoreList[trackNo] == 0:
                assert trackNo < len(cutList)
                cutList[trackNo] = 1

    # segment the tracks
    stats = dict()
    segmentTracks(trackData, args, stats)
    writeStats(trackData, args, stats)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
Esempio n. 22
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Combine a bunch of non-numeric BED tracks into"
        " single file using fitStateNames.py to try to keep names "
        "consistent.  Idea is to be used as baseline to compare"
        " hmm to (via base-by-base statistics, primarily, since"
        " this procedure could induce some fragmentation)")

    parser.add_argument("tracksXML", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("regionBed", help="BED file representing "
                        "target region (best if whole genome)")
    parser.add_argument("outBed", help="Output bed")
    parser.add_argument("--tracks", help="Comma-separated list of "
                        "track names to use.  All tracks will be"
                        " used by default", default=None)
    parser.add_argument("--outside", help="Name to give non-annotated"
                        "regions", default="Outside")
    parser.add_argument("--fitThresh", help="Min map percentage (0,1)"
                        " in order to rename (see --qualThresh option"
                        "of fitStateNames.py", type=float,
                        default=0.5)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    inputTrackList = TrackList(args.tracksXML)
    iter = 0

    # get regionBed where all intervals are merged when possible
    regionIntervals = getMergedBedIntervals(args.regionBed, sort=True)
    tempRegionPath = getLocalTempPath("Temp", "_reg.bed")
    tempRegionFile = open(tempRegionPath, "w")
    for interval in regionIntervals:
        tempRegionFile.write("\t".join([str(x) for x in interval]) + "\n")
    tempRegionFile.close()

    # accumulate tracks in temp file
    tempOutPath = getLocalTempPath("Temp", "_out.bed")
    
    for track in inputTrackList:
        if track.shift is not None or track.scale is not None or\
          track.logScale is not None or track.dist == "gaussian" or\
          os.path.splitext(track.getPath())[1].lower() != ".bed":
          logger.warning("Skipping numeric track %s" % track.getName())
        elif args.tracks is None or track.getName() in args.tracks.split(","):
            combineTrack(track, tempOutPath, tempRegionPath, iter, args)
            iter += 1

    # nothing got written, make everything outside
    if iter == 0:
        tempOutFile = open(tempOutPath, "w")
        for interval in regionIntervals:
            tempOutFile.write("%s\t%s\t%s\t%s\n" % (interval[0], interval[1],
                                                   interval[2], args.outside))
        tempOutFile.close()

    runShellCommand("mv %s %s" % (tempOutPath, args.outBed))
    runShellCommand("rm -f %s" % (tempRegionPath))
                
    cleanBedTool(tempBedToolPath)
Esempio n. 23
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Thin wrapper of teHmmTrain.py and teHmmEval.py "
        "to generate a table of Number-of-HMM-states VS BIC. Lower BIC"
        " is better")

    parser.add_argument("tracks", help="tracks xml used for training and eval")
    parser.add_argument("trainingBeds", help="comma-separated list of training regions"
                        " (training region size will be a variable in output table). "
                        "if segmentation is activated, these must also be the "
                        "segmented beds...")
    parser.add_argument("evalBed", help="eval region")
    parser.add_argument("trainOpts", help="all teHmmTrain options in quotes")
    parser.add_argument("evalOpts", help="all teHmmEval options in quotes")
    parser.add_argument("states", help="comma separated-list of numbers of states"
                        " to try")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("--reps", help="number of replicates", type = int,
                        default=1)
    parser.add_argument("--proc", help="maximum number of processors to use"
                        " in parallel", type = int, default = 1)
    parser.add_argument("--resume", help="try not to rewrite existing files",
                        action="store_true", default=False)
    parser.add_argument("--initTrans", help="the states argument is overridden"
                        " to specify a list of transition initialization files "
                        "instead of state numbers", action="store_true",
                        default=False)
    parser.add_argument("--numReps", help="the states argument is overridden"
                        " to specifiy a list of replicate numbers (--reps)"
                        " arguments", action="store_true", default=False)
    parser.add_argument("--numIter", help="the states argument is overridden"
                        " to specifiy a list of iteration counts (--iter)"
                        " arugments", action="store_true", default=False)
                        
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1:
        raise RuntimeError("only one of {--initTrans, --numReps, --numIter} "
                           "can be used at a time")

    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    # get the sizes of the trianing beds
    trainingSizes = []
    trainingBeds = []
    for tb in  args.trainingBeds.split(","):
        if len(tb) > 0:
            trainingBeds.append(tb)
    for bed in trainingBeds:
        assert os.path.isfile(bed)
        bedLen = 0
        for interval in readBedIntervals(bed):
            bedLen += interval[2] - interval[1]
        trainingSizes.append(bedLen)

    # make sure --bed not in teHmmEval options and --numStates not in train
    # options
    trainOpts = args.trainOpts.split()
    if "--numStates" in args.trainOpts and not args.numReps and not args.numIter:
        nsIdx = trainOpts.index("--numStates")
        assert nsIdx < len(trainOpts) - 1
        del trainOpts[nsIdx]
        del trainOpts[nsIdx]
    if "--initTransProbs" in args.trainOpts:
        tpIdx = trainOpts.index("--initTransProbs")
        assert tpIdx < len(trainOpts) - 1
        del trainOpts[tpIdx]
        del trianOpts[tpIdx]
    trainProcs = 1
    if "--numThreads" in args.trainOpts:
        npIdx = trainOpts.index("--numThreads")
        assert npIdx < len(trainOpts) - 1
        trainProcs = int(trainOpts[npIdx + 1])
    segOptIdx = -1
    if "--segment" in args.trainOpts:
        segIdx = trainOpts.index("--segment")
        assert segIdx < len(trainOpts) - 1
        segOptIdx = segIdx + 1
    if args.numReps and "--reps" in args.trainOpts:
        repsIdx = trainOpts.index("--reps")
        assert repsIdx < len(trainOpts) - 1
        del trainOpts[repsIdx]
        del trainOpts[repsIdx]
    if args.numIter and "--iter" in args.trainOpts:
        iterIdx = trainOpts.index("--iter")
        assert iterIdx < len(trainOpts) - 1
        del trainOpts[iterIdx]
        del trainOpts[iterIdx]
    evalOpts = args.evalOpts.split()
    if "--bed" in args.evalOpts:
        bedIdx = evalOpts.index("--bed")
        assert bedIdx < len(evalOpts) - 1
        del evalOpts[bedIdx]
        del evalOpts[bedIdx]
    if "--bic" in args.evalOpts:
        bicIdx = evalOpts.index("--bic")
        assert bicIdx < len(evalOpts) - 1
        del evalOpts[bicIdx]
        del evalOpts[bicIdx]

    # hack in support for --initTrans option by munging out model sizes
    # from the text files
    if args.initTrans is True:
        transFiles = args.states.split(",")
        states = []
        for tf in transFiles:
            stateSet = set()
            with open(tf) as f:
                for line in f:
                    toks = line.split()
                    print toks
                    if len(toks) > 1 and toks[0][0] != "#":
                        stateSet.add(toks[0])
                        stateSet.add(toks[1])
            states.append(len(stateSet))
    else:
        states = args.states.split(",")

    trainCmds = []
    evalCmds = []
    prevSize = -1
    sameSizeCount = 0
    for trainingSize, trainingBed in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        print prevSize, trainingSize, sameSizeCount
        for numStates in states:
            for rep in xrange(args.reps):
                outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % (
                    trainingSize, sameSizeCount, int(numStates), int(rep)))
                if segOptIdx != -1:
                    trainOpts[segOptIdx] = trainingBed
                if args.initTrans is True:
                    statesOpt = "--initTransProbs %s" % transFiles[states.index(numStates)]
                elif args.numIter is True:
                    # states argument overridden by iterations
                    statesOpt = "--iter %d" % int(numStates)
                elif args.numReps is True:
                    # states argument overridden by reps
                    statesOpt = "--reps %d" % int(numStates)
                else:
                    statesOpt = "--numStates %d" % int(numStates)
                trainCmd = "teHmmTrain.py %s %s %s %s %s" % (
                    args.tracks, trainingBed, outMod, " ".join(trainOpts),
                    statesOpt)
                if not args.resume or not os.path.isfile(outMod) or \
                   os.path.getsize(outMod) < 100:
                    trainCmds.append(trainCmd)

                outBic = outMod.replace(".mod", ".bic")
                outBed = outMod.replace(".mod", "_eval.bed")
                evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % (
                    args.tracks, outMod, args.evalBed, outBed, outBic,
                    " ".join(evalOpts))
                if not args.resume or not os.path.isfile(outBic) or \
                   os.path.getsize(outBic) < 2:
                    evalCmds.append(evalCmd)
            
    # run the training            
    runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs))

    # run the eval
    runParallelShellCommands(evalCmds, args.proc)

    # make the table header
    tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w")
    stateColName = "states"
    if args.numIter is True:
        statesColName = "iter"
    elif args.numReps is True:
        stateColName = "reps"
    tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" % stateColName)
    for i in xrange(args.reps):
        tableFile.write(", bic.%d" % i)
    tableFile.write("\n")

    # make the table body
    prevSize = -1
    sameSizeCount = 0
    for (trainingSize,trainingBed) in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        for numStates in states:
            bics = []
            printBics = []
            for rep in xrange(args.reps):
                outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % (
                    trainingSize, sameSizeCount, int(numStates), int(rep)))
                outBic = outMod.replace(".mod", ".bic")
                try:
                    with open(outBic, "r") as obFile:
                        for line in obFile:
                            bic = float(line.split()[0])
                            break
                    bics.append(bic)
                    printBics.append(bic)
                except:
                    logger.warning("Coudn't find bic %s" % outBic)
                    printBics.append("ERROR")
            # write row
            tableFile.write("%s, %d, %d" % (trainingBed, int(trainingSize), int(numStates)))
            if len(bics) > 0:
                tableFile.write(", %f, %f, %f" % (np.mean(bics), np.min(bics),
                                                  np.max(bics)))
            else:
                tableFile.write(", ERROR, ERROR, ERROR")
            for pb in printBics:
                tableFile.write(", %s" % pb)
            tableFile.write("\n")
    tableFile.close()
            
    cleanBedTool(tempBedToolPath)
Esempio n. 24
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "fix up track names and sort alphabetically.  easier to do here on xml than at end for pape\
        r.")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("outTracksInfo",
                        help="Path to write modified tracks XML")

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    args.logOpString = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.logOpString += " --logFile %s" % args.logFile

    nm = dict()
    nm["hollister"] = "RM-RepBase-Hollister"
    nm["chaux"] = "RM-RepBase-deLaChaux"
    nm["repeat_modeler"] = "RM-RepeatModeler"
    nm["repbase"] = "RM-RepBase"
    nm["repet"] = "REPET"
    nm["ltr_finder"] = "LTR_FINDER"
    nm["ltr_harvest"] = "LTR_Harvest"
    nm["ltr_termini"] = "lastz-Termini"
    nm["lastz-Termini"] = "lastz-LTRTermini"
    nm["tir_termini"] = "lastz-InvTermini"
    nm["irf"] = "IRF"
    nm["palindrome"] = "lastz-Palindrome"
    nm["overlap"] = "lastz-Overlap"
    nm["mitehunter"] = "MITE-Hunter"
    nm["helitronscanner"] = "HelitronScanner"
    nm["cov_80-"] = "lastz-SelfLowId"
    nm["cov_80-90"] = "lastz-SelfMedId"
    nm["cov_90+"] = "lastz-SelfHighId"
    nm["left_peak_80-"] = "lastz-SelfPeakLeftLow"
    nm["lastz-SelfLowLeftPeak"] = nm["left_peak_80-"]
    nm["left_peak_80-90"] = "lastz-SelfPeakLeftMed"
    nm["lastz-SelfMedLeftPeak"] = nm["left_peak_80-90"]
    nm["left_peak_90+"] = "lastz-SelfPeakLeftHigh"
    nm["lastz-SelfHighLeftPeak"] = nm["left_peak_90+"]
    nm["right_peak_80-"] = "lastz-SelfPeakRightLow"
    nm["lastz-SelfLowRightPeak"] = nm["right_peak_80-"]
    nm["right_peak_80-90"] = "lastz-SelfPeakRightMed"
    nm["lastz-SelfMedRightPeak"] = nm["right_peak_80-90"]
    nm["right_peak_90+"] = "lastz-SelfPeakRightHigh"
    nm["lastz-SelfHighRightPeak"] = nm["right_peak_90+"]
    nm["cov_maxPId"] = "lastz-SelfPctMaxId"
    nm["lastz-SelfMaxPctId"] = nm["cov_maxPId"]
    nm["te_domains"] = "TE-Domains"
    nm["fgenesh"] = "Genes"
    nm["genes"] = nm["fgenesh"]
    nm["refseq"] = nm["fgenesh"]
    nm["mrna"] = "mRNA"
    nm["srna"] = "sRNA"
    nm["ortho_depth"] = "Alignment-Depth"
    nm["orthology"] = nm["ortho_depth"]
    nm["chain_depth"] = nm["ortho_depth"]
    nm["alignment_depth"] = nm["ortho_depth"]
    nm["gcpct"] = "GC"
    nm["trf"] = "TRF"
    nm["windowmasker"] = "WindowMasker"
    nm["polyN"] = "Ns"
    nm["phastcons_ce"] = "Conservation"
    nm["phastcons"] = nm["phastcons_ce"]
    nm["PhastCons"] = nm["phastcons_ce"]
    nm["phyloP"] = nm["phastcons_ce"]
    nm["phylop"] = nm["phastcons_ce"]

    rtracks = dict()
    rtracks["tantan"] = True
    rtracks["polyA"] = True
    rtracks["transposon_psi"] = True
    rtracks["transposonpsi"] = True
    rtracks["repbase_censor"] = True
    rtracks["tsd"] = True
    rtracks["repbase_default"] = True
    rtracks["dustmasker"] = True

    inTracks = TrackList(args.tracksInfo)
    outTracks = TrackList()
    outList = []

    for track in itertools.chain(inTracks.trackList, inTracks.maskTrackList):
        if not os.path.exists(track.path):
            raise RuntimeError("Track DNE %s" % track.path)
        if track.name not in rtracks:
            if track.name in nm:
                track.name = nm[track.name]
            else:
                logger.warning("Did not map track %s" % track.name)
            outList.append(track)
        else:
            logger.warning("Deleted track %s" % track.name)

    # sort the list
    def sortComp(x):
        lname = x.name.lower()
        if x.name == "RM-RepeatModeler":
            return "aaaaa" + lname
        elif "RM" in x.name:
            return "aaaa" + lname
        elif "REPET" in x.name:
            return "aaa" + lname
        elif "softmask" in lname or "tigr" in lname or "te-domains" in lname:
            return "aa" + lname
        elif x.getDist == "mask":
            return "zzzz" + lname
        else:
            return lname

    outList = sorted(outList, key=lambda track: sortComp(track))

    for track in outList:
        outTracks.addTrack(track)

    outTracks.saveXML(args.outTracksInfo)

    cleanBedTool(tempBedToolPath)
Esempio n. 25
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Helper script to rank a list of tracks based on how well "
        "they improve some measure of HMM accuracy, by wrapping "
         "teHmmBenchmark.py")

    parser.add_argument("tracks", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("training", help="BED Training regions"
                        "teHmmTrain.py")
    parser.add_argument("truth", help="BED Truth used for scoring")
    parser.add_argument("states", help="States (in truth) to use for"
                        " average F1 score (comma-separated")
    parser.add_argument("outDir", help="Directory to place all results")
    parser.add_argument("--benchOpts", help="Options to pass to "
                        "teHmmBenchmark.py (wrap in double quotes)",
                        default="")
    parser.add_argument("--startTracks", help="comma-separated list of "
                        "tracks to start off with", default = None)
    parser.add_argument("--segOpts", help="Options to pass to "
                        "segmentTracks.py (wrap in double quotes)",
                        default="--comp first --thresh 1 --cutUnscaled")
    parser.add_argument("--fullSegment", help="Only use segmentation"
                        " based on entire track list for each iteration"
                        " rather than compute segmentation each time (as"
                        " done by default)", action="store_true",
                        default=False)
    parser.add_argument("--bic", help="rank by BIC instead of score "
                        " (both always present in output table though)",
                        action="store_true", default=False)
    parser.add_argument("--base", help="use base-level F1 instead of "
                        "interval-level", default=False, action="store_true")
    parser.add_argument("--naive", help="rank by \"naive\" score",
                         action="store_true", default=False)
    parser.add_argument("--doNaive", help="compute naive stats.  will be "
                        "turned on by default if --naive is used", default=False,
                        action="store_true")
    parser.add_argument("--segTracks", help="tracks XML to use for segmentation"
                        " (by default will be same as tracks))", default=None)
    parser.add_argument("--recallSkew", help="when computing f1, multiply recall"
                        " by this number (hack to favour larger recall)",
                        default=1., type=float)
    parser.add_argument("--score", help="accuracy score to use from "
                        "{f1, prec, rec}", default="f1")
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    # make sure no no-no options in benchOpts
    if "--eval" in args.benchOpts or "--truth" in args.benchOpts:
        raise RuntimeError("--eval and --truth cannot be passed through to "
                           "teHmmBenchmark.py as they are generated from "
                           "<training> and <truth> args from this script")
    
    # don't want to keep track of extra logic required for not segmenting
    if "--segment" not in args.benchOpts:
        args.benchOpts += " --segment"
        logger.warning("Adding --segment to teHmmBenchmark.py options")

    if args.bic is True and args.naive is True:
        raise RuntimeError("--bic and --naive are mutually incompatible")
    if args.naive is True:
        args.doNaive = True
    if args.segTracks is None:
        args.segTracks = args.tracks
        
    if not os.path.exists(args.outDir):
        os.makedirs(args.outDir)

    greedyRank(args)
Esempio n. 26
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Thin wrapper of teHmmTrain.py and teHmmEval.py "
        "to generate a table of Number-of-HMM-states VS BIC. Lower BIC"
        " is better")

    parser.add_argument("tracks", help="tracks xml used for training and eval")
    parser.add_argument(
        "trainingBeds",
        help="comma-separated list of training regions"
        " (training region size will be a variable in output table). "
        "if segmentation is activated, these must also be the "
        "segmented beds...")
    parser.add_argument("evalBed", help="eval region")
    parser.add_argument("trainOpts", help="all teHmmTrain options in quotes")
    parser.add_argument("evalOpts", help="all teHmmEval options in quotes")
    parser.add_argument("states",
                        help="comma separated-list of numbers of states"
                        " to try")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("--reps",
                        help="number of replicates",
                        type=int,
                        default=1)
    parser.add_argument("--proc",
                        help="maximum number of processors to use"
                        " in parallel",
                        type=int,
                        default=1)
    parser.add_argument("--resume",
                        help="try not to rewrite existing files",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--initTrans",
        help="the states argument is overridden"
        " to specify a list of transition initialization files "
        "instead of state numbers",
        action="store_true",
        default=False)
    parser.add_argument("--numReps",
                        help="the states argument is overridden"
                        " to specifiy a list of replicate numbers (--reps)"
                        " arguments",
                        action="store_true",
                        default=False)
    parser.add_argument("--numIter",
                        help="the states argument is overridden"
                        " to specifiy a list of iteration counts (--iter)"
                        " arugments",
                        action="store_true",
                        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1:
        raise RuntimeError("only one of {--initTrans, --numReps, --numIter} "
                           "can be used at a time")

    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    # get the sizes of the trianing beds
    trainingSizes = []
    trainingBeds = []
    for tb in args.trainingBeds.split(","):
        if len(tb) > 0:
            trainingBeds.append(tb)
    for bed in trainingBeds:
        assert os.path.isfile(bed)
        bedLen = 0
        for interval in readBedIntervals(bed):
            bedLen += interval[2] - interval[1]
        trainingSizes.append(bedLen)

    # make sure --bed not in teHmmEval options and --numStates not in train
    # options
    trainOpts = args.trainOpts.split()
    if "--numStates" in args.trainOpts and not args.numReps and not args.numIter:
        nsIdx = trainOpts.index("--numStates")
        assert nsIdx < len(trainOpts) - 1
        del trainOpts[nsIdx]
        del trainOpts[nsIdx]
    if "--initTransProbs" in args.trainOpts:
        tpIdx = trainOpts.index("--initTransProbs")
        assert tpIdx < len(trainOpts) - 1
        del trainOpts[tpIdx]
        del trianOpts[tpIdx]
    trainProcs = 1
    if "--numThreads" in args.trainOpts:
        npIdx = trainOpts.index("--numThreads")
        assert npIdx < len(trainOpts) - 1
        trainProcs = int(trainOpts[npIdx + 1])
    segOptIdx = -1
    if "--segment" in args.trainOpts:
        segIdx = trainOpts.index("--segment")
        assert segIdx < len(trainOpts) - 1
        segOptIdx = segIdx + 1
    if args.numReps and "--reps" in args.trainOpts:
        repsIdx = trainOpts.index("--reps")
        assert repsIdx < len(trainOpts) - 1
        del trainOpts[repsIdx]
        del trainOpts[repsIdx]
    if args.numIter and "--iter" in args.trainOpts:
        iterIdx = trainOpts.index("--iter")
        assert iterIdx < len(trainOpts) - 1
        del trainOpts[iterIdx]
        del trainOpts[iterIdx]
    evalOpts = args.evalOpts.split()
    if "--bed" in args.evalOpts:
        bedIdx = evalOpts.index("--bed")
        assert bedIdx < len(evalOpts) - 1
        del evalOpts[bedIdx]
        del evalOpts[bedIdx]
    if "--bic" in args.evalOpts:
        bicIdx = evalOpts.index("--bic")
        assert bicIdx < len(evalOpts) - 1
        del evalOpts[bicIdx]
        del evalOpts[bicIdx]

    # hack in support for --initTrans option by munging out model sizes
    # from the text files
    if args.initTrans is True:
        transFiles = args.states.split(",")
        states = []
        for tf in transFiles:
            stateSet = set()
            with open(tf) as f:
                for line in f:
                    toks = line.split()
                    print toks
                    if len(toks) > 1 and toks[0][0] != "#":
                        stateSet.add(toks[0])
                        stateSet.add(toks[1])
            states.append(len(stateSet))
    else:
        states = args.states.split(",")

    trainCmds = []
    evalCmds = []
    prevSize = -1
    sameSizeCount = 0
    for trainingSize, trainingBed in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        print prevSize, trainingSize, sameSizeCount
        for numStates in states:
            for rep in xrange(args.reps):
                outMod = os.path.join(
                    args.outDir, "hmm_%d.%d.%d.%d.mod" %
                    (trainingSize, sameSizeCount, int(numStates), int(rep)))
                if segOptIdx != -1:
                    trainOpts[segOptIdx] = trainingBed
                if args.initTrans is True:
                    statesOpt = "--initTransProbs %s" % transFiles[
                        states.index(numStates)]
                elif args.numIter is True:
                    # states argument overridden by iterations
                    statesOpt = "--iter %d" % int(numStates)
                elif args.numReps is True:
                    # states argument overridden by reps
                    statesOpt = "--reps %d" % int(numStates)
                else:
                    statesOpt = "--numStates %d" % int(numStates)
                trainCmd = "teHmmTrain.py %s %s %s %s %s" % (
                    args.tracks, trainingBed, outMod, " ".join(trainOpts),
                    statesOpt)
                if not args.resume or not os.path.isfile(outMod) or \
                   os.path.getsize(outMod) < 100:
                    trainCmds.append(trainCmd)

                outBic = outMod.replace(".mod", ".bic")
                outBed = outMod.replace(".mod", "_eval.bed")
                evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % (
                    args.tracks, outMod, args.evalBed, outBed, outBic,
                    " ".join(evalOpts))
                if not args.resume or not os.path.isfile(outBic) or \
                   os.path.getsize(outBic) < 2:
                    evalCmds.append(evalCmd)

    # run the training
    runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs))

    # run the eval
    runParallelShellCommands(evalCmds, args.proc)

    # make the table header
    tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w")
    stateColName = "states"
    if args.numIter is True:
        statesColName = "iter"
    elif args.numReps is True:
        stateColName = "reps"
    tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" %
                    stateColName)
    for i in xrange(args.reps):
        tableFile.write(", bic.%d" % i)
    tableFile.write("\n")

    # make the table body
    prevSize = -1
    sameSizeCount = 0
    for (trainingSize, trainingBed) in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        for numStates in states:
            bics = []
            printBics = []
            for rep in xrange(args.reps):
                outMod = os.path.join(
                    args.outDir, "hmm_%d.%d.%d.%d.mod" %
                    (trainingSize, sameSizeCount, int(numStates), int(rep)))
                outBic = outMod.replace(".mod", ".bic")
                try:
                    with open(outBic, "r") as obFile:
                        for line in obFile:
                            bic = float(line.split()[0])
                            break
                    bics.append(bic)
                    printBics.append(bic)
                except:
                    logger.warning("Coudn't find bic %s" % outBic)
                    printBics.append("ERROR")
            # write row
            tableFile.write("%s, %d, %d" %
                            (trainingBed, int(trainingSize), int(numStates)))
            if len(bics) > 0:
                tableFile.write(", %f, %f, %f" %
                                (np.mean(bics), np.min(bics), np.max(bics)))
            else:
                tableFile.write(", ERROR, ERROR, ERROR")
            for pb in printBics:
                tableFile.write(", %s" % pb)
            tableFile.write("\n")
    tableFile.close()

    cleanBedTool(tempBedToolPath)
Esempio n. 27
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Add a TSD track (or modify an existing one) based on a "
        "given track")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("tsdTrackDir",
                        help="Directory to write cleaned BED"
                        " tracks to")
    parser.add_argument("outTracksInfo",
                        help="Path to write modified tracks XML"
                        " to.")
    parser.add_argument("inputTrack", help="Name of track to createTSDs from")
    parser.add_argument("fastaTrack", help="Name of track for fasta sequence")
    parser.add_argument("outputTrack",
                        help="Name of tsd track to add.  Will"
                        " overwrite if it already exists (or append with"
                        " --append option)")
    parser.add_argument("--append",
                        help="Add onto existing TSD track if exists",
                        default=False,
                        action="store_true")
    parser.add_argument("--inPath",
                        help="Use given file instead of inputTrack"
                        " path to generate TSD",
                        default=None)

    ############ TSDFINDER OPTIONS ##############
    parser.add_argument("--min",
                        help="Minimum length of a TSD",
                        default=None,
                        type=int)
    parser.add_argument("--max",
                        help="Maximum length of a TSD",
                        default=None,
                        type=int)
    parser.add_argument("--all",
                        help="Report all matches in region (as opposed"
                        " to only the nearest to the BED element which is the "
                        "default behaviour",
                        action="store_true",
                        default=False)
    parser.add_argument("--maxScore",
                        help="Only report matches with given "
                        "score or smaller.  The score  is definied as the "
                        "maximum distance between the (two) TSD intervals and "
                        "the query interval",
                        default=None,
                        type=int)
    parser.add_argument("--left",
                        help="Number of bases immediately left of the "
                        "BED element to search for the left TSD",
                        default=None,
                        type=int)
    parser.add_argument("--right",
                        help="Number of bases immediately right of "
                        "the BED element to search for the right TSD",
                        default=None,
                        type=int)
    parser.add_argument("--overlap",
                        help="Number of bases overlapping the "
                        "BED element to include in search (so total space "
                        "on each side will be --left + overlap, and --right + "
                        "--overlap",
                        default=None,
                        type=int)
    parser.add_argument("--leftName",
                        help="Name of left TSDs in output Bed",
                        default=None)
    parser.add_argument("--rightName",
                        help="Name of right TSDs in output Bed",
                        default=None)
    parser.add_argument("--id",
                        help="Assign left/right pairs of TSDs a unique"
                        " matching ID",
                        action="store_true",
                        default=False)
    parser.add_argument("--names",
                        help="Only apply to bed interval whose "
                        "name is in (comma-separated) list.  If not specified"
                        " then all intervals are processed",
                        default=None)
    parser.add_argument("--numProc",
                        help="Number of jobs to run in parallel."
                        " (parallization done on different sequences in FASTA"
                        "file",
                        type=int,
                        default=1)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # copy out all options for call to tsd finder
    args.tsdFinderOptions = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.tsdFinderOptions += " --logFile %s" % args.logFile
    for option in [
            "min", "max", "all", "maxScore", "left", "right", "overlap",
            "leftName", "rightName", "id", "names", "numProc"
    ]:
        val = getattr(args, option)
        if val is True:
            args.tsdFinderOptions += " --%s" % option
        elif val is not None and val is not False:
            args.tsdFinderOptions += " --%s %s" % (option, val)

    try:
        os.makedirs(args.tsdTrackDir)
    except:
        pass
    if not os.path.isdir(args.tsdTrackDir):
        raise RuntimeError("Unable to find or create tsdTrack dir %s" %
                           args.tsdTrackDir)

    trackList = TrackList(args.tracksInfo)
    outTrackList = copy.deepcopy(trackList)
    inputTrack = trackList.getTrackByName(args.inputTrack)
    if inputTrack is None:
        raise RuntimeError("Track %s not found" % args.inputTrack)
    if args.inPath is not None:
        assert os.path.isfile(args.inPath)
        inputTrack.setPath(args.inPath)
    inTrackExt = os.path.splitext(inputTrack.getPath())[1].lower()
    if inTrackExt != ".bb" and inTrackExt != ".bed":
        raise RuntimeError("Track %s has non-bed extension %s" %
                           (args.inputTrack, inTrackExt))

    fastaTrack = trackList.getTrackByName(args.fastaTrack)
    if fastaTrack is None:
        raise RuntimeError("Fasta Track %s not found" % args.fastaTrack)
    faTrackExt = os.path.splitext(fastaTrack.getPath())[1].lower()
    if faTrackExt[:3] != ".fa":
        raise RuntimeError("Fasta Track %s has non-fasta extension %s" %
                           (args.fastaTrack, faTrackExt))

    tsdTrack = outTrackList.getTrackByName(args.outputTrack)
    if tsdTrack is None:
        if args.append is True:
            raise RuntimeError("TSD track %s not found. Cannot append" %
                               (args.outputTrack))
        tsdTrack = Track()
        tsdTrack.name = args.outputTrack
        tsdTrack.path = os.path.join(
            args.tsdTrackDir,
            args.inputTrack + "_" + args.outputTrack + ".bed")

    runTsdFinder(fastaTrack.getPath(), inputTrack.getPath(),
                 tsdTrack.getPath(), args)

    if outTrackList.getTrackByName(tsdTrack.getName()) is None:
        outTrackList.addTrack(tsdTrack)
    outTrackList.saveXML(args.outTracksInfo)

    cleanBedTool(tempBedToolPath)
Esempio n. 28
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Produce a bed file of genome segments which are atomic"
        " elements with resepect to the hmm. ie each segment emits a single"
        " state. Mask tracks always cut.  "
        "Output intervals are assigned name 0 1 0 1 etc.")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("outBed", help="Output segments")
    parser.add_argument("--thresh", help="Number of tracks that can change "
                        "before a new segment formed.  Increasing this value"
                        " increases the expected lengths of output segments",
                        type=int, default=1)
    parser.add_argument("--cutTracks", help="Create a new segment if something"
                        " changes in one of these tracks (as specified by "
                        "comman-separated list), overriding --thresh options"
                        " if necessary.  For example, --cutTracks tsd,chaux"
                        " would invoke a new segment everytime the value at"
                        "either of these tracks changed", default=None)
    parser.add_argument("--cutUnscaled", help="Cut on all unscaled (used as "
                        "a proxy for non-numeric) tracks", default=False,
                        action="store_true")
    parser.add_argument("--cutMultinomial", help="Cut non-gaussian, non-binary"
                        " tracks everytime", default=False, action="store_true")
    parser.add_argument("--cutNonGaussian", help="Cut all but guassian tracks",
                        default=False, action="store_true")
    parser.add_argument("--comp", help="Strategy for comparing columns for the "
                        "threshold cutoff.  Options are [first, prev], where"
                        " first compares with first column of segment and "
                        "prev compares with column immediately left",
                        default="first")
    parser.add_argument("--ignore", help="Comma-separated list of tracks to "
                        "ignore (the FASTA DNA sequence would be a good "
                        "candidate", default="sequence")
    parser.add_argument("--maxLen", help="Maximum length of a segment (<= 0 means"
                        " no max length applied",
                        type=int, default=0)
    parser.add_argument("--fixLen", help="Just make segments of specifed fixed "
                        "length ignoring other parameters and logic (<= 0 means"
                        " no fixed length applied",
                        type=int, default=0)
    parser.add_argument("--stats", help="Write some statistics to specified "
                        "file. Of the form <trackName> <Diff> <DiffPct> "
                        " where <Diff> is the number of times a track differs"
                        " between two consecutive segments, and <DiffPct> "
                        " is the average perecentage of all such differences "
                        "accounted for by the track", default=None)
    parser.add_argument("--delMask", help="Entirely remove intervals from "
                        "mask tracks that are > given length (otherwise "
                        "they would just be ignored by HMM tools). The difference"
                        " here is that removed intervals will break contiguity.",
                        type=int, default=None)
    parser.add_argument("--chroms", help="list of chromosomes, or regions, to run in parallel"
                        " (in BED format).  input regions will be intersected with each line"
                        " in this file, and the result will correspsond to an individual job",
                        default=None)
    parser.add_argument("--proc", help="number of processes (use in conjunction with --chroms)",
                        type=int, default=1)
    parser.add_argument("--co", help="count offset for segment labels.  only used internally",
                        type=int, default=0)
        
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.comp != "first" and args.comp != "prev":
        raise RuntimeError("--comp must be either first or prev")

    if args.chroms is not None:
        # hack to allow chroms argument to chunk and rerun 
        parallelDispatch(argv, args)
        cleanBedTool(tempBedToolPath)
        return 0
        
    # read query intervals from the bed file
    tempFiles = []
    if args.delMask is not None:
        cutBed = cutOutMaskIntervals(args.allBed, args.delMask, sys.maxint,
                                     args.tracksInfo)
        if cutBed is not None:
            tempFiles.append(cutBed)
            args.allBed = cutBed
    logger.info("loading segment region intervals from %s" % args.allBed)
    mergedIntervals = getMergedBedIntervals(args.allBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.allBed)

    # read the tracks, while intersecting them with the query intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals,
                            treatMaskAsBinary=True)

    # process the --cutTracks option
    trackList = trackData.getTrackList()
    cutList = np.zeros((len(trackList)), np.int)
    if args.cutTracks is not None:
        cutNames = args.cutTracks.split(",")
        for name in cutNames:
            track = trackList.getTrackByName(name)
            if track is None:
                raise RuntimeError("cutTrack %s not found" % name)
            trackNo = track.getNumber()
            assert trackNo < len(cutList)
            cutList[trackNo] = 1
    args.cutList = cutList

    # make sure mask tracks count as cut tracks
    for track in trackList:
        if track.getDist() == 'mask':
            args.cutList[track.getNumber()] = 1

    # process the --ignore option
    ignoreList = np.zeros((len(trackList)), np.int)
    if args.ignore is not None:
        ignoreNames = args.ignore.split(",")
        for name in ignoreNames:
            track = trackList.getTrackByName(name)
            if track is None:
                if name is not "sequence":
                    logger.warning("ignore track %s not found" % name)
                continue
            trackNo = track.getNumber()
            assert trackNo < len(ignoreList)
            ignoreList[trackNo] = 1
            if args.cutList[trackNo] == 1:
                raise RuntimeError("Same track (%s) cant be cut and ignored" %
                                  name)
    args.ignoreList = ignoreList

    #process the --cutUnscaled option
    if args.cutUnscaled is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.scale is None and track.shift is None and\
              track.logScale is None and\
              args.ignoreList[trackNo] == 0:
              assert trackNo < len(cutList)
              cutList[trackNo] = 1

    #process the --cutMultinomial option
    if args.cutMultinomial is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist == "multinomial" and\
              args.ignoreList[trackNo] == 0:
              assert trackNo < len(cutList)
              cutList[trackNo] = 1

    #process the --cutNonGaussian option
    if args.cutNonGaussian is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist != "gaussian" and\
              args.ignoreList[trackNo] == 0:
              assert trackNo < len(cutList)
              cutList[trackNo] = 1
              

    # segment the tracks
    stats = dict()
    segmentTracks(trackData, args, stats)
    writeStats(trackData, args, stats)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
Esempio n. 29
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Make some tables of statistics from a BED file.  All"
        " output will be written in one big CSV table to be viewed in a "
        "spreadsheet.")

    parser.add_argument("inBed", help="Input bed file")
    parser.add_argument("outCsv", help="Path to write output in CSV format")
    parser.add_argument("--ignore", help="Comma-separated list of names"
                        " to ignore", default="")
    parser.add_argument("--numBins", help="Number of (linear) bins for "
                        "histograms", type=int, default=10)
    parser.add_argument("--logHist", help="Apply log-transform to data for "
                        "histogram", action="store_true", default=False)
    parser.add_argument("--histRange", help="Histogram range as comma-"
                        "separated pair of numbers", default=None)
    parser.add_argument("--noHist", help="skip hisograms", action="store_true",
                        default=False)
    parser.add_argument("--noScore", help="Just do length stats",
                        action="store_true", default=False)
    parser.add_argument("--noLen", help="Just do score stats",
                        action="store_true", default=False)
    parser.add_argument("--nearness", help="Compute nearness stats (instead "
                        "of normal stats) of input bed with given BED.  Output"
                        " will be a BED instead of CSV, with nearness in the "
                        "score position", default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.histRange is not None:
        args.histRange = args.histRange.split(",")
        assert len(args.histRange) == 2
        args.histRange = int(args.histRange[0]), int(args.histRange[1])

    outFile = open(args.outCsv, "w")
    args.ignoreSet = set(args.ignore.split(","))

    intervals = readBedIntervals(args.inBed, ncol = 5, sort = args.nearness is not None)

    csvStats = ""
    # nearness stats
    if args.nearness is not None:
        args.noScore = True
        csvStats = makeNearnessBED(intervals, args)
        
    # length stats
    elif args.noLen is False:
        csvStats = makeCSV(intervals, args, lambda x : int(x[2])-int(x[1]),
                           "Length")
    # score stats
    try:
        if args.noScore is False:
            csvStats += "\n" + makeCSV(intervals, args, lambda x : float(x[4]),
                                       "Score")
            csvStats += "\n" + makeCSV(intervals, args, lambda x : float(x[4]) * (
                float(x[2]) - float(x[1])), "Score*Length")
    except Exception as e:
        logger.warning("Couldn't make score stats because %s" % str(e))
    outFile.write(csvStats)
    outFile.write("\n")
    outFile.close()
    cleanBedTool(tempBedToolPath)
Esempio n. 30
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Write track data into ASCII dump.  Row i corresponds"
        " to the ith position found when scanning query BED IN SORTED ORDER."
        "Column j corresponds to the jth track in the XML file. --map option"
        " used to write internal integer format used by HMM.  Unobserved values"
        " written as \"None\" if default attribute not specified or track not"
        " binary.  Rounding can occur if scaling parameters present.\n\n"
        "IMPORTANT: values stored in 8bit integers internally.  Any track with"
        " more than 256 different values will get clamped (with a warning)")

    parser.add_argument("tracks", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("query", help="BED region(s) to dump. SCANNED IN"
                        " SORTED ORDER")
    parser.add_argument("output", help="Path of file to write output to")
    parser.add_argument("--map", help="Apply name mapping, including"
                        " transformation specified in scale, logScale"
                        ", etc. attributes, that HMM uses internally"
                        ". Important to note that resulting integers"
                        " are just unique IDs.  ID_1 > ID_2 does not"
                        " mean anything", action="store_true",
                        default=False)
    parser.add_argument("--segment", help="Treat each interval in query"
                        " as a single segment (ie with only one data point)"
                        ".  In this case, query should probably have been"
                        " generated with segmentTracks.py",
                        action="store_true",
                        default=False)
    parser.add_argument("--noPos", help="Do not print genomic position"
                        " (first 2 columnts)", action="store_true",
                        default=False)
    parser.add_argument("--noMask", help="Ignore mask tracks",
                        default=False, action="store_true")
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    # make sure output writeable
    outFile = open(args.output, "w")

    # need to remember to fix this, disable as precaution for now
    assert args.noMask is True or args.segment is False
    
    # read query intervals from the bed file
    logger.info("loading query intervals from %s" % args.query)
    mergedIntervals = getMergedBedIntervals(args.query, ncol=3)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.query)

    # read the segment intervals from the (same) bed file
    segIntervals = None
    if args.segment is True:
        logger.info("loading segment intervals from %s" % args.query)
        segIntervals = readBedIntervals(args.query, sort=True)

    # read all data from track xml
    logger.info("loading tracks %s" % args.tracks)
    trackData = TrackData()
    trackData.loadTrackData(args.tracks, mergedIntervals,
                            segmentIntervals=segIntervals,
                            applyMasking = not args.noMask)

    # dump the data to output
    dumpTrackData(trackData, outFile, args.map, not args.noPos)
    outFile.close()
Esempio n. 31
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Generate HMM-usable tracklist from raw tracklist. EX "
        "used to transform mustang_alyrata_tracks.xml -> "
        "mustang_alyrata_clean.xml.  Runs cleanRM.py cleanLtrFinder.py and "
        " cleanTermini.py and addTsdTrack.py and setTrackScaling.py (also runs "
        " removeBedOverlaps.py before each of the clean scripts)")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("cleanTrackPath",
                        help="Directory to write cleaned BED"
                        " tracks to")
    parser.add_argument("outTracksInfo",
                        help="Path to write modified tracks XML"
                        " to.")
    parser.add_argument("--numBins",
                        help="Maximum number of bins after scaling",
                        default=10,
                        type=int)
    parser.add_argument("--scaleTracks",
                        help="Comma-separated list of tracks "
                        "to process for scaling. If not set, all"
                        " tracks listed as having a multinomial distribution"
                        " (since this is the default value, this includes "
                        "tracks with no distribution attribute) or gaussian "
                        "distribution will be processed.",
                        default=None)
    parser.add_argument("--skipScale",
                        help="Comma-separated list of tracks to "
                        "skip for scaling.",
                        default=None)
    parser.add_argument("--ltr_termini",
                        help="Name of termini track (appy tsd)",
                        default="ltr_termini")
    parser.add_argument("--repeat_modeler",
                        help="Name of repeat_modeler track (appy tsd)",
                        default="repeat_modeler")
    parser.add_argument("--sequence",
                        help="Name of fasta sequence track",
                        default="sequence")
    parser.add_argument(
        "--tsd",
        help="Name of tsd track to generate (appy cleanTermini.py)",
        default="tsd")
    parser.add_argument(
        "--tir",
        help="Name of tir_termini track (appy cleanTermini.py)",
        default="tir_termini")
    parser.add_argument("--noScale",
                        help="Dont do any scaling",
                        default=False,
                        action="store_true")
    parser.add_argument("--noTsd",
                        help="Dont generate TSD track.  NOTE:"
                        " TSD track is hardcoded to be generated from "
                        "termini and (non-LTR elements of ) chaux",
                        default=False,
                        action="store_true")
    parser.add_argument("--numProc",
                        help="Number of processes to use for tsdFinder.py",
                        default=1,
                        type=int)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    args.logOpString = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.logOpString += " --logFile %s" % args.logFile

    try:
        os.makedirs(args.cleanTrackPath)
    except:
        pass
    if not os.path.isdir(args.cleanTrackPath):
        raise RuntimeError("Unable to find or create cleanTrack dir %s" %
                           args.cleanTrackPath)

    tempTracksInfo = getLocalTempPath("Temp_mustang_alyrata_clean", "xml")
    runCleaning(args, tempTracksInfo)
    assert os.path.isfile(tempTracksInfo)

    runTsd(args, tempTracksInfo)

    runScaling(args, tempTracksInfo)

    runShellCommand("rm -f %s" % tempTracksInfo)

    cleanBedTool(tempBedToolPath)
Esempio n. 32
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Find candidate TSDs (exact forward matches) flanking given"
        "BED intervals.  Score is distance between TSD and bed interval.")
    parser.add_argument("fastaSequence", help="DNA sequence in FASTA format")
    parser.add_argument("inBed",
                        help="BED file with TEs whose flanking regions "
                        "we wish to search")
    parser.add_argument("outBed",
                        help="BED file containing (only) output TSDs")
    parser.add_argument("--min",
                        help="Minimum length of a TSD",
                        default=4,
                        type=int)
    parser.add_argument("--max",
                        help="Maximum length of a TSD",
                        default=6,
                        type=int)
    parser.add_argument("--all",
                        help="Report all matches in region (as opposed"
                        " to only the nearest to the BED element which is the "
                        "default behaviour",
                        action="store_true",
                        default=False)
    parser.add_argument("--maxScore",
                        help="Only report matches with given "
                        "score or smaller.  The score  is definied as the "
                        "maximum distance between the (two) TSD intervals and "
                        "the query interval",
                        default=None,
                        type=int)
    parser.add_argument("--left",
                        help="Number of bases immediately left of the "
                        "BED element to search for the left TSD",
                        default=7,
                        type=int)
    parser.add_argument("--right",
                        help="Number of bases immediately right of "
                        "the BED element to search for the right TSD",
                        default=7,
                        type=int)
    parser.add_argument("--overlap",
                        help="Number of bases overlapping the "
                        "BED element to include in search (so total space "
                        "on each side will be --left + overlap, and --right + "
                        "--overlap",
                        default=3,
                        type=int)
    parser.add_argument("--leftName",
                        help="Name of left TSDs in output Bed",
                        default="L_TSD")
    parser.add_argument("--rightName",
                        help="Name of right TSDs in output Bed",
                        default="R_TSD")
    parser.add_argument("--id",
                        help="Assign left/right pairs of TSDs a unique"
                        " matching ID",
                        action="store_true",
                        default=False)
    parser.add_argument("--names",
                        help="Only apply to bed interval whose "
                        "name is in (comma-separated) list.  If not specified"
                        " then all intervals are processed",
                        default=None)
    parser.add_argument("--numProc",
                        help="Number of jobs to run in parallel."
                        " (parallization done on different sequences in FASTA"
                        "file",
                        type=int,
                        default=1)
    parser.add_argument("--sequences",
                        help="Only process given sequences of input"
                        " FASTA file (comma-separated list).",
                        default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    assert os.path.exists(args.inBed)
    assert os.path.exists(args.fastaSequence)
    assert args.min <= args.max
    args.nextId = 0

    if args.sequences is not None:
        args.sequences = set(args.sequences.split(","))

    # read intervals from the bed file
    logger.info("loading target intervals from %s" % args.inBed)
    bedIntervals = readBedIntervals(args.inBed, ncol=4, sort=True)
    if bedIntervals is None or len(bedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" % args.inBed)

    if args.numProc > 1:
        runParallel(args, bedIntervals)
        return 0

    tsds = findTsds(args, bedIntervals)

    writeBedIntervals(tsds, args.outBed)