Ejemplo n.º 1
0
def runParallel(args, bedIntervals):
    """ Quick hack to rerun parallel jobs on different interval subsets. """
    nameSet = None
    if args.names is not None:
        nameSet = set(args.names.split(","))

    # chunk up BED input
    numIntervals = 0
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            numIntervals += 1
    jobSize = 1 + (numIntervals / args.numProc)
    logger.info("Dviding %d intervals into %d processes (%d intervals per)" %
                (numIntervals, args.numProc, jobSize))
    tempBeds = []
    curSize = sys.maxint
    curFile = None
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            if curSize >= jobSize:
                if curFile is not None:
                    curFile.close()
                tempBed = getLocalTempPath("TempTsdFinderIn", ".bed")
                tempBeds.append(tempBed)
                curFile = open(tempBed, "w")
                curSize = 0
            curFile.write("\t".join([str(s) for s in interval]))
            curFile.write("\n")
            curSize += 1
    if curFile is not None:
        curFile.close()

    # map jobs
    assert len(tempBeds) <= args.numProc
    tempOuts = []
    jobCmds = []
    for tempBed in tempBeds:
        cmdLine = " ".join(sys.argv)
        cmdLine = cmdLine.replace("--numProc %d" % args.numProc, "--numProc 1")
        cmdLine = cmdLine.replace(args.inBed, tempBed)
        tempOut = getLocalTempPath("TempTsdFinderOut", ".bed")
        cmdLine = cmdLine.replace(args.outBed, tempOut)
        tempOuts.append(tempOut)
        jobCmds.append(cmdLine)

    runParallelShellCommands(jobCmds, args.numProc)

    # reduce
    for i, tempOut in enumerate(tempOuts):
        if i == 0:
            runShellCommand("mv %s %s" % (tempOut, args.outBed))
        else:
            runShellCommand("cat %s >> %s" % (tempOut, args.outBed))
            runShellCommand("rm -f %s" % (tempOut))
Ejemplo n.º 2
0
def runParallel(args, bedIntervals):
    """ Quick hack to rerun parallel jobs on different interval subsets. """
    nameSet = None
    if args.names is not None:
        nameSet = set(args.names.split(","))
        
    # chunk up BED input
    numIntervals = 0
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            numIntervals += 1
    jobSize = 1 + (numIntervals / args.numProc)
    logger.info("Dviding %d intervals into %d processes (%d intervals per)" % (
        numIntervals, args.numProc, jobSize))
    tempBeds = []
    curSize = sys.maxint
    curFile = None
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            if curSize >= jobSize:
                if curFile is not None:
                    curFile.close()
                tempBed = getLocalTempPath("TempTsdFinderIn", ".bed")
                tempBeds.append(tempBed)
                curFile = open(tempBed, "w")
                curSize = 0
            curFile.write("\t".join([str(s) for s in interval]))
            curFile.write("\n")
            curSize += 1
    if curFile is not None:
        curFile.close()

    # map jobs
    assert len(tempBeds) <= args.numProc
    tempOuts = []
    jobCmds = []
    for tempBed in tempBeds:
        cmdLine = " ".join(sys.argv)
        cmdLine = cmdLine.replace("--numProc %d" % args.numProc,"--numProc 1")
        cmdLine = cmdLine.replace(args.inBed, tempBed)
        tempOut = getLocalTempPath("TempTsdFinderOut", ".bed")
        cmdLine = cmdLine.replace(args.outBed, tempOut)
        tempOuts.append(tempOut)
        jobCmds.append(cmdLine)
        
    runParallelShellCommands(jobCmds, args.numProc)

    # reduce
    for i, tempOut in enumerate(tempOuts):
        if i == 0:
            runShellCommand("mv %s %s" % (tempOut, args.outBed))
        else:
            runShellCommand("cat %s >> %s" % (tempOut, args.outBed))
            runShellCommand("rm -f %s" % (tempOut))
Ejemplo n.º 3
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="another wrapper for compareBedStates.py that will compare many files"
        " and make a decent table output")

    parser.add_argument("tracksList", help="XML tracks list")
    parser.add_argument("truthBeds", help="comma-separated references to benchmark against (ex repet)")
    parser.add_argument("testBeds", help="comma-spearated test beds")
    parser.add_argument("workDir", help="folder to write comparision outputs")
    parser.add_argument("outCSV", help="path for output")
    parser.add_argument("--state", help="state name", default="TE")
    parser.add_argument("--delMask", help="see help for compareBedStates.py", default=None, type=int)
    parser.add_argument("--proc", help="number of prcesses", default=1, type=int)
    parser.add_argument("--truthNames", help="comma-separated list of truth names", default =None)
    parser.add_argument("--testNames", help="comma-separated list of test names", default =None)
    
    args = parser.parse_args()

    truths = args.truthBeds.split(",")
    tests = args.testBeds.split(",")

    if args.truthNames is not None:
        truthNames = args.truthNames.split(",")
    else:
        truthNames = [os.path.splitext(os.path.basename(x))[0] for x in truths]
    if args.testNames is not None:
        testNames = args.testNames.split(",")
    else:
        testNames = [os.path.splitext(os.path.basename(x))[0] for x in tests]

    if not os.path.isdir(args.workDir):
        runShellCommand("mkdir %s" % args.workDir)

    assert len(tests) == len(testNames)
    assert len(truths) == len(truthNames)

    compCmds = []
    for i in xrange(len(tests)):
        for j in xrange(len(truths)):
            opath = os.path.join(args.workDir, "%s_vs_%s.txt" % (testNames[i], truthNames[j]))
            flags = "--tl %s" % args.tracksList
            if args.delMask is not None:
                flags += " --delMask %d" % args.delMask
            cmd = "compareBedStates.py %s %s %s > %s" % (truths[j], tests[i], flags, opath)
            compCmds.append(cmd)

    runParallelShellCommands(compCmds, args.proc)

    # munging ############
    def prettyAcc((prec, rec), spec):
        f1 = 0.
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)        
        return ("%.4f" % prec, "%.4f" % rec, "%.4f" % f1, "%.4f" % spec)
Ejemplo n.º 4
0
def parallelDispatch(argv, args):
    """ chunk up input with chrom option.  recursivlely launch eval. merge
    results """
    jobList = []
    chromIntervals = readBedIntervals(args.chroms, sort=True)
    chromFiles = []
    regionFiles = []
    bedFiles = []
    pdFiles = []
    bicFiles = []
    edFiles = []
    for chrom in chromIntervals:
        cmdToks = copy.deepcopy(argv)
        cmdToks[cmdToks.index("--chrom") + 1] = ""
        cmdToks[cmdToks.index("--chrom")] = ""
        
        chromPath = getLocalTempPath("Temp", ".bed")
        cpFile = open(chromPath, "w")
        cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2]))
        cpFile.close()
        
        regionPath = getLocalTempPath("Temp", ".bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.bedRegions,
                                                                     chromPath,
                                                                     regionPath))

        if os.path.getsize(regionPath) < 2:
            continue
        
        regionFiles.append(regionPath)
        chromFiles.append(chromPath)

        cmdToks[3] = regionPath

        if args.bed is not None:
            bedPath =  getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--bed")+1] = bedPath
            bedFiles.append(bedPath)
        if args.pd is not None:
            pdPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--pd")+1] = pdPath
            pdFiles.append(pdPath)
        if args.ed is not None:
            edPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--ed")+1] = edPath
            edFiles.append(edPath)
        if args.bic is not None:
            bicPath = getLocalTempPath("Temp", ".bic")
            cmdToks[cmdToks.index("--bic")+1] = bicPath
            bicFiles.append(bicPath)
        cmd = " ".join(cmdToks)
        jobList.append(cmd)

    runParallelShellCommands(jobList, args.proc)

    for i in xrange(len(jobList)):
        if i == 0:
            ct = ">"
        else:
            ct = ">>"
        if len(bedFiles) > 0:
            runShellCommand("cat %s %s %s" % (bedFiles[i], ct, args.bed))
        if len(pdFiles) > 0:
            runShellCommand("cat %s %s %s" % (pdFiles[i], ct, args.pd))
        if len(edFiles) > 0:
            runShellCommand("cat %s %s %s" % (edFiles[i], ct, args.ed))
        if len(bicFiles) > 0:
            runShellCommand("cat %s %s %s" % (bicFiles[i], ct, args.bic))

    for i in itertools.chain(chromFiles, regionFiles, bedFiles, pdFiles, edFiles,
                             bicFiles):
        runShellCommand("rm %s" % i)            
Ejemplo n.º 5
0
def parallelDispatch(argv, args):
    """ chunk up input with chrom option.  recursivlely launch eval. merge
    results """
    jobList = []
    chromIntervals = readBedIntervals(args.chroms, sort=True)
    chromFiles = []
    regionFiles = []
    segFiles = []
    statsFiles = []
    offset = args.co
    for chrom in chromIntervals:
        cmdToks = copy.deepcopy(argv)
        cmdToks[cmdToks.index("--chrom") + 1] = ""
        cmdToks[cmdToks.index("--chrom")] = ""

        chromPath = getLocalTempPath("TempChromPath", ".bed")
        cpFile = open(chromPath, "w")
        cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2]))
        cpFile.close()

        regionPath = getLocalTempPath("Temp", ".bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                        (args.allBed, chromPath, regionPath))

        if os.path.getsize(regionPath) < 2:
            continue

        offset += int(chrom[2]) - int(chrom[1])

        regionFiles.append(regionPath)
        chromFiles.append(chromPath)

        cmdToks[2] = regionPath

        segPath = getLocalTempPath("Temp", ".bed")
        cmdToks[3] = segPath
        segFiles.append(segPath)

        if "--co" in cmdToks:
            cmdToks[cmdToks.index("--co") + 1] = str(offset)
        else:
            cmdToks.append("--co")
            cmdToks.append(str(offset))

        if args.stats is not None:
            statsPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--stats") + 1] = statsPath
            statsFiles.append(statsPath)
        cmd = " ".join(cmdToks)
        jobList.append(cmd)

    runParallelShellCommands(jobList, args.proc)

    for i in xrange(len(jobList)):
        if i == 0:
            ct = ">"
        else:
            ct = ">>"
        runShellCommand("cat %s %s %s" % (segFiles[i], ct, args.outBed))
        if len(statsFiles) > 0:
            runShellCommand("cat %s %s %s" % (statsFiles[i], ct, args.stats))

    for i in itertools.chain(chromFiles, regionFiles, segFiles, statsFiles):
        runShellCommand("rm %s" % i)
Ejemplo n.º 6
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Train, evalaute, then compare hmm model on input")

    parser.add_argument("trainingTracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks used "
                        "for training")
    parser.add_argument("outputDir", help="directory to write output")
    parser.add_argument("inBeds", nargs="*", help="list of training beds")
    parser.add_argument("--evalTracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks used"
                        " for evaluation (only need if different from"
                        " trainingTracksInfo", default=None)
    parser.add_argument("--numProc", help="Max number of processors to use",
                        type=int, default=1)
    parser.add_argument("--allTrackCombinations", help="Rerun with all"
                        " possible combinations of tracks from the input"
                        " tracksInfo file.  Note that this number gets big"
                        " pretty fast.", action = "store_true", default= False)
    parser.add_argument("--emStates", help="By default the supervised mode"
                        " of teHmmTrain is activated.  This option overrides"
                        " that and uses the EM mode and the given number of "
                        "states instead", type=int, default=None)
    parser.add_argument("--cross", help="Do 50/50 cross validation by training"
                        " on first half input and validating on second",
                        action="store_true", default=False)
    parser.add_argument("--emFac", help="Normalization factor for weighting"
                        " emission probabilities because when there are "
                        "many tracks, the transition probabilities can get "
                        "totally lost. 0 = no normalization. 1 ="
                        " divide by number of tracks.  k = divide by number "
                        "of tracks / k", type=int, default=0)
    parser.add_argument("--mod", help="Path to trained model.  This will "
                        "bypass the training phase that would normally be done"
                        " and just skip to the evaluation.  Note that the user"
                        " must make sure that the trained model has the "
                        "states required to process the input data",
                        default = None)
    parser.add_argument("--iter", help="Number of EM iterations.  Needs to be"
                        " used in conjunction with --emStates to specify EM"
                        " training",
                        type = int, default=None)
    parser.add_argument("--initTransProbs", help="Path of text file where each "
                        "line has three entries: FromState ToState Probability"
                        ".  This file (all other transitions get probability 0)"
                        " is used to specifiy the initial transition model."
                        " The names and number of states will be initialized "
                        "according to this file (overriding --numStates)",
                        default = None)
    parser.add_argument("--fixTrans", help="Do not learn transition parameters"
                        " (best used with --initTransProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ".  This file (all other emissions get probability 0)"
                        " is used to specifiy the initial emission model. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixEm", help="Do not learn emission parameters"
                        " (best used with --initEmProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initStartProbs", help="Path of text file where each "
                        "line has two entries: State Probability"
                        ".  This file (all other start probs get probability 0)"
                        " is used to specifiy the initial start dist. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixStart", help="Do not learn start parameters"
                        " (best used with --initStartProbs)",
                        action="store_true", default=False)
    parser.add_argument("--forceTransProbs",
                        help="Path of text file where each "
                        "line has three entries: FromState ToState Probability" 
                        ". These transition probabilities will override any "
                        " learned probabilities after training (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed" ,
                        default=None)
    parser.add_argument("--forceEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ". These "
                        "emission probabilities will override any learned"
                        " probabilities after training (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed." ,
                        default = None) 
    parser.add_argument("--flatEm", help="Use a flat emission distribution as "
                        "a baseline.  If not specified, the initial emission "
                        "distribution will be randomized by default.  Emission"
                        " probabilities specified with --initEmpProbs or "
                        "--forceEmProbs will never be affected by randomizaiton"
                        ".  The randomization is important for Baum Welch "
                        "training, since if two states dont have at least one"
                        " different emission or transition probability to begin"
                        " with, they will never learn to be different.",
                        action="store_true", default=False)
    parser.add_argument("--emRandRange", help="When randomly initialzing a"
                        " multinomial emission distribution, constrain"
                        " the values to the given range (pair of "
                        "comma-separated numbers).  Overridden by "
                        "--initEmProbs and --forceEmProbs when applicable."
                        " Completely overridden by --flatEm (which is equivalent"
                        " to --emRandRange .5,.5.). Actual values used will"
                        " always be normalized.", default=None)    
    parser.add_argument("--mandTracks", help="Mandatory track names for use "
                        "with --allTrackCombinations in comma-separated list",
                        default=None)
    parser.add_argument("--combinationRange", help="in form MIN,MAX: Only "
                        "explore track combination in given (closed) range. "
                        "A more refined version of --allTrackCombinations.",
                        default=None)
    parser.add_argument("--supervised", help="Use name (4th) column of "
                        "<traingingBed> for the true hidden states of the"
                        " model.  Transition parameters will be estimated"
                        " directly from this information rather than EM."
                        " NOTE: The number of states will be determined "
                        "from the bed.",
                        action = "store_true", default = False)
    parser.add_argument("--segment", help="Input bed files are also used to "
                        "segment data.  Ie teHmmTrain is called with --segment"
                        " set to the input file. Not currently working with "
                        " --supervised",
                        action = "store_true", default=False)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied) in training", type=int,
                        default=None)
    parser.add_argument("--truth", help="Use specifed file instead of "
                        "input file(s) for truth comparison.  Makes sense"
                        " when --segment is specified and only one input"
                        " bed specified", default = None)
    parser.add_argument("--eval", help="Bed file used for evaluation.  It should"
                        " cover same region in same order as --truth.  Option "
                        "exists mostly to specify segmentation of --truth",
                        default=None)
    parser.add_argument("--seed", help="Seed for random number generator"
                        " which will be used to initialize emissions "
                        "(if --flatEM and --supervised not specified)",
                        default=None, type=int)
    parser.add_argument("--reps", help="Number of training replicates (with "
                        " different"
                         " random initializations) to run. The replicate"
                         " with the highest likelihood will be chosen for the"
                         " output", default=None, type=int)
    parser.add_argument("--numThreads", help="Number of threads to use when"
                        " running training replicates (see --rep) in parallel.",
                        type=int, default=None)
    parser.add_argument("--emThresh", help="Threshold used for convergence"
                        " in baum welch training.  IE delta log likelihood"
                        " must be bigger than this number (which should be"
                        " positive) for convergence", type=float,
                        default=None)
    parser.add_argument("--fit", help="Run fitStateNames.py to automap names"
                        " before running comparison", action="store_true",
                        default=False)
    parser.add_argument("--fitOpts", help="Options to pass to fitStateNames.py"
                        " (only effective if used with --fit)", default=None)
    parser.add_argument("--saveAllReps", help="Save all replicates (--reps)"
                        " models to disk, instead of just the best one"
                        ". Format is <outputModel>.repN.  There will be "
                        " --reps -1 such models saved as the best output"
                        " counts as a replicate.  Comparison statistics"
                        " will be generated for each rep.",
                        action="store_true", default=False)
    parser.add_argument("--maxProb", help="Gaussian distributions and/or"
                        " segment length corrections can cause probability"
                        " to *decrease* during BW iteration.  Use this option"
                        " to remember the parameters with the highest probability"
                        " rather than returning the parameters after the final "
                        "iteration.", action="store_true", default=False)
    parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop"
                        " training if a given number of iterations go by without"
                        " hitting a new maxProb", default=None, type=int)
    parser.add_argument("--transMatEpsilons", help="By default, epsilons are"
                        " added to all transition probabilities to prevent "
                        "converging on 0 due to rounding error only for fully"
                        " unsupervised training.  Use this option to force this"
                        " behaviour for supervised and semisupervised modes",
                        action="store_true", default=False)

        
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    logOps = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        logOps += " --logFile %s" % args.logFile

    if not os.path.exists(args.outputDir):
        os.makedirs(args.outputDir)
    if args.evalTracksInfo is None:
        args.evalTracksInfo = args.trainingTracksInfo

    trainingTrackList = TrackList(args.trainingTracksInfo)
    evalTrackList = TrackList(args.evalTracksInfo)
    checkTrackListCompatible(trainingTrackList, evalTrackList)

    sizeRange = (len(trainingTrackList), len(trainingTrackList) + 1)
    if args.allTrackCombinations is True:
        sizeRange = (1, len(trainingTrackList) + 1)
    if args.combinationRange is not None:
        toks = args.combinationRange.split(",")
        sizeRange = int(toks[0]),int(toks[1]) + 1
        logger.debug("manual range (%d, %d) " % sizeRange)
    mandTracks = set()
    if args.mandTracks is not None:
        mandTracks = set(args.mandTracks.split(","))
        logger.debug("mandatory set %s" % str(mandTracks))
    trainFlags = ""
    if args.emStates is not None:
        trainFlags += " --numStates %d" % args.emStates
    if args.supervised is True:
        trainFlags += " --supervised"
        if args.segment is True:
            raise RuntimeError("--supervised not currently compatible with "
                               "--segment")
    trainFlags += " --emFac %d" % args.emFac
    if args.forceEmProbs is not None:
        trainFlags += " --forceEmProbs %s" % args.forceEmProbs
    if args.iter is not None:
        assert args.emStates is not None or args.initTransProbs is not None
        trainFlags += " --iter %d" % args.iter
    if args.initTransProbs is not None:
        trainFlags += " --initTransProbs %s" % args.initTransProbs
    if args.initEmProbs is not None:
        trainFlags += " --initEmProbs %s" % args.initEmProbs
    if args.fixEm is True:
        trainFlags += " --fixEm"
    if args.initStartProbs is not None:
        trainFlags += " --initStartProbs %s" % args.initStartProbs
    if args.fixStart is True:
        trainFlags += " --fixStart"
    if args.forceTransProbs is not None:
        trainFlags += " --forceTransProbs %s" % args.forceTransProbs
    if args.forceEmProbs is not None:
        trainFlags += " --forceEmProbs %s" % args.forceEmProbs
    if args.flatEm is True:
        trainFlags += " --flatEm"
    if args.emRandRange is not None:
        trainFlags += " --emRandRange %s" % args.emRandRange
    if args.segLen is not None:
        trainFlags += " --segLen %d" % args.segLen
    if args.seed is not None:
        trainFlags += " --seed %d" % args.seed
    if args.reps is not None:
        trainFlags += " --reps %d" % args.reps
    if args.numThreads is not None:
        trainFlags += " --numThreads %d" % args.numThreads
    if args.emThresh is not None:
        trainFlags += " --emThresh %f" % args.emThresh
    if args.saveAllReps is True:
        trainFlags += " --saveAllReps"
    if args.maxProb is True:
        trainFlags += " --maxProb"
    if args.transMatEpsilons is True:
        trainFlags += " --transMatEpsilons"
    if args.maxProbCut is not None:
        trainFlags += " --maxProbCut %d" % args.maxProbCut

    # write out command line for posteriorty's sake
    if not os.path.exists(args.outputDir):
        os.makedirs(args.outputDir)
    cmdPath = os.path.join(args.outputDir, "teHmmBenchmark_cmd.txt")
    cmdFile = open(cmdPath, "w")
    cmdFile.write(" ".join(argv) + "\n")
    cmdFile.close()
                           
    #todo: try to get timing for each command
    commands = []
    rows = dict()
    for pn, pList in enumerate(subsetTrackList(trainingTrackList, sizeRange,
                                               mandTracks)):
        if len(pList) == len(trainingTrackList):
            outDir = args.outputDir
        else:
            outDir = os.path.join(args.outputDir, "perm%d" % pn)
        if not os.path.exists(outDir):
            os.makedirs(outDir)
        trainingTrackPath = os.path.join(outDir, "training_tracks.xml")
        evalTrackPath = os.path.join(outDir, "eval_tracks.xml")
        for maskTrack in trainingTrackList.getMaskTracks():
            pList.addTrack(copy.deepcopy(maskTrack))
        pList.saveXML(trainingTrackPath)
        epList = TrackList()
        for track in pList:
            t = copy.deepcopy(evalTrackList.getTrackByName(track.getName()))
            epList.addTrack(t)
        for maskTrack in trainingTrackList.getMaskTracks():
            epList.addTrack(copy.deepcopy(maskTrack))
        epList.saveXML(evalTrackPath)
        
        for inBed in args.inBeds:
            
            base = os.path.basename(inBed)
            truthBed = inBed
            testBed = inBed
            if args.cross is True:
                truthBed = os.path.join(outDir,
                                        os.path.splitext(base)[0] +
                                        "_truth_temp.bed")
                testBed = os.path.join(outDir,
                                       os.path.splitext(base)[0] +
                                       "_test_temp.bed")
                splitBed(inBed, truthBed, testBed)

                                        
            
            # train
            if args.mod is not None:
                modPath = args.mod
                command = "ls %s" % modPath
            else:
                modPath = os.path.join(outDir,
                                       os.path.splitext(base)[0] + ".mod")
                command = "teHmmTrain.py %s %s %s %s %s" % (trainingTrackPath,
                                                            truthBed,
                                                            modPath,
                                                            logOps,
                                                            trainFlags)
                if args.segment is True:
                    command += " --segment %s" % truthBed

            # view
            viewPath = os.path.join(outDir,
                                   os.path.splitext(base)[0] + "_view.txt")
            command += " && teHmmView.py %s > %s" % (modPath, viewPath)

            # evaluate
            numReps = 1
            if args.reps is not None and args.saveAllReps is True:
                numReps = args.reps
                assert numReps > 0
            missed = 0
            # little hack to repeat evaluation for each training replicate
            for repNum in xrange(-1, numReps-1):
                if repNum == -1:
                    repSuffix = ""
                else:
                    repSuffix = ".rep%d" % repNum                
                evalBed = os.path.join(outDir,
                                       os.path.splitext(base)[0] + "_eval.bed" +
                                       repSuffix)
                hmmEvalInputBed = testBed
                if args.eval is not None:
                    hmmEvalInputBed = args.eval
                bicPath = os.path.join(outDir,
                                       os.path.splitext(base)[0] + "_bic.txt" +
                                       repSuffix)

                command += " && teHmmEval.py %s %s %s --bed %s %s --bic %s" % (
                    evalTrackPath,
                    modPath + repSuffix,
                    hmmEvalInputBed,
                    evalBed,
                    logOps,
                    bicPath)
                zin = True

                if args.segment is True:
                    command += " --segment"

                # fit
                compTruth = testBed
                if args.truth is not None:
                    compTruth = args.truth
                compareInputBed = evalBed
                if args.fit is True:
                    fitBed = os.path.join(outDir,
                                          os.path.splitext(base)[0] + "_eval_fit.bed" +
                                          repSuffix)
                    command += " && fitStateNames.py %s %s %s --tl %s" % (compTruth,
                                                                          evalBed,
                                                                          fitBed,
                                                                          evalTrackPath)
                    if args.fitOpts is not None:
                        command += " " + args.fitOpts
                    compareInputBed = fitBed

                # compare
                compPath = os.path.join(outDir,
                                        os.path.splitext(base)[0] + "_comp.txt" +
                                        repSuffix)
                command += " && compareBedStates.py %s %s --tl %s > %s" % (
                    compTruth,
                    compareInputBed,
                    evalTrackPath,
                    compPath)
            

                # make table row
                if repSuffix == "":
                    rowPath = os.path.join(outDir,
                                           os.path.splitext(base)[0] + "_row.txt")
                    if inBed in rows:
                        rows[inBed].append(rowPath)
                    else:
                        rows[inBed] = [rowPath]
                    command += " && scrapeBenchmarkRow.py %s %s %s %s %s" % (
                        args.trainingTracksInfo,
                        trainingTrackPath,
                        evalBed,
                        compPath,
                        rowPath)

            # remember command
            inCmdPath = os.path.join(outDir,
                                    os.path.splitext(base)[0] + "_cmd.txt")
            inCmdFile = open(inCmdPath, "w")
            inCmdFile.write(command + "\n")
            inCmdFile.close()
            commands.append(command)
            
    runParallelShellCommands(commands, args.numProc)
    writeTables(args.outputDir, rows)
Ejemplo n.º 7
0
interpolateFlags =  "--tgts TE --maxLen %d %s" % (maskK, logOpts)
compIdx = 0 #base
#compIdx = 1 #interval
#compIdx = 2 #weightintervs

#####################

# segment ##########
trainSegPath = "train_segments.bed"
evalSegPath = "eval_segments.bed"
if startPoint <= 1:
    cmdTrain = "segmentTracks.py %s %s %s %s --stats %s" % (segTracksPath, trainRegionPath, trainSegPath, segOpts,
                                                            trainSegPath.replace(".bed", ".segStats"))
    cmdEval = "segmentTracks.py %s %s %s %s --stats %s" % (segTracksPath, evalRegionPath, evalSegPath, segOpts,
                                                           evalSegPath.replace(".bed", ".segStats"))
    runParallelShellCommands([cmdEval, cmdTrain], 2)

# train ############
modelPath = "hmm.mod"
if startPoint <=2:
    cmd = "teHmmTrain.py %s %s %s %s" % (trainTracksPath, trainSegPath, modelPath, logOpts)
    cmd += " --fixStart"
    cmd += " --segLen %d" % segLen
    cmd += " --numStates %d" % numStates
    cmd += " --reps %d --numThreads %d" % (threads, threads)
    cmd += " --emThresh %f" % thresh
    cmd += " --iter %d" % iter
    cmd += " --segment %s" % trainSegPath
    runShellCommand(cmd)

# eval ############
Ejemplo n.º 8
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        " Generate some accuracy results.  To be used on output of statesVsBic.py"
        "(or some set of hmm prediction beds of the form *_trainsize.stateNum.bed"
    )

    parser.add_argument("tracksList", help="XML tracks list")
    parser.add_argument("truthBed",
                        help="reference to benchmark against (ex repet)")
    parser.add_argument("fitBed", help="predition to fit against (ex modeler)")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("beds",
                        help="one or more bed files to evaluate",
                        nargs="*")
    parser.add_argument("--proc",
                        help="number of parallel processes",
                        type=int,
                        default=1)
    parser.add_argument("--maskGap",
                        help="interpolate masked gaps smaller than this",
                        type=int,
                        default=5000)
    parser.add_argument("--exploreFdr",
                        help="try a bunch of fdr values",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--compWindow",
        help="intersect with this file before running comparison",
        default=None)

    args = parser.parse_args()

    # preloop to check files
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])

    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    outFile = open(os.path.join(args.outDir, "accuracy.csv"), "w")

    truthBed = args.truthBed
    if args.compWindow is not None:
        truthBed = os.path.join(args.outDir, "clippedTruth.bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                        (args.truthBed, args.compWindow, truthBed))

    if args.exploreFdr is True:
        fdrs = [
            0, .05, .1, .15, .20, .25, .30, .35, .40, .45, .50, .55, .60, .65,
            .70, .75, .80, .85, .90, .95, 1
        ]
    else:
        fdrs = [.65]

    # do two kinds of fitting vs modeer
    fitCmds = []
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        fitOut = os.path.join(
            args.outDir,
            os.path.basename(bed).replace(".bed", "_fit.bed"))
        fitLog = fitOut.replace(".bed", "_log.txt")
        cmd = "fitStateNames.py %s %s %s --tl %s --tgt TE --qualThresh 0.1 --logDebug --logFile %s" % (
            args.fitBed, bed, fitOut, args.tracksList, fitLog)
        fitCmds.append(cmd)
        for fdr in fdrs:
            fitOutFdr = fitOut.replace(".bed", "Fdr%f.bed" % fdr)
            fitLogFdr = fitOutFdr.replace(".bed", "_log.txt")
            cmdFdr = "fitStateNames.py %s %s %s --tl %s --tgt TE --fdr %f --logDebug --logFile %s" % (
                args.fitBed, bed, fitOutFdr, args.tracksList, fdr, fitLogFdr)
            fitCmds.append(cmdFdr)

    # interpolate the gaps
    interpolateCmds = []
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        fitOut = os.path.join(
            args.outDir,
            os.path.basename(bed).replace(".bed", "_fit.bed"))
        fitOutMI = os.path.join(
            args.outDir,
            os.path.basename(bed).replace(".bed", "_fitMI.bed"))
        cmd = "interpolateMaskedRegions.py %s %s %s %s --maxLen %d" % (
            args.tracksList, args.truthBed, fitOut, fitOutMI, args.maskGap)
        interpolateCmds.append(cmd)

        for fdr in fdrs:
            fitOutFdr = fitOut.replace(".bed", "Fdr%f.bed" % fdr)
            fitOutFdrMI = fitOutMI.replace(".bed", "Fdr%f.bed" % fdr)
            cmdFdr = "interpolateMaskedRegions.py %s %s %s %s --maxLen %d" % (
                args.tracksList, args.truthBed, fitOutFdr, fitOutFdrMI,
                args.maskGap)
            interpolateCmds.append(cmdFdr)

    # run the comparison
    compareCmds = []
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        fitOutMI = os.path.join(
            args.outDir,
            os.path.basename(bed).replace(".bed", "_fitMI.bed"))
        comp = os.path.join(args.outDir,
                            os.path.basename(bed).replace(".bed", "_comp.txt"))
        cmd = ""
        fitOutMIClipped = fitOutMI
        if args.compWindow is not None:
            fitOutMIClipped = fitOutMI.replace(".bed", "_clipped.bed")
            cmd += "intersectBed -a %s -b %s | sortBed > %s && " % (
                fitOutMI, args.compWindow, fitOutMIClipped)
        cmd += "compareBedStates.py %s %s --tl %s --delMask %d > %s" % (
            args.truthBed, fitOutMIClipped, args.tracksList, args.maskGap,
            comp)
        compareCmds.append(cmd)
        for fdr in fdrs:
            fitOutFdrMI = fitOutMI.replace(".bed", "Fdr%f.bed" % fdr)
            compFdr = comp.replace(".txt", "Fdr%f.txt" % fdr)
            cmdFdr = ""
            fitOutFdrMIClipped = fitOutFdrMI
            if args.compWindow is not None:
                fitOutFdrMIClipped = fitOutFdrMI.replace(
                    ".bed", "_clipped.bed")
                cmdFdr += "intersectBed -a %s -b %s | sortBed > %s &&" % (
                    fitOutFdrMI, args.compWindow, fitOutFdrMIClipped)
            cmdFdr += "compareBedStates.py %s %s --tl %s --delMask %d > %s" % (
                args.truthBed, fitOutFdrMIClipped, args.tracksList,
                args.maskGap, compFdr)
            compareCmds.append(cmdFdr)

    runParallelShellCommands(fitCmds, args.proc)
    runParallelShellCommands(interpolateCmds, args.proc)
    runParallelShellCommands(compareCmds, args.proc)
    # got a weird crash before where comp file wasn't found
    # maybe this will help?
    runShellCommand("sleep 10")

    # munging ############
    def prettyAcc((prec, rec), spec):
        f1 = 0.
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)
        return "%.4f, %.4f, %.4f, %.4f" % (prec, rec, f1, spec)
Ejemplo n.º 9
0
cutInput(genomePath, regions, truthPaths, modelerPath, outDir, cutTrackPath)

segmentCmds = segmentCommands(genomePath, regions, outDir, segOpts, tracksPath)        

trainCmds = trainCommands(genomePath, regions, outDir, tracksPath250,
                              segLen, numStates, trainThreads, thresh, numIter)

evalCmds = evalCommands(genomePath, regions, outDir, tracksPath250)

fitCmds = fitCommands(genomePath, regions, outDir, modelerPath, truthPaths, fitFlags)

compareCmds =  compareCommands(genomePath, regions, outDir, modelerPath,
                               truthPaths)
if startPoint <= 1:
    print segmentCmds
    runParallelShellCommands(segmentCmds, numParallelBatch)
if startPoint <= 2:
    print trainCmds
    runParallelShellCommands(trainCmds, max(1, numParallelBatch /
                                            max(1, trainThreads/2)))
if startPoint <= 3:
    print evalCmds
    runParallelShellCommands(evalCmds, numParallelBatch)
if startPoint <= 4:
    print fitCmds
    runParallelShellCommands(fitCmds, numParallelBatch)
if startPoint <= 5:
    print "\n".join(compareCmds)
    runParallelShellCommands(compareCmds, numParallelBatch)
if startPoint <= 6:
    harvestStats(genomePath, regions, outDir, modelerPath, truthPaths, "stats_base", 0)
Ejemplo n.º 10
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Thin wrapper of teHmmTrain.py and teHmmEval.py "
        "to generate a table of Number-of-HMM-states VS BIC. Lower BIC"
        " is better")

    parser.add_argument("tracks", help="tracks xml used for training and eval")
    parser.add_argument(
        "trainingBeds",
        help="comma-separated list of training regions"
        " (training region size will be a variable in output table). "
        "if segmentation is activated, these must also be the "
        "segmented beds...")
    parser.add_argument("evalBed", help="eval region")
    parser.add_argument("trainOpts", help="all teHmmTrain options in quotes")
    parser.add_argument("evalOpts", help="all teHmmEval options in quotes")
    parser.add_argument("states",
                        help="comma separated-list of numbers of states"
                        " to try")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("--reps",
                        help="number of replicates",
                        type=int,
                        default=1)
    parser.add_argument("--proc",
                        help="maximum number of processors to use"
                        " in parallel",
                        type=int,
                        default=1)
    parser.add_argument("--resume",
                        help="try not to rewrite existing files",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--initTrans",
        help="the states argument is overridden"
        " to specify a list of transition initialization files "
        "instead of state numbers",
        action="store_true",
        default=False)
    parser.add_argument("--numReps",
                        help="the states argument is overridden"
                        " to specifiy a list of replicate numbers (--reps)"
                        " arguments",
                        action="store_true",
                        default=False)
    parser.add_argument("--numIter",
                        help="the states argument is overridden"
                        " to specifiy a list of iteration counts (--iter)"
                        " arugments",
                        action="store_true",
                        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1:
        raise RuntimeError("only one of {--initTrans, --numReps, --numIter} "
                           "can be used at a time")

    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    # get the sizes of the trianing beds
    trainingSizes = []
    trainingBeds = []
    for tb in args.trainingBeds.split(","):
        if len(tb) > 0:
            trainingBeds.append(tb)
    for bed in trainingBeds:
        assert os.path.isfile(bed)
        bedLen = 0
        for interval in readBedIntervals(bed):
            bedLen += interval[2] - interval[1]
        trainingSizes.append(bedLen)

    # make sure --bed not in teHmmEval options and --numStates not in train
    # options
    trainOpts = args.trainOpts.split()
    if "--numStates" in args.trainOpts and not args.numReps and not args.numIter:
        nsIdx = trainOpts.index("--numStates")
        assert nsIdx < len(trainOpts) - 1
        del trainOpts[nsIdx]
        del trainOpts[nsIdx]
    if "--initTransProbs" in args.trainOpts:
        tpIdx = trainOpts.index("--initTransProbs")
        assert tpIdx < len(trainOpts) - 1
        del trainOpts[tpIdx]
        del trianOpts[tpIdx]
    trainProcs = 1
    if "--numThreads" in args.trainOpts:
        npIdx = trainOpts.index("--numThreads")
        assert npIdx < len(trainOpts) - 1
        trainProcs = int(trainOpts[npIdx + 1])
    segOptIdx = -1
    if "--segment" in args.trainOpts:
        segIdx = trainOpts.index("--segment")
        assert segIdx < len(trainOpts) - 1
        segOptIdx = segIdx + 1
    if args.numReps and "--reps" in args.trainOpts:
        repsIdx = trainOpts.index("--reps")
        assert repsIdx < len(trainOpts) - 1
        del trainOpts[repsIdx]
        del trainOpts[repsIdx]
    if args.numIter and "--iter" in args.trainOpts:
        iterIdx = trainOpts.index("--iter")
        assert iterIdx < len(trainOpts) - 1
        del trainOpts[iterIdx]
        del trainOpts[iterIdx]
    evalOpts = args.evalOpts.split()
    if "--bed" in args.evalOpts:
        bedIdx = evalOpts.index("--bed")
        assert bedIdx < len(evalOpts) - 1
        del evalOpts[bedIdx]
        del evalOpts[bedIdx]
    if "--bic" in args.evalOpts:
        bicIdx = evalOpts.index("--bic")
        assert bicIdx < len(evalOpts) - 1
        del evalOpts[bicIdx]
        del evalOpts[bicIdx]

    # hack in support for --initTrans option by munging out model sizes
    # from the text files
    if args.initTrans is True:
        transFiles = args.states.split(",")
        states = []
        for tf in transFiles:
            stateSet = set()
            with open(tf) as f:
                for line in f:
                    toks = line.split()
                    print toks
                    if len(toks) > 1 and toks[0][0] != "#":
                        stateSet.add(toks[0])
                        stateSet.add(toks[1])
            states.append(len(stateSet))
    else:
        states = args.states.split(",")

    trainCmds = []
    evalCmds = []
    prevSize = -1
    sameSizeCount = 0
    for trainingSize, trainingBed in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        print prevSize, trainingSize, sameSizeCount
        for numStates in states:
            for rep in xrange(args.reps):
                outMod = os.path.join(
                    args.outDir, "hmm_%d.%d.%d.%d.mod" %
                    (trainingSize, sameSizeCount, int(numStates), int(rep)))
                if segOptIdx != -1:
                    trainOpts[segOptIdx] = trainingBed
                if args.initTrans is True:
                    statesOpt = "--initTransProbs %s" % transFiles[
                        states.index(numStates)]
                elif args.numIter is True:
                    # states argument overridden by iterations
                    statesOpt = "--iter %d" % int(numStates)
                elif args.numReps is True:
                    # states argument overridden by reps
                    statesOpt = "--reps %d" % int(numStates)
                else:
                    statesOpt = "--numStates %d" % int(numStates)
                trainCmd = "teHmmTrain.py %s %s %s %s %s" % (
                    args.tracks, trainingBed, outMod, " ".join(trainOpts),
                    statesOpt)
                if not args.resume or not os.path.isfile(outMod) or \
                   os.path.getsize(outMod) < 100:
                    trainCmds.append(trainCmd)

                outBic = outMod.replace(".mod", ".bic")
                outBed = outMod.replace(".mod", "_eval.bed")
                evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % (
                    args.tracks, outMod, args.evalBed, outBed, outBic,
                    " ".join(evalOpts))
                if not args.resume or not os.path.isfile(outBic) or \
                   os.path.getsize(outBic) < 2:
                    evalCmds.append(evalCmd)

    # run the training
    runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs))

    # run the eval
    runParallelShellCommands(evalCmds, args.proc)

    # make the table header
    tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w")
    stateColName = "states"
    if args.numIter is True:
        statesColName = "iter"
    elif args.numReps is True:
        stateColName = "reps"
    tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" %
                    stateColName)
    for i in xrange(args.reps):
        tableFile.write(", bic.%d" % i)
    tableFile.write("\n")

    # make the table body
    prevSize = -1
    sameSizeCount = 0
    for (trainingSize, trainingBed) in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        for numStates in states:
            bics = []
            printBics = []
            for rep in xrange(args.reps):
                outMod = os.path.join(
                    args.outDir, "hmm_%d.%d.%d.%d.mod" %
                    (trainingSize, sameSizeCount, int(numStates), int(rep)))
                outBic = outMod.replace(".mod", ".bic")
                try:
                    with open(outBic, "r") as obFile:
                        for line in obFile:
                            bic = float(line.split()[0])
                            break
                    bics.append(bic)
                    printBics.append(bic)
                except:
                    logger.warning("Coudn't find bic %s" % outBic)
                    printBics.append("ERROR")
            # write row
            tableFile.write("%s, %d, %d" %
                            (trainingBed, int(trainingSize), int(numStates)))
            if len(bics) > 0:
                tableFile.write(", %f, %f, %f" %
                                (np.mean(bics), np.min(bics), np.max(bics)))
            else:
                tableFile.write(", ERROR, ERROR, ERROR")
            for pb in printBics:
                tableFile.write(", %s" % pb)
            tableFile.write("\n")
    tableFile.close()

    cleanBedTool(tempBedToolPath)
Ejemplo n.º 11
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Generate some accuracy results.  To be used on output of statesVsBic.py"
        "(or some set of hmm prediction beds of the form *_trainsize.stateNum.bed")

    parser.add_argument("tracksList", help="XML tracks list")
    parser.add_argument("truthBed", help="reference to benchmark against (ex repet)")
    parser.add_argument("fitBed", help="predition to fit against (ex modeler)")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("beds", help="one or more bed files to evaluate", nargs="*")
    parser.add_argument("--proc", help="number of parallel processes", type=int, default=1)
    parser.add_argument("--maskGap", help="interpolate masked gaps smaller than this", type=int, default=5000)
    parser.add_argument("--exploreFdr", help="try a bunch of fdr values", action="store_true", default=False)
    parser.add_argument("--compWindow", help="intersect with this file before running comparison", default=None)

    args = parser.parse_args()

    # preloop to check files
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        
    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    outFile = open(os.path.join(args.outDir, "accuracy.csv"), "w")

    truthBed = args.truthBed
    if args.compWindow is not None:
        truthBed = os.path.join(args.outDir, "clippedTruth.bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.truthBed, args.compWindow, truthBed))

    if args.exploreFdr is True:
        fdrs = [0, .05, .1, .15, .20, .25, .30, .35, .40, .45, .50, .55, .60, .65, .70, .75, .80, .85, .90, .95, 1]
    else:
        fdrs = [.65]

    # do two kinds of fitting vs modeer
    fitCmds = []
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        fitOut = os.path.join(args.outDir, os.path.basename(bed).replace(".bed", "_fit.bed"))
        fitLog = fitOut.replace(".bed", "_log.txt")
        cmd = "fitStateNames.py %s %s %s --tl %s --tgt TE --qualThresh 0.1 --logDebug --logFile %s" % (args.fitBed, bed, fitOut, args.tracksList, fitLog)
        fitCmds.append(cmd)
        for fdr in fdrs:
            fitOutFdr = fitOut.replace(".bed", "Fdr%f.bed" % fdr)
            fitLogFdr = fitOutFdr.replace(".bed", "_log.txt")
            cmdFdr = "fitStateNames.py %s %s %s --tl %s --tgt TE --fdr %f --logDebug --logFile %s" % (args.fitBed, bed, fitOutFdr, args.tracksList, fdr, fitLogFdr)
            fitCmds.append(cmdFdr)

    # interpolate the gaps
    interpolateCmds = []
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        fitOut = os.path.join(args.outDir, os.path.basename(bed).replace(".bed", "_fit.bed"))
        fitOutMI = os.path.join(args.outDir, os.path.basename(bed).replace(".bed", "_fitMI.bed"))
        cmd = "interpolateMaskedRegions.py %s %s %s %s --maxLen %d" % (args.tracksList, args.truthBed, fitOut, fitOutMI, args.maskGap)
        interpolateCmds.append(cmd)

        for fdr in fdrs:
            fitOutFdr = fitOut.replace(".bed", "Fdr%f.bed" % fdr)        
            fitOutFdrMI = fitOutMI.replace(".bed", "Fdr%f.bed" % fdr)
            cmdFdr = "interpolateMaskedRegions.py %s %s %s %s --maxLen %d" % (args.tracksList, args.truthBed, fitOutFdr, fitOutFdrMI, args.maskGap)
            interpolateCmds.append(cmdFdr)

    # run the comparison
    compareCmds = []
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        fitOutMI = os.path.join(args.outDir, os.path.basename(bed).replace(".bed", "_fitMI.bed"))
        comp = os.path.join(args.outDir, os.path.basename(bed).replace(".bed", "_comp.txt"))
        cmd = ""
        fitOutMIClipped = fitOutMI
        if args.compWindow is not None:
            fitOutMIClipped = fitOutMI.replace(".bed", "_clipped.bed")
            cmd += "intersectBed -a %s -b %s | sortBed > %s && " % (fitOutMI, args.compWindow, fitOutMIClipped)
        cmd += "compareBedStates.py %s %s --tl %s --delMask %d > %s" % (args.truthBed, fitOutMIClipped, args.tracksList, args.maskGap, comp)
        compareCmds.append(cmd)
        for fdr in fdrs:
            fitOutFdrMI = fitOutMI.replace(".bed", "Fdr%f.bed" % fdr)
            compFdr = comp.replace(".txt", "Fdr%f.txt" % fdr)
            cmdFdr = ""
            fitOutFdrMIClipped = fitOutFdrMI
            if args.compWindow is not None:
                fitOutFdrMIClipped = fitOutFdrMI.replace(".bed", "_clipped.bed")
                cmdFdr += "intersectBed -a %s -b %s | sortBed > %s &&" % (fitOutFdrMI, args.compWindow, fitOutFdrMIClipped)
            cmdFdr += "compareBedStates.py %s %s --tl %s --delMask %d > %s" % (args.truthBed, fitOutFdrMIClipped, args.tracksList, args.maskGap, compFdr)
            compareCmds.append(cmdFdr)
    
    runParallelShellCommands(fitCmds, args.proc)
    runParallelShellCommands(interpolateCmds, args.proc)
    runParallelShellCommands(compareCmds, args.proc)
    # got a weird crash before where comp file wasn't found
    # maybe this will help?
    runShellCommand("sleep 10")

    # munging ############
    def prettyAcc((prec, rec), spec):
        f1 = 0.
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)        
        return "%.4f, %.4f, %.4f, %.4f" % (prec, rec, f1, spec)
Ejemplo n.º 12
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Thin wrapper of teHmmTrain.py and teHmmEval.py "
        "to generate a table of Number-of-HMM-states VS BIC. Lower BIC"
        " is better")

    parser.add_argument("tracks", help="tracks xml used for training and eval")
    parser.add_argument("trainingBeds", help="comma-separated list of training regions"
                        " (training region size will be a variable in output table). "
                        "if segmentation is activated, these must also be the "
                        "segmented beds...")
    parser.add_argument("evalBed", help="eval region")
    parser.add_argument("trainOpts", help="all teHmmTrain options in quotes")
    parser.add_argument("evalOpts", help="all teHmmEval options in quotes")
    parser.add_argument("states", help="comma separated-list of numbers of states"
                        " to try")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("--reps", help="number of replicates", type = int,
                        default=1)
    parser.add_argument("--proc", help="maximum number of processors to use"
                        " in parallel", type = int, default = 1)
    parser.add_argument("--resume", help="try not to rewrite existing files",
                        action="store_true", default=False)
    parser.add_argument("--initTrans", help="the states argument is overridden"
                        " to specify a list of transition initialization files "
                        "instead of state numbers", action="store_true",
                        default=False)
    parser.add_argument("--numReps", help="the states argument is overridden"
                        " to specifiy a list of replicate numbers (--reps)"
                        " arguments", action="store_true", default=False)
    parser.add_argument("--numIter", help="the states argument is overridden"
                        " to specifiy a list of iteration counts (--iter)"
                        " arugments", action="store_true", default=False)
                        
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1:
        raise RuntimeError("only one of {--initTrans, --numReps, --numIter} "
                           "can be used at a time")

    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    # get the sizes of the trianing beds
    trainingSizes = []
    trainingBeds = []
    for tb in  args.trainingBeds.split(","):
        if len(tb) > 0:
            trainingBeds.append(tb)
    for bed in trainingBeds:
        assert os.path.isfile(bed)
        bedLen = 0
        for interval in readBedIntervals(bed):
            bedLen += interval[2] - interval[1]
        trainingSizes.append(bedLen)

    # make sure --bed not in teHmmEval options and --numStates not in train
    # options
    trainOpts = args.trainOpts.split()
    if "--numStates" in args.trainOpts and not args.numReps and not args.numIter:
        nsIdx = trainOpts.index("--numStates")
        assert nsIdx < len(trainOpts) - 1
        del trainOpts[nsIdx]
        del trainOpts[nsIdx]
    if "--initTransProbs" in args.trainOpts:
        tpIdx = trainOpts.index("--initTransProbs")
        assert tpIdx < len(trainOpts) - 1
        del trainOpts[tpIdx]
        del trianOpts[tpIdx]
    trainProcs = 1
    if "--numThreads" in args.trainOpts:
        npIdx = trainOpts.index("--numThreads")
        assert npIdx < len(trainOpts) - 1
        trainProcs = int(trainOpts[npIdx + 1])
    segOptIdx = -1
    if "--segment" in args.trainOpts:
        segIdx = trainOpts.index("--segment")
        assert segIdx < len(trainOpts) - 1
        segOptIdx = segIdx + 1
    if args.numReps and "--reps" in args.trainOpts:
        repsIdx = trainOpts.index("--reps")
        assert repsIdx < len(trainOpts) - 1
        del trainOpts[repsIdx]
        del trainOpts[repsIdx]
    if args.numIter and "--iter" in args.trainOpts:
        iterIdx = trainOpts.index("--iter")
        assert iterIdx < len(trainOpts) - 1
        del trainOpts[iterIdx]
        del trainOpts[iterIdx]
    evalOpts = args.evalOpts.split()
    if "--bed" in args.evalOpts:
        bedIdx = evalOpts.index("--bed")
        assert bedIdx < len(evalOpts) - 1
        del evalOpts[bedIdx]
        del evalOpts[bedIdx]
    if "--bic" in args.evalOpts:
        bicIdx = evalOpts.index("--bic")
        assert bicIdx < len(evalOpts) - 1
        del evalOpts[bicIdx]
        del evalOpts[bicIdx]

    # hack in support for --initTrans option by munging out model sizes
    # from the text files
    if args.initTrans is True:
        transFiles = args.states.split(",")
        states = []
        for tf in transFiles:
            stateSet = set()
            with open(tf) as f:
                for line in f:
                    toks = line.split()
                    print toks
                    if len(toks) > 1 and toks[0][0] != "#":
                        stateSet.add(toks[0])
                        stateSet.add(toks[1])
            states.append(len(stateSet))
    else:
        states = args.states.split(",")

    trainCmds = []
    evalCmds = []
    prevSize = -1
    sameSizeCount = 0
    for trainingSize, trainingBed in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        print prevSize, trainingSize, sameSizeCount
        for numStates in states:
            for rep in xrange(args.reps):
                outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % (
                    trainingSize, sameSizeCount, int(numStates), int(rep)))
                if segOptIdx != -1:
                    trainOpts[segOptIdx] = trainingBed
                if args.initTrans is True:
                    statesOpt = "--initTransProbs %s" % transFiles[states.index(numStates)]
                elif args.numIter is True:
                    # states argument overridden by iterations
                    statesOpt = "--iter %d" % int(numStates)
                elif args.numReps is True:
                    # states argument overridden by reps
                    statesOpt = "--reps %d" % int(numStates)
                else:
                    statesOpt = "--numStates %d" % int(numStates)
                trainCmd = "teHmmTrain.py %s %s %s %s %s" % (
                    args.tracks, trainingBed, outMod, " ".join(trainOpts),
                    statesOpt)
                if not args.resume or not os.path.isfile(outMod) or \
                   os.path.getsize(outMod) < 100:
                    trainCmds.append(trainCmd)

                outBic = outMod.replace(".mod", ".bic")
                outBed = outMod.replace(".mod", "_eval.bed")
                evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % (
                    args.tracks, outMod, args.evalBed, outBed, outBic,
                    " ".join(evalOpts))
                if not args.resume or not os.path.isfile(outBic) or \
                   os.path.getsize(outBic) < 2:
                    evalCmds.append(evalCmd)
            
    # run the training            
    runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs))

    # run the eval
    runParallelShellCommands(evalCmds, args.proc)

    # make the table header
    tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w")
    stateColName = "states"
    if args.numIter is True:
        statesColName = "iter"
    elif args.numReps is True:
        stateColName = "reps"
    tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" % stateColName)
    for i in xrange(args.reps):
        tableFile.write(", bic.%d" % i)
    tableFile.write("\n")

    # make the table body
    prevSize = -1
    sameSizeCount = 0
    for (trainingSize,trainingBed) in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        for numStates in states:
            bics = []
            printBics = []
            for rep in xrange(args.reps):
                outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % (
                    trainingSize, sameSizeCount, int(numStates), int(rep)))
                outBic = outMod.replace(".mod", ".bic")
                try:
                    with open(outBic, "r") as obFile:
                        for line in obFile:
                            bic = float(line.split()[0])
                            break
                    bics.append(bic)
                    printBics.append(bic)
                except:
                    logger.warning("Coudn't find bic %s" % outBic)
                    printBics.append("ERROR")
            # write row
            tableFile.write("%s, %d, %d" % (trainingBed, int(trainingSize), int(numStates)))
            if len(bics) > 0:
                tableFile.write(", %f, %f, %f" % (np.mean(bics), np.min(bics),
                                                  np.max(bics)))
            else:
                tableFile.write(", ERROR, ERROR, ERROR")
            for pb in printBics:
                tableFile.write(", %s" % pb)
            tableFile.write("\n")
    tableFile.close()
            
    cleanBedTool(tempBedToolPath)
Ejemplo n.º 13
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "another wrapper for compareBedStates.py that will compare many files"
        " and make a decent table output")

    parser.add_argument("tracksList", help="XML tracks list")
    parser.add_argument(
        "truthBeds",
        help="comma-separated references to benchmark against (ex repet)")
    parser.add_argument("testBeds", help="comma-spearated test beds")
    parser.add_argument("workDir", help="folder to write comparision outputs")
    parser.add_argument("outCSV", help="path for output")
    parser.add_argument("--state", help="state name", default="TE")
    parser.add_argument("--delMask",
                        help="see help for compareBedStates.py",
                        default=None,
                        type=int)
    parser.add_argument("--proc",
                        help="number of prcesses",
                        default=1,
                        type=int)
    parser.add_argument("--truthNames",
                        help="comma-separated list of truth names",
                        default=None)
    parser.add_argument("--testNames",
                        help="comma-separated list of test names",
                        default=None)

    args = parser.parse_args()

    truths = args.truthBeds.split(",")
    tests = args.testBeds.split(",")

    if args.truthNames is not None:
        truthNames = args.truthNames.split(",")
    else:
        truthNames = [os.path.splitext(os.path.basename(x))[0] for x in truths]
    if args.testNames is not None:
        testNames = args.testNames.split(",")
    else:
        testNames = [os.path.splitext(os.path.basename(x))[0] for x in tests]

    if not os.path.isdir(args.workDir):
        runShellCommand("mkdir %s" % args.workDir)

    assert len(tests) == len(testNames)
    assert len(truths) == len(truthNames)

    compCmds = []
    for i in xrange(len(tests)):
        for j in xrange(len(truths)):
            opath = os.path.join(
                args.workDir, "%s_vs_%s.txt" % (testNames[i], truthNames[j]))
            flags = "--tl %s" % args.tracksList
            if args.delMask is not None:
                flags += " --delMask %d" % args.delMask
            cmd = "compareBedStates.py %s %s %s > %s" % (truths[j], tests[i],
                                                         flags, opath)
            compCmds.append(cmd)

    runParallelShellCommands(compCmds, args.proc)

    # munging ############
    def prettyAcc((prec, rec), spec):
        f1 = 0.
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)
        return ("%.4f" % prec, "%.4f" % rec, "%.4f" % f1, "%.4f" % spec)
Ejemplo n.º 14
0
#compIdx = 1 #interval
#compIdx = 2 #weightintervs

#####################

# segment ##########
trainSegPath = "train_segments.bed"
evalSegPath = "eval_segments.bed"
if startPoint <= 1:
    cmdTrain = "segmentTracks.py %s %s %s %s --stats %s" % (
        segTracksPath, trainRegionPath, trainSegPath, segOpts,
        trainSegPath.replace(".bed", ".segStats"))
    cmdEval = "segmentTracks.py %s %s %s %s --stats %s" % (
        segTracksPath, evalRegionPath, evalSegPath, segOpts,
        evalSegPath.replace(".bed", ".segStats"))
    runParallelShellCommands([cmdEval, cmdTrain], 2)

# train ############
modelPath = "hmm.mod"
if startPoint <= 2:
    cmd = "teHmmTrain.py %s %s %s %s" % (trainTracksPath, trainSegPath,
                                         modelPath, logOpts)
    cmd += " --fixStart"
    cmd += " --segLen %d" % segLen
    cmd += " --numStates %d" % numStates
    cmd += " --reps %d --numThreads %d" % (threads, threads)
    cmd += " --emThresh %f" % thresh
    cmd += " --iter %d" % iter
    cmd += " --segment %s" % trainSegPath
    runShellCommand(cmd)
Ejemplo n.º 15
0
segmentCmds = segmentCommands(genomePath, regions, outDir, segOpts, tracksPath)

trainCmds = trainCommands(genomePath, regions, outDir, tracksPath250, segLen,
                          numStates, trainThreads, thresh, numIter)

evalCmds = evalCommands(genomePath, regions, outDir, tracksPath250)

fitCmds = fitCommands(genomePath, regions, outDir, modelerPath, truthPaths,
                      fitFlags)

compareCmds = compareCommands(genomePath, regions, outDir, modelerPath,
                              truthPaths)
if startPoint <= 1:
    print segmentCmds
    runParallelShellCommands(segmentCmds, numParallelBatch)
if startPoint <= 2:
    print trainCmds
    runParallelShellCommands(
        trainCmds, max(1, numParallelBatch / max(1, trainThreads / 2)))
if startPoint <= 3:
    print evalCmds
    runParallelShellCommands(evalCmds, numParallelBatch)
if startPoint <= 4:
    print fitCmds
    runParallelShellCommands(fitCmds, numParallelBatch)
if startPoint <= 5:
    print "\n".join(compareCmds)
    runParallelShellCommands(compareCmds, numParallelBatch)
if startPoint <= 6:
    harvestStats(genomePath, regions, outDir, modelerPath, truthPaths,
Ejemplo n.º 16
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create a teHMM")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("trainingBed", help="Path of BED file containing"
                        " genome regions to train model on.  If --supervised "
                        "is used, the names in this bed file will be treated "
                        "as the true annotation (otherwise it is only used for "
                        "interval coordinates)")
    parser.add_argument("outputModel", help="Path of output hmm")
    parser.add_argument("--numStates", help="Number of states in model",
                        type = int, default=2)
    parser.add_argument("--iter", help="Number of EM iterations",
                        type = int, default=100)
    parser.add_argument("--supervised", help="Use name (4th) column of "
                        "<traingingBed> for the true hidden states of the"
                        " model.  Transition parameters will be estimated"
                        " directly from this information rather than EM."
                        " NOTE: The number of states will be determined "
                        "from the bed.",
                        action = "store_true", default = False)
    parser.add_argument("--cfg", help="Use Context Free Grammar insead of "
                        "HMM.  Only works with --supervised for now",
                        action = "store_true", default = False)
    parser.add_argument("--saPrior", help="Confidence in self alignment "
                        "track for CFG.  Probability of pair emission "
                        "is multiplied by this number if the bases are aligned"
                        " and its complement if bases are not aligned. Must"
                        " be between [0,1].", default=0.95, type=float)
    parser.add_argument("--pairStates", help="Comma-separated list of states"
                        " (from trainingBed) that are treated as pair-emitors"
                        " for the CFG", default=None)
    parser.add_argument("--emFac", help="Normalization factor for weighting"
                        " emission probabilities because when there are "
                        "many tracks, the transition probabilities can get "
                        "totally lost. 0 = no normalization. 1 ="
                        " divide by number of tracks.  k = divide by number "
                        "of tracks / k", type=int, default=0)
    parser.add_argument("--initTransProbs", help="Path of text file where each "
                        "line has three entries: FromState ToState Probability"
                        ".  This file (all other transitions get probability 0)"
                        " is used to specifiy the initial transition model."
                        " The names and number of states will be initialized "
                        "according to this file (overriding --numStates)",
                        default = None)
    parser.add_argument("--fixTrans", help="Do not learn transition parameters"
                        " (best used with --initTransProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ".  This file (all other emissions get probability 0)"
                        " is used to specifiy the initial emission model. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixEm", help="Do not learn emission parameters"
                        " (best used with --initEmProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initStartProbs", help="Path of text file where each "
                        "line has two entries: State Probability"
                        ".  This file (all other start probs get probability 0)"
                        " is used to specifiy the initial start dist. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixStart", help="Do not learn start parameters"
                        " (best used with --initStartProbs)",
                        action="store_true", default=False)
    parser.add_argument("--forceTransProbs",
                        help="Path of text file where each "
                        "line has three entries: FromState ToState Probability" 
                        ". These transition probabilities will override any "
                        " learned probabilities after each training iteration"
                        " (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed)" ,
                        default=None)
    parser.add_argument("--forceEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ". These "
                        "emission probabilities will override any learned"
                        " probabilities after each training iteration "
                        "(unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed.)" ,
                        default = None) 
    parser.add_argument("--flatEm", help="Use a flat emission distribution as "
                        "a baseline.  If not specified, the initial emission "
                        "distribution will be randomized by default.  Emission"
                        " probabilities specified with --initEmpProbs or "
                        "--forceEmProbs will never be affected by randomizaiton"
                        ".  The randomization is important for Baum Welch "
                        "training, since if two states dont have at least one"
                        " different emission or transition probability to begin"
                        " with, they will never learn to be different.",
                        action="store_true", default=False)
    parser.add_argument("--emRandRange", help="When randomly initialzing an"
                        " emission distribution, constrain"
                        " the values to the given range (pair of "
                        "comma-separated numbers).  Overridden by "
                        "--initEmProbs and --forceEmProbs when applicable."
                        " Completely overridden by --flatEm (which is equivalent"
                        " to --emRandRange .5,.5.). Actual values used will"
                        " always be normalized.", default="0.2,0.8")
    parser.add_argument("--segment", help="Bed file of segments to treat as "
                        "single columns for HMM (ie as created with "
                        "segmentTracks.py).  IMPORTANT: this file must cover "
                        "the same regions as the traininBed file. Unless in "
                        "supervised mode, probably best to use same bed file "
                        " as both traingBed and --segment argument.  Otherwise"
                        " use intersectBed to make sure the overlap is exact",
                        default=None)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied)", type=int, default=0)
    parser.add_argument("--seed", help="Seed for random number generator"
                        " which will be used to initialize emissions "
                        "(if --flatEM and --supervised not specified)",
                        default=None, type=int)
    parser.add_argument("--reps", help="Number of replicates (with different"
                         " random initializations) to run. The replicate"
                         " with the highest likelihood will be chosen for the"
                         " output", default=1, type=int)
    parser.add_argument("--numThreads", help="Number of threads to use when"
                        " running replicates (see --rep) in parallel.",
                        type=int, default=1)
    parser.add_argument("--emThresh", help="Threshold used for convergence"
                        " in baum welch training.  IE delta log likelihood"
                        " must be bigger than this number (which should be"
                        " positive) for convergence", type=float,
                        default=0.001)
    parser.add_argument("--saveAllReps", help="Save all replicates (--reps)"
                        " models to disk, instead of just the best one"
                        ". Format is <outputModel>.repN.  There will be "
                        " --reps -1 such models saved as the best output"
                        " counts as a replicate",
                        action="store_true", default=False)
    parser.add_argument("--maxProb", help="Gaussian distributions and/or"
                        " segment length corrections can cause probability"
                        " to *decrease* during BW iteration.  Use this option"
                        " to remember the parameters with the highest probability"
                        " rather than returning the parameters after the final "
                        "iteration.", action="store_true", default=False)
    parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop"
                        " training if a given number of iterations go by without"
                        " hitting a new maxProb", default=None, type=int)
    parser.add_argument("--transMatEpsilons", help="By default, epsilons are"
                        " added to all transition probabilities to prevent "
                        "converging on 0 due to rounding error only for fully"
                        " unsupervised training.  Use this option to force this"
                        " behaviour for supervised and semisupervised modes",
                        action="store_true", default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.cfg is True:
        assert args.supervised is True
        assert args.saPrior >= 0. and args.saPrior <= 1.
    if args.pairStates is not None:
        assert args.cfg is True
    if args.initTransProbs is not None or args.fixTrans is True or\
      args.initEmProbs is not None or args.fixEm is not None:
        if args.cfg is True:
            raise RuntimeError("--transProbs, --fixTrans, --emProbs, --fixEm "
                               "are not currently compatible with --cfg.")
    if args.fixTrans is True and args.supervised is True:
        raise RuntimeError("--fixTrans option not compatible with --supervised")
    if args.fixEm is True and args.supervised is True:
        raise RuntimeError("--fixEm option not compatible with --supervised")
    if (args.forceTransProbs is not None or args.forceEmProbs is not None) \
      and args.cfg is True:
        raise RuntimeError("--forceTransProbs and --forceEmProbs are not "
                           "currently compatible with --cfg")
    if args.flatEm is True and args.supervised is False and\
      args.initEmProbs is None and args.initTransProbs is None:
      raise RuntimeError("--flatEm must be used with --initEmProbs and or"
                         " --initTransProbs")
    if args.initEmProbs is not None and args.initTransProbs is None:
        raise RuntimeError("--initEmProbs can only be used in conjunction with"
                           " --initTransProbs")
    if args.emRandRange is not None:
        args.emRandRange = args.emRandRange.split(",")
        try:
            assert len(args.emRandRange) == 2
            args.emRandRange = (float(args.emRandRange[0]),
                                float(args.emRandRange[1]))
        except:
            raise RuntimeError("Invalid --emRandRange specified")
    if args.transMatEpsilons is False:
        # old logic here. now overriden with above options
        args.transMatEpsilons = (args.supervised is False and
                                 args.initTransProbs is None and
                                 args.forceTransProbs is None)

    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read training intervals from the bed file
    logger.info("loading training intervals from %s" % args.trainingBed)
    mergedIntervals = getMergedBedIntervals(args.trainingBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.trainingBed)

    # read segment intervals
    segIntervals = None
    if args.segment is not None:
        logger.info("loading segment intervals from %s" % args.segment)
        try:
            checkExactOverlap(args.trainingBed, args.segment)
        except:
            raise RuntimeError("bed file passed with --segments option"
                               " must exactly overlap trainingBed")
        segIntervals = readBedIntervals(args.segment, sort=True)
    elif args.segLen > 0:
        raise RuntimeError("--segLen can only be used with --segment")
    if args.segLen <= 0:
        args.segLen = None
    if args.segLen > 0 and args.segLen != 1:
        logger.warning("--segLen should be 0 (no correction) or 1 (base"
                       " correction).  Values > 1 may cause bias.")

    # read the tracks, while intersecting them with the training intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals,
                            segmentIntervals=segIntervals)

    catMap = None
    userTrans = None
    if args.supervised is False and args.initTransProbs is not None:
        logger.debug("initializing transition model with user data")
        catMap = stateNamesFromUserTrans(args.initTransProbs)
        # state number is overrided by the transProbs file
        args.numStates = len(catMap)

    truthIntervals = None
    # state number is overrided by the input bed file in supervised mode
    if args.supervised is True:
        logger.info("processing supervised state names")
        # we reload because we don't want to be merging them here
        truthIntervals = readBedIntervals(args.trainingBed, ncol=4)
        catMap = mapStateNames(truthIntervals)
        args.numStates = len(catMap)

    # train the model
    seeds = [random.randint(0, 4294967294)]
    if args.seed is not None:
        seeds = [args.seed]
        random.seed(args.seed)
    seeds += [random.randint(0, sys.maxint) for x in xrange(1, args.reps)]

    def trainClosure(randomSeed):
        return trainModel(randomSeed, trackData=trackData, catMap=catMap,
                          userTrans=userTrans, truthIntervals=truthIntervals,
                          args=args)
    
    modelList = runParallelShellCommands(argList=seeds, numProc = args.numThreads,
                                         execFunction = trainClosure,
                                         useThreads = True)

    # select best model
    logmsg = ""
    bestModel = (-1, LOGZERO)
    for i in xrange(len(modelList)):
        curModel = (i, modelList[i].getLastLogProb())
        if curModel[1] > bestModel[1]:
            bestModel = curModel
        if curModel[1] is not None:
            logmsg += "Rep %i: TotalProb: %f\n" % curModel
    if len(modelList) > 1:
        logging.info("Training Replicates Statistics:\n%s" % logmsg)
        logging.info("Selecting best replicate (%d, %f)" % bestModel)
    model = modelList[bestModel[0]]
        
    # write the model to a pickle
    logger.info("saving trained model to %s" % args.outputModel)
    saveModel(args.outputModel, model)

    # write all replicates
    writtenCount = 0
    if args.saveAllReps is True:
        for i, repModel in enumerate(modelList):
            if i != bestModel[0]:
                repPath = "%s.rep%d" % (args.outputModel, writtenCount)
                logger.info("saving replicate model to %s" % repPath)                
                saveModel(repPath, repModel)
                writtenCount += 1

    cleanBedTool(tempBedToolPath)
Ejemplo n.º 17
0
def parallelDispatch(argv, args):
    """ chunk up input with chrom option.  recursivlely launch eval. merge
    results """
    jobList = []
    chromIntervals = readBedIntervals(args.chroms, sort=True)
    chromFiles = []
    regionFiles = []
    segFiles = []
    statsFiles = []
    offset = args.co
    for chrom in chromIntervals:
        cmdToks = copy.deepcopy(argv)
        cmdToks[cmdToks.index("--chrom") + 1] = ""
        cmdToks[cmdToks.index("--chrom")] = ""
        
        chromPath = getLocalTempPath("TempChromPath", ".bed")
        cpFile = open(chromPath, "w")
        cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2]))
        cpFile.close()
        
        regionPath = getLocalTempPath("Temp", ".bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.allBed,
                                                                     chromPath,
                                                                     regionPath))

        if os.path.getsize(regionPath) < 2:
            continue

        offset += int(chrom[2]) - int(chrom[1])
        
        regionFiles.append(regionPath)
        chromFiles.append(chromPath)

        cmdToks[2] = regionPath

        segPath =  getLocalTempPath("Temp", ".bed")
        cmdToks[3] = segPath
        segFiles.append(segPath)

        if "--co" in cmdToks:
            cmdToks[cmdToks.index("--co")+1] = str(offset)
        else:
            cmdToks.append("--co")
            cmdToks.append(str(offset))
        
        if args.stats is not None:
            statsPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--stats")+1] = statsPath
            statsFiles.append(statsPath)
        cmd = " ".join(cmdToks)
        jobList.append(cmd)

    runParallelShellCommands(jobList, args.proc)

    for i in xrange(len(jobList)):
        if i == 0:
            ct = ">"
        else:
            ct = ">>"
        runShellCommand("cat %s %s %s" % (segFiles[i], ct, args.outBed))
        if len(statsFiles) > 0:
            runShellCommand("cat %s %s %s" % (statsFiles[i], ct, args.stats))

    for i in itertools.chain(chromFiles, regionFiles, segFiles, statsFiles):
        runShellCommand("rm %s" % i)