Exemple #1
0
def buildSeqTable(args, bedIntervals):
    """build table of sequence indexes from input bed file to quickly read 
    while sorting.  Table maps sequence name to range of indexes in 
    bedIntervals.  This only works if bedIntervals are sorted (and should 
    raise an assertion error if that's not the case. 
    """
    logger.info("building sequence name index of %d bed intervals" %
                 len(bedIntervals))
    bedSeqTable = dict()
    prevName = None
    prevIdx = 0
    for i, interval in enumerate(bedIntervals):
        seqName = interval[0]
        if seqName != prevName:
            assert seqName not in bedSeqTable
            if prevName is not None:
                bedSeqTable[prevName] = (prevIdx, i)
                prevIdx = i
        prevName = seqName

    seqName = bedIntervals[-1][0]
    assert seqName not in bedSeqTable
    bedSeqTable[seqName] = (prevIdx, len(bedIntervals))
    logger.debug("index has %d unique sequences" % len(bedSeqTable))
    return bedSeqTable
Exemple #2
0
def filterStateMap(stateMap, args):
    """ Make sure ignored states are ignored.  Apply unique id suffixes is necessary.
    Make sure that quality threshold is high enough or else ignore too.  map is
    filtered in place."""
    mapCounts = dict()
    for name, mapVal in stateMap.items():
        assert len(mapVal) == 3
        mapName, mapCount, mapTotal = mapVal
        qual = float(mapCount) / float(mapTotal)
        if name in args.ignore or qual < args.qualThresh:
            # set map such that it won't be changed
            logger.debug("Ignoring state %s with quality %f" % (name, qual))
            stateMap[name] = (name, 1, 1)
        elif args.unique:
            if mapName not in mapCounts:
                mapCounts[mapName] = 1
            else:
                mapCounts[mapName] += 1

    # dont want to rename if only 1 instance
    for name, count in mapCounts.items():
        if count == 1:
            mapCounts[name] = 0

    # 2nd pass to assign the unique ids (range from 1 to count)
    for name, mapVal in stateMap.items():
        mapName, mapCount, mapTotal = mapVal
        if mapName in mapCounts:
            count = mapCounts[mapName]
            if count > 0:
                newName = mapName + ".%d" % count
                logger.debug("Mapping %s to %s" % (mapName, newName))
                stateMap[name] = (newName, mapCount, mapTotal)
                mapCounts[mapName] -= 1
Exemple #3
0
def applyNamesToModel(stateMap, modelPath):
    """ change a given HMM model to use the new state names"""
    # load model created with teHmmTrain.py
    logger.debug("loading model %s" % modelPath)
    model = loadModel(modelPath)
    modelMap = model.getStateNameMap()
    raise RuntimeError("Not Implemented")
Exemple #4
0
def filterStateMap(stateMap, args):
    """ Make sure ignored states are ignored.  Apply unique id suffixes is necessary.
    Make sure that quality threshold is high enough or else ignore too.  map is
    filtered in place."""
    mapCounts = dict()
    for name, mapVal in stateMap.items():
        assert len(mapVal) == 3
        mapName, mapCount, mapTotal = mapVal
        qual = float(mapCount) / float(mapTotal)
        if name in args.ignore or qual < args.qualThresh:
            # set map such that it won't be changed
            logger.debug("Ignoring state %s with quality %f" % (name, qual))
            stateMap[name] = (name, 1, 1)
        elif args.unique:
            if mapName not in mapCounts:
                mapCounts[mapName] = 1
            else:
                mapCounts[mapName] += 1

    # dont want to rename if only 1 instance
    for name, count in mapCounts.items():
        if count == 1:
            mapCounts[name] = 0

    # 2nd pass to assign the unique ids (range from 1 to count)
    for name, mapVal in stateMap.items():
        mapName, mapCount, mapTotal = mapVal
        if mapName in mapCounts:
            count = mapCounts[mapName]
            if count > 0:
                newName = mapName + ".%d" % count
                logger.debug("Mapping %s to %s" % (mapName, newName))
                stateMap[name] = (newName, mapCount, mapTotal)
                mapCounts[mapName] -= 1
Exemple #5
0
def applyNamesToModel(stateMap, modelPath):
    """ change a given HMM model to use the new state names"""
    # load model created with teHmmTrain.py
    logger.debug("loading model %s" % modelPath)
    model = loadModel(modelPath)
    modelMap = model.getStateNameMap()
    raise RuntimeError("Not Implemented")
Exemple #6
0
def buildSeqTable(args, bedIntervals):
    """build table of sequence indexes from input bed file to quickly read 
    while sorting.  Table maps sequence name to range of indexes in 
    bedIntervals.  This only works if bedIntervals are sorted (and should 
    raise an assertion error if that's not the case. 
    """
    logger.info("building sequence name index of %d bed intervals" %
                len(bedIntervals))
    bedSeqTable = dict()
    prevName = None
    prevIdx = 0
    for i, interval in enumerate(bedIntervals):
        seqName = interval[0]
        if seqName != prevName:
            assert seqName not in bedSeqTable
            if prevName is not None:
                bedSeqTable[prevName] = (prevIdx, i)
                prevIdx = i
        prevName = seqName

    seqName = bedIntervals[-1][0]
    assert seqName not in bedSeqTable
    bedSeqTable[seqName] = (prevIdx, len(bedIntervals))
    logger.debug("index has %d unique sequences" % len(bedSeqTable))
    return bedSeqTable
Exemple #7
0
def computeScale(data, numBins, noLog):
    """ very simple heuristic to compute reasonable scaling"""
    minVal, maxVal = np.amin(data), np.amax(data)
    range = maxVal - minVal
    logger.debug("Min=%f Max=%f" % (minVal, maxVal))

    #NOTE: the -2.0 when computing the linear binSize and logBase
    # are very conservative measures to insure that we dont under
    # bin on each side due to rounding.  Can result in bins that
    # are too large when binsize, say, divides evently into the
    # range.  Should be optimized down the road when have more time.

    # try linear scale
    binSize = float(range) / float(numBins - 2.0)
    minBin = np.floor(minVal / binSize) * binSize
    linearBins = [minBin] * numBins
    for i in xrange(1, numBins):
        linearBins[i] = linearBins[i-1] + binSize
    logger.debug("Linear bins %s" % linearBins)
    linearVar = histVariance(data, sorted(linearBins))
    linearScale = 1.0 / binSize
    logger.debug("Linear scale=%f has variance=%f" % (linearScale, linearVar))
    
    # try log scale
    logVar = sys.maxint

    # shift parameter is a constant that gets added before log scaling
    # to make sure that we always deal with positive numbers
    shift = 0.
    if minVal <= 0.:
        shift = 1.0 - minVal
        data += shift
        minVal += shift
        maxVal += shift

    ratio = float(maxVal) / float(minVal)
    logBase = np.power(ratio, 1. / float(numBins - 2.00))
    minBin = np.power(logBase, np.floor(np.log(minVal) / np.log(logBase)))
    logBins = [minBin] * numBins
    for i in xrange(1, numBins):
        logBins[i] = logBins[i-1] * logBase
    logger.debug("Log bins %s" % logBins)
    logVar = histVariance(data, sorted(logBins), fromLog = True)
    logger.debug("Log base=%f has variance=%f" % (logBase, logVar))

    ret = "scale", linearScale, 0.
    if logVar < linearVar and noLog is False:
        ret = "logScale", logBase, shift
    return ret
Exemple #8
0
def readTrackIntoFloatArray(track, allIntervals):
    """ use the track API to directly read an entire data file into memory
    as an array of floats.  If the track has an associated defaultVal, it will
    be used to cover all gaps in the coverage.  If not, only annotated values
    will be kept"""
    defaultVal = track.getDefaultVal()
    hasDefault = defaultVal != None
    floatType = np.float32
    if not hasDefault:
        defaultVal = np.finfo(floatType).max
    else:
        # sanity check : we assume that no one ever actually uses this value
        defaultVal = floatType(defaultVal)
        assert defaultVal != np.finfo(floatType).max
        assert not np.isinf(defaultVal)
    readBuffers = []
    totalLen = 0
    for interval in allIntervals:
        logger.debug("Allocating track array of size %d" % (
             interval[2] - interval[1]))
        buf = defaultVal + np.zeros((interval[2] - interval[1]), dtype=floatType)
        buf = readTrackData(track.getPath(), interval[0], interval[1],
                            interval[2], outputBuf=buf,
                            valCol=track.getValCol(),
                            useDelta=track.getDelta)
        readBuffers.append(buf)
        totalLen += len(buf)

    data = np.concatenate(readBuffers)
    assert len(data) == totalLen
    readBuffers = None

    if not hasDefault:
        # strip out all the float_max values we put in there since there is
        # no default value for unannotated regions, and we just ignore them
        # (ie as original implementation)
        stripData = np.ndarray((totalLen), dtype=floatType)
        basesRead = 0
        for i in xrange(totalLen):
            if data[i] != defaultVal:
                stripData[basesRead] = data[i]
                basesRead += 1
        stripData.resize(basesRead)
        data = stripData

    return data
Exemple #9
0
def findTsds(args, bedIntervals):
    """ search through input bed intervals, loading up the FASTA sequence
    for each one """

    # index for quick lookups in bed file (to be used while scanning fasta file)
    seqTable = buildSeqTable(args, bedIntervals)
    outTsds = []
    faFile = open(args.fastaSequence, "r")
    nameSet = None
    if args.names is not None:
        nameSet = set(args.names.split(","))
    for seqNameFa, sequence in fastaRead(faFile):
        if args.sequences is not None and seqNameFa not in args.sequences and\
          seqNameFa.split()[0] not in args.sequences:
            # skip unflagged sequences when option specified
            continue

        # try name from Fasta as well as name with everything after first
        # whitespace skipped
        if seqNameFa in seqTable:
            seqName = seqNameFa
        else:
            seqName = seqNameFa.split()[0]
        if seqName in seqTable:
            logger.info("Scanning FASTA sequence %s" % seqName)
            bedRange = seqTable[seqName]
            for bedIdx in xrange(bedRange[0], bedRange[1]):
                bedInterval = bedIntervals[bedIdx]
                name = None
                if len(bedInterval) > 3:
                    name = bedInterval[3]
                if nameSet is None or name in nameSet:
                    # we make sequence lower case below because we dont care
                    # about soft masking
                    outTsds += intervalTsds(args, sequence.lower(),
                                            bedInterval)
        else:
            logger.debug("Skipping FASTA sequence %s because no intervals "
                         "found" % seqName)

    return outTsds
Exemple #10
0
def findTsds(args, bedIntervals):
    """ search through input bed intervals, loading up the FASTA sequence
    for each one """
    
    # index for quick lookups in bed file (to be used while scanning fasta file)
    seqTable = buildSeqTable(args, bedIntervals)
    outTsds = []
    faFile = open(args.fastaSequence, "r")
    nameSet = None
    if args.names is not None:
        nameSet = set(args.names.split(","))
    for seqNameFa, sequence in fastaRead(faFile):
        if args.sequences is not None and seqNameFa not in args.sequences and\
          seqNameFa.split()[0] not in args.sequences:
          # skip unflagged sequences when option specified
          continue
            
        # try name from Fasta as well as name with everything after first
        # whitespace skipped
        if seqNameFa in seqTable:
            seqName = seqNameFa
        else:
            seqName = seqNameFa.split()[0]
        if seqName in seqTable:
            logger.info("Scanning FASTA sequence %s" % seqName)
            bedRange = seqTable[seqName]
            for bedIdx in xrange(bedRange[0], bedRange[1]):
                bedInterval = bedIntervals[bedIdx]
                name = None
                if len(bedInterval) > 3:
                    name = bedInterval[3]
                if nameSet is None or name in nameSet:
                    # we make sequence lower case below because we dont care
                    # about soft masking
                    outTsds += intervalTsds(args, sequence.lower(), bedInterval)
        else:
            logger.debug("Skipping FASTA sequence %s because no intervals "
                          "found" % seqName)

    return outTsds
Exemple #11
0
def writeTransitions(bedIntervals, nameMap, outNameMap, args):
    tfile = open(args.outTransProbs, "w")
    
    # do the self transitions
    N = len(nameMap)
    selfTran = args.selfTran + np.zeros((N))
    if args.selfTran < 0:
        tot = np.zeros((N))
        num = np.zeros((N))
        for interval in bedIntervals:
            assert nameMap.has(interval[3])
            state = nameMap.getMap(interval[3])
            assert state < N
            num[state] += 1
            tot[state] += interval[2] - interval[1] - 1
        selfTran = tot / (tot + num)

    for state, i in nameMap.catMap.items():
        tfile.write("%s\t%s\t%f\n" % (state, state, selfTran[i]))
        if args.mode == "star":
            outTrans = (1. - selfTran[i]) / float(args.numOut)
            for outState, j in outNameMap.catMap.items():
                tfile.write("%s\t%s\t%f\n" % (state, outState, outTrans))

    # do the outside states
    if args.numOut > 0:
        outselfTran = args.selfTran + np.zeros((args.numOut))
        if args.selfTran < 0:
            # hack for now (should be from data above)
            logger.debug("Hacky maximum used for outside state self transition")
            outselfTran = max(selfTran) + np.zeros((args.numOut))
            
        for state, i in outNameMap.catMap.items():
            tfile.write("%s\t%s\t%f\n" % (state, state, outselfTran[i]))
                
    tfile.close()
Exemple #12
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Train, evalaute, then compare hmm model on input")

    parser.add_argument("trainingTracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks used "
                        "for training")
    parser.add_argument("outputDir", help="directory to write output")
    parser.add_argument("inBeds", nargs="*", help="list of training beds")
    parser.add_argument("--evalTracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks used"
                        " for evaluation (only need if different from"
                        " trainingTracksInfo", default=None)
    parser.add_argument("--numProc", help="Max number of processors to use",
                        type=int, default=1)
    parser.add_argument("--allTrackCombinations", help="Rerun with all"
                        " possible combinations of tracks from the input"
                        " tracksInfo file.  Note that this number gets big"
                        " pretty fast.", action = "store_true", default= False)
    parser.add_argument("--emStates", help="By default the supervised mode"
                        " of teHmmTrain is activated.  This option overrides"
                        " that and uses the EM mode and the given number of "
                        "states instead", type=int, default=None)
    parser.add_argument("--cross", help="Do 50/50 cross validation by training"
                        " on first half input and validating on second",
                        action="store_true", default=False)
    parser.add_argument("--emFac", help="Normalization factor for weighting"
                        " emission probabilities because when there are "
                        "many tracks, the transition probabilities can get "
                        "totally lost. 0 = no normalization. 1 ="
                        " divide by number of tracks.  k = divide by number "
                        "of tracks / k", type=int, default=0)
    parser.add_argument("--mod", help="Path to trained model.  This will "
                        "bypass the training phase that would normally be done"
                        " and just skip to the evaluation.  Note that the user"
                        " must make sure that the trained model has the "
                        "states required to process the input data",
                        default = None)
    parser.add_argument("--iter", help="Number of EM iterations.  Needs to be"
                        " used in conjunction with --emStates to specify EM"
                        " training",
                        type = int, default=None)
    parser.add_argument("--initTransProbs", help="Path of text file where each "
                        "line has three entries: FromState ToState Probability"
                        ".  This file (all other transitions get probability 0)"
                        " is used to specifiy the initial transition model."
                        " The names and number of states will be initialized "
                        "according to this file (overriding --numStates)",
                        default = None)
    parser.add_argument("--fixTrans", help="Do not learn transition parameters"
                        " (best used with --initTransProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ".  This file (all other emissions get probability 0)"
                        " is used to specifiy the initial emission model. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixEm", help="Do not learn emission parameters"
                        " (best used with --initEmProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initStartProbs", help="Path of text file where each "
                        "line has two entries: State Probability"
                        ".  This file (all other start probs get probability 0)"
                        " is used to specifiy the initial start dist. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixStart", help="Do not learn start parameters"
                        " (best used with --initStartProbs)",
                        action="store_true", default=False)
    parser.add_argument("--forceTransProbs",
                        help="Path of text file where each "
                        "line has three entries: FromState ToState Probability" 
                        ". These transition probabilities will override any "
                        " learned probabilities after training (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed" ,
                        default=None)
    parser.add_argument("--forceEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ". These "
                        "emission probabilities will override any learned"
                        " probabilities after training (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed." ,
                        default = None) 
    parser.add_argument("--flatEm", help="Use a flat emission distribution as "
                        "a baseline.  If not specified, the initial emission "
                        "distribution will be randomized by default.  Emission"
                        " probabilities specified with --initEmpProbs or "
                        "--forceEmProbs will never be affected by randomizaiton"
                        ".  The randomization is important for Baum Welch "
                        "training, since if two states dont have at least one"
                        " different emission or transition probability to begin"
                        " with, they will never learn to be different.",
                        action="store_true", default=False)
    parser.add_argument("--emRandRange", help="When randomly initialzing a"
                        " multinomial emission distribution, constrain"
                        " the values to the given range (pair of "
                        "comma-separated numbers).  Overridden by "
                        "--initEmProbs and --forceEmProbs when applicable."
                        " Completely overridden by --flatEm (which is equivalent"
                        " to --emRandRange .5,.5.). Actual values used will"
                        " always be normalized.", default=None)    
    parser.add_argument("--mandTracks", help="Mandatory track names for use "
                        "with --allTrackCombinations in comma-separated list",
                        default=None)
    parser.add_argument("--combinationRange", help="in form MIN,MAX: Only "
                        "explore track combination in given (closed) range. "
                        "A more refined version of --allTrackCombinations.",
                        default=None)
    parser.add_argument("--supervised", help="Use name (4th) column of "
                        "<traingingBed> for the true hidden states of the"
                        " model.  Transition parameters will be estimated"
                        " directly from this information rather than EM."
                        " NOTE: The number of states will be determined "
                        "from the bed.",
                        action = "store_true", default = False)
    parser.add_argument("--segment", help="Input bed files are also used to "
                        "segment data.  Ie teHmmTrain is called with --segment"
                        " set to the input file. Not currently working with "
                        " --supervised",
                        action = "store_true", default=False)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied) in training", type=int,
                        default=None)
    parser.add_argument("--truth", help="Use specifed file instead of "
                        "input file(s) for truth comparison.  Makes sense"
                        " when --segment is specified and only one input"
                        " bed specified", default = None)
    parser.add_argument("--eval", help="Bed file used for evaluation.  It should"
                        " cover same region in same order as --truth.  Option "
                        "exists mostly to specify segmentation of --truth",
                        default=None)
    parser.add_argument("--seed", help="Seed for random number generator"
                        " which will be used to initialize emissions "
                        "(if --flatEM and --supervised not specified)",
                        default=None, type=int)
    parser.add_argument("--reps", help="Number of training replicates (with "
                        " different"
                         " random initializations) to run. The replicate"
                         " with the highest likelihood will be chosen for the"
                         " output", default=None, type=int)
    parser.add_argument("--numThreads", help="Number of threads to use when"
                        " running training replicates (see --rep) in parallel.",
                        type=int, default=None)
    parser.add_argument("--emThresh", help="Threshold used for convergence"
                        " in baum welch training.  IE delta log likelihood"
                        " must be bigger than this number (which should be"
                        " positive) for convergence", type=float,
                        default=None)
    parser.add_argument("--fit", help="Run fitStateNames.py to automap names"
                        " before running comparison", action="store_true",
                        default=False)
    parser.add_argument("--fitOpts", help="Options to pass to fitStateNames.py"
                        " (only effective if used with --fit)", default=None)
    parser.add_argument("--saveAllReps", help="Save all replicates (--reps)"
                        " models to disk, instead of just the best one"
                        ". Format is <outputModel>.repN.  There will be "
                        " --reps -1 such models saved as the best output"
                        " counts as a replicate.  Comparison statistics"
                        " will be generated for each rep.",
                        action="store_true", default=False)
    parser.add_argument("--maxProb", help="Gaussian distributions and/or"
                        " segment length corrections can cause probability"
                        " to *decrease* during BW iteration.  Use this option"
                        " to remember the parameters with the highest probability"
                        " rather than returning the parameters after the final "
                        "iteration.", action="store_true", default=False)
    parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop"
                        " training if a given number of iterations go by without"
                        " hitting a new maxProb", default=None, type=int)
    parser.add_argument("--transMatEpsilons", help="By default, epsilons are"
                        " added to all transition probabilities to prevent "
                        "converging on 0 due to rounding error only for fully"
                        " unsupervised training.  Use this option to force this"
                        " behaviour for supervised and semisupervised modes",
                        action="store_true", default=False)

        
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    logOps = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        logOps += " --logFile %s" % args.logFile

    if not os.path.exists(args.outputDir):
        os.makedirs(args.outputDir)
    if args.evalTracksInfo is None:
        args.evalTracksInfo = args.trainingTracksInfo

    trainingTrackList = TrackList(args.trainingTracksInfo)
    evalTrackList = TrackList(args.evalTracksInfo)
    checkTrackListCompatible(trainingTrackList, evalTrackList)

    sizeRange = (len(trainingTrackList), len(trainingTrackList) + 1)
    if args.allTrackCombinations is True:
        sizeRange = (1, len(trainingTrackList) + 1)
    if args.combinationRange is not None:
        toks = args.combinationRange.split(",")
        sizeRange = int(toks[0]),int(toks[1]) + 1
        logger.debug("manual range (%d, %d) " % sizeRange)
    mandTracks = set()
    if args.mandTracks is not None:
        mandTracks = set(args.mandTracks.split(","))
        logger.debug("mandatory set %s" % str(mandTracks))
    trainFlags = ""
    if args.emStates is not None:
        trainFlags += " --numStates %d" % args.emStates
    if args.supervised is True:
        trainFlags += " --supervised"
        if args.segment is True:
            raise RuntimeError("--supervised not currently compatible with "
                               "--segment")
    trainFlags += " --emFac %d" % args.emFac
    if args.forceEmProbs is not None:
        trainFlags += " --forceEmProbs %s" % args.forceEmProbs
    if args.iter is not None:
        assert args.emStates is not None or args.initTransProbs is not None
        trainFlags += " --iter %d" % args.iter
    if args.initTransProbs is not None:
        trainFlags += " --initTransProbs %s" % args.initTransProbs
    if args.initEmProbs is not None:
        trainFlags += " --initEmProbs %s" % args.initEmProbs
    if args.fixEm is True:
        trainFlags += " --fixEm"
    if args.initStartProbs is not None:
        trainFlags += " --initStartProbs %s" % args.initStartProbs
    if args.fixStart is True:
        trainFlags += " --fixStart"
    if args.forceTransProbs is not None:
        trainFlags += " --forceTransProbs %s" % args.forceTransProbs
    if args.forceEmProbs is not None:
        trainFlags += " --forceEmProbs %s" % args.forceEmProbs
    if args.flatEm is True:
        trainFlags += " --flatEm"
    if args.emRandRange is not None:
        trainFlags += " --emRandRange %s" % args.emRandRange
    if args.segLen is not None:
        trainFlags += " --segLen %d" % args.segLen
    if args.seed is not None:
        trainFlags += " --seed %d" % args.seed
    if args.reps is not None:
        trainFlags += " --reps %d" % args.reps
    if args.numThreads is not None:
        trainFlags += " --numThreads %d" % args.numThreads
    if args.emThresh is not None:
        trainFlags += " --emThresh %f" % args.emThresh
    if args.saveAllReps is True:
        trainFlags += " --saveAllReps"
    if args.maxProb is True:
        trainFlags += " --maxProb"
    if args.transMatEpsilons is True:
        trainFlags += " --transMatEpsilons"
    if args.maxProbCut is not None:
        trainFlags += " --maxProbCut %d" % args.maxProbCut

    # write out command line for posteriorty's sake
    if not os.path.exists(args.outputDir):
        os.makedirs(args.outputDir)
    cmdPath = os.path.join(args.outputDir, "teHmmBenchmark_cmd.txt")
    cmdFile = open(cmdPath, "w")
    cmdFile.write(" ".join(argv) + "\n")
    cmdFile.close()
                           
    #todo: try to get timing for each command
    commands = []
    rows = dict()
    for pn, pList in enumerate(subsetTrackList(trainingTrackList, sizeRange,
                                               mandTracks)):
        if len(pList) == len(trainingTrackList):
            outDir = args.outputDir
        else:
            outDir = os.path.join(args.outputDir, "perm%d" % pn)
        if not os.path.exists(outDir):
            os.makedirs(outDir)
        trainingTrackPath = os.path.join(outDir, "training_tracks.xml")
        evalTrackPath = os.path.join(outDir, "eval_tracks.xml")
        for maskTrack in trainingTrackList.getMaskTracks():
            pList.addTrack(copy.deepcopy(maskTrack))
        pList.saveXML(trainingTrackPath)
        epList = TrackList()
        for track in pList:
            t = copy.deepcopy(evalTrackList.getTrackByName(track.getName()))
            epList.addTrack(t)
        for maskTrack in trainingTrackList.getMaskTracks():
            epList.addTrack(copy.deepcopy(maskTrack))
        epList.saveXML(evalTrackPath)
        
        for inBed in args.inBeds:
            
            base = os.path.basename(inBed)
            truthBed = inBed
            testBed = inBed
            if args.cross is True:
                truthBed = os.path.join(outDir,
                                        os.path.splitext(base)[0] +
                                        "_truth_temp.bed")
                testBed = os.path.join(outDir,
                                       os.path.splitext(base)[0] +
                                       "_test_temp.bed")
                splitBed(inBed, truthBed, testBed)

                                        
            
            # train
            if args.mod is not None:
                modPath = args.mod
                command = "ls %s" % modPath
            else:
                modPath = os.path.join(outDir,
                                       os.path.splitext(base)[0] + ".mod")
                command = "teHmmTrain.py %s %s %s %s %s" % (trainingTrackPath,
                                                            truthBed,
                                                            modPath,
                                                            logOps,
                                                            trainFlags)
                if args.segment is True:
                    command += " --segment %s" % truthBed

            # view
            viewPath = os.path.join(outDir,
                                   os.path.splitext(base)[0] + "_view.txt")
            command += " && teHmmView.py %s > %s" % (modPath, viewPath)

            # evaluate
            numReps = 1
            if args.reps is not None and args.saveAllReps is True:
                numReps = args.reps
                assert numReps > 0
            missed = 0
            # little hack to repeat evaluation for each training replicate
            for repNum in xrange(-1, numReps-1):
                if repNum == -1:
                    repSuffix = ""
                else:
                    repSuffix = ".rep%d" % repNum                
                evalBed = os.path.join(outDir,
                                       os.path.splitext(base)[0] + "_eval.bed" +
                                       repSuffix)
                hmmEvalInputBed = testBed
                if args.eval is not None:
                    hmmEvalInputBed = args.eval
                bicPath = os.path.join(outDir,
                                       os.path.splitext(base)[0] + "_bic.txt" +
                                       repSuffix)

                command += " && teHmmEval.py %s %s %s --bed %s %s --bic %s" % (
                    evalTrackPath,
                    modPath + repSuffix,
                    hmmEvalInputBed,
                    evalBed,
                    logOps,
                    bicPath)
                zin = True

                if args.segment is True:
                    command += " --segment"

                # fit
                compTruth = testBed
                if args.truth is not None:
                    compTruth = args.truth
                compareInputBed = evalBed
                if args.fit is True:
                    fitBed = os.path.join(outDir,
                                          os.path.splitext(base)[0] + "_eval_fit.bed" +
                                          repSuffix)
                    command += " && fitStateNames.py %s %s %s --tl %s" % (compTruth,
                                                                          evalBed,
                                                                          fitBed,
                                                                          evalTrackPath)
                    if args.fitOpts is not None:
                        command += " " + args.fitOpts
                    compareInputBed = fitBed

                # compare
                compPath = os.path.join(outDir,
                                        os.path.splitext(base)[0] + "_comp.txt" +
                                        repSuffix)
                command += " && compareBedStates.py %s %s --tl %s > %s" % (
                    compTruth,
                    compareInputBed,
                    evalTrackPath,
                    compPath)
            

                # make table row
                if repSuffix == "":
                    rowPath = os.path.join(outDir,
                                           os.path.splitext(base)[0] + "_row.txt")
                    if inBed in rows:
                        rows[inBed].append(rowPath)
                    else:
                        rows[inBed] = [rowPath]
                    command += " && scrapeBenchmarkRow.py %s %s %s %s %s" % (
                        args.trainingTracksInfo,
                        trainingTrackPath,
                        evalBed,
                        compPath,
                        rowPath)

            # remember command
            inCmdPath = os.path.join(outDir,
                                    os.path.splitext(base)[0] + "_cmd.txt")
            inCmdFile = open(inCmdPath, "w")
            inCmdFile.write(command + "\n")
            inCmdFile.close()
            commands.append(command)
            
    runParallelShellCommands(commands, args.numProc)
    writeTables(args.outputDir, rows)
Exemple #13
0
def getStateMapFromConfMatrix(reverseMatrix, truthTgt, truthIgnore, predIgnore, thresh,
                              fdr):
    """ Use greedy algorithm to construct state map in order to maximize F1 score of
    each non-ignored state (in order of size in truth).

    The greedy heuristic here (mapping to truth states in order of their genome
    coverage) is worrisome.  Since once a predicted state is mapped to the truth
    state it is left out of consideration for all other truth states.  One hopes
    the F1 metric compensates for this somewhat, and observes that the "truth"
    annotations we currently consider (TE/non-TE) should *still be optimal* despite
    the heuristic. 

    NOTE: Unlike old version above, the input matrix is the reverse confusion matrix,
    ie mapping truth states back to predictions (tho matrix data is symmetrical,
    representation used in this module is not, and more convenient to work in one
    direction or another)

    UPDATE: FDR option allows to skip F1 optimization and just use fdr directly
    as a cutoff for candidates
    """

    # build maps of state names to # bases in resepctive annotations
    truthStateSizes = defaultdict(int)
    predStateSizes = defaultdict(int)
    for truthState in reverseMatrix.keys():
        for predState, overlap in reverseMatrix[truthState].items():
            truthStateSizes[truthState] += overlap
            predStateSizes[predState] += overlap
            
    # sort truth states decreasing order
    truthStateList = truthStateSizes.items()
    truthStateList.sort(key = lambda x : x[1], reverse = True)
    logger.debug("State ranking in f1Fit:" + str(truthStateList))
    
    # main loop
    stateNameMap = dict()
    for truthState, truthSize in truthStateList:
        if truthState in truthIgnore or \
               (len(truthTgt) > 0 and truthState not in truthTgt):
            continue
        # assemble list of candidate pred states that meet threshold
        predCandidates = []
        # assemble list of andidate pred states that exceed 1-threshold
        # these will be sure bets that we assume are good
        sureBets = []
        # tack in extra list for FDR option that overrides other two
        # if FDR activated
        fdrSureBets = []
        for predState, overlap in reverseMatrix[truthState].items():
            if predState not in stateNameMap and\
              predState not in predIgnore:
                predFrac = float(overlap) / float(min(truthSize,
                                                      predStateSizes[predState]))
                if predFrac >= thresh:
                    if predFrac >= 1. - thresh:
                        sureBets.append(predState)
                    else:
                        predCandidates.append(predState)
                if fdr is not None:
                    # above calculation of predFrac is effective heuristic but
                    # runs against definition of fdr
                    predFrac = float(overlap) / float(predStateSizes[predState])
                    if predFrac >= 1. - fdr:
                        fdrSureBets.append(predState)
            else:
                logger.debug("state mapper skipping %s with othresh %f" % (
                    predState, float(overlap) / float(min(truthSize,
                                                          predStateSizes[predState]))))
        if fdr is not None:
            sureBets = fdrSureBets
            predCandidates = []
        logger.debug("candidates for %s: %s" % (truthState, str(predCandidates)))
        logger.debug("sure bets for %s: %s" % (truthState, str(sureBets)))

        # iterate over all combinaations of candidate mappings
        def allSubsets(s):
            if len(sureBets) > 0:
                yield []
            for i in xrange(1, len(s) + 1):
                for j in itertools.combinations(s, i):
                    yield j
        bestF1, bestMapSet = -1., []
        for candidateSetIter in allSubsets(predCandidates):
            candidateSet = list(candidateSetIter) + sureBets
            # compute the f1 score of this mapping
            p, r, f1, tp, fp, fn = 0.,0.,0.,0.,0., float(truthStateSizes[truthState])
            bsSortMap = dict()
            for predState in candidateSet:
                overlap = reverseMatrix[truthState][predState]
                tp += overlap
                fp += predStateSizes[predState] - overlap
                bsSortMap[predState] = tp + fp
                fn -= overlap
            if tp > 0.:
                p = tp / (tp + fp)
                r = tp / (tp + fn)
                f1 = (2. * p * r) / (p + r)
            #print f1, p, r, tp, fp, fn, str(candidateSet)
            if f1 > bestF1:
                # sort by total number of bases 
                bestF1, bestMapSet = f1, sorted(candidateSet, reverse=True,
                                                key = lambda x : bsSortMap[x])
                
        # add best candidate set to prediction state name map
        for predState in bestMapSet:
            assert predState not in stateNameMap
            stateNameMap[predState] = [truthState,
                                       reverseMatrix[truthState][predState],
                                       predStateSizes[predState]]
        logger.debug("map %s <---- %s" % (truthState, str(bestMapSet)))
        logger.debug("best F1 = %s" % bestF1)
    
    # predStateName - > (truthStateName, tp, tp+fp)        
    return stateNameMap
Exemple #14
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Filter overlapping intervals out")
    parser.add_argument("inputBed", help="Bed file to filter")
    parser.add_argument("--bed12", help="Use bed12 exons instead of start/end"
                        " if present (equivalent to running bed12ToBed6 on"
                        " input first).", action="store_true", default=False)
    parser.add_argument("--rm", help="Make sure intervals that are labeled as TE "
                        "by rm2State.sh script are never cut by ones that are not",
                        default=False, action='store_true')
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    assert os.path.isfile(args.inputBed)
    tempBedToolPath = initBedTool()

    # do the --rm filter.  by splitting into TE / non-TE
    # then removing everything in non-TE that overlaps
    # TE.  The adding the remainder back to TE. 
    inputPath = args.inputBed
    if args.rm is True:
        tempPath = getLocalTempPath("Temp_", ".bed")
        tePath = getLocalTempPath("Temp_te_", ".bed")
        runShellCommand("rm2State.sh %s |grep TE | sortBed > %s" % (
            args.inputBed, tempPath))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %(
            args.inputBed, tempPath, tePath))
        otherPath = getLocalTempPath("Temp_other_", ".bed")
        runShellCommand("rm2State.sh %s |grep -v TE | sortBed > %s" % (
            args.inputBed, tempPath))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %(
            args.inputBed, tempPath, otherPath))
        if os.path.getsize(tePath) > 0  and\
           os.path.getsize(otherPath) > 0:
            filterPath = getLocalTempPath("Temp_filter_", ".bed")
            runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (
                otherPath, tePath, filterPath))
            inputPath = getLocalTempPath("Temp_input_", ".bed")
            runShellCommand("cat %s %s | sortBed > %s" % (
                tePath, filterPath, inputPath))
            runShellCommand("rm -f %s" % filterPath)
        runShellCommand("rm -f %s %s %s" % (tePath, otherPath, tempPath))

    bedIntervals = BedTool(inputPath).sort()
    if args.bed12 is True:
        bedIntervals = bedIntervals.bed6()
        
    prevInterval = None

    # this code has been way to buggy for something so simple
    # keep extra list to check for sure even though it's a waste of
    # time and space
    sanity = []
    
    for interval in bedIntervals:
        if (prevInterval is not None and
            interval.chrom == prevInterval.chrom and
            interval.start < prevInterval.end):
            logger.debug("Replace %d bases of \n%s with\n%s" % (
                prevInterval.end - interval.start,
                str(interval), str(prevInterval)))
            interval.start = prevInterval.end
            
        if interval.end > interval.start:
            sys.stdout.write("%s" % str(interval))
            sanity.append(interval)
            prevInterval = interval

    for i in xrange(len(sanity) - 1):
        if sanity[i].chrom == sanity[i+1].chrom:
            assert sanity[i+1].start >= sanity[i].end
    cleanBedTool(tempBedToolPath)
    if args.inputBed != inputPath:
        runShellCommand("rm -f %s" % inputPath)
Exemple #15
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create a teHMM")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("trainingBed", help="Path of BED file containing"
                        " genome regions to train model on.  If --supervised "
                        "is used, the names in this bed file will be treated "
                        "as the true annotation (otherwise it is only used for "
                        "interval coordinates)")
    parser.add_argument("outputModel", help="Path of output hmm")
    parser.add_argument("--numStates", help="Number of states in model",
                        type = int, default=2)
    parser.add_argument("--iter", help="Number of EM iterations",
                        type = int, default=100)
    parser.add_argument("--supervised", help="Use name (4th) column of "
                        "<traingingBed> for the true hidden states of the"
                        " model.  Transition parameters will be estimated"
                        " directly from this information rather than EM."
                        " NOTE: The number of states will be determined "
                        "from the bed.",
                        action = "store_true", default = False)
    parser.add_argument("--cfg", help="Use Context Free Grammar insead of "
                        "HMM.  Only works with --supervised for now",
                        action = "store_true", default = False)
    parser.add_argument("--saPrior", help="Confidence in self alignment "
                        "track for CFG.  Probability of pair emission "
                        "is multiplied by this number if the bases are aligned"
                        " and its complement if bases are not aligned. Must"
                        " be between [0,1].", default=0.95, type=float)
    parser.add_argument("--pairStates", help="Comma-separated list of states"
                        " (from trainingBed) that are treated as pair-emitors"
                        " for the CFG", default=None)
    parser.add_argument("--emFac", help="Normalization factor for weighting"
                        " emission probabilities because when there are "
                        "many tracks, the transition probabilities can get "
                        "totally lost. 0 = no normalization. 1 ="
                        " divide by number of tracks.  k = divide by number "
                        "of tracks / k", type=int, default=0)
    parser.add_argument("--initTransProbs", help="Path of text file where each "
                        "line has three entries: FromState ToState Probability"
                        ".  This file (all other transitions get probability 0)"
                        " is used to specifiy the initial transition model."
                        " The names and number of states will be initialized "
                        "according to this file (overriding --numStates)",
                        default = None)
    parser.add_argument("--fixTrans", help="Do not learn transition parameters"
                        " (best used with --initTransProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ".  This file (all other emissions get probability 0)"
                        " is used to specifiy the initial emission model. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixEm", help="Do not learn emission parameters"
                        " (best used with --initEmProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initStartProbs", help="Path of text file where each "
                        "line has two entries: State Probability"
                        ".  This file (all other start probs get probability 0)"
                        " is used to specifiy the initial start dist. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixStart", help="Do not learn start parameters"
                        " (best used with --initStartProbs)",
                        action="store_true", default=False)
    parser.add_argument("--forceTransProbs",
                        help="Path of text file where each "
                        "line has three entries: FromState ToState Probability" 
                        ". These transition probabilities will override any "
                        " learned probabilities after each training iteration"
                        " (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed)" ,
                        default=None)
    parser.add_argument("--forceEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ". These "
                        "emission probabilities will override any learned"
                        " probabilities after each training iteration "
                        "(unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed.)" ,
                        default = None) 
    parser.add_argument("--flatEm", help="Use a flat emission distribution as "
                        "a baseline.  If not specified, the initial emission "
                        "distribution will be randomized by default.  Emission"
                        " probabilities specified with --initEmpProbs or "
                        "--forceEmProbs will never be affected by randomizaiton"
                        ".  The randomization is important for Baum Welch "
                        "training, since if two states dont have at least one"
                        " different emission or transition probability to begin"
                        " with, they will never learn to be different.",
                        action="store_true", default=False)
    parser.add_argument("--emRandRange", help="When randomly initialzing an"
                        " emission distribution, constrain"
                        " the values to the given range (pair of "
                        "comma-separated numbers).  Overridden by "
                        "--initEmProbs and --forceEmProbs when applicable."
                        " Completely overridden by --flatEm (which is equivalent"
                        " to --emRandRange .5,.5.). Actual values used will"
                        " always be normalized.", default="0.2,0.8")
    parser.add_argument("--segment", help="Bed file of segments to treat as "
                        "single columns for HMM (ie as created with "
                        "segmentTracks.py).  IMPORTANT: this file must cover "
                        "the same regions as the traininBed file. Unless in "
                        "supervised mode, probably best to use same bed file "
                        " as both traingBed and --segment argument.  Otherwise"
                        " use intersectBed to make sure the overlap is exact",
                        default=None)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied)", type=int, default=0)
    parser.add_argument("--seed", help="Seed for random number generator"
                        " which will be used to initialize emissions "
                        "(if --flatEM and --supervised not specified)",
                        default=None, type=int)
    parser.add_argument("--reps", help="Number of replicates (with different"
                         " random initializations) to run. The replicate"
                         " with the highest likelihood will be chosen for the"
                         " output", default=1, type=int)
    parser.add_argument("--numThreads", help="Number of threads to use when"
                        " running replicates (see --rep) in parallel.",
                        type=int, default=1)
    parser.add_argument("--emThresh", help="Threshold used for convergence"
                        " in baum welch training.  IE delta log likelihood"
                        " must be bigger than this number (which should be"
                        " positive) for convergence", type=float,
                        default=0.001)
    parser.add_argument("--saveAllReps", help="Save all replicates (--reps)"
                        " models to disk, instead of just the best one"
                        ". Format is <outputModel>.repN.  There will be "
                        " --reps -1 such models saved as the best output"
                        " counts as a replicate",
                        action="store_true", default=False)
    parser.add_argument("--maxProb", help="Gaussian distributions and/or"
                        " segment length corrections can cause probability"
                        " to *decrease* during BW iteration.  Use this option"
                        " to remember the parameters with the highest probability"
                        " rather than returning the parameters after the final "
                        "iteration.", action="store_true", default=False)
    parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop"
                        " training if a given number of iterations go by without"
                        " hitting a new maxProb", default=None, type=int)
    parser.add_argument("--transMatEpsilons", help="By default, epsilons are"
                        " added to all transition probabilities to prevent "
                        "converging on 0 due to rounding error only for fully"
                        " unsupervised training.  Use this option to force this"
                        " behaviour for supervised and semisupervised modes",
                        action="store_true", default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.cfg is True:
        assert args.supervised is True
        assert args.saPrior >= 0. and args.saPrior <= 1.
    if args.pairStates is not None:
        assert args.cfg is True
    if args.initTransProbs is not None or args.fixTrans is True or\
      args.initEmProbs is not None or args.fixEm is not None:
        if args.cfg is True:
            raise RuntimeError("--transProbs, --fixTrans, --emProbs, --fixEm "
                               "are not currently compatible with --cfg.")
    if args.fixTrans is True and args.supervised is True:
        raise RuntimeError("--fixTrans option not compatible with --supervised")
    if args.fixEm is True and args.supervised is True:
        raise RuntimeError("--fixEm option not compatible with --supervised")
    if (args.forceTransProbs is not None or args.forceEmProbs is not None) \
      and args.cfg is True:
        raise RuntimeError("--forceTransProbs and --forceEmProbs are not "
                           "currently compatible with --cfg")
    if args.flatEm is True and args.supervised is False and\
      args.initEmProbs is None and args.initTransProbs is None:
      raise RuntimeError("--flatEm must be used with --initEmProbs and or"
                         " --initTransProbs")
    if args.initEmProbs is not None and args.initTransProbs is None:
        raise RuntimeError("--initEmProbs can only be used in conjunction with"
                           " --initTransProbs")
    if args.emRandRange is not None:
        args.emRandRange = args.emRandRange.split(",")
        try:
            assert len(args.emRandRange) == 2
            args.emRandRange = (float(args.emRandRange[0]),
                                float(args.emRandRange[1]))
        except:
            raise RuntimeError("Invalid --emRandRange specified")
    if args.transMatEpsilons is False:
        # old logic here. now overriden with above options
        args.transMatEpsilons = (args.supervised is False and
                                 args.initTransProbs is None and
                                 args.forceTransProbs is None)

    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read training intervals from the bed file
    logger.info("loading training intervals from %s" % args.trainingBed)
    mergedIntervals = getMergedBedIntervals(args.trainingBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.trainingBed)

    # read segment intervals
    segIntervals = None
    if args.segment is not None:
        logger.info("loading segment intervals from %s" % args.segment)
        try:
            checkExactOverlap(args.trainingBed, args.segment)
        except:
            raise RuntimeError("bed file passed with --segments option"
                               " must exactly overlap trainingBed")
        segIntervals = readBedIntervals(args.segment, sort=True)
    elif args.segLen > 0:
        raise RuntimeError("--segLen can only be used with --segment")
    if args.segLen <= 0:
        args.segLen = None
    if args.segLen > 0 and args.segLen != 1:
        logger.warning("--segLen should be 0 (no correction) or 1 (base"
                       " correction).  Values > 1 may cause bias.")

    # read the tracks, while intersecting them with the training intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals,
                            segmentIntervals=segIntervals)

    catMap = None
    userTrans = None
    if args.supervised is False and args.initTransProbs is not None:
        logger.debug("initializing transition model with user data")
        catMap = stateNamesFromUserTrans(args.initTransProbs)
        # state number is overrided by the transProbs file
        args.numStates = len(catMap)

    truthIntervals = None
    # state number is overrided by the input bed file in supervised mode
    if args.supervised is True:
        logger.info("processing supervised state names")
        # we reload because we don't want to be merging them here
        truthIntervals = readBedIntervals(args.trainingBed, ncol=4)
        catMap = mapStateNames(truthIntervals)
        args.numStates = len(catMap)

    # train the model
    seeds = [random.randint(0, 4294967294)]
    if args.seed is not None:
        seeds = [args.seed]
        random.seed(args.seed)
    seeds += [random.randint(0, sys.maxint) for x in xrange(1, args.reps)]

    def trainClosure(randomSeed):
        return trainModel(randomSeed, trackData=trackData, catMap=catMap,
                          userTrans=userTrans, truthIntervals=truthIntervals,
                          args=args)
    
    modelList = runParallelShellCommands(argList=seeds, numProc = args.numThreads,
                                         execFunction = trainClosure,
                                         useThreads = True)

    # select best model
    logmsg = ""
    bestModel = (-1, LOGZERO)
    for i in xrange(len(modelList)):
        curModel = (i, modelList[i].getLastLogProb())
        if curModel[1] > bestModel[1]:
            bestModel = curModel
        if curModel[1] is not None:
            logmsg += "Rep %i: TotalProb: %f\n" % curModel
    if len(modelList) > 1:
        logging.info("Training Replicates Statistics:\n%s" % logmsg)
        logging.info("Selecting best replicate (%d, %f)" % bestModel)
    model = modelList[bestModel[0]]
        
    # write the model to a pickle
    logger.info("saving trained model to %s" % args.outputModel)
    saveModel(args.outputModel, model)

    # write all replicates
    writtenCount = 0
    if args.saveAllReps is True:
        for i, repModel in enumerate(modelList):
            if i != bestModel[0]:
                repPath = "%s.rep%d" % (args.outputModel, writtenCount)
                logger.info("saving replicate model to %s" % repPath)                
                saveModel(repPath, repModel)
                writtenCount += 1

    cleanBedTool(tempBedToolPath)
Exemple #16
0
def trainModel(randomSeed, trackData, catMap, userTrans, truthIntervals,
               args):
    """ Run the whole training pipeline
    """
    # activate the random seed
    randGen = np.random.RandomState(randomSeed)

    # create the independent emission model
    logger.info("creating emission model")
    numSymbolsPerTrack = trackData.getNumSymbolsPerTrack()
    logger.debug("numSymbolsPerTrack=%s" % numSymbolsPerTrack)
    # only randomize model if using Baum-Welch 
    randomize = args.supervised is False and args.flatEm is False
    emissionModel = IndependentMultinomialAndGaussianEmissionModel(
        args.numStates,
        numSymbolsPerTrack,
        trackData.getTrackList(),
        normalizeFac=args.emFac,
        randomize=randomize,
        effectiveSegmentLength = args.segLen,
        random_state = randGen,
        randRange = args.emRandRange)

    # create the model
    if not args.cfg:
        logger.info("creating hmm model")
        model = MultitrackHmm(emissionModel, n_iter=args.iter,
                              state_name_map=catMap,
                              fixTrans = args.fixTrans,
                              fixEmission = args.fixEm,
                              fixStart = args.fixStart,
                              forceUserEmissions = args.forceEmProbs,
                              forceUserTrans = args.forceTransProbs,
                              random_state = randGen,
                              thresh = args.emThresh,
                              transMatEpsilons = args.transMatEpsilons,
                              maxProb = args.maxProb,
                              maxProbCut = args.maxProbCut)
    else:
        pairEM = PairEmissionModel(emissionModel, [args.saPrior] *
                                   emissionModel.getNumStates())
        assert args.supervised is True
        nestStates = []
        if args.pairStates is not None:
            pairStates = args.pairStates.split(",")
            nestStates = map(lambda x: catMap.getMap(x), pairStates)
        logger.info("Creating cfg model")
        model = MultitrackCfg(emissionModel, pairEM, nestStates,
                              state_name_map=catMap)

    # initialize the user specified transition probabilities now if necessary
    if args.initTransProbs is not None:
        with open(args.initTransProbs) as f:
            model.applyUserTrans(f.readlines())

    # initialize the user specified emission probabilities now if necessary
    if args.initEmProbs is not None:
        with open(args.initEmProbs) as f:
            # can't apply emissions without a track list! 
            model.trackList = trackData.getTrackList()
            model.applyUserEmissions(f.readlines())

    # initialize the user specified start probabilities now if necessary
    if args.initStartProbs is not None:
        with open(args.initStartProbs) as f:
            model.applyUserStarts(f.readlines())

    # make sure initialization didnt screw up
    model.validate()

    # do the training
    if args.supervised is False:
        logger.info("training via EM")
        model.train(trackData)
    else:
        logger.info("training from input bed states")
        model.supervisedTrain(trackData, truthIntervals)

    # reset the user specified transition probabilities now if necessary
    if args.forceTransProbs is not None:
        with open(args.forceTransProbs) as f:
            model.applyUserTrans(f.readlines())

    # reset the user specified emission probabilities now if necessary
    if args.forceEmProbs is not None:
        with open(args.forceEmProbs) as f:
            model.applyUserEmissions(f.readlines())

    return model
Exemple #17
0
def getStateMapFromConfMatrix(reverseMatrix, truthTgt, truthIgnore, predIgnore,
                              thresh, fdr):
    """ Use greedy algorithm to construct state map in order to maximize F1 score of
    each non-ignored state (in order of size in truth).

    The greedy heuristic here (mapping to truth states in order of their genome
    coverage) is worrisome.  Since once a predicted state is mapped to the truth
    state it is left out of consideration for all other truth states.  One hopes
    the F1 metric compensates for this somewhat, and observes that the "truth"
    annotations we currently consider (TE/non-TE) should *still be optimal* despite
    the heuristic. 

    NOTE: Unlike old version above, the input matrix is the reverse confusion matrix,
    ie mapping truth states back to predictions (tho matrix data is symmetrical,
    representation used in this module is not, and more convenient to work in one
    direction or another)

    UPDATE: FDR option allows to skip F1 optimization and just use fdr directly
    as a cutoff for candidates
    """

    # build maps of state names to # bases in resepctive annotations
    truthStateSizes = defaultdict(int)
    predStateSizes = defaultdict(int)
    for truthState in reverseMatrix.keys():
        for predState, overlap in reverseMatrix[truthState].items():
            truthStateSizes[truthState] += overlap
            predStateSizes[predState] += overlap

    # sort truth states decreasing order
    truthStateList = truthStateSizes.items()
    truthStateList.sort(key=lambda x: x[1], reverse=True)
    logger.debug("State ranking in f1Fit:" + str(truthStateList))

    # main loop
    stateNameMap = dict()
    for truthState, truthSize in truthStateList:
        if truthState in truthIgnore or \
               (len(truthTgt) > 0 and truthState not in truthTgt):
            continue
        # assemble list of candidate pred states that meet threshold
        predCandidates = []
        # assemble list of andidate pred states that exceed 1-threshold
        # these will be sure bets that we assume are good
        sureBets = []
        # tack in extra list for FDR option that overrides other two
        # if FDR activated
        fdrSureBets = []
        for predState, overlap in reverseMatrix[truthState].items():
            if predState not in stateNameMap and\
              predState not in predIgnore:
                predFrac = float(overlap) / float(
                    min(truthSize, predStateSizes[predState]))
                if predFrac >= thresh:
                    if predFrac >= 1. - thresh:
                        sureBets.append(predState)
                    else:
                        predCandidates.append(predState)
                if fdr is not None:
                    # above calculation of predFrac is effective heuristic but
                    # runs against definition of fdr
                    predFrac = float(overlap) / float(
                        predStateSizes[predState])
                    if predFrac >= 1. - fdr:
                        fdrSureBets.append(predState)
            else:
                logger.debug(
                    "state mapper skipping %s with othresh %f" %
                    (predState, float(overlap) /
                     float(min(truthSize, predStateSizes[predState]))))
        if fdr is not None:
            sureBets = fdrSureBets
            predCandidates = []
        logger.debug("candidates for %s: %s" %
                     (truthState, str(predCandidates)))
        logger.debug("sure bets for %s: %s" % (truthState, str(sureBets)))

        # iterate over all combinaations of candidate mappings
        def allSubsets(s):
            if len(sureBets) > 0:
                yield []
            for i in xrange(1, len(s) + 1):
                for j in itertools.combinations(s, i):
                    yield j

        bestF1, bestMapSet = -1., []
        for candidateSetIter in allSubsets(predCandidates):
            candidateSet = list(candidateSetIter) + sureBets
            # compute the f1 score of this mapping
            p, r, f1, tp, fp, fn = 0., 0., 0., 0., 0., float(
                truthStateSizes[truthState])
            bsSortMap = dict()
            for predState in candidateSet:
                overlap = reverseMatrix[truthState][predState]
                tp += overlap
                fp += predStateSizes[predState] - overlap
                bsSortMap[predState] = tp + fp
                fn -= overlap
            if tp > 0.:
                p = tp / (tp + fp)
                r = tp / (tp + fn)
                f1 = (2. * p * r) / (p + r)
            #print f1, p, r, tp, fp, fn, str(candidateSet)
            if f1 > bestF1:
                # sort by total number of bases
                bestF1, bestMapSet = f1, sorted(candidateSet,
                                                reverse=True,
                                                key=lambda x: bsSortMap[x])

        # add best candidate set to prediction state name map
        for predState in bestMapSet:
            assert predState not in stateNameMap
            stateNameMap[predState] = [
                truthState, reverseMatrix[truthState][predState],
                predStateSizes[predState]
            ]
        logger.debug("map %s <---- %s" % (truthState, str(bestMapSet)))
        logger.debug("best F1 = %s" % bestF1)

    # predStateName - > (truthStateName, tp, tp+fp)
    return stateNameMap