def buildSeqTable(args, bedIntervals): """build table of sequence indexes from input bed file to quickly read while sorting. Table maps sequence name to range of indexes in bedIntervals. This only works if bedIntervals are sorted (and should raise an assertion error if that's not the case. """ logger.info("building sequence name index of %d bed intervals" % len(bedIntervals)) bedSeqTable = dict() prevName = None prevIdx = 0 for i, interval in enumerate(bedIntervals): seqName = interval[0] if seqName != prevName: assert seqName not in bedSeqTable if prevName is not None: bedSeqTable[prevName] = (prevIdx, i) prevIdx = i prevName = seqName seqName = bedIntervals[-1][0] assert seqName not in bedSeqTable bedSeqTable[seqName] = (prevIdx, len(bedIntervals)) logger.debug("index has %d unique sequences" % len(bedSeqTable)) return bedSeqTable
def filterStateMap(stateMap, args): """ Make sure ignored states are ignored. Apply unique id suffixes is necessary. Make sure that quality threshold is high enough or else ignore too. map is filtered in place.""" mapCounts = dict() for name, mapVal in stateMap.items(): assert len(mapVal) == 3 mapName, mapCount, mapTotal = mapVal qual = float(mapCount) / float(mapTotal) if name in args.ignore or qual < args.qualThresh: # set map such that it won't be changed logger.debug("Ignoring state %s with quality %f" % (name, qual)) stateMap[name] = (name, 1, 1) elif args.unique: if mapName not in mapCounts: mapCounts[mapName] = 1 else: mapCounts[mapName] += 1 # dont want to rename if only 1 instance for name, count in mapCounts.items(): if count == 1: mapCounts[name] = 0 # 2nd pass to assign the unique ids (range from 1 to count) for name, mapVal in stateMap.items(): mapName, mapCount, mapTotal = mapVal if mapName in mapCounts: count = mapCounts[mapName] if count > 0: newName = mapName + ".%d" % count logger.debug("Mapping %s to %s" % (mapName, newName)) stateMap[name] = (newName, mapCount, mapTotal) mapCounts[mapName] -= 1
def applyNamesToModel(stateMap, modelPath): """ change a given HMM model to use the new state names""" # load model created with teHmmTrain.py logger.debug("loading model %s" % modelPath) model = loadModel(modelPath) modelMap = model.getStateNameMap() raise RuntimeError("Not Implemented")
def computeScale(data, numBins, noLog): """ very simple heuristic to compute reasonable scaling""" minVal, maxVal = np.amin(data), np.amax(data) range = maxVal - minVal logger.debug("Min=%f Max=%f" % (minVal, maxVal)) #NOTE: the -2.0 when computing the linear binSize and logBase # are very conservative measures to insure that we dont under # bin on each side due to rounding. Can result in bins that # are too large when binsize, say, divides evently into the # range. Should be optimized down the road when have more time. # try linear scale binSize = float(range) / float(numBins - 2.0) minBin = np.floor(minVal / binSize) * binSize linearBins = [minBin] * numBins for i in xrange(1, numBins): linearBins[i] = linearBins[i-1] + binSize logger.debug("Linear bins %s" % linearBins) linearVar = histVariance(data, sorted(linearBins)) linearScale = 1.0 / binSize logger.debug("Linear scale=%f has variance=%f" % (linearScale, linearVar)) # try log scale logVar = sys.maxint # shift parameter is a constant that gets added before log scaling # to make sure that we always deal with positive numbers shift = 0. if minVal <= 0.: shift = 1.0 - minVal data += shift minVal += shift maxVal += shift ratio = float(maxVal) / float(minVal) logBase = np.power(ratio, 1. / float(numBins - 2.00)) minBin = np.power(logBase, np.floor(np.log(minVal) / np.log(logBase))) logBins = [minBin] * numBins for i in xrange(1, numBins): logBins[i] = logBins[i-1] * logBase logger.debug("Log bins %s" % logBins) logVar = histVariance(data, sorted(logBins), fromLog = True) logger.debug("Log base=%f has variance=%f" % (logBase, logVar)) ret = "scale", linearScale, 0. if logVar < linearVar and noLog is False: ret = "logScale", logBase, shift return ret
def readTrackIntoFloatArray(track, allIntervals): """ use the track API to directly read an entire data file into memory as an array of floats. If the track has an associated defaultVal, it will be used to cover all gaps in the coverage. If not, only annotated values will be kept""" defaultVal = track.getDefaultVal() hasDefault = defaultVal != None floatType = np.float32 if not hasDefault: defaultVal = np.finfo(floatType).max else: # sanity check : we assume that no one ever actually uses this value defaultVal = floatType(defaultVal) assert defaultVal != np.finfo(floatType).max assert not np.isinf(defaultVal) readBuffers = [] totalLen = 0 for interval in allIntervals: logger.debug("Allocating track array of size %d" % ( interval[2] - interval[1])) buf = defaultVal + np.zeros((interval[2] - interval[1]), dtype=floatType) buf = readTrackData(track.getPath(), interval[0], interval[1], interval[2], outputBuf=buf, valCol=track.getValCol(), useDelta=track.getDelta) readBuffers.append(buf) totalLen += len(buf) data = np.concatenate(readBuffers) assert len(data) == totalLen readBuffers = None if not hasDefault: # strip out all the float_max values we put in there since there is # no default value for unannotated regions, and we just ignore them # (ie as original implementation) stripData = np.ndarray((totalLen), dtype=floatType) basesRead = 0 for i in xrange(totalLen): if data[i] != defaultVal: stripData[basesRead] = data[i] basesRead += 1 stripData.resize(basesRead) data = stripData return data
def findTsds(args, bedIntervals): """ search through input bed intervals, loading up the FASTA sequence for each one """ # index for quick lookups in bed file (to be used while scanning fasta file) seqTable = buildSeqTable(args, bedIntervals) outTsds = [] faFile = open(args.fastaSequence, "r") nameSet = None if args.names is not None: nameSet = set(args.names.split(",")) for seqNameFa, sequence in fastaRead(faFile): if args.sequences is not None and seqNameFa not in args.sequences and\ seqNameFa.split()[0] not in args.sequences: # skip unflagged sequences when option specified continue # try name from Fasta as well as name with everything after first # whitespace skipped if seqNameFa in seqTable: seqName = seqNameFa else: seqName = seqNameFa.split()[0] if seqName in seqTable: logger.info("Scanning FASTA sequence %s" % seqName) bedRange = seqTable[seqName] for bedIdx in xrange(bedRange[0], bedRange[1]): bedInterval = bedIntervals[bedIdx] name = None if len(bedInterval) > 3: name = bedInterval[3] if nameSet is None or name in nameSet: # we make sequence lower case below because we dont care # about soft masking outTsds += intervalTsds(args, sequence.lower(), bedInterval) else: logger.debug("Skipping FASTA sequence %s because no intervals " "found" % seqName) return outTsds
def writeTransitions(bedIntervals, nameMap, outNameMap, args): tfile = open(args.outTransProbs, "w") # do the self transitions N = len(nameMap) selfTran = args.selfTran + np.zeros((N)) if args.selfTran < 0: tot = np.zeros((N)) num = np.zeros((N)) for interval in bedIntervals: assert nameMap.has(interval[3]) state = nameMap.getMap(interval[3]) assert state < N num[state] += 1 tot[state] += interval[2] - interval[1] - 1 selfTran = tot / (tot + num) for state, i in nameMap.catMap.items(): tfile.write("%s\t%s\t%f\n" % (state, state, selfTran[i])) if args.mode == "star": outTrans = (1. - selfTran[i]) / float(args.numOut) for outState, j in outNameMap.catMap.items(): tfile.write("%s\t%s\t%f\n" % (state, outState, outTrans)) # do the outside states if args.numOut > 0: outselfTran = args.selfTran + np.zeros((args.numOut)) if args.selfTran < 0: # hack for now (should be from data above) logger.debug("Hacky maximum used for outside state self transition") outselfTran = max(selfTran) + np.zeros((args.numOut)) for state, i in outNameMap.catMap.items(): tfile.write("%s\t%s\t%f\n" % (state, state, outselfTran[i])) tfile.close()
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Train, evalaute, then compare hmm model on input") parser.add_argument("trainingTracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks used " "for training") parser.add_argument("outputDir", help="directory to write output") parser.add_argument("inBeds", nargs="*", help="list of training beds") parser.add_argument("--evalTracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks used" " for evaluation (only need if different from" " trainingTracksInfo", default=None) parser.add_argument("--numProc", help="Max number of processors to use", type=int, default=1) parser.add_argument("--allTrackCombinations", help="Rerun with all" " possible combinations of tracks from the input" " tracksInfo file. Note that this number gets big" " pretty fast.", action = "store_true", default= False) parser.add_argument("--emStates", help="By default the supervised mode" " of teHmmTrain is activated. This option overrides" " that and uses the EM mode and the given number of " "states instead", type=int, default=None) parser.add_argument("--cross", help="Do 50/50 cross validation by training" " on first half input and validating on second", action="store_true", default=False) parser.add_argument("--emFac", help="Normalization factor for weighting" " emission probabilities because when there are " "many tracks, the transition probabilities can get " "totally lost. 0 = no normalization. 1 =" " divide by number of tracks. k = divide by number " "of tracks / k", type=int, default=0) parser.add_argument("--mod", help="Path to trained model. This will " "bypass the training phase that would normally be done" " and just skip to the evaluation. Note that the user" " must make sure that the trained model has the " "states required to process the input data", default = None) parser.add_argument("--iter", help="Number of EM iterations. Needs to be" " used in conjunction with --emStates to specify EM" " training", type = int, default=None) parser.add_argument("--initTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". This file (all other transitions get probability 0)" " is used to specifiy the initial transition model." " The names and number of states will be initialized " "according to this file (overriding --numStates)", default = None) parser.add_argument("--fixTrans", help="Do not learn transition parameters" " (best used with --initTransProbs)", action="store_true", default=False) parser.add_argument("--initEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". This file (all other emissions get probability 0)" " is used to specifiy the initial emission model. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixEm", help="Do not learn emission parameters" " (best used with --initEmProbs)", action="store_true", default=False) parser.add_argument("--initStartProbs", help="Path of text file where each " "line has two entries: State Probability" ". This file (all other start probs get probability 0)" " is used to specifiy the initial start dist. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixStart", help="Do not learn start parameters" " (best used with --initStartProbs)", action="store_true", default=False) parser.add_argument("--forceTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". These transition probabilities will override any " " learned probabilities after training (unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed" , default=None) parser.add_argument("--forceEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". These " "emission probabilities will override any learned" " probabilities after training (unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed." , default = None) parser.add_argument("--flatEm", help="Use a flat emission distribution as " "a baseline. If not specified, the initial emission " "distribution will be randomized by default. Emission" " probabilities specified with --initEmpProbs or " "--forceEmProbs will never be affected by randomizaiton" ". The randomization is important for Baum Welch " "training, since if two states dont have at least one" " different emission or transition probability to begin" " with, they will never learn to be different.", action="store_true", default=False) parser.add_argument("--emRandRange", help="When randomly initialzing a" " multinomial emission distribution, constrain" " the values to the given range (pair of " "comma-separated numbers). Overridden by " "--initEmProbs and --forceEmProbs when applicable." " Completely overridden by --flatEm (which is equivalent" " to --emRandRange .5,.5.). Actual values used will" " always be normalized.", default=None) parser.add_argument("--mandTracks", help="Mandatory track names for use " "with --allTrackCombinations in comma-separated list", default=None) parser.add_argument("--combinationRange", help="in form MIN,MAX: Only " "explore track combination in given (closed) range. " "A more refined version of --allTrackCombinations.", default=None) parser.add_argument("--supervised", help="Use name (4th) column of " "<traingingBed> for the true hidden states of the" " model. Transition parameters will be estimated" " directly from this information rather than EM." " NOTE: The number of states will be determined " "from the bed.", action = "store_true", default = False) parser.add_argument("--segment", help="Input bed files are also used to " "segment data. Ie teHmmTrain is called with --segment" " set to the input file. Not currently working with " " --supervised", action = "store_true", default=False) parser.add_argument("--segLen", help="Effective segment length used for" " normalizing input segments (specifying 0 means no" " normalization applied) in training", type=int, default=None) parser.add_argument("--truth", help="Use specifed file instead of " "input file(s) for truth comparison. Makes sense" " when --segment is specified and only one input" " bed specified", default = None) parser.add_argument("--eval", help="Bed file used for evaluation. It should" " cover same region in same order as --truth. Option " "exists mostly to specify segmentation of --truth", default=None) parser.add_argument("--seed", help="Seed for random number generator" " which will be used to initialize emissions " "(if --flatEM and --supervised not specified)", default=None, type=int) parser.add_argument("--reps", help="Number of training replicates (with " " different" " random initializations) to run. The replicate" " with the highest likelihood will be chosen for the" " output", default=None, type=int) parser.add_argument("--numThreads", help="Number of threads to use when" " running training replicates (see --rep) in parallel.", type=int, default=None) parser.add_argument("--emThresh", help="Threshold used for convergence" " in baum welch training. IE delta log likelihood" " must be bigger than this number (which should be" " positive) for convergence", type=float, default=None) parser.add_argument("--fit", help="Run fitStateNames.py to automap names" " before running comparison", action="store_true", default=False) parser.add_argument("--fitOpts", help="Options to pass to fitStateNames.py" " (only effective if used with --fit)", default=None) parser.add_argument("--saveAllReps", help="Save all replicates (--reps)" " models to disk, instead of just the best one" ". Format is <outputModel>.repN. There will be " " --reps -1 such models saved as the best output" " counts as a replicate. Comparison statistics" " will be generated for each rep.", action="store_true", default=False) parser.add_argument("--maxProb", help="Gaussian distributions and/or" " segment length corrections can cause probability" " to *decrease* during BW iteration. Use this option" " to remember the parameters with the highest probability" " rather than returning the parameters after the final " "iteration.", action="store_true", default=False) parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop" " training if a given number of iterations go by without" " hitting a new maxProb", default=None, type=int) parser.add_argument("--transMatEpsilons", help="By default, epsilons are" " added to all transition probabilities to prevent " "converging on 0 due to rounding error only for fully" " unsupervised training. Use this option to force this" " behaviour for supervised and semisupervised modes", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) logOps = "--logLevel %s" % getLogLevelString() if args.logFile is not None: logOps += " --logFile %s" % args.logFile if not os.path.exists(args.outputDir): os.makedirs(args.outputDir) if args.evalTracksInfo is None: args.evalTracksInfo = args.trainingTracksInfo trainingTrackList = TrackList(args.trainingTracksInfo) evalTrackList = TrackList(args.evalTracksInfo) checkTrackListCompatible(trainingTrackList, evalTrackList) sizeRange = (len(trainingTrackList), len(trainingTrackList) + 1) if args.allTrackCombinations is True: sizeRange = (1, len(trainingTrackList) + 1) if args.combinationRange is not None: toks = args.combinationRange.split(",") sizeRange = int(toks[0]),int(toks[1]) + 1 logger.debug("manual range (%d, %d) " % sizeRange) mandTracks = set() if args.mandTracks is not None: mandTracks = set(args.mandTracks.split(",")) logger.debug("mandatory set %s" % str(mandTracks)) trainFlags = "" if args.emStates is not None: trainFlags += " --numStates %d" % args.emStates if args.supervised is True: trainFlags += " --supervised" if args.segment is True: raise RuntimeError("--supervised not currently compatible with " "--segment") trainFlags += " --emFac %d" % args.emFac if args.forceEmProbs is not None: trainFlags += " --forceEmProbs %s" % args.forceEmProbs if args.iter is not None: assert args.emStates is not None or args.initTransProbs is not None trainFlags += " --iter %d" % args.iter if args.initTransProbs is not None: trainFlags += " --initTransProbs %s" % args.initTransProbs if args.initEmProbs is not None: trainFlags += " --initEmProbs %s" % args.initEmProbs if args.fixEm is True: trainFlags += " --fixEm" if args.initStartProbs is not None: trainFlags += " --initStartProbs %s" % args.initStartProbs if args.fixStart is True: trainFlags += " --fixStart" if args.forceTransProbs is not None: trainFlags += " --forceTransProbs %s" % args.forceTransProbs if args.forceEmProbs is not None: trainFlags += " --forceEmProbs %s" % args.forceEmProbs if args.flatEm is True: trainFlags += " --flatEm" if args.emRandRange is not None: trainFlags += " --emRandRange %s" % args.emRandRange if args.segLen is not None: trainFlags += " --segLen %d" % args.segLen if args.seed is not None: trainFlags += " --seed %d" % args.seed if args.reps is not None: trainFlags += " --reps %d" % args.reps if args.numThreads is not None: trainFlags += " --numThreads %d" % args.numThreads if args.emThresh is not None: trainFlags += " --emThresh %f" % args.emThresh if args.saveAllReps is True: trainFlags += " --saveAllReps" if args.maxProb is True: trainFlags += " --maxProb" if args.transMatEpsilons is True: trainFlags += " --transMatEpsilons" if args.maxProbCut is not None: trainFlags += " --maxProbCut %d" % args.maxProbCut # write out command line for posteriorty's sake if not os.path.exists(args.outputDir): os.makedirs(args.outputDir) cmdPath = os.path.join(args.outputDir, "teHmmBenchmark_cmd.txt") cmdFile = open(cmdPath, "w") cmdFile.write(" ".join(argv) + "\n") cmdFile.close() #todo: try to get timing for each command commands = [] rows = dict() for pn, pList in enumerate(subsetTrackList(trainingTrackList, sizeRange, mandTracks)): if len(pList) == len(trainingTrackList): outDir = args.outputDir else: outDir = os.path.join(args.outputDir, "perm%d" % pn) if not os.path.exists(outDir): os.makedirs(outDir) trainingTrackPath = os.path.join(outDir, "training_tracks.xml") evalTrackPath = os.path.join(outDir, "eval_tracks.xml") for maskTrack in trainingTrackList.getMaskTracks(): pList.addTrack(copy.deepcopy(maskTrack)) pList.saveXML(trainingTrackPath) epList = TrackList() for track in pList: t = copy.deepcopy(evalTrackList.getTrackByName(track.getName())) epList.addTrack(t) for maskTrack in trainingTrackList.getMaskTracks(): epList.addTrack(copy.deepcopy(maskTrack)) epList.saveXML(evalTrackPath) for inBed in args.inBeds: base = os.path.basename(inBed) truthBed = inBed testBed = inBed if args.cross is True: truthBed = os.path.join(outDir, os.path.splitext(base)[0] + "_truth_temp.bed") testBed = os.path.join(outDir, os.path.splitext(base)[0] + "_test_temp.bed") splitBed(inBed, truthBed, testBed) # train if args.mod is not None: modPath = args.mod command = "ls %s" % modPath else: modPath = os.path.join(outDir, os.path.splitext(base)[0] + ".mod") command = "teHmmTrain.py %s %s %s %s %s" % (trainingTrackPath, truthBed, modPath, logOps, trainFlags) if args.segment is True: command += " --segment %s" % truthBed # view viewPath = os.path.join(outDir, os.path.splitext(base)[0] + "_view.txt") command += " && teHmmView.py %s > %s" % (modPath, viewPath) # evaluate numReps = 1 if args.reps is not None and args.saveAllReps is True: numReps = args.reps assert numReps > 0 missed = 0 # little hack to repeat evaluation for each training replicate for repNum in xrange(-1, numReps-1): if repNum == -1: repSuffix = "" else: repSuffix = ".rep%d" % repNum evalBed = os.path.join(outDir, os.path.splitext(base)[0] + "_eval.bed" + repSuffix) hmmEvalInputBed = testBed if args.eval is not None: hmmEvalInputBed = args.eval bicPath = os.path.join(outDir, os.path.splitext(base)[0] + "_bic.txt" + repSuffix) command += " && teHmmEval.py %s %s %s --bed %s %s --bic %s" % ( evalTrackPath, modPath + repSuffix, hmmEvalInputBed, evalBed, logOps, bicPath) zin = True if args.segment is True: command += " --segment" # fit compTruth = testBed if args.truth is not None: compTruth = args.truth compareInputBed = evalBed if args.fit is True: fitBed = os.path.join(outDir, os.path.splitext(base)[0] + "_eval_fit.bed" + repSuffix) command += " && fitStateNames.py %s %s %s --tl %s" % (compTruth, evalBed, fitBed, evalTrackPath) if args.fitOpts is not None: command += " " + args.fitOpts compareInputBed = fitBed # compare compPath = os.path.join(outDir, os.path.splitext(base)[0] + "_comp.txt" + repSuffix) command += " && compareBedStates.py %s %s --tl %s > %s" % ( compTruth, compareInputBed, evalTrackPath, compPath) # make table row if repSuffix == "": rowPath = os.path.join(outDir, os.path.splitext(base)[0] + "_row.txt") if inBed in rows: rows[inBed].append(rowPath) else: rows[inBed] = [rowPath] command += " && scrapeBenchmarkRow.py %s %s %s %s %s" % ( args.trainingTracksInfo, trainingTrackPath, evalBed, compPath, rowPath) # remember command inCmdPath = os.path.join(outDir, os.path.splitext(base)[0] + "_cmd.txt") inCmdFile = open(inCmdPath, "w") inCmdFile.write(command + "\n") inCmdFile.close() commands.append(command) runParallelShellCommands(commands, args.numProc) writeTables(args.outputDir, rows)
def getStateMapFromConfMatrix(reverseMatrix, truthTgt, truthIgnore, predIgnore, thresh, fdr): """ Use greedy algorithm to construct state map in order to maximize F1 score of each non-ignored state (in order of size in truth). The greedy heuristic here (mapping to truth states in order of their genome coverage) is worrisome. Since once a predicted state is mapped to the truth state it is left out of consideration for all other truth states. One hopes the F1 metric compensates for this somewhat, and observes that the "truth" annotations we currently consider (TE/non-TE) should *still be optimal* despite the heuristic. NOTE: Unlike old version above, the input matrix is the reverse confusion matrix, ie mapping truth states back to predictions (tho matrix data is symmetrical, representation used in this module is not, and more convenient to work in one direction or another) UPDATE: FDR option allows to skip F1 optimization and just use fdr directly as a cutoff for candidates """ # build maps of state names to # bases in resepctive annotations truthStateSizes = defaultdict(int) predStateSizes = defaultdict(int) for truthState in reverseMatrix.keys(): for predState, overlap in reverseMatrix[truthState].items(): truthStateSizes[truthState] += overlap predStateSizes[predState] += overlap # sort truth states decreasing order truthStateList = truthStateSizes.items() truthStateList.sort(key = lambda x : x[1], reverse = True) logger.debug("State ranking in f1Fit:" + str(truthStateList)) # main loop stateNameMap = dict() for truthState, truthSize in truthStateList: if truthState in truthIgnore or \ (len(truthTgt) > 0 and truthState not in truthTgt): continue # assemble list of candidate pred states that meet threshold predCandidates = [] # assemble list of andidate pred states that exceed 1-threshold # these will be sure bets that we assume are good sureBets = [] # tack in extra list for FDR option that overrides other two # if FDR activated fdrSureBets = [] for predState, overlap in reverseMatrix[truthState].items(): if predState not in stateNameMap and\ predState not in predIgnore: predFrac = float(overlap) / float(min(truthSize, predStateSizes[predState])) if predFrac >= thresh: if predFrac >= 1. - thresh: sureBets.append(predState) else: predCandidates.append(predState) if fdr is not None: # above calculation of predFrac is effective heuristic but # runs against definition of fdr predFrac = float(overlap) / float(predStateSizes[predState]) if predFrac >= 1. - fdr: fdrSureBets.append(predState) else: logger.debug("state mapper skipping %s with othresh %f" % ( predState, float(overlap) / float(min(truthSize, predStateSizes[predState])))) if fdr is not None: sureBets = fdrSureBets predCandidates = [] logger.debug("candidates for %s: %s" % (truthState, str(predCandidates))) logger.debug("sure bets for %s: %s" % (truthState, str(sureBets))) # iterate over all combinaations of candidate mappings def allSubsets(s): if len(sureBets) > 0: yield [] for i in xrange(1, len(s) + 1): for j in itertools.combinations(s, i): yield j bestF1, bestMapSet = -1., [] for candidateSetIter in allSubsets(predCandidates): candidateSet = list(candidateSetIter) + sureBets # compute the f1 score of this mapping p, r, f1, tp, fp, fn = 0.,0.,0.,0.,0., float(truthStateSizes[truthState]) bsSortMap = dict() for predState in candidateSet: overlap = reverseMatrix[truthState][predState] tp += overlap fp += predStateSizes[predState] - overlap bsSortMap[predState] = tp + fp fn -= overlap if tp > 0.: p = tp / (tp + fp) r = tp / (tp + fn) f1 = (2. * p * r) / (p + r) #print f1, p, r, tp, fp, fn, str(candidateSet) if f1 > bestF1: # sort by total number of bases bestF1, bestMapSet = f1, sorted(candidateSet, reverse=True, key = lambda x : bsSortMap[x]) # add best candidate set to prediction state name map for predState in bestMapSet: assert predState not in stateNameMap stateNameMap[predState] = [truthState, reverseMatrix[truthState][predState], predStateSizes[predState]] logger.debug("map %s <---- %s" % (truthState, str(bestMapSet))) logger.debug("best F1 = %s" % bestF1) # predStateName - > (truthStateName, tp, tp+fp) return stateNameMap
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Filter overlapping intervals out") parser.add_argument("inputBed", help="Bed file to filter") parser.add_argument("--bed12", help="Use bed12 exons instead of start/end" " if present (equivalent to running bed12ToBed6 on" " input first).", action="store_true", default=False) parser.add_argument("--rm", help="Make sure intervals that are labeled as TE " "by rm2State.sh script are never cut by ones that are not", default=False, action='store_true') addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) assert os.path.isfile(args.inputBed) tempBedToolPath = initBedTool() # do the --rm filter. by splitting into TE / non-TE # then removing everything in non-TE that overlaps # TE. The adding the remainder back to TE. inputPath = args.inputBed if args.rm is True: tempPath = getLocalTempPath("Temp_", ".bed") tePath = getLocalTempPath("Temp_te_", ".bed") runShellCommand("rm2State.sh %s |grep TE | sortBed > %s" % ( args.inputBed, tempPath)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %( args.inputBed, tempPath, tePath)) otherPath = getLocalTempPath("Temp_other_", ".bed") runShellCommand("rm2State.sh %s |grep -v TE | sortBed > %s" % ( args.inputBed, tempPath)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %( args.inputBed, tempPath, otherPath)) if os.path.getsize(tePath) > 0 and\ os.path.getsize(otherPath) > 0: filterPath = getLocalTempPath("Temp_filter_", ".bed") runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % ( otherPath, tePath, filterPath)) inputPath = getLocalTempPath("Temp_input_", ".bed") runShellCommand("cat %s %s | sortBed > %s" % ( tePath, filterPath, inputPath)) runShellCommand("rm -f %s" % filterPath) runShellCommand("rm -f %s %s %s" % (tePath, otherPath, tempPath)) bedIntervals = BedTool(inputPath).sort() if args.bed12 is True: bedIntervals = bedIntervals.bed6() prevInterval = None # this code has been way to buggy for something so simple # keep extra list to check for sure even though it's a waste of # time and space sanity = [] for interval in bedIntervals: if (prevInterval is not None and interval.chrom == prevInterval.chrom and interval.start < prevInterval.end): logger.debug("Replace %d bases of \n%s with\n%s" % ( prevInterval.end - interval.start, str(interval), str(prevInterval))) interval.start = prevInterval.end if interval.end > interval.start: sys.stdout.write("%s" % str(interval)) sanity.append(interval) prevInterval = interval for i in xrange(len(sanity) - 1): if sanity[i].chrom == sanity[i+1].chrom: assert sanity[i+1].start >= sanity[i].end cleanBedTool(tempBedToolPath) if args.inputBed != inputPath: runShellCommand("rm -f %s" % inputPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Create a teHMM") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("trainingBed", help="Path of BED file containing" " genome regions to train model on. If --supervised " "is used, the names in this bed file will be treated " "as the true annotation (otherwise it is only used for " "interval coordinates)") parser.add_argument("outputModel", help="Path of output hmm") parser.add_argument("--numStates", help="Number of states in model", type = int, default=2) parser.add_argument("--iter", help="Number of EM iterations", type = int, default=100) parser.add_argument("--supervised", help="Use name (4th) column of " "<traingingBed> for the true hidden states of the" " model. Transition parameters will be estimated" " directly from this information rather than EM." " NOTE: The number of states will be determined " "from the bed.", action = "store_true", default = False) parser.add_argument("--cfg", help="Use Context Free Grammar insead of " "HMM. Only works with --supervised for now", action = "store_true", default = False) parser.add_argument("--saPrior", help="Confidence in self alignment " "track for CFG. Probability of pair emission " "is multiplied by this number if the bases are aligned" " and its complement if bases are not aligned. Must" " be between [0,1].", default=0.95, type=float) parser.add_argument("--pairStates", help="Comma-separated list of states" " (from trainingBed) that are treated as pair-emitors" " for the CFG", default=None) parser.add_argument("--emFac", help="Normalization factor for weighting" " emission probabilities because when there are " "many tracks, the transition probabilities can get " "totally lost. 0 = no normalization. 1 =" " divide by number of tracks. k = divide by number " "of tracks / k", type=int, default=0) parser.add_argument("--initTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". This file (all other transitions get probability 0)" " is used to specifiy the initial transition model." " The names and number of states will be initialized " "according to this file (overriding --numStates)", default = None) parser.add_argument("--fixTrans", help="Do not learn transition parameters" " (best used with --initTransProbs)", action="store_true", default=False) parser.add_argument("--initEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". This file (all other emissions get probability 0)" " is used to specifiy the initial emission model. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixEm", help="Do not learn emission parameters" " (best used with --initEmProbs)", action="store_true", default=False) parser.add_argument("--initStartProbs", help="Path of text file where each " "line has two entries: State Probability" ". This file (all other start probs get probability 0)" " is used to specifiy the initial start dist. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixStart", help="Do not learn start parameters" " (best used with --initStartProbs)", action="store_true", default=False) parser.add_argument("--forceTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". These transition probabilities will override any " " learned probabilities after each training iteration" " (unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed)" , default=None) parser.add_argument("--forceEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". These " "emission probabilities will override any learned" " probabilities after each training iteration " "(unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed.)" , default = None) parser.add_argument("--flatEm", help="Use a flat emission distribution as " "a baseline. If not specified, the initial emission " "distribution will be randomized by default. Emission" " probabilities specified with --initEmpProbs or " "--forceEmProbs will never be affected by randomizaiton" ". The randomization is important for Baum Welch " "training, since if two states dont have at least one" " different emission or transition probability to begin" " with, they will never learn to be different.", action="store_true", default=False) parser.add_argument("--emRandRange", help="When randomly initialzing an" " emission distribution, constrain" " the values to the given range (pair of " "comma-separated numbers). Overridden by " "--initEmProbs and --forceEmProbs when applicable." " Completely overridden by --flatEm (which is equivalent" " to --emRandRange .5,.5.). Actual values used will" " always be normalized.", default="0.2,0.8") parser.add_argument("--segment", help="Bed file of segments to treat as " "single columns for HMM (ie as created with " "segmentTracks.py). IMPORTANT: this file must cover " "the same regions as the traininBed file. Unless in " "supervised mode, probably best to use same bed file " " as both traingBed and --segment argument. Otherwise" " use intersectBed to make sure the overlap is exact", default=None) parser.add_argument("--segLen", help="Effective segment length used for" " normalizing input segments (specifying 0 means no" " normalization applied)", type=int, default=0) parser.add_argument("--seed", help="Seed for random number generator" " which will be used to initialize emissions " "(if --flatEM and --supervised not specified)", default=None, type=int) parser.add_argument("--reps", help="Number of replicates (with different" " random initializations) to run. The replicate" " with the highest likelihood will be chosen for the" " output", default=1, type=int) parser.add_argument("--numThreads", help="Number of threads to use when" " running replicates (see --rep) in parallel.", type=int, default=1) parser.add_argument("--emThresh", help="Threshold used for convergence" " in baum welch training. IE delta log likelihood" " must be bigger than this number (which should be" " positive) for convergence", type=float, default=0.001) parser.add_argument("--saveAllReps", help="Save all replicates (--reps)" " models to disk, instead of just the best one" ". Format is <outputModel>.repN. There will be " " --reps -1 such models saved as the best output" " counts as a replicate", action="store_true", default=False) parser.add_argument("--maxProb", help="Gaussian distributions and/or" " segment length corrections can cause probability" " to *decrease* during BW iteration. Use this option" " to remember the parameters with the highest probability" " rather than returning the parameters after the final " "iteration.", action="store_true", default=False) parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop" " training if a given number of iterations go by without" " hitting a new maxProb", default=None, type=int) parser.add_argument("--transMatEpsilons", help="By default, epsilons are" " added to all transition probabilities to prevent " "converging on 0 due to rounding error only for fully" " unsupervised training. Use this option to force this" " behaviour for supervised and semisupervised modes", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() if args.cfg is True: assert args.supervised is True assert args.saPrior >= 0. and args.saPrior <= 1. if args.pairStates is not None: assert args.cfg is True if args.initTransProbs is not None or args.fixTrans is True or\ args.initEmProbs is not None or args.fixEm is not None: if args.cfg is True: raise RuntimeError("--transProbs, --fixTrans, --emProbs, --fixEm " "are not currently compatible with --cfg.") if args.fixTrans is True and args.supervised is True: raise RuntimeError("--fixTrans option not compatible with --supervised") if args.fixEm is True and args.supervised is True: raise RuntimeError("--fixEm option not compatible with --supervised") if (args.forceTransProbs is not None or args.forceEmProbs is not None) \ and args.cfg is True: raise RuntimeError("--forceTransProbs and --forceEmProbs are not " "currently compatible with --cfg") if args.flatEm is True and args.supervised is False and\ args.initEmProbs is None and args.initTransProbs is None: raise RuntimeError("--flatEm must be used with --initEmProbs and or" " --initTransProbs") if args.initEmProbs is not None and args.initTransProbs is None: raise RuntimeError("--initEmProbs can only be used in conjunction with" " --initTransProbs") if args.emRandRange is not None: args.emRandRange = args.emRandRange.split(",") try: assert len(args.emRandRange) == 2 args.emRandRange = (float(args.emRandRange[0]), float(args.emRandRange[1])) except: raise RuntimeError("Invalid --emRandRange specified") if args.transMatEpsilons is False: # old logic here. now overriden with above options args.transMatEpsilons = (args.supervised is False and args.initTransProbs is None and args.forceTransProbs is None) setLoggingFromOptions(args) tempBedToolPath = initBedTool() # read training intervals from the bed file logger.info("loading training intervals from %s" % args.trainingBed) mergedIntervals = getMergedBedIntervals(args.trainingBed, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.trainingBed) # read segment intervals segIntervals = None if args.segment is not None: logger.info("loading segment intervals from %s" % args.segment) try: checkExactOverlap(args.trainingBed, args.segment) except: raise RuntimeError("bed file passed with --segments option" " must exactly overlap trainingBed") segIntervals = readBedIntervals(args.segment, sort=True) elif args.segLen > 0: raise RuntimeError("--segLen can only be used with --segment") if args.segLen <= 0: args.segLen = None if args.segLen > 0 and args.segLen != 1: logger.warning("--segLen should be 0 (no correction) or 1 (base" " correction). Values > 1 may cause bias.") # read the tracks, while intersecting them with the training intervals logger.info("loading tracks %s" % args.tracksInfo) trackData = TrackData() trackData.loadTrackData(args.tracksInfo, mergedIntervals, segmentIntervals=segIntervals) catMap = None userTrans = None if args.supervised is False and args.initTransProbs is not None: logger.debug("initializing transition model with user data") catMap = stateNamesFromUserTrans(args.initTransProbs) # state number is overrided by the transProbs file args.numStates = len(catMap) truthIntervals = None # state number is overrided by the input bed file in supervised mode if args.supervised is True: logger.info("processing supervised state names") # we reload because we don't want to be merging them here truthIntervals = readBedIntervals(args.trainingBed, ncol=4) catMap = mapStateNames(truthIntervals) args.numStates = len(catMap) # train the model seeds = [random.randint(0, 4294967294)] if args.seed is not None: seeds = [args.seed] random.seed(args.seed) seeds += [random.randint(0, sys.maxint) for x in xrange(1, args.reps)] def trainClosure(randomSeed): return trainModel(randomSeed, trackData=trackData, catMap=catMap, userTrans=userTrans, truthIntervals=truthIntervals, args=args) modelList = runParallelShellCommands(argList=seeds, numProc = args.numThreads, execFunction = trainClosure, useThreads = True) # select best model logmsg = "" bestModel = (-1, LOGZERO) for i in xrange(len(modelList)): curModel = (i, modelList[i].getLastLogProb()) if curModel[1] > bestModel[1]: bestModel = curModel if curModel[1] is not None: logmsg += "Rep %i: TotalProb: %f\n" % curModel if len(modelList) > 1: logging.info("Training Replicates Statistics:\n%s" % logmsg) logging.info("Selecting best replicate (%d, %f)" % bestModel) model = modelList[bestModel[0]] # write the model to a pickle logger.info("saving trained model to %s" % args.outputModel) saveModel(args.outputModel, model) # write all replicates writtenCount = 0 if args.saveAllReps is True: for i, repModel in enumerate(modelList): if i != bestModel[0]: repPath = "%s.rep%d" % (args.outputModel, writtenCount) logger.info("saving replicate model to %s" % repPath) saveModel(repPath, repModel) writtenCount += 1 cleanBedTool(tempBedToolPath)
def trainModel(randomSeed, trackData, catMap, userTrans, truthIntervals, args): """ Run the whole training pipeline """ # activate the random seed randGen = np.random.RandomState(randomSeed) # create the independent emission model logger.info("creating emission model") numSymbolsPerTrack = trackData.getNumSymbolsPerTrack() logger.debug("numSymbolsPerTrack=%s" % numSymbolsPerTrack) # only randomize model if using Baum-Welch randomize = args.supervised is False and args.flatEm is False emissionModel = IndependentMultinomialAndGaussianEmissionModel( args.numStates, numSymbolsPerTrack, trackData.getTrackList(), normalizeFac=args.emFac, randomize=randomize, effectiveSegmentLength = args.segLen, random_state = randGen, randRange = args.emRandRange) # create the model if not args.cfg: logger.info("creating hmm model") model = MultitrackHmm(emissionModel, n_iter=args.iter, state_name_map=catMap, fixTrans = args.fixTrans, fixEmission = args.fixEm, fixStart = args.fixStart, forceUserEmissions = args.forceEmProbs, forceUserTrans = args.forceTransProbs, random_state = randGen, thresh = args.emThresh, transMatEpsilons = args.transMatEpsilons, maxProb = args.maxProb, maxProbCut = args.maxProbCut) else: pairEM = PairEmissionModel(emissionModel, [args.saPrior] * emissionModel.getNumStates()) assert args.supervised is True nestStates = [] if args.pairStates is not None: pairStates = args.pairStates.split(",") nestStates = map(lambda x: catMap.getMap(x), pairStates) logger.info("Creating cfg model") model = MultitrackCfg(emissionModel, pairEM, nestStates, state_name_map=catMap) # initialize the user specified transition probabilities now if necessary if args.initTransProbs is not None: with open(args.initTransProbs) as f: model.applyUserTrans(f.readlines()) # initialize the user specified emission probabilities now if necessary if args.initEmProbs is not None: with open(args.initEmProbs) as f: # can't apply emissions without a track list! model.trackList = trackData.getTrackList() model.applyUserEmissions(f.readlines()) # initialize the user specified start probabilities now if necessary if args.initStartProbs is not None: with open(args.initStartProbs) as f: model.applyUserStarts(f.readlines()) # make sure initialization didnt screw up model.validate() # do the training if args.supervised is False: logger.info("training via EM") model.train(trackData) else: logger.info("training from input bed states") model.supervisedTrain(trackData, truthIntervals) # reset the user specified transition probabilities now if necessary if args.forceTransProbs is not None: with open(args.forceTransProbs) as f: model.applyUserTrans(f.readlines()) # reset the user specified emission probabilities now if necessary if args.forceEmProbs is not None: with open(args.forceEmProbs) as f: model.applyUserEmissions(f.readlines()) return model
def getStateMapFromConfMatrix(reverseMatrix, truthTgt, truthIgnore, predIgnore, thresh, fdr): """ Use greedy algorithm to construct state map in order to maximize F1 score of each non-ignored state (in order of size in truth). The greedy heuristic here (mapping to truth states in order of their genome coverage) is worrisome. Since once a predicted state is mapped to the truth state it is left out of consideration for all other truth states. One hopes the F1 metric compensates for this somewhat, and observes that the "truth" annotations we currently consider (TE/non-TE) should *still be optimal* despite the heuristic. NOTE: Unlike old version above, the input matrix is the reverse confusion matrix, ie mapping truth states back to predictions (tho matrix data is symmetrical, representation used in this module is not, and more convenient to work in one direction or another) UPDATE: FDR option allows to skip F1 optimization and just use fdr directly as a cutoff for candidates """ # build maps of state names to # bases in resepctive annotations truthStateSizes = defaultdict(int) predStateSizes = defaultdict(int) for truthState in reverseMatrix.keys(): for predState, overlap in reverseMatrix[truthState].items(): truthStateSizes[truthState] += overlap predStateSizes[predState] += overlap # sort truth states decreasing order truthStateList = truthStateSizes.items() truthStateList.sort(key=lambda x: x[1], reverse=True) logger.debug("State ranking in f1Fit:" + str(truthStateList)) # main loop stateNameMap = dict() for truthState, truthSize in truthStateList: if truthState in truthIgnore or \ (len(truthTgt) > 0 and truthState not in truthTgt): continue # assemble list of candidate pred states that meet threshold predCandidates = [] # assemble list of andidate pred states that exceed 1-threshold # these will be sure bets that we assume are good sureBets = [] # tack in extra list for FDR option that overrides other two # if FDR activated fdrSureBets = [] for predState, overlap in reverseMatrix[truthState].items(): if predState not in stateNameMap and\ predState not in predIgnore: predFrac = float(overlap) / float( min(truthSize, predStateSizes[predState])) if predFrac >= thresh: if predFrac >= 1. - thresh: sureBets.append(predState) else: predCandidates.append(predState) if fdr is not None: # above calculation of predFrac is effective heuristic but # runs against definition of fdr predFrac = float(overlap) / float( predStateSizes[predState]) if predFrac >= 1. - fdr: fdrSureBets.append(predState) else: logger.debug( "state mapper skipping %s with othresh %f" % (predState, float(overlap) / float(min(truthSize, predStateSizes[predState])))) if fdr is not None: sureBets = fdrSureBets predCandidates = [] logger.debug("candidates for %s: %s" % (truthState, str(predCandidates))) logger.debug("sure bets for %s: %s" % (truthState, str(sureBets))) # iterate over all combinaations of candidate mappings def allSubsets(s): if len(sureBets) > 0: yield [] for i in xrange(1, len(s) + 1): for j in itertools.combinations(s, i): yield j bestF1, bestMapSet = -1., [] for candidateSetIter in allSubsets(predCandidates): candidateSet = list(candidateSetIter) + sureBets # compute the f1 score of this mapping p, r, f1, tp, fp, fn = 0., 0., 0., 0., 0., float( truthStateSizes[truthState]) bsSortMap = dict() for predState in candidateSet: overlap = reverseMatrix[truthState][predState] tp += overlap fp += predStateSizes[predState] - overlap bsSortMap[predState] = tp + fp fn -= overlap if tp > 0.: p = tp / (tp + fp) r = tp / (tp + fn) f1 = (2. * p * r) / (p + r) #print f1, p, r, tp, fp, fn, str(candidateSet) if f1 > bestF1: # sort by total number of bases bestF1, bestMapSet = f1, sorted(candidateSet, reverse=True, key=lambda x: bsSortMap[x]) # add best candidate set to prediction state name map for predState in bestMapSet: assert predState not in stateNameMap stateNameMap[predState] = [ truthState, reverseMatrix[truthState][predState], predStateSizes[predState] ] logger.debug("map %s <---- %s" % (truthState, str(bestMapSet))) logger.debug("best F1 = %s" % bestF1) # predStateName - > (truthStateName, tp, tp+fp) return stateNameMap