def checkExactOverlap(bed1, bed2): """ make sure two bed files cover same region exactly: a requirement for all code based on the comparisons in this module.""" errorMessage = ( "Bed files %s and %s cannot be compared. xxx. " " Input files must be both sorted, cover the exact same region," " and contain no self-overlaps.") % (bed1, bed2) # empty file may break downstream comparisons size1 = os.path.getsize(bed1) size2 = os.path.getsize(bed2) if size1 == 0 or size2 == 0: raise RuntimeError( errorMessage.replace("xxx", "one or both inputs empty")) # test self-overlap and sorting intervals1 = readBedIntervals(bed1, sort=False) for i in xrange(1, len(intervals1)): if intersectSize(intervals1[i - 1], intervals1[i]) != 0: raise RuntimeError( errorMessage.replace( "xxx", "Overlapping intervals %s and %s found in input1" % (intervals1[i - 1], intervals1[i]))) if intervals1[i - 1] > intervals1[i]: raise RuntimeError( errorMessage.replace( "xxx", "Out of order intervals %s and %s found in input1" % (intervals1[i - 1], intervals1[i]))) # test self-overlap and sorting intervals2 = readBedIntervals(bed1, sort=False) for i in xrange(1, len(intervals2)): if intersectSize(intervals2[i - 1], intervals2[i]) != 0: raise RuntimeError( errorMessage.replace( "xxx", "Overlapping intervals %s and %s found in input2" % (intervals2[i - 1], intervals2[i]))) if intervals2[i - 1] > intervals2[i]: raise RuntimeError( errorMessage.replace( "xxx", "Out of order intervals %s and %s found in input2" % (intervals2[i - 1], intervals2[i]))) # test intersection size tempFile = getLocalTempPath("Temp_test", ".bed") runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile)) if os.path.getsize(tempFile) != 0: runShellCommand("rm -f %s" % tempFile) raise RuntimeError( errorMessage.replace("xxx", "Input1 covers regions outside input2")) runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile)) if os.path.getsize(tempFile) != 0: runShellCommand("rm -f %s" % tempFile) raise RuntimeError( errorMessage.replace("xxx", "Input2 covers regions outside input1")) runShellCommand("rm -f %s" % tempFile)
def checkExactOverlap(bed1, bed2): """ make sure two bed files cover same region exactly: a requirement for all code based on the comparisons in this module.""" errorMessage = ("Bed files %s and %s cannot be compared. xxx. " " Input files must be both sorted, cover the exact same region," " and contain no self-overlaps.") % (bed1, bed2) # empty file may break downstream comparisons size1 = os.path.getsize(bed1) size2 = os.path.getsize(bed2) if size1 == 0 or size2 == 0: raise RuntimeError(errorMessage.replace("xxx", "one or both inputs empty")) # test self-overlap and sorting intervals1 = readBedIntervals(bed1, sort=False) for i in xrange(1, len(intervals1)): if intersectSize(intervals1[i-1], intervals1[i]) != 0: raise RuntimeError(errorMessage.replace( "xxx", "Overlapping intervals %s and %s found in input1" % ( intervals1[i-1], intervals1[i]))) if intervals1[i-1] > intervals1[i]: raise RuntimeError(errorMessage.replace( "xxx", "Out of order intervals %s and %s found in input1" % ( intervals1[i-1], intervals1[i]))) # test self-overlap and sorting intervals2 = readBedIntervals(bed1, sort=False) for i in xrange(1, len(intervals2)): if intersectSize(intervals2[i-1], intervals2[i]) != 0: raise RuntimeError(errorMessage.replace( "xxx", "Overlapping intervals %s and %s found in input2" % ( intervals2[i-1], intervals2[i]))) if intervals2[i-1] > intervals2[i]: raise RuntimeError(errorMessage.replace( "xxx", "Out of order intervals %s and %s found in input2" % ( intervals2[i-1], intervals2[i]))) # test intersection size tempFile = getLocalTempPath("Temp_test", ".bed") runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile)) if os.path.getsize(tempFile) != 0: runShellCommand("rm -f %s" % tempFile) raise RuntimeError(errorMessage.replace( "xxx", "Input1 covers regions outside input2")) runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile)) if os.path.getsize(tempFile) != 0: runShellCommand("rm -f %s" % tempFile) raise RuntimeError(errorMessage.replace( "xxx", "Input2 covers regions outside input1")) runShellCommand("rm -f %s" % tempFile)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Subselect some output of chunkBedRegions.py") parser.add_argument("inBed", help="Input bed file (generated with chunkBedRegions.py)") parser.add_argument("sampleSize", help="Desired sample size (in bases).", type=int) args = parser.parse_args() tempBedToolPath = initBedTool() assert os.path.exists(args.inBed) bedIntervals = readBedIntervals(args.inBed) outIntervals = [] curSize = 0 # dumb n^2 alg should be enough for our current purposes while curSize < args.sampleSize and len(bedIntervals) > 0: idx = random.randint(0, len(bedIntervals)-1) interval = bedIntervals[idx] sampleLen = interval[2] - interval[1] if sampleLen + curSize > args.sampleSize: sampleLen = (sampleLen + curSize) - args.sampleSize interval = (interval[0], interval[1], interval[1] + sampleLen) outIntervals.append(interval) curSize += sampleLen del bedIntervals[idx] for interval in sorted(outIntervals): sys.stdout.write("%s\t%d\t%d\n" % interval) cleanBedTool(tempBedToolPath)
def baserize(inBed, outBed): outFile = open(outBed, "w") for interval in readBedIntervals(inBed): for i in xrange(interval[2] - interval[1]): outFile.write("%s\t%d\t%d\n" % (interval[0], interval[1] + i, interval[1] + i + 1)) outFile.close()
def fillGaps(inBed): """ Make two interval sets from a given bed file: filledIntervals: Set of intervals with intervals added between consecutive intervals on same seq (ala addBedGaps.y) mergedIntervals: Set of intervals spanning each continuous region from above (ala getMergeItnervals) probably reimplementing stuff but oh well """ filledIntervals = [] mergedIntervals = [] intervals = readBedIntervals(inBed, ncol=4, sort=True) if len(intervals) == 0: return [], [] prevInterval = None for interval in intervals: if prevInterval is not None and prevInterval[0] == interval[0] and\ prevInterval[2] != interval[1]: # update fill for discontinuity assert prevInterval[2] < interval[1] filledIntervals.append( (interval[0], prevInterval[2], interval[1], filTok)) if prevInterval is None or prevInterval[0] != interval[0]: # update merge for new sequence mergedIntervals.append(interval) else: # extend merge for same sequence mergedIntervals[-1] = (mergedIntervals[-1][0], mergedIntervals[-1][1], interval[2], mergedIntervals[-1][3]) # update fill with current interval filledIntervals.append(interval) prevInterval = interval return filledIntervals, mergedIntervals
def combineTrack(track, outPath, tempRegionPath, iter, args): """ merge track with outPath """ # make sure track is of form chrom start end state tempColPath = getLocalTempPath("Temp", "_col.bed") tempColFile = open(tempColPath, "w") vc = track.getValCol() + 1 if track.getDist() == "binary": assert track.getName() != args.outside vc = 3 bedIntervals = readBedIntervals(track.getPath(), vc, sort = True) for bedInterval in bedIntervals: outStr = "\t".join([str(x) for x in bedInterval]) if track.getDist() == "binary": # state name = track name for binary track outStr += "\t%s" % track.getName() outStr += "\n" tempColFile.write(outStr) tempColFile.close() # intersect the target region tempIntersectPath = getLocalTempPath("Temp", "_int.bed") runShellCommand("intersectBed -a %s -b %s > %s" % ( tempColPath, tempRegionPath, tempIntersectPath)) # add the outside states tempGappedPath = getLocalTempPath("Temp", "_gap.bed") runShellCommand("addBedGaps.py --state %s %s %s %s" % ( args.outside, tempRegionPath, tempIntersectPath, tempGappedPath)) # fit the names with previous interations' result tempFitPath = getLocalTempPath("Temp", "_fit.bed") if iter == 0: runShellCommand("cp %s %s" % (tempGappedPath, tempFitPath)) else: runShellCommand("fitStateNames.py %s %s %s --qualThresh %f --ignoreTgt %s" % ( outPath, tempGappedPath, tempFitPath, args.fitThresh, args.outside)) # now merge into outPath runShellCommand("cat %s >> %s" % (tempFitPath, outPath)) runShellCommand("removeBedOverlaps.py %s > %s" % (outPath, tempColPath)) runShellCommand("mv %s %s" % (tempColPath, outPath)) # clean crap (note tempCol should already be gone) runShellCommand("rm -f %s" % tempColPath) runShellCommand("rm -f %s" % tempIntersectPath) runShellCommand("rm -f %s" % tempGappedPath) runShellCommand("rm -f %s" % tempFitPath)
def makeNearnessBED(intervals, args): """ for each interval, measure distance to nearest interval in args.nearness and write as score """ compIntervals = readBedIntervals(args.nearness, ncol=4, sort=True) if len(intervals) == 0: return "" # only correct if sorted non-overlapping outBedString = "" i = 0 for interval in intervals: distI = sys.maxint for j in xrange(i, len(compIntervals)): distJ = distance(interval, compIntervals[j]) if distJ <= distI: i, distI = j, distJ else: break outBedString += "%s\t%d\t%d\t%s\t%d\n" % ( interval[0], interval[1], interval[2], interval[3], distI) return outBedString
def makeNearnessBED(intervals, args): """ for each interval, measure distance to nearest interval in args.nearness and write as score """ compIntervals = readBedIntervals(args.nearness, ncol = 4, sort = True) if len(intervals) == 0: return "" # only correct if sorted non-overlapping outBedString = "" i = 0 for interval in intervals: distI = sys.maxint for j in xrange(i, len(compIntervals)): distJ = distance(interval, compIntervals[j]) if distJ <= distI: i, distI = j, distJ else: break outBedString += "%s\t%d\t%d\t%s\t%d\n" % (interval[0], interval[1], interval[2], interval[3], distI) return outBedString
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Subselect some output of chunkBedRegions.py") parser.add_argument( "inBed", help="Input bed file (generated with chunkBedRegions.py)") parser.add_argument("sampleSize", help="Desired sample size (in bases).", type=int) args = parser.parse_args() tempBedToolPath = initBedTool() assert os.path.exists(args.inBed) bedIntervals = readBedIntervals(args.inBed) outIntervals = [] curSize = 0 # dumb n^2 alg should be enough for our current purposes while curSize < args.sampleSize and len(bedIntervals) > 0: idx = random.randint(0, len(bedIntervals) - 1) interval = bedIntervals[idx] sampleLen = interval[2] - interval[1] if sampleLen + curSize > args.sampleSize: sampleLen = (sampleLen + curSize) - args.sampleSize interval = (interval[0], interval[1], interval[1] + sampleLen) outIntervals.append(interval) curSize += sampleLen del bedIntervals[idx] for interval in sorted(outIntervals): sys.stdout.write("%s\t%d\t%d\n" % interval) cleanBedTool(tempBedToolPath)
def fillGaps(inBed): """ Make two interval sets from a given bed file: filledIntervals: Set of intervals with intervals added between consecutive intervals on same seq (ala addBedGaps.y) mergedIntervals: Set of intervals spanning each continuous region from above (ala getMergeItnervals) probably reimplementing stuff but oh well """ filledIntervals = [] mergedIntervals = [] intervals = readBedIntervals(inBed, ncol=4, sort=True) if len(intervals) == 0: return [], [] prevInterval = None for interval in intervals: if prevInterval is not None and prevInterval[0] == interval[0] and\ prevInterval[2] != interval[1]: # update fill for discontinuity assert prevInterval[2] < interval[1] filledIntervals.append((interval[0], prevInterval[2], interval[1], filTok)) if prevInterval is None or prevInterval[0] != interval[0]: # update merge for new sequence mergedIntervals.append(interval) else: # extend merge for same sequence mergedIntervals[-1] = (mergedIntervals[-1][0], mergedIntervals[-1][1], interval[2], mergedIntervals[-1][3]) # update fill with current interval filledIntervals.append(interval) prevInterval = interval return filledIntervals, mergedIntervals
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Compare two bed files where Model states are represented" " in a column. Used to determine sensitivity and specificity. NOTE" " that both bed files must be sorted and cover the exact same regions" " of the same genome.") parser.add_argument("bed1", help="Bed file (TRUTH)") parser.add_argument("bed2", help="Bed file covering same regions in same" " order as bed1") parser.add_argument("--col", help="Column of bed files to use for state" " (currently only support 4(name) or 5(score))", default = 4, type = int) parser.add_argument("--thresh", help="Threshold to consider interval from" " bed1 covered by bed2.", type=float, default=0.8) parser.add_argument("--plot", help="Path of file to write Precision/Recall" " graphs to in PDF format", default=None) parser.add_argument("--ignore", help="Comma-separated list of stateNames to" " ignore", default=None) parser.add_argument("--strictPrec", help="By default, precision is computed" " in a manner strictly symmetric to recall. So calling" " compareBedStates.py A.bed B.bed would give the exact" " same output as compareBedStates.py B.bed A.bed except" " precision and recall values would be swapped. With " " this option, a predicted element only counts toward" " precision if it overlaps with 80pct of the true" " element, as opposed to only needing 80pct of itself" " overlapping with the true element. ", action="store_true", default = False) parser.add_argument("--noBase", help="Skip base-level stats (and only show" " interval stats). Runs faster", action="store_true", default=False) parser.add_argument("--noFrag", help="Do not allow fragmented matches in" "interval predictions. ie if a single truth interval" " is covered by a series of predicted intervals, only " "the best match will be counted if this flag is used", action="store_true", default=False) parser.add_argument("--tl", help="Path to tracks XML file. Used to cut " "out mask tracks so they are removed from comparison." " (convenience option to not have to manually run " "subtractBed everytime...)", default=None) parser.add_argument("--delMask", help="Entirely remove intervals from " "mask tracks that are > given length. Probably " "only want to set to non-zero value K if using" " with a prediction that was processed with " "interpolateMaskedRegions.py --max K", type=int, default=0) parser.add_argument("--window", help="A comma-delimited 5-tuple of " "windowSize,stateName,compType,score,outBed. " "Where windowSize is the sliding window size " "(overlap .5), stateName is target stateName," " compType is in {base,interval,weighted}, sore is" " in {f1,precision,recall} and " "outBed is the path of a bedFile to write positional" " accuracy to. For example, --window 1000000,TE,base,f1" ",acc.bed will write base-level f1 for 1MB sliding windows" " to acc.bed. These can be viewed on the browser by first" " converting to BigWig.", default=None) args = parser.parse_args() tempBedToolPath = initBedTool() if args.ignore is not None: args.ignore = set(args.ignore.split(",")) else: args.ignore = set() assert args.col == 4 or args.col == 5 print "Commandline %s" % " ".join(sys.argv) origArgs = copy.deepcopy(args) tempFiles = [] if args.tl is not None: cutBed1 = cutOutMaskIntervals(args.bed1, args.delMask, sys.maxint, args.tl) cutBed2 = cutOutMaskIntervals(args.bed2, args.delMask, sys.maxint, args.tl) if cutBed1 is not None: assert cutBed2 is not None tempFiles += [cutBed1, cutBed2] args.bed1 = cutBed1 args.bed2 = cutBed2 checkExactOverlap(args.bed1, args.bed2) if args.window is not None: runPositionalComparison(argv, origArgs) intervals1 = readBedIntervals(args.bed1, ncol = args.col) intervals2 = readBedIntervals(args.bed2, ncol = args.col) if args.noBase is False: stats = compareBaseLevel(intervals1, intervals2, args.col - 1)[0] totalRight, totalWrong, accMap = summarizeBaseComparision(stats, args.ignore) print "Base counts [False Negatives, False Positives, True Positives]:" print stats totalBoth = totalRight + totalWrong accuracy = float(totalRight) / float(totalBoth) print "Accuaracy: %d / %d = %f" % (totalRight, totalBoth, accuracy) print "State-by-state (Precision, Recall):" print "Base-by-base Accuracy" print accMap trueStats = compareIntervalsOneSided(intervals1, intervals2, args.col -1, args.thresh, False, not args.noFrag)[0] predStats = compareIntervalsOneSided(intervals2, intervals1, args.col -1, args.thresh, args.strictPrec, not args.noFrag)[0] intAccMap = summarizeIntervalComparison(trueStats, predStats, False, args.ignore) intAccMapWeighted = summarizeIntervalComparison(trueStats, predStats, True, args.ignore) print "\nInterval Accuracy" print intAccMap print "" print "\nWeighted Interval Accuracy" print intAccMapWeighted print "" # print some row data to be picked up by scrapeBenchmarkRow.py if args.noBase is False: header, row = summaryRow(accuracy, stats, accMap) print " ".join(header) print " ".join(row) # make graph if args.plot is not None: if canPlot is False: raise RuntimeError("Unable to write plots. Maybe matplotlib is " "not installed?") writeAccPlots(accuracy, accMap, intAccMap, intAccMapWeighted, args.thresh, args.plot) if len(tempFiles) > 0: runShellCommand("rm -f %s" % " ".join(tempFiles)) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Fill in masked intervals of an hmm prediction " "(from teHmmEval.py) with state corresponding to surrounding" " intervals.") parser.add_argument("tracksXML", help="XML track list (used to id masking" " tracks") parser.add_argument("allBed", help="Target scope. Masked intervals outside" " of these regions will not be included") parser.add_argument( "inBed", help="TE prediction BED file. State labels" " should probably be mapped (ie with fitStateNames.py)") parser.add_argument("outBed", help="Output BED. Will be equivalent to" " the input bed except all gaps corresponding to " "masked intervals will be filled") parser.add_argument( "--maxLen", help="Maximum length of a masked interval" " to fill (inclusive). Use --delMask option with same value" "if running compareBedStates.py after.", type=int, default=sys.maxint) parser.add_argument("--default", help="Default label to give to masked " "region if no label can be determined", default="0") parser.add_argument( "--tgts", help="Only relabel gaps that " "are flanked on both sides by the same state, and this state" " is in this comma- separated list. --default used for other" " gaps. If not targetst specified then all states checked.", default=None) parser.add_argument( "--oneSidedTgts", help="Only relabel gaps that " "are flanked on at least one side by a state in this comma-" "separated list --default used for other gaps", default=None) parser.add_argument( "--onlyDefault", help="Add the default state (--default) no" " no all masked gaps no matter what. ie ignoring all other " "logic", action="store_true", default=False) parser.add_argument( "--cut", help="Cut out gaps for masked tracks from the input." " By default, the input is expected to come from the HMM " "with mask intervals already absent, and will crash on with" " an assertion error if an overlap is detected.", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # make sets tgtSet = set() if args.tgts is not None: tgtSet = set(args.tgts.split(",")) oneSidedTgtSet = set() if args.oneSidedTgts is not None: oneSidedTgtSet = set(args.oneSidedTgts.split(",")) assert len(tgtSet.intersection(oneSidedTgtSet)) == 0 # read the track list trackList = TrackList(args.tracksXML) maskTracks = trackList.getMaskTracks() # read the input bed inBed = args.inBed if args.cut is True: inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML) inputIntervals = readBedIntervals(inBed, ncol=4, sort=True) if args.cut is True: runShellCommand("rm -f %s" % inBed) if len(maskTracks) == 0 or len(inputIntervals) == 0: runShellCommand("cp %s %s" % (args.inBed, args.outBed)) logger.warning("No mask tracks located in %s or" " %s empty" % (args.tracksXML, args.inBed)) return 0 # make a temporary, combined, merged masking bed file tempMaskBed = getLocalTempPath("Temp_mb", ".bed") for maskTrack in maskTracks: assert os.path.isfile(maskTrack.getPath()) runShellCommand( "cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % (maskTrack.getPath(), tempMaskBed)) maskedIntervals = getMergedBedIntervals(tempMaskBed, sort=True) resolvedMasks = 0 if len(inputIntervals) == 0: logger.warning("No mask tracks located in %s" % args.tracksXML) return inputIdx = 0 rightFlank = inputIntervals[inputIdx] tempOutMask = getLocalTempPath("Temp_om", ".bed") tempOutMaskFile = open(tempOutMask, "w") for maskIdx, maskInterval in enumerate(maskedIntervals): if maskInterval[2] - maskInterval[1] > args.maxLen: continue # find candidate right flank while rightFlank < maskInterval: if inputIdx == len(inputIntervals) - 1: rightFlank = None break else: inputIdx += 1 rightFlank = inputIntervals[inputIdx] # candidate left flank leftFlank = None if inputIdx > 0: leftFlank = inputIntervals[inputIdx - 1] # identify flanking states if the intervals perfectly abut leftState = None if leftFlank is not None: if leftFlank[0] == maskInterval[0] and leftFlank[ 2] == maskInterval[1]: leftState = str(leftFlank[3]) else: assert intersectSize(leftFlank, maskInterval) == 0 rightState = None if rightFlank is not None: if rightFlank[0] == maskInterval[0] and rightFlank[ 1] == maskInterval[2]: rightState = str(rightFlank[3]) else: assert intersectSize(rightFlank, maskInterval) == 0 # choose a state for the mask interval maskState = str(args.default) if args.onlyDefault is True: pass elif leftState is not None and leftState == rightState: if len(tgtSet) == 0 or leftState in tgtSet: maskState = leftState elif leftState in oneSidedTgtSet: maskState = leftState elif rightState in oneSidedTgtSet: maskState = rightState # write our mask interval tempOutMaskFile.write( "%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1], maskInterval[2], maskState)) tempOutMaskFile.close() tempMergePath1 = getLocalTempPath("Temp_mp", ".bed") tempMergePath2 = getLocalTempPath("Temp_mp", ".bed") runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1, tempOutMask, tempMergePath1)) runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2)) tempScopePath = getLocalTempPath("temp_all", ".bed") runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath)) runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath, args.outBed)) runShellCommand("rm -f %s" % " ".join([ tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath ])) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=" Given two bed files: a prediction and a true (or target)" " annotation, re-label the prediction's state names so that they " " best match the true annotation. Usees same logic as " " compareBedStates.py for determining accuracy") parser.add_argument("tgtBed", help="Target bed file") parser.add_argument("predBed", help="Predicted bed file to re-label. ") parser.add_argument("outBed", help="Output bed (relabeling of predBed)") parser.add_argument("--col", help="Column of bed files to use for state" " (currently only support 4(name) or 5(score))", default=4, type=int) parser.add_argument( "--intThresh", help="Threshold to consider interval from" " tgtBed covered by predBed. If not specified, then base" " level statistics will be used. Value in range (0,1]", type=float, default=None) parser.add_argument("--noFrag", help="Dont allow fragmented interval matches (" "see help for --frag in compareBedStates.py). Only" " relevant with --intThresh", action="store_true", default=False) parser.add_argument( "--qualThresh", help="Minimum match ratio between truth" " and prediction to relabel prediction. Example, if" " predicted state X overlaps target state LTR 25 pct of " "the time, then qualThresh must be at least 0.25 to " "label X as LTR in the output. Value in range (0, 1]", type=float, default=0.1) parser.add_argument("--ignore", help="Comma-separated list of stateNames to" " ignore (in prediction)", default=None) parser.add_argument("--ignoreTgt", help="Comma-separated list of stateNames to" " ignore (int target)", default=None) parser.add_argument("--tgt", help="Comma-separated list of stateNames to " " consider (in target). All others will be ignored", default=None) parser.add_argument( "--unique", help="If more than one predicted state maps" " to the same target state, add a unique id (numeric " "suffix) to the output so that they can be distinguished", action="store_true", default=False) parser.add_argument("--model", help="Apply state name mapping to the model" " in the specified path (it is strongly advised to" " make a backup of the model first)", default=None) parser.add_argument("--noMerge", help="By default, adjacent intervals" " with the same state name in the output are " "automatically merged into a single interval. This" " flag disables this.", action="store_true", default=False) parser.add_argument("--hm", help="Write confusion matrix as heatmap in PDF" " format to specified file", default=None) parser.add_argument("--old", help="Use old name mapping logic which just " "takes biggest overlap in forward confusion matrix. " "faster than new default logic which does the greedy" " f1 optimization", action="store_true", default=False) parser.add_argument("--fdr", help="Use FDR cutoff instead of (default)" " greedy F1 optimization for state labeling", type=float, default=None) parser.add_argument("--tl", help="Path to tracks XML file. Used to cut " "out mask tracks so they are removed from comparison." " (convenience option to not have to manually run " "subtractBed everytime...)", default=None) parser.add_argument( "--colOrder", help="List of states used to force" " ordering in heatmap (otherwise alphabetical) columns. These" " states will correspond to the tgtBed when --old used and" " --predBed otherwise.", default=None) parser.add_argument( "--hmCovRow", help="Path to write 1-row heatmap of " "state coverage (fraction of bases). only works with --hm", default=None) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.ignore is not None: args.ignore = set(args.ignore.split(",")) else: args.ignore = set() if args.ignoreTgt is not None: args.ignoreTgt = set(args.ignoreTgt.split(",")) else: args.ignoreTgt = set() if args.tgt is not None: args.tgt = set(args.tgt.split(",")) if args.old is True: raise RuntimeError("--tgt option not implemented for --old") else: args.tgt = set() if args.old is True and args.fdr is not None: raise RuntimeError("--old and --fdr options are exclusive") assert args.col == 4 or args.col == 5 tempFiles = [] if args.tl is not None: cutBedTgt = cutOutMaskIntervals(args.tgtBed, -1, sys.maxint, args.tl) cutBedPred = cutOutMaskIntervals(args.predBed, -1, sys.maxint, args.tl) if cutBedTgt is not None: assert cutBedPred is not None tempFiles += [cutBedTgt, cutBedPred] args.tgtBed = cutBedTgt args.predBed = cutBedPred checkExactOverlap(args.tgtBed, args.predBed) intervals1 = readBedIntervals(args.tgtBed, ncol=args.col) intervals2 = readBedIntervals(args.predBed, ncol=args.col) cfName = "reverse" if args.old is True: intervals1, intervals2 = intervals2, intervals1 cfName = "forward" # generate confusion matrix based on accuracy comparison using # base or interval stats as desired if args.intThresh is not None: logger.info("Computing interval %s confusion matrix" % cfName) confMat = compareIntervalsOneSided(intervals2, intervals1, args.col - 1, args.intThresh, False, not args.noFrag)[1] else: logger.info("Computing base %s confusion matrix" % cfName) confMat = compareBaseLevel(intervals2, intervals1, args.col - 1)[1] logger.info("%s Confusion Matrix:\n%s" % (cfName, str(confMat))) # find the best "true" match for each predicted state if args.old is True: intervals1, intervals2 = intervals2, intervals1 stateMap = getStateMapFromConfMatrix_simple(confMat) else: stateMap = getStateMapFromConfMatrix(confMat, args.tgt, args.ignoreTgt, args.ignore, args.qualThresh, args.fdr) # filter the stateMap to take into account the command-line options # notably --ignore, --ignoreTgt, --qualThresh, and --unique filterStateMap(stateMap, args) logger.info("State Map:\n%s", str(stateMap)) # write the model if spefied if args.model is not None: applyNamesToModel(stateMap, args.model) # generate the output bed using the statemap writeFittedBed(intervals2, stateMap, args.outBed, args.col - 1, args.noMerge, args.ignoreTgt) # write the confusion matrix as heatmap if args.hm is not None: if canPlot is False: raise RuntimeError("Unable to write heatmap. Maybe matplotlib is " "not installed?") writeHeatMap(confMat, args.hm, args.colOrder, args.hmCovRow) if len(tempFiles) > 0: runShellCommand("rm -f %s" % " ".join(tempFiles)) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Evaluate a given data set with a trained HMM. Display" " the log probability of the input data given the model, and " "optionally output the most likely sequence of hidden states.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("inputModel", help="Path of hmm created with" "teHmmTrain.py") parser.add_argument("bedRegions", help="Intervals to process") parser.add_argument("--bed", help="path of file to write viterbi " "output to (most likely sequence of hidden states)", default=None) parser.add_argument("--numThreads", help="Number of threads to use (only" " applies to CFG parser for the moment)", type=int, default=1) parser.add_argument("--slice", help="Make sure that regions are sliced" " to a maximum length of the given value. Most " "useful when model is a CFG to keep memory down. " "When 0, no slicing is done", type=int, default=0) parser.add_argument("--segment", help="Use the intervals in bedRegions" " as segments which each count as a single column" " for evaluattion. Note the model should have been" " trained with the --segment option pointing to this" " same bed file.", action="store_true", default=False) parser.add_argument("--segLen", help="Effective segment length used for" " normalizing input segments (specifying 0 means no" " normalization applied)", type=int, default=0) parser.add_argument("--maxPost", help="Use maximum posterior decoding instead" " of Viterbi for evaluation", action="store_true", default=False) parser.add_argument("--pd", help="Output BED file for posterior distribution. Must" " be used in conjunction with --pdStates (View on the " "browser via bedGraphToBigWig)", default=None) parser.add_argument("--pdStates", help="comma-separated list of state names to use" " for computing posterior distribution. For example: " " --pdStates inside,LTR_left,LTR_right will compute the probability" ", for each observation, that the hidden state is inside OR LTR_left" " OR LTR_right. Must be used with --pd to specify output " "file.", default=None) parser.add_argument("--bic", help="save Bayesian Information Criterion (BIC) score" " in given file", default=None) parser.add_argument("--ed", help="Output BED file for emission distribution. Must" " be used in conjunction with --edStates (View on the " "browser via bedGraphToBigWig)", default=None) parser.add_argument("--edStates", help="comma-separated list of state names to use" " for computing emission distribution. For example: " " --edStates inside,LTR_left for each obsercation the probability " " that inside emitted that observaiton plus the probabillity that" " LTR_left emitted it. If more than one state is selected, this " " is not a distribution, but a sum of distributions (and values" " can exceed 1). Mostly for debugging purposes. Note output in LOG", default=None) parser.add_argument("--chroms", help="list of chromosomes, or regions, to run in parallel" " (in BED format). input regions will be intersected with each line" " in this file, and the result will correspsond to an individual job", default=None) parser.add_argument("--proc", help="number of processes (use in conjunction with --chroms)", type=int, default=1) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.slice <= 0: args.slice = sys.maxint elif args.segment is True: raise RuntimeError("--slice and --segment options are not compatible at " "this time") if (args.pd is not None) ^ (args.pdStates is not None): raise RuntimeError("--pd requires --pdStates and vice versa") if (args.ed is not None) ^ (args.edStates is not None): raise RuntimeError("--ed requires --edStates and vice versa") if args.bed is None and (args.pd is not None or args.ed is not None): raise RuntimeError("Both --ed and --pd only usable in conjunction with" " --bed") if args.chroms is not None: # hack to allow chroms argument to chunk and rerun parallelDispatch(argv, args) cleanBedTool(tempBedToolPath) return 0 # load model created with teHmmTrain.py logger.info("loading model %s" % args.inputModel) model = loadModel(args.inputModel) if isinstance(model, MultitrackCfg): if args.maxPost is True: raise RuntimeErorr("--post not supported on CFG models") # apply the effective segment length if args.segLen > 0: assert args.segment is True model.getEmissionModel().effectiveSegmentLength = args.segLen # read intervals from the bed file logger.info("loading target intervals from %s" % args.bedRegions) mergedIntervals = getMergedBedIntervals(args.bedRegions, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.bedRegions) # slice if desired choppedIntervals = [x for x in slicedIntervals(mergedIntervals, args.slice)] # read segment intervals segIntervals = None if args.segment is True: logger.info("loading segment intervals from %s" % args.bedRegions) segIntervals = readBedIntervals(args.bedRegions, sort=True) # load the input # read the tracks, while intersecting them with the given interval trackData = TrackData() # note we pass in the trackList that was saved as part of the model # because we do not want to generate a new one. logger.info("loading tracks %s" % args.tracksInfo) trackData.loadTrackData(args.tracksInfo, choppedIntervals, model.getTrackList(), segmentIntervals=segIntervals) # do the viterbi algorithm if isinstance(model, MultitrackHmm): algname = "viterbi" if args.maxPost is True: algname = "posterior decoding" logger.info("running %s algorithm" % algname) elif isinstance(model, MultitrackCfg): logger.info("running CYK algorithm") vitOutFile = None if args.bed is not None: vitOutFile = open(args.bed, "w") totalScore = 0 tableIndex = 0 totalDatapoints = 0 # Note: in general there's room to save on memory by only computing single # track table at once (just need to add table by table interface to hmm...) posteriors = [None] * trackData.getNumTrackTables() posteriorsFile = None posteriorsMask = None if args.pd is not None: posteriors = model.posteriorDistribution(trackData) posteriorsFile = open(args.pd, "w") posteriorsMask = getPosteriorsMask(args.pdStates, model) assert len(posteriors[0][0]) == len(posteriorsMask) emProbs = [None] * trackData.getNumTrackTables() emissionsFile = None emissionsMask = None if args.ed is not None: emProbs = model.emissionDistribution(trackData) emissionsFile = open(args.ed, "w") emissionsMask = getPosteriorsMask(args.edStates, model) assert len(emProbs[0][0]) == len(emissionsMask) decodeFunction = model.viterbi if args.maxPost is True: decodeFunction = model.posteriorDecode for i, (vitLogProb, vitStates) in enumerate(decodeFunction(trackData, numThreads=args.numThreads)): totalScore += vitLogProb if args.bed is not None or args.pd is not None: if args.bed is not None: vitOutFile.write("#Viterbi Score: %f\n" % (vitLogProb)) trackTable = trackData.getTrackTableList()[tableIndex] tableIndex += 1 statesToBed(trackTable, vitStates, vitOutFile, posteriors[i], posteriorsMask, posteriorsFile, emProbs[i], emissionsMask, emissionsFile) totalDatapoints += len(vitStates) * trackTable.getNumTracks() print "Viterbi (log) score: %f" % totalScore if isinstance(model, MultitrackHmm) and model.current_iteration is not None: print "Number of EM iterations: %d" % model.current_iteration if args.bed is not None: vitOutFile.close() if posteriorsFile is not None: posteriorsFile.close() if emissionsFile is not None: emissionsFile.close() if args.bic is not None: bicFile = open(args.bic, "w") # http://en.wikipedia.org/wiki/Bayesian_information_criterion lnL = float(totalScore) try: k = float(model.getNumFreeParameters()) except: # numFreeParameters still not done for semi-supervised # just pass through a 0 instead of crashing for now k = 0.0 n = float(totalDatapoints) bic = -2.0 * lnL + k * (np.log(n) + np.log(2 * np.pi)) bicFile.write("%f\n" % bic) bicFile.write("# = -2.0 * lnL + k * (lnN + ln(2 * np.pi))\n" "# where lnL=%f k=%d (%d states) N=%d (%d obs * %d tracks) lnN=%f\n" % ( lnL, int(k), model.getEmissionModel().getNumStates(), int(totalDatapoints), totalDatapoints / model.getEmissionModel().getNumTracks(), model.getEmissionModel().getNumTracks(), np.log(n))) bicFile.close() cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=" Given two bed files: a prediction and a true (or target)" " annotation, re-label the prediction's state names so that they " " best match the true annotation. Usees same logic as " " compareBedStates.py for determining accuracy") parser.add_argument("tgtBed", help="Target bed file") parser.add_argument("predBed", help="Predicted bed file to re-label. ") parser.add_argument("outBed", help="Output bed (relabeling of predBed)") parser.add_argument("--col", help="Column of bed files to use for state" " (currently only support 4(name) or 5(score))", default = 4, type = int) parser.add_argument("--intThresh", help="Threshold to consider interval from" " tgtBed covered by predBed. If not specified, then base" " level statistics will be used. Value in range (0,1]", type=float, default=None) parser.add_argument("--noFrag", help="Dont allow fragmented interval matches (" "see help for --frag in compareBedStates.py). Only" " relevant with --intThresh", action="store_true", default=False) parser.add_argument("--qualThresh", help="Minimum match ratio between truth" " and prediction to relabel prediction. Example, if" " predicted state X overlaps target state LTR 25 pct of " "the time, then qualThresh must be at least 0.25 to " "label X as LTR in the output. Value in range (0, 1]", type=float, default=0.1) parser.add_argument("--ignore", help="Comma-separated list of stateNames to" " ignore (in prediction)", default=None) parser.add_argument("--ignoreTgt", help="Comma-separated list of stateNames to" " ignore (int target)", default=None) parser.add_argument("--tgt", help="Comma-separated list of stateNames to " " consider (in target). All others will be ignored", default=None) parser.add_argument("--unique", help="If more than one predicted state maps" " to the same target state, add a unique id (numeric " "suffix) to the output so that they can be distinguished", action="store_true", default=False) parser.add_argument("--model", help="Apply state name mapping to the model" " in the specified path (it is strongly advised to" " make a backup of the model first)", default=None) parser.add_argument("--noMerge", help="By default, adjacent intervals" " with the same state name in the output are " "automatically merged into a single interval. This" " flag disables this.", action="store_true", default=False) parser.add_argument("--hm", help="Write confusion matrix as heatmap in PDF" " format to specified file", default = None) parser.add_argument("--old", help="Use old name mapping logic which just " "takes biggest overlap in forward confusion matrix. " "faster than new default logic which does the greedy" " f1 optimization", action="store_true", default=False) parser.add_argument("--fdr", help="Use FDR cutoff instead of (default)" " greedy F1 optimization for state labeling", type=float, default=None) parser.add_argument("--tl", help="Path to tracks XML file. Used to cut " "out mask tracks so they are removed from comparison." " (convenience option to not have to manually run " "subtractBed everytime...)", default=None) parser.add_argument("--colOrder", help="List of states used to force" " ordering in heatmap (otherwise alphabetical) columns. These" " states will correspond to the tgtBed when --old used and" " --predBed otherwise.", default=None) parser.add_argument("--hmCovRow", help="Path to write 1-row heatmap of " "state coverage (fraction of bases). only works with --hm", default=None) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.ignore is not None: args.ignore = set(args.ignore.split(",")) else: args.ignore = set() if args.ignoreTgt is not None: args.ignoreTgt = set(args.ignoreTgt.split(",")) else: args.ignoreTgt = set() if args.tgt is not None: args.tgt = set(args.tgt.split(",")) if args.old is True: raise RuntimeError("--tgt option not implemented for --old") else: args.tgt = set() if args.old is True and args.fdr is not None: raise RuntimeError("--old and --fdr options are exclusive") assert args.col == 4 or args.col == 5 tempFiles = [] if args.tl is not None: cutBedTgt = cutOutMaskIntervals(args.tgtBed, -1, sys.maxint, args.tl) cutBedPred = cutOutMaskIntervals(args.predBed, -1, sys.maxint, args.tl) if cutBedTgt is not None: assert cutBedPred is not None tempFiles += [cutBedTgt, cutBedPred] args.tgtBed = cutBedTgt args.predBed = cutBedPred checkExactOverlap(args.tgtBed, args.predBed) intervals1 = readBedIntervals(args.tgtBed, ncol = args.col) intervals2 = readBedIntervals(args.predBed, ncol = args.col) cfName = "reverse" if args.old is True: intervals1, intervals2 = intervals2, intervals1 cfName = "forward" # generate confusion matrix based on accuracy comparison using # base or interval stats as desired if args.intThresh is not None: logger.info("Computing interval %s confusion matrix" % cfName) confMat = compareIntervalsOneSided(intervals2, intervals1, args.col -1, args.intThresh, False, not args.noFrag)[1] else: logger.info("Computing base %s confusion matrix" % cfName) confMat = compareBaseLevel(intervals2, intervals1, args.col - 1)[1] logger.info("%s Confusion Matrix:\n%s" % (cfName, str(confMat))) # find the best "true" match for each predicted state if args.old is True: intervals1, intervals2 = intervals2, intervals1 stateMap = getStateMapFromConfMatrix_simple(confMat) else: stateMap = getStateMapFromConfMatrix(confMat, args.tgt, args.ignoreTgt, args.ignore, args.qualThresh, args.fdr) # filter the stateMap to take into account the command-line options # notably --ignore, --ignoreTgt, --qualThresh, and --unique filterStateMap(stateMap, args) logger.info("State Map:\n%s", str(stateMap)) # write the model if spefied if args.model is not None: applyNamesToModel(stateMap, args.model) # generate the output bed using the statemap writeFittedBed(intervals2, stateMap, args.outBed, args.col-1, args.noMerge, args.ignoreTgt) # write the confusion matrix as heatmap if args.hm is not None: if canPlot is False: raise RuntimeError("Unable to write heatmap. Maybe matplotlib is " "not installed?") writeHeatMap(confMat, args.hm, args.colOrder, args.hmCovRow) if len(tempFiles) > 0: runShellCommand("rm -f %s" % " ".join(tempFiles)) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Fill in masked intervals of an hmm prediction " "(from teHmmEval.py) with state corresponding to surrounding" " intervals.") parser.add_argument("tracksXML", help="XML track list (used to id masking" " tracks") parser.add_argument("allBed", help="Target scope. Masked intervals outside" " of these regions will not be included") parser.add_argument("inBed", help="TE prediction BED file. State labels" " should probably be mapped (ie with fitStateNames.py)") parser.add_argument("outBed", help="Output BED. Will be equivalent to" " the input bed except all gaps corresponding to " "masked intervals will be filled") parser.add_argument("--maxLen", help="Maximum length of a masked interval" " to fill (inclusive). Use --delMask option with same value" "if running compareBedStates.py after.", type=int, default=sys.maxint) parser.add_argument("--default", help="Default label to give to masked " "region if no label can be determined", default="0") parser.add_argument("--tgts", help="Only relabel gaps that " "are flanked on both sides by the same state, and this state" " is in this comma- separated list. --default used for other" " gaps. If not targetst specified then all states checked.", default=None) parser.add_argument("--oneSidedTgts", help="Only relabel gaps that " "are flanked on at least one side by a state in this comma-" "separated list --default used for other gaps", default=None) parser.add_argument("--onlyDefault", help="Add the default state (--default) no" " no all masked gaps no matter what. ie ignoring all other " "logic", action="store_true", default=False) parser.add_argument("--cut", help="Cut out gaps for masked tracks from the input." " By default, the input is expected to come from the HMM " "with mask intervals already absent, and will crash on with" " an assertion error if an overlap is detected.", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # make sets tgtSet = set() if args.tgts is not None: tgtSet = set(args.tgts.split(",")) oneSidedTgtSet = set() if args.oneSidedTgts is not None: oneSidedTgtSet = set(args.oneSidedTgts.split(",")) assert len(tgtSet.intersection(oneSidedTgtSet)) == 0 # read the track list trackList = TrackList(args.tracksXML) maskTracks = trackList.getMaskTracks() # read the input bed inBed = args.inBed if args.cut is True: inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML) inputIntervals = readBedIntervals(inBed, ncol = 4, sort = True) if args.cut is True: runShellCommand("rm -f %s" % inBed) if len(maskTracks) == 0 or len(inputIntervals) == 0: runShellCommand("cp %s %s" % (args.inBed, args.outBed)) logger.warning("No mask tracks located in %s or" " %s empty" % (args.tracksXML, args.inBed)) return 0 # make a temporary, combined, merged masking bed file tempMaskBed = getLocalTempPath("Temp_mb", ".bed") for maskTrack in maskTracks: assert os.path.isfile(maskTrack.getPath()) runShellCommand("cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % ( maskTrack.getPath(), tempMaskBed)) maskedIntervals = getMergedBedIntervals(tempMaskBed, sort = True) resolvedMasks = 0 if len(inputIntervals) == 0: logger.warning("No mask tracks located in %s" % args.tracksXML) return inputIdx = 0 rightFlank = inputIntervals[inputIdx] tempOutMask = getLocalTempPath("Temp_om", ".bed") tempOutMaskFile = open(tempOutMask, "w") for maskIdx, maskInterval in enumerate(maskedIntervals): if maskInterval[2] - maskInterval[1] > args.maxLen: continue # find candidate right flank while rightFlank < maskInterval: if inputIdx == len(inputIntervals) - 1: rightFlank = None break else: inputIdx += 1 rightFlank = inputIntervals[inputIdx] # candidate left flank leftFlank = None if inputIdx > 0: leftFlank = inputIntervals[inputIdx - 1] # identify flanking states if the intervals perfectly abut leftState = None if leftFlank is not None: if leftFlank[0] == maskInterval[0] and leftFlank[2] == maskInterval[1]: leftState = str(leftFlank[3]) else: assert intersectSize(leftFlank, maskInterval) == 0 rightState = None if rightFlank is not None: if rightFlank[0] == maskInterval[0] and rightFlank[1] == maskInterval[2]: rightState = str(rightFlank[3]) else: assert intersectSize(rightFlank, maskInterval) == 0 # choose a state for the mask interval maskState = str(args.default) if args.onlyDefault is True: pass elif leftState is not None and leftState == rightState: if len(tgtSet) == 0 or leftState in tgtSet: maskState = leftState elif leftState in oneSidedTgtSet: maskState = leftState elif rightState in oneSidedTgtSet: maskState = rightState # write our mask interval tempOutMaskFile.write("%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1], maskInterval[2], maskState)) tempOutMaskFile.close() tempMergePath1 = getLocalTempPath("Temp_mp", ".bed") tempMergePath2 = getLocalTempPath("Temp_mp", ".bed") runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1, tempOutMask, tempMergePath1)) runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2)) tempScopePath = getLocalTempPath("temp_all", ".bed") runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath)) runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath, args.outBed)) runShellCommand("rm -f %s" % " ".join([tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath])) cleanBedTool(tempBedToolPath)
def parallelDispatch(argv, args): """ chunk up input with chrom option. recursivlely launch eval. merge results """ jobList = [] chromIntervals = readBedIntervals(args.chroms, sort=True) chromFiles = [] regionFiles = [] segFiles = [] statsFiles = [] offset = args.co for chrom in chromIntervals: cmdToks = copy.deepcopy(argv) cmdToks[cmdToks.index("--chrom") + 1] = "" cmdToks[cmdToks.index("--chrom")] = "" chromPath = getLocalTempPath("TempChromPath", ".bed") cpFile = open(chromPath, "w") cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2])) cpFile.close() regionPath = getLocalTempPath("Temp", ".bed") runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.allBed, chromPath, regionPath)) if os.path.getsize(regionPath) < 2: continue offset += int(chrom[2]) - int(chrom[1]) regionFiles.append(regionPath) chromFiles.append(chromPath) cmdToks[2] = regionPath segPath = getLocalTempPath("Temp", ".bed") cmdToks[3] = segPath segFiles.append(segPath) if "--co" in cmdToks: cmdToks[cmdToks.index("--co")+1] = str(offset) else: cmdToks.append("--co") cmdToks.append(str(offset)) if args.stats is not None: statsPath = getLocalTempPath("Temp", ".bed") cmdToks[cmdToks.index("--stats")+1] = statsPath statsFiles.append(statsPath) cmd = " ".join(cmdToks) jobList.append(cmd) runParallelShellCommands(jobList, args.proc) for i in xrange(len(jobList)): if i == 0: ct = ">" else: ct = ">>" runShellCommand("cat %s %s %s" % (segFiles[i], ct, args.outBed)) if len(statsFiles) > 0: runShellCommand("cat %s %s %s" % (statsFiles[i], ct, args.stats)) for i in itertools.chain(chromFiles, regionFiles, segFiles, statsFiles): runShellCommand("rm %s" % i)
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=" Thin wrapper of teHmmTrain.py and teHmmEval.py " "to generate a table of Number-of-HMM-states VS BIC. Lower BIC" " is better") parser.add_argument("tracks", help="tracks xml used for training and eval") parser.add_argument( "trainingBeds", help="comma-separated list of training regions" " (training region size will be a variable in output table). " "if segmentation is activated, these must also be the " "segmented beds...") parser.add_argument("evalBed", help="eval region") parser.add_argument("trainOpts", help="all teHmmTrain options in quotes") parser.add_argument("evalOpts", help="all teHmmEval options in quotes") parser.add_argument("states", help="comma separated-list of numbers of states" " to try") parser.add_argument("outDir", help="output directory") parser.add_argument("--reps", help="number of replicates", type=int, default=1) parser.add_argument("--proc", help="maximum number of processors to use" " in parallel", type=int, default=1) parser.add_argument("--resume", help="try not to rewrite existing files", action="store_true", default=False) parser.add_argument( "--initTrans", help="the states argument is overridden" " to specify a list of transition initialization files " "instead of state numbers", action="store_true", default=False) parser.add_argument("--numReps", help="the states argument is overridden" " to specifiy a list of replicate numbers (--reps)" " arguments", action="store_true", default=False) parser.add_argument("--numIter", help="the states argument is overridden" " to specifiy a list of iteration counts (--iter)" " arugments", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1: raise RuntimeError("only one of {--initTrans, --numReps, --numIter} " "can be used at a time") if not os.path.isdir(args.outDir): runShellCommand("mkdir %s" % args.outDir) # get the sizes of the trianing beds trainingSizes = [] trainingBeds = [] for tb in args.trainingBeds.split(","): if len(tb) > 0: trainingBeds.append(tb) for bed in trainingBeds: assert os.path.isfile(bed) bedLen = 0 for interval in readBedIntervals(bed): bedLen += interval[2] - interval[1] trainingSizes.append(bedLen) # make sure --bed not in teHmmEval options and --numStates not in train # options trainOpts = args.trainOpts.split() if "--numStates" in args.trainOpts and not args.numReps and not args.numIter: nsIdx = trainOpts.index("--numStates") assert nsIdx < len(trainOpts) - 1 del trainOpts[nsIdx] del trainOpts[nsIdx] if "--initTransProbs" in args.trainOpts: tpIdx = trainOpts.index("--initTransProbs") assert tpIdx < len(trainOpts) - 1 del trainOpts[tpIdx] del trianOpts[tpIdx] trainProcs = 1 if "--numThreads" in args.trainOpts: npIdx = trainOpts.index("--numThreads") assert npIdx < len(trainOpts) - 1 trainProcs = int(trainOpts[npIdx + 1]) segOptIdx = -1 if "--segment" in args.trainOpts: segIdx = trainOpts.index("--segment") assert segIdx < len(trainOpts) - 1 segOptIdx = segIdx + 1 if args.numReps and "--reps" in args.trainOpts: repsIdx = trainOpts.index("--reps") assert repsIdx < len(trainOpts) - 1 del trainOpts[repsIdx] del trainOpts[repsIdx] if args.numIter and "--iter" in args.trainOpts: iterIdx = trainOpts.index("--iter") assert iterIdx < len(trainOpts) - 1 del trainOpts[iterIdx] del trainOpts[iterIdx] evalOpts = args.evalOpts.split() if "--bed" in args.evalOpts: bedIdx = evalOpts.index("--bed") assert bedIdx < len(evalOpts) - 1 del evalOpts[bedIdx] del evalOpts[bedIdx] if "--bic" in args.evalOpts: bicIdx = evalOpts.index("--bic") assert bicIdx < len(evalOpts) - 1 del evalOpts[bicIdx] del evalOpts[bicIdx] # hack in support for --initTrans option by munging out model sizes # from the text files if args.initTrans is True: transFiles = args.states.split(",") states = [] for tf in transFiles: stateSet = set() with open(tf) as f: for line in f: toks = line.split() print toks if len(toks) > 1 and toks[0][0] != "#": stateSet.add(toks[0]) stateSet.add(toks[1]) states.append(len(stateSet)) else: states = args.states.split(",") trainCmds = [] evalCmds = [] prevSize = -1 sameSizeCount = 0 for trainingSize, trainingBed in zip(trainingSizes, trainingBeds): # hack to take into account we may have different inputs with same # same size, so their corresponding results need unique filenames if trainingSize == prevSize: sameSizeCount += 1 else: sameSizeCount = 0 prevSize = trainingSize print prevSize, trainingSize, sameSizeCount for numStates in states: for rep in xrange(args.reps): outMod = os.path.join( args.outDir, "hmm_%d.%d.%d.%d.mod" % (trainingSize, sameSizeCount, int(numStates), int(rep))) if segOptIdx != -1: trainOpts[segOptIdx] = trainingBed if args.initTrans is True: statesOpt = "--initTransProbs %s" % transFiles[ states.index(numStates)] elif args.numIter is True: # states argument overridden by iterations statesOpt = "--iter %d" % int(numStates) elif args.numReps is True: # states argument overridden by reps statesOpt = "--reps %d" % int(numStates) else: statesOpt = "--numStates %d" % int(numStates) trainCmd = "teHmmTrain.py %s %s %s %s %s" % ( args.tracks, trainingBed, outMod, " ".join(trainOpts), statesOpt) if not args.resume or not os.path.isfile(outMod) or \ os.path.getsize(outMod) < 100: trainCmds.append(trainCmd) outBic = outMod.replace(".mod", ".bic") outBed = outMod.replace(".mod", "_eval.bed") evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % ( args.tracks, outMod, args.evalBed, outBed, outBic, " ".join(evalOpts)) if not args.resume or not os.path.isfile(outBic) or \ os.path.getsize(outBic) < 2: evalCmds.append(evalCmd) # run the training runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs)) # run the eval runParallelShellCommands(evalCmds, args.proc) # make the table header tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w") stateColName = "states" if args.numIter is True: statesColName = "iter" elif args.numReps is True: stateColName = "reps" tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" % stateColName) for i in xrange(args.reps): tableFile.write(", bic.%d" % i) tableFile.write("\n") # make the table body prevSize = -1 sameSizeCount = 0 for (trainingSize, trainingBed) in zip(trainingSizes, trainingBeds): # hack to take into account we may have different inputs with same # same size, so their corresponding results need unique filenames if trainingSize == prevSize: sameSizeCount += 1 else: sameSizeCount = 0 prevSize = trainingSize for numStates in states: bics = [] printBics = [] for rep in xrange(args.reps): outMod = os.path.join( args.outDir, "hmm_%d.%d.%d.%d.mod" % (trainingSize, sameSizeCount, int(numStates), int(rep))) outBic = outMod.replace(".mod", ".bic") try: with open(outBic, "r") as obFile: for line in obFile: bic = float(line.split()[0]) break bics.append(bic) printBics.append(bic) except: logger.warning("Coudn't find bic %s" % outBic) printBics.append("ERROR") # write row tableFile.write("%s, %d, %d" % (trainingBed, int(trainingSize), int(numStates))) if len(bics) > 0: tableFile.write(", %f, %f, %f" % (np.mean(bics), np.min(bics), np.max(bics))) else: tableFile.write(", ERROR, ERROR, ERROR") for pb in printBics: tableFile.write(", %s" % pb) tableFile.write("\n") tableFile.close() cleanBedTool(tempBedToolPath)
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=" Thin wrapper of teHmmTrain.py and teHmmEval.py " "to generate a table of Number-of-HMM-states VS BIC. Lower BIC" " is better") parser.add_argument("tracks", help="tracks xml used for training and eval") parser.add_argument("trainingBeds", help="comma-separated list of training regions" " (training region size will be a variable in output table). " "if segmentation is activated, these must also be the " "segmented beds...") parser.add_argument("evalBed", help="eval region") parser.add_argument("trainOpts", help="all teHmmTrain options in quotes") parser.add_argument("evalOpts", help="all teHmmEval options in quotes") parser.add_argument("states", help="comma separated-list of numbers of states" " to try") parser.add_argument("outDir", help="output directory") parser.add_argument("--reps", help="number of replicates", type = int, default=1) parser.add_argument("--proc", help="maximum number of processors to use" " in parallel", type = int, default = 1) parser.add_argument("--resume", help="try not to rewrite existing files", action="store_true", default=False) parser.add_argument("--initTrans", help="the states argument is overridden" " to specify a list of transition initialization files " "instead of state numbers", action="store_true", default=False) parser.add_argument("--numReps", help="the states argument is overridden" " to specifiy a list of replicate numbers (--reps)" " arguments", action="store_true", default=False) parser.add_argument("--numIter", help="the states argument is overridden" " to specifiy a list of iteration counts (--iter)" " arugments", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1: raise RuntimeError("only one of {--initTrans, --numReps, --numIter} " "can be used at a time") if not os.path.isdir(args.outDir): runShellCommand("mkdir %s" % args.outDir) # get the sizes of the trianing beds trainingSizes = [] trainingBeds = [] for tb in args.trainingBeds.split(","): if len(tb) > 0: trainingBeds.append(tb) for bed in trainingBeds: assert os.path.isfile(bed) bedLen = 0 for interval in readBedIntervals(bed): bedLen += interval[2] - interval[1] trainingSizes.append(bedLen) # make sure --bed not in teHmmEval options and --numStates not in train # options trainOpts = args.trainOpts.split() if "--numStates" in args.trainOpts and not args.numReps and not args.numIter: nsIdx = trainOpts.index("--numStates") assert nsIdx < len(trainOpts) - 1 del trainOpts[nsIdx] del trainOpts[nsIdx] if "--initTransProbs" in args.trainOpts: tpIdx = trainOpts.index("--initTransProbs") assert tpIdx < len(trainOpts) - 1 del trainOpts[tpIdx] del trianOpts[tpIdx] trainProcs = 1 if "--numThreads" in args.trainOpts: npIdx = trainOpts.index("--numThreads") assert npIdx < len(trainOpts) - 1 trainProcs = int(trainOpts[npIdx + 1]) segOptIdx = -1 if "--segment" in args.trainOpts: segIdx = trainOpts.index("--segment") assert segIdx < len(trainOpts) - 1 segOptIdx = segIdx + 1 if args.numReps and "--reps" in args.trainOpts: repsIdx = trainOpts.index("--reps") assert repsIdx < len(trainOpts) - 1 del trainOpts[repsIdx] del trainOpts[repsIdx] if args.numIter and "--iter" in args.trainOpts: iterIdx = trainOpts.index("--iter") assert iterIdx < len(trainOpts) - 1 del trainOpts[iterIdx] del trainOpts[iterIdx] evalOpts = args.evalOpts.split() if "--bed" in args.evalOpts: bedIdx = evalOpts.index("--bed") assert bedIdx < len(evalOpts) - 1 del evalOpts[bedIdx] del evalOpts[bedIdx] if "--bic" in args.evalOpts: bicIdx = evalOpts.index("--bic") assert bicIdx < len(evalOpts) - 1 del evalOpts[bicIdx] del evalOpts[bicIdx] # hack in support for --initTrans option by munging out model sizes # from the text files if args.initTrans is True: transFiles = args.states.split(",") states = [] for tf in transFiles: stateSet = set() with open(tf) as f: for line in f: toks = line.split() print toks if len(toks) > 1 and toks[0][0] != "#": stateSet.add(toks[0]) stateSet.add(toks[1]) states.append(len(stateSet)) else: states = args.states.split(",") trainCmds = [] evalCmds = [] prevSize = -1 sameSizeCount = 0 for trainingSize, trainingBed in zip(trainingSizes, trainingBeds): # hack to take into account we may have different inputs with same # same size, so their corresponding results need unique filenames if trainingSize == prevSize: sameSizeCount += 1 else: sameSizeCount = 0 prevSize = trainingSize print prevSize, trainingSize, sameSizeCount for numStates in states: for rep in xrange(args.reps): outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % ( trainingSize, sameSizeCount, int(numStates), int(rep))) if segOptIdx != -1: trainOpts[segOptIdx] = trainingBed if args.initTrans is True: statesOpt = "--initTransProbs %s" % transFiles[states.index(numStates)] elif args.numIter is True: # states argument overridden by iterations statesOpt = "--iter %d" % int(numStates) elif args.numReps is True: # states argument overridden by reps statesOpt = "--reps %d" % int(numStates) else: statesOpt = "--numStates %d" % int(numStates) trainCmd = "teHmmTrain.py %s %s %s %s %s" % ( args.tracks, trainingBed, outMod, " ".join(trainOpts), statesOpt) if not args.resume or not os.path.isfile(outMod) or \ os.path.getsize(outMod) < 100: trainCmds.append(trainCmd) outBic = outMod.replace(".mod", ".bic") outBed = outMod.replace(".mod", "_eval.bed") evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % ( args.tracks, outMod, args.evalBed, outBed, outBic, " ".join(evalOpts)) if not args.resume or not os.path.isfile(outBic) or \ os.path.getsize(outBic) < 2: evalCmds.append(evalCmd) # run the training runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs)) # run the eval runParallelShellCommands(evalCmds, args.proc) # make the table header tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w") stateColName = "states" if args.numIter is True: statesColName = "iter" elif args.numReps is True: stateColName = "reps" tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" % stateColName) for i in xrange(args.reps): tableFile.write(", bic.%d" % i) tableFile.write("\n") # make the table body prevSize = -1 sameSizeCount = 0 for (trainingSize,trainingBed) in zip(trainingSizes, trainingBeds): # hack to take into account we may have different inputs with same # same size, so their corresponding results need unique filenames if trainingSize == prevSize: sameSizeCount += 1 else: sameSizeCount = 0 prevSize = trainingSize for numStates in states: bics = [] printBics = [] for rep in xrange(args.reps): outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % ( trainingSize, sameSizeCount, int(numStates), int(rep))) outBic = outMod.replace(".mod", ".bic") try: with open(outBic, "r") as obFile: for line in obFile: bic = float(line.split()[0]) break bics.append(bic) printBics.append(bic) except: logger.warning("Coudn't find bic %s" % outBic) printBics.append("ERROR") # write row tableFile.write("%s, %d, %d" % (trainingBed, int(trainingSize), int(numStates))) if len(bics) > 0: tableFile.write(", %f, %f, %f" % (np.mean(bics), np.min(bics), np.max(bics))) else: tableFile.write(", ERROR, ERROR, ERROR") for pb in printBics: tableFile.write(", %s" % pb) tableFile.write("\n") tableFile.close() cleanBedTool(tempBedToolPath)
def testSupervisedLearn(self): intervals = readBedIntervals(getTestDirPath("truth.bed"), ncol=4) truthIntervals = [] for i in intervals: truthIntervals.append((i[0], i[1], i[2], int(i[3]))) allIntervals = [(truthIntervals[0][0], truthIntervals[0][1], truthIntervals[-1][2])] trackData = TrackData() trackData.loadTrackData(getTracksInfoPath(3), allIntervals) assert len(trackData.getTrackTableList()) == 1 # set the fudge to 1 since when the test was written this was # hardcoded default em = IndependentMultinomialEmissionModel( 4, trackData.getNumSymbolsPerTrack(), fudge=1.0) hmm = MultitrackHmm(em) hmm.supervisedTrain(trackData, truthIntervals) hmm.validate() # check emissions, they should basically be binary. trackList = hmm.getTrackList() emp = np.exp(em.getLogProbs()) ltrTrack = trackList.getTrackByName("ltr") track = ltrTrack.getNumber() cmap = ltrTrack.getValueMap() s0 = cmap.getMap(None) s1 = cmap.getMap(0) # we add 1 to all frequencies like emission trainer assert_array_almost_equal(emp[track][0][s0], 36. / 37.) assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.) assert_array_almost_equal(emp[track][1][s0], 1 - 6. / 7.) assert_array_almost_equal(emp[track][1][s1], 6. / 7.) assert_array_almost_equal(emp[track][2][s0], 26. / 27.) assert_array_almost_equal(emp[track][2][s1], 1. - 26. / 27.) assert_array_almost_equal(emp[track][3][s0], 1. - 6. / 7.) assert_array_almost_equal(emp[track][3][s1], 6. / 7.) insideTrack = trackList.getTrackByName("inside") track = insideTrack.getNumber() cmap = insideTrack.getValueMap() s0 = cmap.getMap(None) s1 = cmap.getMap("Inside") assert_array_almost_equal(emp[track][0][s0], 36. / 37.) assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.) assert_array_almost_equal(emp[track][1][s0], 6. / 7.) assert_array_almost_equal(emp[track][1][s1], 1 - 6. / 7.) assert_array_almost_equal(emp[track][2][s0], 1. - 26. / 27.) assert_array_almost_equal(emp[track][2][s1], 26. / 27.) assert_array_almost_equal(emp[track][3][s0], 6. / 7.) assert_array_almost_equal(emp[track][3][s1], 1. - 6. / 7.) # crappy check for start probs. need to test transition too! freq = [0.0] * em.getNumStates() total = 0.0 for interval in truthIntervals: state = interval[3] freq[state] += float(interval[2]) - float(interval[1]) total += float(interval[2]) - float(interval[1]) sprobs = hmm.getStartProbs() assert len(sprobs) == em.getNumStates() for state in xrange(em.getNumStates()): assert_array_almost_equal(freq[state] / total, sprobs[state]) # transition probabilites # from eyeball: #c 0 5 0 0->0 +4 0->1 +1 0-> +5 #c 5 10 1 1->1 +4 1->2 +1 1-> +5 #c 10 35 2 2->2 +24 2->3 +1 2-> +25 #c 35 40 3 3->3 +4 3->0 +1 3-> +5 #c 40 70 0 0->0 +29 0-> +19 realTransProbs = np.array([[33. / 34., 1. / 34., 0., 0.], [0., 4. / 5., 1. / 5., 0.], [0., 0., 24. / 25., 1. / 25.], [1. / 5., 0., 0., 4. / 5.]]) tprobs = hmm.getTransitionProbs() assert tprobs.shape == (em.getNumStates(), em.getNumStates()) assert_array_almost_equal(tprobs, realTransProbs) prob, states = hmm.viterbi(trackData)[0] for truthInt in truthIntervals: for i in xrange(truthInt[1], truthInt[2]): assert states[i] == truthInt[3]
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Make some tables of statistics from a BED file. All" " output will be written in one big CSV table to be viewed in a " "spreadsheet.") parser.add_argument("inBed", help="Input bed file") parser.add_argument("outCsv", help="Path to write output in CSV format") parser.add_argument("--ignore", help="Comma-separated list of names" " to ignore", default="") parser.add_argument("--numBins", help="Number of (linear) bins for " "histograms", type=int, default=10) parser.add_argument("--logHist", help="Apply log-transform to data for " "histogram", action="store_true", default=False) parser.add_argument("--histRange", help="Histogram range as comma-" "separated pair of numbers", default=None) parser.add_argument("--noHist", help="skip hisograms", action="store_true", default=False) parser.add_argument("--noScore", help="Just do length stats", action="store_true", default=False) parser.add_argument("--noLen", help="Just do score stats", action="store_true", default=False) parser.add_argument("--nearness", help="Compute nearness stats (instead " "of normal stats) of input bed with given BED. Output" " will be a BED instead of CSV, with nearness in the " "score position", default=None) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.histRange is not None: args.histRange = args.histRange.split(",") assert len(args.histRange) == 2 args.histRange = int(args.histRange[0]), int(args.histRange[1]) outFile = open(args.outCsv, "w") args.ignoreSet = set(args.ignore.split(",")) intervals = readBedIntervals(args.inBed, ncol=5, sort=args.nearness is not None) csvStats = "" # nearness stats if args.nearness is not None: args.noScore = True csvStats = makeNearnessBED(intervals, args) # length stats elif args.noLen is False: csvStats = makeCSV(intervals, args, lambda x: int(x[2]) - int(x[1]), "Length") # score stats try: if args.noScore is False: csvStats += "\n" + makeCSV(intervals, args, lambda x: float(x[4]), "Score") csvStats += "\n" + makeCSV( intervals, args, lambda x: float(x[4]) * (float(x[2]) - float(x[1])), "Score*Length") except Exception as e: logger.warning("Couldn't make score stats because %s" % str(e)) outFile.write(csvStats) outFile.write("\n") outFile.close() cleanBedTool(tempBedToolPath)
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Write track data into ASCII dump. Row i corresponds" " to the ith position found when scanning query BED IN SORTED ORDER." "Column j corresponds to the jth track in the XML file. --map option" " used to write internal integer format used by HMM. Unobserved values" " written as \"None\" if default attribute not specified or track not" " binary. Rounding can occur if scaling parameters present.\n\n" "IMPORTANT: values stored in 8bit integers internally. Any track with" " more than 256 different values will get clamped (with a warning)") parser.add_argument("tracks", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("query", help="BED region(s) to dump. SCANNED IN" " SORTED ORDER") parser.add_argument("output", help="Path of file to write output to") parser.add_argument("--map", help="Apply name mapping, including" " transformation specified in scale, logScale" ", etc. attributes, that HMM uses internally" ". Important to note that resulting integers" " are just unique IDs. ID_1 > ID_2 does not" " mean anything", action="store_true", default=False) parser.add_argument("--segment", help="Treat each interval in query" " as a single segment (ie with only one data point)" ". In this case, query should probably have been" " generated with segmentTracks.py", action="store_true", default=False) parser.add_argument("--noPos", help="Do not print genomic position" " (first 2 columnts)", action="store_true", default=False) parser.add_argument("--noMask", help="Ignore mask tracks", default=False, action="store_true") addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) # make sure output writeable outFile = open(args.output, "w") # need to remember to fix this, disable as precaution for now assert args.noMask is True or args.segment is False # read query intervals from the bed file logger.info("loading query intervals from %s" % args.query) mergedIntervals = getMergedBedIntervals(args.query, ncol=3) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.query) # read the segment intervals from the (same) bed file segIntervals = None if args.segment is True: logger.info("loading segment intervals from %s" % args.query) segIntervals = readBedIntervals(args.query, sort=True) # read all data from track xml logger.info("loading tracks %s" % args.tracks) trackData = TrackData() trackData.loadTrackData(args.tracks, mergedIntervals, segmentIntervals=segIntervals, applyMasking = not args.noMask) # dump the data to output dumpTrackData(trackData, outFile, args.map, not args.noPos) outFile.close()
def runPositionalComparison(argv, args): """ hack to recursively exectute compareBedStates.py on a sliding window of the two inputs and report accuracy in a BED file """ try: windowToks = args.window.split(",") assert len(windowToks) == 5 windowSize = int(windowToks[0]) stateName = windowToks[1] compType = windowToks[2] score = windowToks[3] outBed = windowToks[4] except: raise RuntimeError("value passed to --window is not in valid format") if compType == "base": compIdx = 0 elif compType == "interval": compIdx = 1 elif compType == "weighted": compIdx = 2 else: raise RuntimeError("invalid compType, %s, passed to --window" % compType) if score != "f1" and score != "precision" and score != "recall": raise RuntimeError("invalid score, %s, passed to --window" % score) try: outFile = open(outBed, "w") except: raise RuntimeError("invalid outBed, %s, passed to --window" % outBed) tempBed = getLocalTempPath("Temp_region", ".bed") runShellCommand("mergeBed -i %s > %s" % (args.bed1, tempBed)) chunkBed = getLocalTempPath("Temp_chunkBed", ".bed") runShellCommand("chunkBedRegions.py %s %d --overlap .5 > %s" % ( tempBed, windowSize, chunkBed)) window = getLocalTempPath("Temp_window", ".bed") slice1 = getLocalTempPath("Temp_slice1", ".bed") slice2 = getLocalTempPath("Temp_slice2", ".bed") compFile = getLocalTempPath("Temp_compFile", ".bed") compOpts = "" winIdx = argv.index("--window") assert winIdx > 0 and winIdx < len(argv) -1 and argv[winIdx + 1] == args.window for i in xrange(3, len(argv)): if i != winIdx and i != winIdx + 1: compOpts += " " + argv[i] for chunk in readBedIntervals(chunkBed): runShellCommand("echo \"%s\t%d\t%d\" > %s" % (chunk[0], chunk[1], chunk[2], window)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % ( args.bed1, window, slice1)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % ( args.bed2, window, slice2)) runShellCommand("compareBedStates.py %s %s %s > %s" % ( slice1, slice2, compOpts, compFile)) stats = extractCompStatsFromFile(compFile)[compIdx] if stateName not in stats: stats[stateName] = (0,0) f1 = 0. prec, rec = stats[stateName] if prec + rec > 0: f1 = (2. * prec * rec) / (prec + rec) val = f1 if score == "precision": val = prec elif score == "recall": val = rec outFile.write("%s\t%d\t%d\t%f\n" % (chunk[0], chunk[1], chunk[2], val)) runShellCommand("rm -f %s %s %s %s %s %s" % (tempBed, chunkBed, window, slice1, slice2, compFile)) outFile.close()
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Make some tables of statistics from a BED file. All" " output will be written in one big CSV table to be viewed in a " "spreadsheet.") parser.add_argument("inBed", help="Input bed file") parser.add_argument("outCsv", help="Path to write output in CSV format") parser.add_argument("--ignore", help="Comma-separated list of names" " to ignore", default="") parser.add_argument("--numBins", help="Number of (linear) bins for " "histograms", type=int, default=10) parser.add_argument("--logHist", help="Apply log-transform to data for " "histogram", action="store_true", default=False) parser.add_argument("--histRange", help="Histogram range as comma-" "separated pair of numbers", default=None) parser.add_argument("--noHist", help="skip hisograms", action="store_true", default=False) parser.add_argument("--noScore", help="Just do length stats", action="store_true", default=False) parser.add_argument("--noLen", help="Just do score stats", action="store_true", default=False) parser.add_argument("--nearness", help="Compute nearness stats (instead " "of normal stats) of input bed with given BED. Output" " will be a BED instead of CSV, with nearness in the " "score position", default=None) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.histRange is not None: args.histRange = args.histRange.split(",") assert len(args.histRange) == 2 args.histRange = int(args.histRange[0]), int(args.histRange[1]) outFile = open(args.outCsv, "w") args.ignoreSet = set(args.ignore.split(",")) intervals = readBedIntervals(args.inBed, ncol = 5, sort = args.nearness is not None) csvStats = "" # nearness stats if args.nearness is not None: args.noScore = True csvStats = makeNearnessBED(intervals, args) # length stats elif args.noLen is False: csvStats = makeCSV(intervals, args, lambda x : int(x[2])-int(x[1]), "Length") # score stats try: if args.noScore is False: csvStats += "\n" + makeCSV(intervals, args, lambda x : float(x[4]), "Score") csvStats += "\n" + makeCSV(intervals, args, lambda x : float(x[4]) * ( float(x[2]) - float(x[1])), "Score*Length") except Exception as e: logger.warning("Couldn't make score stats because %s" % str(e)) outFile.write(csvStats) outFile.write("\n") outFile.close() cleanBedTool(tempBedToolPath)
def testHmmSupervisedLearn(self): """ Pretty much copied from the HMM unit test. We try to recapitualte all results with a CFG with no nest states, which should be same as HMM""" intervals = readBedIntervals(getTestDirPath("truth.bed"), ncol=4) truthIntervals = [] for i in intervals: truthIntervals.append((i[0], i[1], i[2], int(i[3]))) allIntervals = [(truthIntervals[0][0], truthIntervals[0][1], truthIntervals[-1][2])] trackData = TrackData() trackData.loadTrackData(getTracksInfoPath(3), allIntervals) assert len(trackData.getTrackTableList()) == 1 # set the fudge to 1 since when the test was written this was # hardcoded default em = IndependentMultinomialEmissionModel( 4, trackData.getNumSymbolsPerTrack(), fudge=1.0) hmm = MultitrackHmm(em) hmm.supervisedTrain(trackData, truthIntervals) hmm.validate() pairModel = PairEmissionModel(em, [1.0] * em.getNumStates()) # Test validates with neststate just for fun cfg = MultitrackCfg(em, pairModel, nestStates=[1]) cfg.supervisedTrain(trackData, truthIntervals) cfg.validate() # Then reload as an hmm-equivalent cfg = MultitrackCfg(em, pairModel, nestStates=[]) cfg.supervisedTrain(trackData, truthIntervals) cfg.validate() # check emissions, they should basically be binary. trackList = cfg.getTrackList() emp = np.exp(em.getLogProbs()) ltrTrack = trackList.getTrackByName("ltr") track = ltrTrack.getNumber() cmap = ltrTrack.getValueMap() s0 = cmap.getMap(None) s1 = cmap.getMap(0) # we add 1 to all frequencies like emission trainer assert_array_almost_equal(emp[track][0][s0], 36. / 37.) assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.) assert_array_almost_equal(emp[track][1][s0], 1 - 6. / 7.) assert_array_almost_equal(emp[track][1][s1], 6. / 7.) assert_array_almost_equal(emp[track][2][s0], 26. / 27.) assert_array_almost_equal(emp[track][2][s1], 1. - 26. / 27.) assert_array_almost_equal(emp[track][3][s0], 1. - 6. / 7.) assert_array_almost_equal(emp[track][3][s1], 6. / 7.) insideTrack = trackList.getTrackByName("inside") track = insideTrack.getNumber() cmap = insideTrack.getValueMap() s0 = cmap.getMap(None) s1 = cmap.getMap("Inside") assert_array_almost_equal(emp[track][0][s0], 36. / 37.) assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.) assert_array_almost_equal(emp[track][1][s0], 6. / 7.) assert_array_almost_equal(emp[track][1][s1], 1 - 6. / 7.) assert_array_almost_equal(emp[track][2][s0], 1. - 26. / 27.) assert_array_almost_equal(emp[track][2][s1], 26. / 27.) assert_array_almost_equal(emp[track][3][s0], 6. / 7.) assert_array_almost_equal(emp[track][3][s1], 1. - 6. / 7.) # crappy check for start probs. need to test transition too! freq = [0.0] * em.getNumStates() total = 0.0 for interval in truthIntervals: state = interval[3] freq[state] += float(interval[2]) - float(interval[1]) total += float(interval[2]) - float(interval[1]) sprobs = cfg.getStartProbs() assert len(sprobs) == em.getNumStates() for state in xrange(em.getNumStates()): assert_array_almost_equal(freq[state] / total, sprobs[state]) # transition probabilites # from eyeball: #c 0 5 0 0->0 +4 0->1 +1 0-> +5 #c 5 10 1 1->1 +4 1->2 +1 1-> +5 #c 10 35 2 2->2 +24 2->3 +1 2-> +25 #c 35 40 3 3->3 +4 3->0 +1 3-> +5 #c 40 70 0 0->0 +29 0-> +19 realTransProbs = np.array([[33. / 34., 1. / 34., 0., 0.], [0., 4. / 5., 1. / 5., 0.], [0., 0., 24. / 25., 1. / 25.], [1. / 5., 0., 0., 4. / 5.]]) tprobs = np.exp(cfg.getLogProbTables()[0]) assert tprobs.shape == (em.getNumStates(), em.getNumStates(), em.getNumStates()) for i in xrange(em.getNumStates()): for j in xrange(em.getNumStates()): fbTot = tprobs[i, i, j] if i != j: fbTot += tprobs[i, j, i] assert_array_almost_equal(fbTot, realTransProbs[i, j]) prob, states = cfg.decode(trackData.getTrackTableList()[0]) for truthInt in truthIntervals: for i in xrange(truthInt[1], truthInt[2]): # gah, just realized that ltr track is binary, which means # ltr states can be either 1 or 3. need to fix test properly # but just relax comparison for now. if truthInt[3] == 1 or truthInt[3] == 3: assert states[i] == 1 or states[i] == 3 else: assert states[i] == truthInt[3]
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Rename HMM states.") parser.add_argument("inputModel", help="Path of teHMM model created with" " teHmmTrain.py") parser.add_argument("outputModel", help="Path of model with renamed states") parser.add_argument("--newNames", help="comma-separated list of state names to" " apply. This list must have exactly the same number of" " states as the model. The ith name in the list will be " "assigned to the ith name of the model...", default=None) parser.add_argument("--teNumbers", help="comma-separated list of state numbers" " that will be assigned TE states, with everything else" " assigned Other. This is less flexible but maybe more" " convenient at times than --newNames.", default=None) parser.add_argument("--bed", help="apply naming to bed file and print " "results to stdout", default=None) parser.add_argument("--sizes", help="bedFile to use for computing state numbering" " by using decreasing order in total coverage (only works" " with --teNumbers)", default=None) parser.add_argument("--noMerge", help="dont merge adjacent intervals with same" " name with --bed option", action="store_true",default=False) parser.add_argument("--changeTrackName", help="dont do anything else, just change" " the name of one track. specified value should be of form" " currentNAme, newName", default=None) args = parser.parse_args() assert args.inputModel != args.outputModel # load model created with teHmmTrain.py model = loadModel(args.inputModel) # trackChangeName logic hacked in completely separate from everything else if args.changeTrackName is not None: oldName, newName = args.changeTrackName.split(",") track = model.getTrackList().getTrackByName(oldName) track.setName(newName) saveModel(args.outputModel, model) return 0 assert (args.newNames is None) != (args.teNumbers is None) # names manually specified if args.newNames is not None: names = args.newNames.split(",") # names computed using simple scheme from set of "TE" state numbers (as found from # log output of fitStateNames.py) elif args.teNumbers is not None: teNos = set([int(x) for x in args.teNumbers.split(",")]) teCount, otherCount = 0, 0 numStates = model.getEmissionModel().getNumStates() # re-order from sizing info if args.sizes is not None: bedIntervals = readBedIntervals(args.sizes, ncol=4) sizeMap = defaultdict(int) for interval in bedIntervals: sizeMap[int(interval[3])] += interval[2] - interval[1] stateNumbers = sorted([x for x in xrange(numStates)], reverse=True, key = lambda x : sizeMap[x]) else: stateNumbers = [x for x in xrange(numStates)] names = [""] * numStates for i in stateNumbers: if i in teNos: name = "TE-%.2d" % teCount teCount += 1 else: name = "Other-%.2d" % otherCount otherCount += 1 names[i] = name assert teCount == len(teNos) and teCount + otherCount == len(names) assert len(names) == model.getEmissionModel().getNumStates() # throw names in the mapping object and stick into model catMap = CategoryMap(reserved=0) for i, name in enumerate(names): catMap.getMap(name, update=True) model.stateNameMap = catMap # save model saveModel(args.outputModel, model) # process optional bed file if args.bed is not None: prevInterval = None bedIntervals = readBedIntervals(args.bed, ncol=4) for interval in bedIntervals: oldName = interval[3] newName = names[int(oldName)] newInterval = list(interval) newInterval[3] = newName if args.noMerge: # write interval print "\t".join(str(x) for x in newInterval) else: if prevInterval is None: # update prev interval first time prevInterval = newInterval elif newInterval[3] == prevInterval[3] and\ newInterval[0] == prevInterval[0] and\ newInterval[1] == prevInterval[2]: # glue onto prev interval prevInterval[2] = newInterval[2] else: # write and update prev print "\t".join(str(x) for x in prevInterval) prevInterval = newInterval if prevInterval is not None: print "\t".join(str(x) for x in prevInterval)
def parallelDispatch(argv, args): """ chunk up input with chrom option. recursivlely launch eval. merge results """ jobList = [] chromIntervals = readBedIntervals(args.chroms, sort=True) chromFiles = [] regionFiles = [] segFiles = [] statsFiles = [] offset = args.co for chrom in chromIntervals: cmdToks = copy.deepcopy(argv) cmdToks[cmdToks.index("--chrom") + 1] = "" cmdToks[cmdToks.index("--chrom")] = "" chromPath = getLocalTempPath("TempChromPath", ".bed") cpFile = open(chromPath, "w") cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2])) cpFile.close() regionPath = getLocalTempPath("Temp", ".bed") runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.allBed, chromPath, regionPath)) if os.path.getsize(regionPath) < 2: continue offset += int(chrom[2]) - int(chrom[1]) regionFiles.append(regionPath) chromFiles.append(chromPath) cmdToks[2] = regionPath segPath = getLocalTempPath("Temp", ".bed") cmdToks[3] = segPath segFiles.append(segPath) if "--co" in cmdToks: cmdToks[cmdToks.index("--co") + 1] = str(offset) else: cmdToks.append("--co") cmdToks.append(str(offset)) if args.stats is not None: statsPath = getLocalTempPath("Temp", ".bed") cmdToks[cmdToks.index("--stats") + 1] = statsPath statsFiles.append(statsPath) cmd = " ".join(cmdToks) jobList.append(cmd) runParallelShellCommands(jobList, args.proc) for i in xrange(len(jobList)): if i == 0: ct = ">" else: ct = ">>" runShellCommand("cat %s %s %s" % (segFiles[i], ct, args.outBed)) if len(statsFiles) > 0: runShellCommand("cat %s %s %s" % (statsFiles[i], ct, args.stats)) for i in itertools.chain(chromFiles, regionFiles, segFiles, statsFiles): runShellCommand("rm %s" % i)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Create starting transition and emission distributions " "from a candidate BED annotation, which can" " be used with teHmmTrain.py using the --initTransProbs and " "--initEmProbs options, respectively. The distributions created here" " are extremely simple, but this can be a good shortcut to at least " "getting the state names into the init files, which can be further " "tweeked by hand.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("trackName", help="Name of Track to use as initial" " annotation") parser.add_argument("queryBed", help="Bed file with regions to query") parser.add_argument("outTransProbs", help="File to write transition model" " to") parser.add_argument("outEmProbs", help="File to write emission model to") parser.add_argument("--numOut", help="Number of \"outside\" states to add" " to the model.", default=1, type=int) parser.add_argument("--numTot", help="Add x \"outside\" states such " "that total states is this. (overrieds --numOut)", default=0, type=int) parser.add_argument("--outName", help="Name of outside states (will have" " numeric suffix if more than 1)", default="Outside") parser.add_argument("--mode", help="Strategy for initializing the " "transition graph: {\'star\': all states are connected" " to the oustide state(s) but not each other; " " \'data\': transitions estimated from input bed; " " \'full\': dont write edges and let teHmmTrain.py " "initialize as a clique}", default="star") parser.add_argument("--selfTran", help="This script will always write all" " the self-transition probabilities to the output file. " "They will all be set to the specified value using this" " option, or estimated from the data if -1", default=-1., type=float) parser.add_argument("--em", help="Emission probability for input track (" "ie probability that state emits itself)", type=float, default=0.95) parser.add_argument("--outEmNone", help="Add None emission probabilities" " for target track for Outside states", action="store_true", default=None) addLoggingOptions(parser) args = parser.parse_args() if args.mode == "star" and args.numOut < 1: raise RuntimeError("--numOut must be at least 1 if --mode star is used") if args.mode != "star" and args.mode != "data" and args.mode != "full": raise RuntimeError("--mode must be one of {star, data, full}") if args.mode == "data": raise RuntimeError("--data not implemented yet") assert os.path.isfile(args.tracksInfo) setLoggingFromOptions(args) tempBedToolPath = initBedTool() # Read the tracks info trackList = TrackList(args.tracksInfo) # Extract the track we want track = trackList.getTrackByName(args.trackName) if track is None: raise RuntimeError("Track %s not found in tracksInfo" % args.trackName) trackPath = track.getPath() if track.getDist() != "multinomial" and track.getDist() != "gaussian": raise RuntimeError("Track %s does not have multinomial or " "gaussian distribution" % args.trackName) if track.getScale() is not None or track.getLogScale() is not None: raise RuntimeError("Track %s must not have scale" % args.trackName) # read query intervals from the bed file logger.info("loading query intervals from %s" % args.queryBed) mergedIntervals = getMergedBedIntervals(args.queryBed, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.queryBed) # read the track, while intersecting with query intervals # (track is saved as temp XML file for sake not changing interface) bedIntervals = [] for queryInterval in mergedIntervals: bedIntervals += readBedIntervals(trackPath, ncol = track.getValCol() + 1, chrom=queryInterval[0], start=queryInterval[1], end=queryInterval[2]) # 1st pass to collect set of names nameMap = CategoryMap(reserved = 0) for interval in bedIntervals: nameMap.update(interval[track.getValCol()]) outNameMap = CategoryMap(reserved = 0) if args.numTot > 0: args.numOut = max(0, args.numTot - len(nameMap)) for i in xrange(args.numOut): outName = args.outName if args.numOut > 1: outName += str(i) assert nameMap.has(outName) is False outNameMap.update(outName) # write the transition model for use with teHmmTrain.py --initTransProbs writeTransitions(bedIntervals, nameMap, outNameMap, args) # write the emission model for use with teHmmTrain.py --initEmProbs writeEmissions(bedIntervals, nameMap, outNameMap, args) cleanBedTool(tempBedToolPath)
def parallelDispatch(argv, args): """ chunk up input with chrom option. recursivlely launch eval. merge results """ jobList = [] chromIntervals = readBedIntervals(args.chroms, sort=True) chromFiles = [] regionFiles = [] bedFiles = [] pdFiles = [] bicFiles = [] edFiles = [] for chrom in chromIntervals: cmdToks = copy.deepcopy(argv) cmdToks[cmdToks.index("--chrom") + 1] = "" cmdToks[cmdToks.index("--chrom")] = "" chromPath = getLocalTempPath("Temp", ".bed") cpFile = open(chromPath, "w") cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2])) cpFile.close() regionPath = getLocalTempPath("Temp", ".bed") runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.bedRegions, chromPath, regionPath)) if os.path.getsize(regionPath) < 2: continue regionFiles.append(regionPath) chromFiles.append(chromPath) cmdToks[3] = regionPath if args.bed is not None: bedPath = getLocalTempPath("Temp", ".bed") cmdToks[cmdToks.index("--bed")+1] = bedPath bedFiles.append(bedPath) if args.pd is not None: pdPath = getLocalTempPath("Temp", ".bed") cmdToks[cmdToks.index("--pd")+1] = pdPath pdFiles.append(pdPath) if args.ed is not None: edPath = getLocalTempPath("Temp", ".bed") cmdToks[cmdToks.index("--ed")+1] = edPath edFiles.append(edPath) if args.bic is not None: bicPath = getLocalTempPath("Temp", ".bic") cmdToks[cmdToks.index("--bic")+1] = bicPath bicFiles.append(bicPath) cmd = " ".join(cmdToks) jobList.append(cmd) runParallelShellCommands(jobList, args.proc) for i in xrange(len(jobList)): if i == 0: ct = ">" else: ct = ">>" if len(bedFiles) > 0: runShellCommand("cat %s %s %s" % (bedFiles[i], ct, args.bed)) if len(pdFiles) > 0: runShellCommand("cat %s %s %s" % (pdFiles[i], ct, args.pd)) if len(edFiles) > 0: runShellCommand("cat %s %s %s" % (edFiles[i], ct, args.ed)) if len(bicFiles) > 0: runShellCommand("cat %s %s %s" % (bicFiles[i], ct, args.bic)) for i in itertools.chain(chromFiles, regionFiles, bedFiles, pdFiles, edFiles, bicFiles): runShellCommand("rm %s" % i)
def runPositionalComparison(argv, args): """ hack to recursively exectute compareBedStates.py on a sliding window of the two inputs and report accuracy in a BED file """ try: windowToks = args.window.split(",") assert len(windowToks) == 5 windowSize = int(windowToks[0]) stateName = windowToks[1] compType = windowToks[2] score = windowToks[3] outBed = windowToks[4] except: raise RuntimeError("value passed to --window is not in valid format") if compType == "base": compIdx = 0 elif compType == "interval": compIdx = 1 elif compType == "weighted": compIdx = 2 else: raise RuntimeError("invalid compType, %s, passed to --window" % compType) if score != "f1" and score != "precision" and score != "recall": raise RuntimeError("invalid score, %s, passed to --window" % score) try: outFile = open(outBed, "w") except: raise RuntimeError("invalid outBed, %s, passed to --window" % outBed) tempBed = getLocalTempPath("Temp_region", ".bed") runShellCommand("mergeBed -i %s > %s" % (args.bed1, tempBed)) chunkBed = getLocalTempPath("Temp_chunkBed", ".bed") runShellCommand("chunkBedRegions.py %s %d --overlap .5 > %s" % (tempBed, windowSize, chunkBed)) window = getLocalTempPath("Temp_window", ".bed") slice1 = getLocalTempPath("Temp_slice1", ".bed") slice2 = getLocalTempPath("Temp_slice2", ".bed") compFile = getLocalTempPath("Temp_compFile", ".bed") compOpts = "" winIdx = argv.index("--window") assert winIdx > 0 and winIdx < len(argv) - 1 and argv[winIdx + 1] == args.window for i in xrange(3, len(argv)): if i != winIdx and i != winIdx + 1: compOpts += " " + argv[i] for chunk in readBedIntervals(chunkBed): runShellCommand("echo \"%s\t%d\t%d\" > %s" % (chunk[0], chunk[1], chunk[2], window)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.bed1, window, slice1)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.bed2, window, slice2)) runShellCommand("compareBedStates.py %s %s %s > %s" % (slice1, slice2, compOpts, compFile)) stats = extractCompStatsFromFile(compFile)[compIdx] if stateName not in stats: stats[stateName] = (0, 0) f1 = 0. prec, rec = stats[stateName] if prec + rec > 0: f1 = (2. * prec * rec) / (prec + rec) val = f1 if score == "precision": val = prec elif score == "recall": val = rec outFile.write("%s\t%d\t%d\t%f\n" % (chunk[0], chunk[1], chunk[2], val)) runShellCommand("rm -f %s %s %s %s %s %s" % (tempBed, chunkBed, window, slice1, slice2, compFile)) outFile.close()
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Find candidate TSDs (exact forward matches) flanking given" "BED intervals. Score is distance between TSD and bed interval.") parser.add_argument("fastaSequence", help="DNA sequence in FASTA format") parser.add_argument("inBed", help="BED file with TEs whose flanking regions " "we wish to search") parser.add_argument("outBed", help="BED file containing (only) output TSDs") parser.add_argument("--min", help="Minimum length of a TSD", default=4, type=int) parser.add_argument("--max", help="Maximum length of a TSD", default=6, type=int) parser.add_argument("--all", help="Report all matches in region (as opposed" " to only the nearest to the BED element which is the " "default behaviour", action="store_true", default=False) parser.add_argument("--maxScore", help="Only report matches with given " "score or smaller. The score is definied as the " "maximum distance between the (two) TSD intervals and " "the query interval", default=None, type=int) parser.add_argument("--left", help="Number of bases immediately left of the " "BED element to search for the left TSD", default=7, type=int) parser.add_argument("--right", help="Number of bases immediately right of " "the BED element to search for the right TSD", default=7, type=int) parser.add_argument("--overlap", help="Number of bases overlapping the " "BED element to include in search (so total space " "on each side will be --left + overlap, and --right + " "--overlap", default=3, type=int) parser.add_argument("--leftName", help="Name of left TSDs in output Bed", default="L_TSD") parser.add_argument("--rightName", help="Name of right TSDs in output Bed", default="R_TSD") parser.add_argument("--id", help="Assign left/right pairs of TSDs a unique" " matching ID", action="store_true", default=False) parser.add_argument("--names", help="Only apply to bed interval whose " "name is in (comma-separated) list. If not specified" " then all intervals are processed", default=None) parser.add_argument("--numProc", help="Number of jobs to run in parallel." " (parallization done on different sequences in FASTA" "file", type=int, default=1) parser.add_argument("--sequences", help="Only process given sequences of input" " FASTA file (comma-separated list).", default=None) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) assert os.path.exists(args.inBed) assert os.path.exists(args.fastaSequence) assert args.min <= args.max args.nextId = 0 if args.sequences is not None: args.sequences = set(args.sequences.split(",")) # read intervals from the bed file logger.info("loading target intervals from %s" % args.inBed) bedIntervals = readBedIntervals(args.inBed, ncol=4, sort=True) if bedIntervals is None or len(bedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.inBed) if args.numProc > 1: runParallel(args, bedIntervals) return 0 tsds = findTsds(args, bedIntervals) writeBedIntervals(tsds, args.outBed)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Rename HMM states.") parser.add_argument("inputModel", help="Path of teHMM model created with" " teHmmTrain.py") parser.add_argument("outputModel", help="Path of model with renamed states") parser.add_argument( "--newNames", help="comma-separated list of state names to" " apply. This list must have exactly the same number of" " states as the model. The ith name in the list will be " "assigned to the ith name of the model...", default=None) parser.add_argument( "--teNumbers", help="comma-separated list of state numbers" " that will be assigned TE states, with everything else" " assigned Other. This is less flexible but maybe more" " convenient at times than --newNames.", default=None) parser.add_argument("--bed", help="apply naming to bed file and print " "results to stdout", default=None) parser.add_argument( "--sizes", help="bedFile to use for computing state numbering" " by using decreasing order in total coverage (only works" " with --teNumbers)", default=None) parser.add_argument("--noMerge", help="dont merge adjacent intervals with same" " name with --bed option", action="store_true", default=False) parser.add_argument( "--changeTrackName", help="dont do anything else, just change" " the name of one track. specified value should be of form" " currentNAme, newName", default=None) args = parser.parse_args() assert args.inputModel != args.outputModel # load model created with teHmmTrain.py model = loadModel(args.inputModel) # trackChangeName logic hacked in completely separate from everything else if args.changeTrackName is not None: oldName, newName = args.changeTrackName.split(",") track = model.getTrackList().getTrackByName(oldName) track.setName(newName) saveModel(args.outputModel, model) return 0 assert (args.newNames is None) != (args.teNumbers is None) # names manually specified if args.newNames is not None: names = args.newNames.split(",") # names computed using simple scheme from set of "TE" state numbers (as found from # log output of fitStateNames.py) elif args.teNumbers is not None: teNos = set([int(x) for x in args.teNumbers.split(",")]) teCount, otherCount = 0, 0 numStates = model.getEmissionModel().getNumStates() # re-order from sizing info if args.sizes is not None: bedIntervals = readBedIntervals(args.sizes, ncol=4) sizeMap = defaultdict(int) for interval in bedIntervals: sizeMap[int(interval[3])] += interval[2] - interval[1] stateNumbers = sorted([x for x in xrange(numStates)], reverse=True, key=lambda x: sizeMap[x]) else: stateNumbers = [x for x in xrange(numStates)] names = [""] * numStates for i in stateNumbers: if i in teNos: name = "TE-%.2d" % teCount teCount += 1 else: name = "Other-%.2d" % otherCount otherCount += 1 names[i] = name assert teCount == len(teNos) and teCount + otherCount == len(names) assert len(names) == model.getEmissionModel().getNumStates() # throw names in the mapping object and stick into model catMap = CategoryMap(reserved=0) for i, name in enumerate(names): catMap.getMap(name, update=True) model.stateNameMap = catMap # save model saveModel(args.outputModel, model) # process optional bed file if args.bed is not None: prevInterval = None bedIntervals = readBedIntervals(args.bed, ncol=4) for interval in bedIntervals: oldName = interval[3] newName = names[int(oldName)] newInterval = list(interval) newInterval[3] = newName if args.noMerge: # write interval print "\t".join(str(x) for x in newInterval) else: if prevInterval is None: # update prev interval first time prevInterval = newInterval elif newInterval[3] == prevInterval[3] and\ newInterval[0] == prevInterval[0] and\ newInterval[1] == prevInterval[2]: # glue onto prev interval prevInterval[2] = newInterval[2] else: # write and update prev print "\t".join(str(x) for x in prevInterval) prevInterval = newInterval if prevInterval is not None: print "\t".join(str(x) for x in prevInterval)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Compare two bed files where Model states are represented" " in a column. Used to determine sensitivity and specificity. NOTE" " that both bed files must be sorted and cover the exact same regions" " of the same genome.") parser.add_argument("bed1", help="Bed file (TRUTH)") parser.add_argument("bed2", help="Bed file covering same regions in same" " order as bed1") parser.add_argument("--col", help="Column of bed files to use for state" " (currently only support 4(name) or 5(score))", default=4, type=int) parser.add_argument("--thresh", help="Threshold to consider interval from" " bed1 covered by bed2.", type=float, default=0.8) parser.add_argument("--plot", help="Path of file to write Precision/Recall" " graphs to in PDF format", default=None) parser.add_argument("--ignore", help="Comma-separated list of stateNames to" " ignore", default=None) parser.add_argument( "--strictPrec", help="By default, precision is computed" " in a manner strictly symmetric to recall. So calling" " compareBedStates.py A.bed B.bed would give the exact" " same output as compareBedStates.py B.bed A.bed except" " precision and recall values would be swapped. With " " this option, a predicted element only counts toward" " precision if it overlaps with 80pct of the true" " element, as opposed to only needing 80pct of itself" " overlapping with the true element. ", action="store_true", default=False) parser.add_argument("--noBase", help="Skip base-level stats (and only show" " interval stats). Runs faster", action="store_true", default=False) parser.add_argument("--noFrag", help="Do not allow fragmented matches in" "interval predictions. ie if a single truth interval" " is covered by a series of predicted intervals, only " "the best match will be counted if this flag is used", action="store_true", default=False) parser.add_argument("--tl", help="Path to tracks XML file. Used to cut " "out mask tracks so they are removed from comparison." " (convenience option to not have to manually run " "subtractBed everytime...)", default=None) parser.add_argument("--delMask", help="Entirely remove intervals from " "mask tracks that are > given length. Probably " "only want to set to non-zero value K if using" " with a prediction that was processed with " "interpolateMaskedRegions.py --max K", type=int, default=0) parser.add_argument( "--window", help="A comma-delimited 5-tuple of " "windowSize,stateName,compType,score,outBed. " "Where windowSize is the sliding window size " "(overlap .5), stateName is target stateName," " compType is in {base,interval,weighted}, sore is" " in {f1,precision,recall} and " "outBed is the path of a bedFile to write positional" " accuracy to. For example, --window 1000000,TE,base,f1" ",acc.bed will write base-level f1 for 1MB sliding windows" " to acc.bed. These can be viewed on the browser by first" " converting to BigWig.", default=None) args = parser.parse_args() tempBedToolPath = initBedTool() if args.ignore is not None: args.ignore = set(args.ignore.split(",")) else: args.ignore = set() assert args.col == 4 or args.col == 5 print "Commandline %s" % " ".join(sys.argv) origArgs = copy.deepcopy(args) tempFiles = [] if args.tl is not None: cutBed1 = cutOutMaskIntervals(args.bed1, args.delMask, sys.maxint, args.tl) cutBed2 = cutOutMaskIntervals(args.bed2, args.delMask, sys.maxint, args.tl) if cutBed1 is not None: assert cutBed2 is not None tempFiles += [cutBed1, cutBed2] args.bed1 = cutBed1 args.bed2 = cutBed2 checkExactOverlap(args.bed1, args.bed2) if args.window is not None: runPositionalComparison(argv, origArgs) intervals1 = readBedIntervals(args.bed1, ncol=args.col) intervals2 = readBedIntervals(args.bed2, ncol=args.col) if args.noBase is False: stats = compareBaseLevel(intervals1, intervals2, args.col - 1)[0] totalRight, totalWrong, accMap = summarizeBaseComparision( stats, args.ignore) print "Base counts [False Negatives, False Positives, True Positives]:" print stats totalBoth = totalRight + totalWrong accuracy = float(totalRight) / float(totalBoth) print "Accuaracy: %d / %d = %f" % (totalRight, totalBoth, accuracy) print "State-by-state (Precision, Recall):" print "Base-by-base Accuracy" print accMap trueStats = compareIntervalsOneSided(intervals1, intervals2, args.col - 1, args.thresh, False, not args.noFrag)[0] predStats = compareIntervalsOneSided(intervals2, intervals1, args.col - 1, args.thresh, args.strictPrec, not args.noFrag)[0] intAccMap = summarizeIntervalComparison(trueStats, predStats, False, args.ignore) intAccMapWeighted = summarizeIntervalComparison(trueStats, predStats, True, args.ignore) print "\nInterval Accuracy" print intAccMap print "" print "\nWeighted Interval Accuracy" print intAccMapWeighted print "" # print some row data to be picked up by scrapeBenchmarkRow.py if args.noBase is False: header, row = summaryRow(accuracy, stats, accMap) print " ".join(header) print " ".join(row) # make graph if args.plot is not None: if canPlot is False: raise RuntimeError("Unable to write plots. Maybe matplotlib is " "not installed?") writeAccPlots(accuracy, accMap, intAccMap, intAccMapWeighted, args.thresh, args.plot) if len(tempFiles) > 0: runShellCommand("rm -f %s" % " ".join(tempFiles)) cleanBedTool(tempBedToolPath)
def testSupervisedLearn(self): intervals = readBedIntervals(getTestDirPath("truth.bed"), ncol=4) truthIntervals = [] for i in intervals: truthIntervals.append((i[0], i[1], i[2], int(i[3]))) allIntervals = [(truthIntervals[0][0], truthIntervals[0][1], truthIntervals[-1][2])] trackData = TrackData() trackData.loadTrackData(getTracksInfoPath(3), allIntervals) assert len(trackData.getTrackTableList()) == 1 # set the fudge to 1 since when the test was written this was # hardcoded default em = IndependentMultinomialEmissionModel( 4, trackData.getNumSymbolsPerTrack(), fudge = 1.0) hmm = MultitrackHmm(em) hmm.supervisedTrain(trackData, truthIntervals) hmm.validate() # check emissions, they should basically be binary. trackList = hmm.getTrackList() emp = np.exp(em.getLogProbs()) ltrTrack = trackList.getTrackByName("ltr") track = ltrTrack.getNumber() cmap = ltrTrack.getValueMap() s0 = cmap.getMap(None) s1 = cmap.getMap(0) # we add 1 to all frequencies like emission trainer assert_array_almost_equal(emp[track][0][s0], 36. / 37.) assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.) assert_array_almost_equal(emp[track][1][s0], 1 - 6. / 7.) assert_array_almost_equal(emp[track][1][s1], 6. / 7.) assert_array_almost_equal(emp[track][2][s0], 26. / 27.) assert_array_almost_equal(emp[track][2][s1], 1. - 26. / 27.) assert_array_almost_equal(emp[track][3][s0], 1. - 6. / 7.) assert_array_almost_equal(emp[track][3][s1], 6. / 7.) insideTrack = trackList.getTrackByName("inside") track = insideTrack.getNumber() cmap = insideTrack.getValueMap() s0 = cmap.getMap(None) s1 = cmap.getMap("Inside") assert_array_almost_equal(emp[track][0][s0], 36. / 37.) assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.) assert_array_almost_equal(emp[track][1][s0], 6. / 7.) assert_array_almost_equal(emp[track][1][s1], 1 - 6. / 7.) assert_array_almost_equal(emp[track][2][s0], 1. - 26. / 27.) assert_array_almost_equal(emp[track][2][s1], 26. / 27.) assert_array_almost_equal(emp[track][3][s0], 6. / 7.) assert_array_almost_equal(emp[track][3][s1], 1. - 6. / 7.) # crappy check for start probs. need to test transition too! freq = [0.0] * em.getNumStates() total = 0.0 for interval in truthIntervals: state = interval[3] freq[state] += float(interval[2]) - float(interval[1]) total += float(interval[2]) - float(interval[1]) sprobs = hmm.getStartProbs() assert len(sprobs) == em.getNumStates() for state in xrange(em.getNumStates()): assert_array_almost_equal(freq[state] / total, sprobs[state]) # transition probabilites # from eyeball: #c 0 5 0 0->0 +4 0->1 +1 0-> +5 #c 5 10 1 1->1 +4 1->2 +1 1-> +5 #c 10 35 2 2->2 +24 2->3 +1 2-> +25 #c 35 40 3 3->3 +4 3->0 +1 3-> +5 #c 40 70 0 0->0 +29 0-> +19 realTransProbs = np.array([ [33. / 34., 1. / 34., 0., 0.], [0., 4. / 5., 1. / 5., 0.], [0., 0., 24. / 25., 1. / 25.], [1. / 5., 0., 0., 4. / 5.] ]) tprobs = hmm.getTransitionProbs() assert tprobs.shape == (em.getNumStates(), em.getNumStates()) assert_array_almost_equal(tprobs, realTransProbs) prob, states = hmm.viterbi(trackData)[0] for truthInt in truthIntervals: for i in xrange(truthInt[1], truthInt[2]): assert states[i] == truthInt[3]
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Create a teHMM") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("trainingBed", help="Path of BED file containing" " genome regions to train model on. If --supervised " "is used, the names in this bed file will be treated " "as the true annotation (otherwise it is only used for " "interval coordinates)") parser.add_argument("outputModel", help="Path of output hmm") parser.add_argument("--numStates", help="Number of states in model", type = int, default=2) parser.add_argument("--iter", help="Number of EM iterations", type = int, default=100) parser.add_argument("--supervised", help="Use name (4th) column of " "<traingingBed> for the true hidden states of the" " model. Transition parameters will be estimated" " directly from this information rather than EM." " NOTE: The number of states will be determined " "from the bed.", action = "store_true", default = False) parser.add_argument("--cfg", help="Use Context Free Grammar insead of " "HMM. Only works with --supervised for now", action = "store_true", default = False) parser.add_argument("--saPrior", help="Confidence in self alignment " "track for CFG. Probability of pair emission " "is multiplied by this number if the bases are aligned" " and its complement if bases are not aligned. Must" " be between [0,1].", default=0.95, type=float) parser.add_argument("--pairStates", help="Comma-separated list of states" " (from trainingBed) that are treated as pair-emitors" " for the CFG", default=None) parser.add_argument("--emFac", help="Normalization factor for weighting" " emission probabilities because when there are " "many tracks, the transition probabilities can get " "totally lost. 0 = no normalization. 1 =" " divide by number of tracks. k = divide by number " "of tracks / k", type=int, default=0) parser.add_argument("--initTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". This file (all other transitions get probability 0)" " is used to specifiy the initial transition model." " The names and number of states will be initialized " "according to this file (overriding --numStates)", default = None) parser.add_argument("--fixTrans", help="Do not learn transition parameters" " (best used with --initTransProbs)", action="store_true", default=False) parser.add_argument("--initEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". This file (all other emissions get probability 0)" " is used to specifiy the initial emission model. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixEm", help="Do not learn emission parameters" " (best used with --initEmProbs)", action="store_true", default=False) parser.add_argument("--initStartProbs", help="Path of text file where each " "line has two entries: State Probability" ". This file (all other start probs get probability 0)" " is used to specifiy the initial start dist. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixStart", help="Do not learn start parameters" " (best used with --initStartProbs)", action="store_true", default=False) parser.add_argument("--forceTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". These transition probabilities will override any " " learned probabilities after each training iteration" " (unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed)" , default=None) parser.add_argument("--forceEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". These " "emission probabilities will override any learned" " probabilities after each training iteration " "(unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed.)" , default = None) parser.add_argument("--flatEm", help="Use a flat emission distribution as " "a baseline. If not specified, the initial emission " "distribution will be randomized by default. Emission" " probabilities specified with --initEmpProbs or " "--forceEmProbs will never be affected by randomizaiton" ". The randomization is important for Baum Welch " "training, since if two states dont have at least one" " different emission or transition probability to begin" " with, they will never learn to be different.", action="store_true", default=False) parser.add_argument("--emRandRange", help="When randomly initialzing an" " emission distribution, constrain" " the values to the given range (pair of " "comma-separated numbers). Overridden by " "--initEmProbs and --forceEmProbs when applicable." " Completely overridden by --flatEm (which is equivalent" " to --emRandRange .5,.5.). Actual values used will" " always be normalized.", default="0.2,0.8") parser.add_argument("--segment", help="Bed file of segments to treat as " "single columns for HMM (ie as created with " "segmentTracks.py). IMPORTANT: this file must cover " "the same regions as the traininBed file. Unless in " "supervised mode, probably best to use same bed file " " as both traingBed and --segment argument. Otherwise" " use intersectBed to make sure the overlap is exact", default=None) parser.add_argument("--segLen", help="Effective segment length used for" " normalizing input segments (specifying 0 means no" " normalization applied)", type=int, default=0) parser.add_argument("--seed", help="Seed for random number generator" " which will be used to initialize emissions " "(if --flatEM and --supervised not specified)", default=None, type=int) parser.add_argument("--reps", help="Number of replicates (with different" " random initializations) to run. The replicate" " with the highest likelihood will be chosen for the" " output", default=1, type=int) parser.add_argument("--numThreads", help="Number of threads to use when" " running replicates (see --rep) in parallel.", type=int, default=1) parser.add_argument("--emThresh", help="Threshold used for convergence" " in baum welch training. IE delta log likelihood" " must be bigger than this number (which should be" " positive) for convergence", type=float, default=0.001) parser.add_argument("--saveAllReps", help="Save all replicates (--reps)" " models to disk, instead of just the best one" ". Format is <outputModel>.repN. There will be " " --reps -1 such models saved as the best output" " counts as a replicate", action="store_true", default=False) parser.add_argument("--maxProb", help="Gaussian distributions and/or" " segment length corrections can cause probability" " to *decrease* during BW iteration. Use this option" " to remember the parameters with the highest probability" " rather than returning the parameters after the final " "iteration.", action="store_true", default=False) parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop" " training if a given number of iterations go by without" " hitting a new maxProb", default=None, type=int) parser.add_argument("--transMatEpsilons", help="By default, epsilons are" " added to all transition probabilities to prevent " "converging on 0 due to rounding error only for fully" " unsupervised training. Use this option to force this" " behaviour for supervised and semisupervised modes", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() if args.cfg is True: assert args.supervised is True assert args.saPrior >= 0. and args.saPrior <= 1. if args.pairStates is not None: assert args.cfg is True if args.initTransProbs is not None or args.fixTrans is True or\ args.initEmProbs is not None or args.fixEm is not None: if args.cfg is True: raise RuntimeError("--transProbs, --fixTrans, --emProbs, --fixEm " "are not currently compatible with --cfg.") if args.fixTrans is True and args.supervised is True: raise RuntimeError("--fixTrans option not compatible with --supervised") if args.fixEm is True and args.supervised is True: raise RuntimeError("--fixEm option not compatible with --supervised") if (args.forceTransProbs is not None or args.forceEmProbs is not None) \ and args.cfg is True: raise RuntimeError("--forceTransProbs and --forceEmProbs are not " "currently compatible with --cfg") if args.flatEm is True and args.supervised is False and\ args.initEmProbs is None and args.initTransProbs is None: raise RuntimeError("--flatEm must be used with --initEmProbs and or" " --initTransProbs") if args.initEmProbs is not None and args.initTransProbs is None: raise RuntimeError("--initEmProbs can only be used in conjunction with" " --initTransProbs") if args.emRandRange is not None: args.emRandRange = args.emRandRange.split(",") try: assert len(args.emRandRange) == 2 args.emRandRange = (float(args.emRandRange[0]), float(args.emRandRange[1])) except: raise RuntimeError("Invalid --emRandRange specified") if args.transMatEpsilons is False: # old logic here. now overriden with above options args.transMatEpsilons = (args.supervised is False and args.initTransProbs is None and args.forceTransProbs is None) setLoggingFromOptions(args) tempBedToolPath = initBedTool() # read training intervals from the bed file logger.info("loading training intervals from %s" % args.trainingBed) mergedIntervals = getMergedBedIntervals(args.trainingBed, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.trainingBed) # read segment intervals segIntervals = None if args.segment is not None: logger.info("loading segment intervals from %s" % args.segment) try: checkExactOverlap(args.trainingBed, args.segment) except: raise RuntimeError("bed file passed with --segments option" " must exactly overlap trainingBed") segIntervals = readBedIntervals(args.segment, sort=True) elif args.segLen > 0: raise RuntimeError("--segLen can only be used with --segment") if args.segLen <= 0: args.segLen = None if args.segLen > 0 and args.segLen != 1: logger.warning("--segLen should be 0 (no correction) or 1 (base" " correction). Values > 1 may cause bias.") # read the tracks, while intersecting them with the training intervals logger.info("loading tracks %s" % args.tracksInfo) trackData = TrackData() trackData.loadTrackData(args.tracksInfo, mergedIntervals, segmentIntervals=segIntervals) catMap = None userTrans = None if args.supervised is False and args.initTransProbs is not None: logger.debug("initializing transition model with user data") catMap = stateNamesFromUserTrans(args.initTransProbs) # state number is overrided by the transProbs file args.numStates = len(catMap) truthIntervals = None # state number is overrided by the input bed file in supervised mode if args.supervised is True: logger.info("processing supervised state names") # we reload because we don't want to be merging them here truthIntervals = readBedIntervals(args.trainingBed, ncol=4) catMap = mapStateNames(truthIntervals) args.numStates = len(catMap) # train the model seeds = [random.randint(0, 4294967294)] if args.seed is not None: seeds = [args.seed] random.seed(args.seed) seeds += [random.randint(0, sys.maxint) for x in xrange(1, args.reps)] def trainClosure(randomSeed): return trainModel(randomSeed, trackData=trackData, catMap=catMap, userTrans=userTrans, truthIntervals=truthIntervals, args=args) modelList = runParallelShellCommands(argList=seeds, numProc = args.numThreads, execFunction = trainClosure, useThreads = True) # select best model logmsg = "" bestModel = (-1, LOGZERO) for i in xrange(len(modelList)): curModel = (i, modelList[i].getLastLogProb()) if curModel[1] > bestModel[1]: bestModel = curModel if curModel[1] is not None: logmsg += "Rep %i: TotalProb: %f\n" % curModel if len(modelList) > 1: logging.info("Training Replicates Statistics:\n%s" % logmsg) logging.info("Selecting best replicate (%d, %f)" % bestModel) model = modelList[bestModel[0]] # write the model to a pickle logger.info("saving trained model to %s" % args.outputModel) saveModel(args.outputModel, model) # write all replicates writtenCount = 0 if args.saveAllReps is True: for i, repModel in enumerate(modelList): if i != bestModel[0]: repPath = "%s.rep%d" % (args.outputModel, writtenCount) logger.info("saving replicate model to %s" % repPath) saveModel(repPath, repModel) writtenCount += 1 cleanBedTool(tempBedToolPath)