def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Set the score column of each bed interval in input to " "(MODE, BINNED) average value of the intersection region in another track). " "Can be used, for instance, to assign a copy number of each RepeatModeler " "prediction...") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("inBed", help="BED file to annotate") parser.add_argument("track", help="Track to use for annotation") parser.add_argument("outBed", help="Path for output, annotated BED file") parser.add_argument("--name", help="Set ID field (column 4 instead of 5)", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # read the tracks list trackList = TrackList(args.tracksInfo) track = trackList.getTrackByName(args.track) if track is None: raise RuntimeError("Can't find track %s" % args.track) # make temporary tracks list with just our track so we can keep using # tracks list interface but not read unecessary crap. singleListPath = getLocalTempPath("Temp_secScore", ".bed") trackList.trackList = [track] trackList.saveXML(singleListPath) obFile = open(args.outBed, "w") # trackData interface not so great at cherry picking intervals. # need to merge them up and use segmentation interface filledIntervals, mergedIntervals = fillGaps(args.inBed) # read track into trackData trackData = TrackData() logger.info("loading track %s" % singleListPath) trackData.loadTrackData(singleListPath, mergedIntervals, segmentIntervals=filledIntervals, applyMasking=False) # finally, write the annotation writeAnnotatedIntervals(trackData, filledIntervals, mergedIntervals, obFile, args) runShellCommand("rm -f %s" % singleListPath) obFile.close() cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Automatically set the scale attributes of numeric tracks" " within a given tracks.xml function using some simple heuristics. ") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("chromSizes", help="2-column chrom sizes file as needed" " by bedGraphToBigWig") parser.add_argument("queryBed", help="Region(s) to apply scaling to") parser.add_argument("outputDir", help="Output directory") parser.add_argument("--tracks", help="Comma-separated list of tracks " "to process. If not set, all tracks with a scaling" " attribute are processed", default=None) parser.add_argument("--skip", help="Comma-separated list of tracks to " "skip.", default=None) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if not os.path.exists(args.outputDir): os.makedirs(args.outputDir) trackNames = [] if args.tracks is not None: trackNames = args.tracks.split(",") skipNames = [] if args.skip is not None: skipNames = args.skip.split(",") mergedIntervals = getMergedBedIntervals(args.queryBed) trackData = TrackData() trackData.loadTrackData(args.tracksInfo, mergedIntervals) trackList = trackData.getTrackList() for track in trackList: if track.getName() not in skipNames and\ (track.getName() in trackNames or len(trackNames) == 0): if track.getScale() is not None or\ track.getLogScale() is not None or\ track.getShift() is not None or\ track.getDelta() is True: logger.info("Writing scaled track %s" % track.getName()) writeScaledTrack(trackData, track, args) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run unit tests") addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) suite = allSuites() runner = unittest.TextTestRunner() i = runner.run(suite) return len(i.failures) + len(i.errors)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Automatically set the scale attributes of numeric tracks" " within a given tracks.xml function using some simple heuristics. ") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("allBed", help="Bed file spanning entire genome") parser.add_argument("outputTracks", help="Path to write modified tracks XML" " to.") parser.add_argument("--numBins", help="Maximum number of bins after scaling", default=10, type=int) parser.add_argument("--tracks", help="Comma-separated list of tracks " "to process. If not set, all" " tracks listed as having a multinomial distribution" " (since this is the default value, this includes " "tracks with no distribution attribute) or gaussian" " distribution will be processed.", default=None) parser.add_argument("--skip", help="Comma-separated list of tracks to " "skip.", default=None) parser.add_argument("--noLog", help="Never use log scaling", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() trackNames = [] if args.tracks is not None: trackNames = args.tracks.split(",") skipNames = [] if args.skip is not None: skipNames = args.skip.split(",") trackList = TrackList(args.tracksInfo) outTrackList = copy.deepcopy(trackList) allIntervals = getMergedBedIntervals(args.allBed) for track in trackList: trackExt = os.path.splitext(track.getPath())[1] isFasta = len(trackExt) >= 3 and trackExt[:3].lower() == ".fa" if track.getName() not in skipNames and\ (track.getName() in trackNames or len(trackNames) == 0) and\ (track.getDist() == "multinomial" or track.getDist() == "sparse_multinomial" or track.getDist() == "gaussian") and\ not isFasta: try: setTrackScale(track, args.numBins, allIntervals, args.noLog) except ValueError as e: logger.warning("Skipping (non-numeric?) track %s due to: %s" % ( track.getName(), str(e))) trackList.saveXML(args.outputTracks) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Fill in masked intervals of an hmm prediction " "(from teHmmEval.py) with state corresponding to surrounding" " intervals.") parser.add_argument("tracksXML", help="XML track list (used to id masking" " tracks") parser.add_argument("allBed", help="Target scope. Masked intervals outside" " of these regions will not be included") parser.add_argument("inBed", help="TE prediction BED file. State labels" " should probably be mapped (ie with fitStateNames.py)") parser.add_argument("outBed", help="Output BED. Will be equivalent to" " the input bed except all gaps corresponding to " "masked intervals will be filled") parser.add_argument("--maxLen", help="Maximum length of a masked interval" " to fill (inclusive). Use --delMask option with same value" "if running compareBedStates.py after.", type=int, default=sys.maxint) parser.add_argument("--default", help="Default label to give to masked " "region if no label can be determined", default="0") parser.add_argument("--tgts", help="Only relabel gaps that " "are flanked on both sides by the same state, and this state" " is in this comma- separated list. --default used for other" " gaps. If not targetst specified then all states checked.", default=None) parser.add_argument("--oneSidedTgts", help="Only relabel gaps that " "are flanked on at least one side by a state in this comma-" "separated list --default used for other gaps", default=None) parser.add_argument("--onlyDefault", help="Add the default state (--default) no" " no all masked gaps no matter what. ie ignoring all other " "logic", action="store_true", default=False) parser.add_argument("--cut", help="Cut out gaps for masked tracks from the input." " By default, the input is expected to come from the HMM " "with mask intervals already absent, and will crash on with" " an assertion error if an overlap is detected.", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # make sets tgtSet = set() if args.tgts is not None: tgtSet = set(args.tgts.split(",")) oneSidedTgtSet = set() if args.oneSidedTgts is not None: oneSidedTgtSet = set(args.oneSidedTgts.split(",")) assert len(tgtSet.intersection(oneSidedTgtSet)) == 0 # read the track list trackList = TrackList(args.tracksXML) maskTracks = trackList.getMaskTracks() # read the input bed inBed = args.inBed if args.cut is True: inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML) inputIntervals = readBedIntervals(inBed, ncol = 4, sort = True) if args.cut is True: runShellCommand("rm -f %s" % inBed) if len(maskTracks) == 0 or len(inputIntervals) == 0: runShellCommand("cp %s %s" % (args.inBed, args.outBed)) logger.warning("No mask tracks located in %s or" " %s empty" % (args.tracksXML, args.inBed)) return 0 # make a temporary, combined, merged masking bed file tempMaskBed = getLocalTempPath("Temp_mb", ".bed") for maskTrack in maskTracks: assert os.path.isfile(maskTrack.getPath()) runShellCommand("cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % ( maskTrack.getPath(), tempMaskBed)) maskedIntervals = getMergedBedIntervals(tempMaskBed, sort = True) resolvedMasks = 0 if len(inputIntervals) == 0: logger.warning("No mask tracks located in %s" % args.tracksXML) return inputIdx = 0 rightFlank = inputIntervals[inputIdx] tempOutMask = getLocalTempPath("Temp_om", ".bed") tempOutMaskFile = open(tempOutMask, "w") for maskIdx, maskInterval in enumerate(maskedIntervals): if maskInterval[2] - maskInterval[1] > args.maxLen: continue # find candidate right flank while rightFlank < maskInterval: if inputIdx == len(inputIntervals) - 1: rightFlank = None break else: inputIdx += 1 rightFlank = inputIntervals[inputIdx] # candidate left flank leftFlank = None if inputIdx > 0: leftFlank = inputIntervals[inputIdx - 1] # identify flanking states if the intervals perfectly abut leftState = None if leftFlank is not None: if leftFlank[0] == maskInterval[0] and leftFlank[2] == maskInterval[1]: leftState = str(leftFlank[3]) else: assert intersectSize(leftFlank, maskInterval) == 0 rightState = None if rightFlank is not None: if rightFlank[0] == maskInterval[0] and rightFlank[1] == maskInterval[2]: rightState = str(rightFlank[3]) else: assert intersectSize(rightFlank, maskInterval) == 0 # choose a state for the mask interval maskState = str(args.default) if args.onlyDefault is True: pass elif leftState is not None and leftState == rightState: if len(tgtSet) == 0 or leftState in tgtSet: maskState = leftState elif leftState in oneSidedTgtSet: maskState = leftState elif rightState in oneSidedTgtSet: maskState = rightState # write our mask interval tempOutMaskFile.write("%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1], maskInterval[2], maskState)) tempOutMaskFile.close() tempMergePath1 = getLocalTempPath("Temp_mp", ".bed") tempMergePath2 = getLocalTempPath("Temp_mp", ".bed") runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1, tempOutMask, tempMergePath1)) runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2)) tempScopePath = getLocalTempPath("temp_all", ".bed") runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath)) runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath, args.outBed)) runShellCommand("rm -f %s" % " ".join([tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath])) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Add a TSD track (or modify an existing one) based on a " "given track") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("tsdTrackDir", help="Directory to write cleaned BED" " tracks to") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML" " to.") parser.add_argument("inputTrack", help="Name of track to createTSDs from") parser.add_argument("fastaTrack", help="Name of track for fasta sequence") parser.add_argument("outputTrack", help="Name of tsd track to add. Will" " overwrite if it already exists (or append with" " --append option)") parser.add_argument("--append", help="Add onto existing TSD track if exists", default=False, action="store_true") parser.add_argument("--inPath", help="Use given file instead of inputTrack" " path to generate TSD", default=None) ############ TSDFINDER OPTIONS ############## parser.add_argument("--min", help="Minimum length of a TSD", default=None, type=int) parser.add_argument("--max", help="Maximum length of a TSD", default=None, type=int) parser.add_argument("--all", help="Report all matches in region (as opposed" " to only the nearest to the BED element which is the " "default behaviour", action="store_true", default=False) parser.add_argument("--maxScore", help="Only report matches with given " "score or smaller. The score is definied as the " "maximum distance between the (two) TSD intervals and " "the query interval", default=None, type=int) parser.add_argument("--left", help="Number of bases immediately left of the " "BED element to search for the left TSD", default=None, type=int) parser.add_argument("--right", help="Number of bases immediately right of " "the BED element to search for the right TSD", default=None, type=int) parser.add_argument("--overlap", help="Number of bases overlapping the " "BED element to include in search (so total space " "on each side will be --left + overlap, and --right + " "--overlap", default=None, type=int) parser.add_argument("--leftName", help="Name of left TSDs in output Bed", default=None) parser.add_argument("--rightName", help="Name of right TSDs in output Bed", default=None) parser.add_argument("--id", help="Assign left/right pairs of TSDs a unique" " matching ID", action="store_true", default=False) parser.add_argument("--names", help="Only apply to bed interval whose " "name is in (comma-separated) list. If not specified" " then all intervals are processed", default=None) parser.add_argument("--numProc", help="Number of jobs to run in parallel." " (parallization done on different sequences in FASTA" "file", type=int, default=1) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # copy out all options for call to tsd finder args.tsdFinderOptions = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.tsdFinderOptions += " --logFile %s" % args.logFile for option in ["min", "max", "all", "maxScore", "left", "right", "overlap", "leftName", "rightName", "id", "names", "numProc"]: val = getattr(args, option) if val is True: args.tsdFinderOptions += " --%s" % option elif val is not None and val is not False: args.tsdFinderOptions += " --%s %s" % (option, val) try: os.makedirs(args.tsdTrackDir) except: pass if not os.path.isdir(args.tsdTrackDir): raise RuntimeError("Unable to find or create tsdTrack dir %s" % args.tsdTrackDir) trackList = TrackList(args.tracksInfo) outTrackList = copy.deepcopy(trackList) inputTrack = trackList.getTrackByName(args.inputTrack) if inputTrack is None: raise RuntimeError("Track %s not found" % args.inputTrack) if args.inPath is not None: assert os.path.isfile(args.inPath) inputTrack.setPath(args.inPath) inTrackExt = os.path.splitext(inputTrack.getPath())[1].lower() if inTrackExt != ".bb" and inTrackExt != ".bed": raise RuntimeError("Track %s has non-bed extension %s" % ( args.inputTrack, inTrackExt)) fastaTrack = trackList.getTrackByName(args.fastaTrack) if fastaTrack is None: raise RuntimeError("Fasta Track %s not found" % args.fastaTrack) faTrackExt = os.path.splitext(fastaTrack.getPath())[1].lower() if faTrackExt[:3] != ".fa": raise RuntimeError("Fasta Track %s has non-fasta extension %s" % ( args.fastaTrack, faTrackExt)) tsdTrack = outTrackList.getTrackByName(args.outputTrack) if tsdTrack is None: if args.append is True: raise RuntimeError("TSD track %s not found. Cannot append" % ( args.outputTrack)) tsdTrack = Track() tsdTrack.name = args.outputTrack tsdTrack.path = os.path.join(args.tsdTrackDir, args.inputTrack + "_" + args.outputTrack + ".bed") runTsdFinder(fastaTrack.getPath(), inputTrack.getPath(), tsdTrack.getPath(), args) if outTrackList.getTrackByName(tsdTrack.getName()) is None: outTrackList.addTrack(tsdTrack) outTrackList.saveXML(args.outTracksInfo) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Filter overlapping intervals out") parser.add_argument("inputBed", help="Bed file to filter") parser.add_argument("--bed12", help="Use bed12 exons instead of start/end" " if present (equivalent to running bed12ToBed6 on" " input first).", action="store_true", default=False) parser.add_argument("--rm", help="Make sure intervals that are labeled as TE " "by rm2State.sh script are never cut by ones that are not", default=False, action='store_true') addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) assert os.path.isfile(args.inputBed) tempBedToolPath = initBedTool() # do the --rm filter. by splitting into TE / non-TE # then removing everything in non-TE that overlaps # TE. The adding the remainder back to TE. inputPath = args.inputBed if args.rm is True: tempPath = getLocalTempPath("Temp_", ".bed") tePath = getLocalTempPath("Temp_te_", ".bed") runShellCommand("rm2State.sh %s |grep TE | sortBed > %s" % ( args.inputBed, tempPath)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %( args.inputBed, tempPath, tePath)) otherPath = getLocalTempPath("Temp_other_", ".bed") runShellCommand("rm2State.sh %s |grep -v TE | sortBed > %s" % ( args.inputBed, tempPath)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %( args.inputBed, tempPath, otherPath)) if os.path.getsize(tePath) > 0 and\ os.path.getsize(otherPath) > 0: filterPath = getLocalTempPath("Temp_filter_", ".bed") runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % ( otherPath, tePath, filterPath)) inputPath = getLocalTempPath("Temp_input_", ".bed") runShellCommand("cat %s %s | sortBed > %s" % ( tePath, filterPath, inputPath)) runShellCommand("rm -f %s" % filterPath) runShellCommand("rm -f %s %s %s" % (tePath, otherPath, tempPath)) bedIntervals = BedTool(inputPath).sort() if args.bed12 is True: bedIntervals = bedIntervals.bed6() prevInterval = None # this code has been way to buggy for something so simple # keep extra list to check for sure even though it's a waste of # time and space sanity = [] for interval in bedIntervals: if (prevInterval is not None and interval.chrom == prevInterval.chrom and interval.start < prevInterval.end): logger.debug("Replace %d bases of \n%s with\n%s" % ( prevInterval.end - interval.start, str(interval), str(prevInterval))) interval.start = prevInterval.end if interval.end > interval.start: sys.stdout.write("%s" % str(interval)) sanity.append(interval) prevInterval = interval for i in xrange(len(sanity) - 1): if sanity[i].chrom == sanity[i+1].chrom: assert sanity[i+1].start >= sanity[i].end cleanBedTool(tempBedToolPath) if args.inputBed != inputPath: runShellCommand("rm -f %s" % inputPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="fix up track names and sort alphabetically. easier to do here on xml than at end for pape\ r.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML") addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() args.logOpString = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.logOpString += " --logFile %s" % args.logFile nm = dict() nm["hollister"] = "RM-RepBase-Hollister" nm["chaux"] = "RM-RepBase-deLaChaux" nm["repeat_modeler"] = "RM-RepeatModeler" nm["repbase"] = "RM-RepBase" nm["repet"] = "REPET" nm["ltr_finder"] = "LTR_FINDER" nm["ltr_harvest"] = "LTR_Harvest" nm["ltr_termini"] = "lastz-Termini" nm["lastz-Termini"] = "lastz-LTRTermini" nm["tir_termini"] = "lastz-InvTermini" nm["irf"] = "IRF" nm["palindrome"] = "lastz-Palindrome" nm["overlap"] = "lastz-Overlap" nm["mitehunter"] = "MITE-Hunter" nm["helitronscanner"] = "HelitronScanner" nm["cov_80-"] = "lastz-SelfLowId" nm["cov_80-90"] = "lastz-SelfMedId" nm["cov_90+"] = "lastz-SelfHighId" nm["left_peak_80-"] = "lastz-SelfPeakLeftLow" nm["lastz-SelfLowLeftPeak"] = nm["left_peak_80-"] nm["left_peak_80-90"] = "lastz-SelfPeakLeftMed" nm["lastz-SelfMedLeftPeak"] = nm["left_peak_80-90"] nm["left_peak_90+"] = "lastz-SelfPeakLeftHigh" nm["lastz-SelfHighLeftPeak"] = nm["left_peak_90+"] nm["right_peak_80-"] = "lastz-SelfPeakRightLow" nm["lastz-SelfLowRightPeak"] = nm["right_peak_80-"] nm["right_peak_80-90"] = "lastz-SelfPeakRightMed" nm["lastz-SelfMedRightPeak"] = nm["right_peak_80-90"] nm["right_peak_90+"] = "lastz-SelfPeakRightHigh" nm["lastz-SelfHighRightPeak"] = nm["right_peak_90+"] nm["cov_maxPId"] = "lastz-SelfPctMaxId" nm["lastz-SelfMaxPctId"] = nm["cov_maxPId"] nm["te_domains"] = "TE-Domains" nm["fgenesh"] = "Genes" nm["genes"] = nm["fgenesh"] nm["refseq"] = nm["fgenesh"] nm["mrna"] = "mRNA" nm["srna"] = "sRNA" nm["ortho_depth"] = "Alignment-Depth" nm["orthology"] = nm["ortho_depth"] nm["chain_depth"] = nm["ortho_depth"] nm["alignment_depth"] = nm["ortho_depth"] nm["gcpct"] = "GC" nm["trf"] = "TRF" nm["windowmasker"] = "WindowMasker" nm["polyN"] = "Ns" nm["phastcons_ce"] = "Conservation" nm["phastcons"] = nm["phastcons_ce"] nm["PhastCons"] = nm["phastcons_ce"] nm["phyloP"] = nm["phastcons_ce"] nm["phylop"] = nm["phastcons_ce"] rtracks = dict() rtracks["tantan"] = True rtracks["polyA"] = True rtracks["transposon_psi"] = True rtracks["transposonpsi"] = True rtracks["repbase_censor"] = True rtracks["tsd"] = True rtracks["repbase_default"] = True rtracks["dustmasker"] = True inTracks = TrackList(args.tracksInfo) outTracks = TrackList() outList = [] for track in itertools.chain(inTracks.trackList, inTracks.maskTrackList): if not os.path.exists(track.path): raise RuntimeError("Track DNE %s" % track.path) if track.name not in rtracks: if track.name in nm: track.name = nm[track.name] else: logger.warning("Did not map track %s" % track.name) outList.append(track) else: logger.warning("Deleted track %s" % track.name) # sort the list def sortComp(x): lname = x.name.lower() if x.name == "RM-RepeatModeler": return "aaaaa" + lname elif "RM" in x.name: return "aaaa" + lname elif "REPET" in x.name: return "aaa" + lname elif "softmask" in lname or "tigr" in lname or "te-domains" in lname: return "aa" + lname elif x.getDist == "mask": return "zzzz" + lname else: return lname outList = sorted(outList, key = lambda track : sortComp(track)) for track in outList: outTracks.addTrack(track) outTracks.saveXML(args.outTracksInfo) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Create a teHMM") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("trainingBed", help="Path of BED file containing" " genome regions to train model on. If --supervised " "is used, the names in this bed file will be treated " "as the true annotation (otherwise it is only used for " "interval coordinates)") parser.add_argument("outputModel", help="Path of output hmm") parser.add_argument("--numStates", help="Number of states in model", type = int, default=2) parser.add_argument("--iter", help="Number of EM iterations", type = int, default=100) parser.add_argument("--supervised", help="Use name (4th) column of " "<traingingBed> for the true hidden states of the" " model. Transition parameters will be estimated" " directly from this information rather than EM." " NOTE: The number of states will be determined " "from the bed.", action = "store_true", default = False) parser.add_argument("--cfg", help="Use Context Free Grammar insead of " "HMM. Only works with --supervised for now", action = "store_true", default = False) parser.add_argument("--saPrior", help="Confidence in self alignment " "track for CFG. Probability of pair emission " "is multiplied by this number if the bases are aligned" " and its complement if bases are not aligned. Must" " be between [0,1].", default=0.95, type=float) parser.add_argument("--pairStates", help="Comma-separated list of states" " (from trainingBed) that are treated as pair-emitors" " for the CFG", default=None) parser.add_argument("--emFac", help="Normalization factor for weighting" " emission probabilities because when there are " "many tracks, the transition probabilities can get " "totally lost. 0 = no normalization. 1 =" " divide by number of tracks. k = divide by number " "of tracks / k", type=int, default=0) parser.add_argument("--initTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". This file (all other transitions get probability 0)" " is used to specifiy the initial transition model." " The names and number of states will be initialized " "according to this file (overriding --numStates)", default = None) parser.add_argument("--fixTrans", help="Do not learn transition parameters" " (best used with --initTransProbs)", action="store_true", default=False) parser.add_argument("--initEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". This file (all other emissions get probability 0)" " is used to specifiy the initial emission model. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixEm", help="Do not learn emission parameters" " (best used with --initEmProbs)", action="store_true", default=False) parser.add_argument("--initStartProbs", help="Path of text file where each " "line has two entries: State Probability" ". This file (all other start probs get probability 0)" " is used to specifiy the initial start dist. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixStart", help="Do not learn start parameters" " (best used with --initStartProbs)", action="store_true", default=False) parser.add_argument("--forceTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". These transition probabilities will override any " " learned probabilities after each training iteration" " (unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed)" , default=None) parser.add_argument("--forceEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". These " "emission probabilities will override any learned" " probabilities after each training iteration " "(unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed.)" , default = None) parser.add_argument("--flatEm", help="Use a flat emission distribution as " "a baseline. If not specified, the initial emission " "distribution will be randomized by default. Emission" " probabilities specified with --initEmpProbs or " "--forceEmProbs will never be affected by randomizaiton" ". The randomization is important for Baum Welch " "training, since if two states dont have at least one" " different emission or transition probability to begin" " with, they will never learn to be different.", action="store_true", default=False) parser.add_argument("--emRandRange", help="When randomly initialzing an" " emission distribution, constrain" " the values to the given range (pair of " "comma-separated numbers). Overridden by " "--initEmProbs and --forceEmProbs when applicable." " Completely overridden by --flatEm (which is equivalent" " to --emRandRange .5,.5.). Actual values used will" " always be normalized.", default="0.2,0.8") parser.add_argument("--segment", help="Bed file of segments to treat as " "single columns for HMM (ie as created with " "segmentTracks.py). IMPORTANT: this file must cover " "the same regions as the traininBed file. Unless in " "supervised mode, probably best to use same bed file " " as both traingBed and --segment argument. Otherwise" " use intersectBed to make sure the overlap is exact", default=None) parser.add_argument("--segLen", help="Effective segment length used for" " normalizing input segments (specifying 0 means no" " normalization applied)", type=int, default=0) parser.add_argument("--seed", help="Seed for random number generator" " which will be used to initialize emissions " "(if --flatEM and --supervised not specified)", default=None, type=int) parser.add_argument("--reps", help="Number of replicates (with different" " random initializations) to run. The replicate" " with the highest likelihood will be chosen for the" " output", default=1, type=int) parser.add_argument("--numThreads", help="Number of threads to use when" " running replicates (see --rep) in parallel.", type=int, default=1) parser.add_argument("--emThresh", help="Threshold used for convergence" " in baum welch training. IE delta log likelihood" " must be bigger than this number (which should be" " positive) for convergence", type=float, default=0.001) parser.add_argument("--saveAllReps", help="Save all replicates (--reps)" " models to disk, instead of just the best one" ". Format is <outputModel>.repN. There will be " " --reps -1 such models saved as the best output" " counts as a replicate", action="store_true", default=False) parser.add_argument("--maxProb", help="Gaussian distributions and/or" " segment length corrections can cause probability" " to *decrease* during BW iteration. Use this option" " to remember the parameters with the highest probability" " rather than returning the parameters after the final " "iteration.", action="store_true", default=False) parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop" " training if a given number of iterations go by without" " hitting a new maxProb", default=None, type=int) parser.add_argument("--transMatEpsilons", help="By default, epsilons are" " added to all transition probabilities to prevent " "converging on 0 due to rounding error only for fully" " unsupervised training. Use this option to force this" " behaviour for supervised and semisupervised modes", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() if args.cfg is True: assert args.supervised is True assert args.saPrior >= 0. and args.saPrior <= 1. if args.pairStates is not None: assert args.cfg is True if args.initTransProbs is not None or args.fixTrans is True or\ args.initEmProbs is not None or args.fixEm is not None: if args.cfg is True: raise RuntimeError("--transProbs, --fixTrans, --emProbs, --fixEm " "are not currently compatible with --cfg.") if args.fixTrans is True and args.supervised is True: raise RuntimeError("--fixTrans option not compatible with --supervised") if args.fixEm is True and args.supervised is True: raise RuntimeError("--fixEm option not compatible with --supervised") if (args.forceTransProbs is not None or args.forceEmProbs is not None) \ and args.cfg is True: raise RuntimeError("--forceTransProbs and --forceEmProbs are not " "currently compatible with --cfg") if args.flatEm is True and args.supervised is False and\ args.initEmProbs is None and args.initTransProbs is None: raise RuntimeError("--flatEm must be used with --initEmProbs and or" " --initTransProbs") if args.initEmProbs is not None and args.initTransProbs is None: raise RuntimeError("--initEmProbs can only be used in conjunction with" " --initTransProbs") if args.emRandRange is not None: args.emRandRange = args.emRandRange.split(",") try: assert len(args.emRandRange) == 2 args.emRandRange = (float(args.emRandRange[0]), float(args.emRandRange[1])) except: raise RuntimeError("Invalid --emRandRange specified") if args.transMatEpsilons is False: # old logic here. now overriden with above options args.transMatEpsilons = (args.supervised is False and args.initTransProbs is None and args.forceTransProbs is None) setLoggingFromOptions(args) tempBedToolPath = initBedTool() # read training intervals from the bed file logger.info("loading training intervals from %s" % args.trainingBed) mergedIntervals = getMergedBedIntervals(args.trainingBed, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.trainingBed) # read segment intervals segIntervals = None if args.segment is not None: logger.info("loading segment intervals from %s" % args.segment) try: checkExactOverlap(args.trainingBed, args.segment) except: raise RuntimeError("bed file passed with --segments option" " must exactly overlap trainingBed") segIntervals = readBedIntervals(args.segment, sort=True) elif args.segLen > 0: raise RuntimeError("--segLen can only be used with --segment") if args.segLen <= 0: args.segLen = None if args.segLen > 0 and args.segLen != 1: logger.warning("--segLen should be 0 (no correction) or 1 (base" " correction). Values > 1 may cause bias.") # read the tracks, while intersecting them with the training intervals logger.info("loading tracks %s" % args.tracksInfo) trackData = TrackData() trackData.loadTrackData(args.tracksInfo, mergedIntervals, segmentIntervals=segIntervals) catMap = None userTrans = None if args.supervised is False and args.initTransProbs is not None: logger.debug("initializing transition model with user data") catMap = stateNamesFromUserTrans(args.initTransProbs) # state number is overrided by the transProbs file args.numStates = len(catMap) truthIntervals = None # state number is overrided by the input bed file in supervised mode if args.supervised is True: logger.info("processing supervised state names") # we reload because we don't want to be merging them here truthIntervals = readBedIntervals(args.trainingBed, ncol=4) catMap = mapStateNames(truthIntervals) args.numStates = len(catMap) # train the model seeds = [random.randint(0, 4294967294)] if args.seed is not None: seeds = [args.seed] random.seed(args.seed) seeds += [random.randint(0, sys.maxint) for x in xrange(1, args.reps)] def trainClosure(randomSeed): return trainModel(randomSeed, trackData=trackData, catMap=catMap, userTrans=userTrans, truthIntervals=truthIntervals, args=args) modelList = runParallelShellCommands(argList=seeds, numProc = args.numThreads, execFunction = trainClosure, useThreads = True) # select best model logmsg = "" bestModel = (-1, LOGZERO) for i in xrange(len(modelList)): curModel = (i, modelList[i].getLastLogProb()) if curModel[1] > bestModel[1]: bestModel = curModel if curModel[1] is not None: logmsg += "Rep %i: TotalProb: %f\n" % curModel if len(modelList) > 1: logging.info("Training Replicates Statistics:\n%s" % logmsg) logging.info("Selecting best replicate (%d, %f)" % bestModel) model = modelList[bestModel[0]] # write the model to a pickle logger.info("saving trained model to %s" % args.outputModel) saveModel(args.outputModel, model) # write all replicates writtenCount = 0 if args.saveAllReps is True: for i, repModel in enumerate(modelList): if i != bestModel[0]: repPath = "%s.rep%d" % (args.outputModel, writtenCount) logger.info("saving replicate model to %s" % repPath) saveModel(repPath, repModel) writtenCount += 1 cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=" Given two bed files: a prediction and a true (or target)" " annotation, re-label the prediction's state names so that they " " best match the true annotation. Usees same logic as " " compareBedStates.py for determining accuracy") parser.add_argument("tgtBed", help="Target bed file") parser.add_argument("predBed", help="Predicted bed file to re-label. ") parser.add_argument("outBed", help="Output bed (relabeling of predBed)") parser.add_argument("--col", help="Column of bed files to use for state" " (currently only support 4(name) or 5(score))", default=4, type=int) parser.add_argument( "--intThresh", help="Threshold to consider interval from" " tgtBed covered by predBed. If not specified, then base" " level statistics will be used. Value in range (0,1]", type=float, default=None) parser.add_argument("--noFrag", help="Dont allow fragmented interval matches (" "see help for --frag in compareBedStates.py). Only" " relevant with --intThresh", action="store_true", default=False) parser.add_argument( "--qualThresh", help="Minimum match ratio between truth" " and prediction to relabel prediction. Example, if" " predicted state X overlaps target state LTR 25 pct of " "the time, then qualThresh must be at least 0.25 to " "label X as LTR in the output. Value in range (0, 1]", type=float, default=0.1) parser.add_argument("--ignore", help="Comma-separated list of stateNames to" " ignore (in prediction)", default=None) parser.add_argument("--ignoreTgt", help="Comma-separated list of stateNames to" " ignore (int target)", default=None) parser.add_argument("--tgt", help="Comma-separated list of stateNames to " " consider (in target). All others will be ignored", default=None) parser.add_argument( "--unique", help="If more than one predicted state maps" " to the same target state, add a unique id (numeric " "suffix) to the output so that they can be distinguished", action="store_true", default=False) parser.add_argument("--model", help="Apply state name mapping to the model" " in the specified path (it is strongly advised to" " make a backup of the model first)", default=None) parser.add_argument("--noMerge", help="By default, adjacent intervals" " with the same state name in the output are " "automatically merged into a single interval. This" " flag disables this.", action="store_true", default=False) parser.add_argument("--hm", help="Write confusion matrix as heatmap in PDF" " format to specified file", default=None) parser.add_argument("--old", help="Use old name mapping logic which just " "takes biggest overlap in forward confusion matrix. " "faster than new default logic which does the greedy" " f1 optimization", action="store_true", default=False) parser.add_argument("--fdr", help="Use FDR cutoff instead of (default)" " greedy F1 optimization for state labeling", type=float, default=None) parser.add_argument("--tl", help="Path to tracks XML file. Used to cut " "out mask tracks so they are removed from comparison." " (convenience option to not have to manually run " "subtractBed everytime...)", default=None) parser.add_argument( "--colOrder", help="List of states used to force" " ordering in heatmap (otherwise alphabetical) columns. These" " states will correspond to the tgtBed when --old used and" " --predBed otherwise.", default=None) parser.add_argument( "--hmCovRow", help="Path to write 1-row heatmap of " "state coverage (fraction of bases). only works with --hm", default=None) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.ignore is not None: args.ignore = set(args.ignore.split(",")) else: args.ignore = set() if args.ignoreTgt is not None: args.ignoreTgt = set(args.ignoreTgt.split(",")) else: args.ignoreTgt = set() if args.tgt is not None: args.tgt = set(args.tgt.split(",")) if args.old is True: raise RuntimeError("--tgt option not implemented for --old") else: args.tgt = set() if args.old is True and args.fdr is not None: raise RuntimeError("--old and --fdr options are exclusive") assert args.col == 4 or args.col == 5 tempFiles = [] if args.tl is not None: cutBedTgt = cutOutMaskIntervals(args.tgtBed, -1, sys.maxint, args.tl) cutBedPred = cutOutMaskIntervals(args.predBed, -1, sys.maxint, args.tl) if cutBedTgt is not None: assert cutBedPred is not None tempFiles += [cutBedTgt, cutBedPred] args.tgtBed = cutBedTgt args.predBed = cutBedPred checkExactOverlap(args.tgtBed, args.predBed) intervals1 = readBedIntervals(args.tgtBed, ncol=args.col) intervals2 = readBedIntervals(args.predBed, ncol=args.col) cfName = "reverse" if args.old is True: intervals1, intervals2 = intervals2, intervals1 cfName = "forward" # generate confusion matrix based on accuracy comparison using # base or interval stats as desired if args.intThresh is not None: logger.info("Computing interval %s confusion matrix" % cfName) confMat = compareIntervalsOneSided(intervals2, intervals1, args.col - 1, args.intThresh, False, not args.noFrag)[1] else: logger.info("Computing base %s confusion matrix" % cfName) confMat = compareBaseLevel(intervals2, intervals1, args.col - 1)[1] logger.info("%s Confusion Matrix:\n%s" % (cfName, str(confMat))) # find the best "true" match for each predicted state if args.old is True: intervals1, intervals2 = intervals2, intervals1 stateMap = getStateMapFromConfMatrix_simple(confMat) else: stateMap = getStateMapFromConfMatrix(confMat, args.tgt, args.ignoreTgt, args.ignore, args.qualThresh, args.fdr) # filter the stateMap to take into account the command-line options # notably --ignore, --ignoreTgt, --qualThresh, and --unique filterStateMap(stateMap, args) logger.info("State Map:\n%s", str(stateMap)) # write the model if spefied if args.model is not None: applyNamesToModel(stateMap, args.model) # generate the output bed using the statemap writeFittedBed(intervals2, stateMap, args.outBed, args.col - 1, args.noMerge, args.ignoreTgt) # write the confusion matrix as heatmap if args.hm is not None: if canPlot is False: raise RuntimeError("Unable to write heatmap. Maybe matplotlib is " "not installed?") writeHeatMap(confMat, args.hm, args.colOrder, args.hmCovRow) if len(tempFiles) > 0: runShellCommand("rm -f %s" % " ".join(tempFiles)) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Generate HMM-usable tracklist from raw tracklist. EX " "used to transform mustang_alyrata_tracks.xml -> " "mustang_alyrata_clean.xml. Runs cleanRM.py cleanLtrFinder.py and " " cleanTermini.py and addTsdTrack.py and setTrackScaling.py (also runs " " removeBedOverlaps.py before each of the clean scripts)") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("allBed", help="Bed file spanning entire genome") parser.add_argument("cleanTrackPath", help="Directory to write cleaned BED" " tracks to") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML" " to.") parser.add_argument("--numBins", help="Maximum number of bins after scaling", default=10, type=int) parser.add_argument("--scaleTracks", help="Comma-separated list of tracks " "to process for scaling. If not set, all" " tracks listed as having a multinomial distribution" " (since this is the default value, this includes " "tracks with no distribution attribute) or gaussian " "distribution will be processed.", default=None) parser.add_argument("--skipScale", help="Comma-separated list of tracks to " "skip for scaling.", default=None) parser.add_argument("--ltr_termini", help="Name of termini track (appy tsd)", default="ltr_termini") parser.add_argument("--repeat_modeler", help="Name of repeat_modeler track (appy tsd)", default="repeat_modeler") parser.add_argument("--sequence", help="Name of fasta sequence track", default="sequence") parser.add_argument("--tsd", help="Name of tsd track to generate (appy cleanTermini.py)", default="tsd") parser.add_argument("--tir", help="Name of tir_termini track (appy cleanTermini.py)", default="tir_termini") parser.add_argument("--noScale", help="Dont do any scaling", default=False, action="store_true") parser.add_argument("--noTsd", help="Dont generate TSD track. NOTE:" " TSD track is hardcoded to be generated from " "termini and (non-LTR elements of ) chaux", default=False, action="store_true") parser.add_argument("--numProc", help="Number of processes to use for tsdFinder.py", default=1, type=int) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() args.logOpString = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.logOpString += " --logFile %s" % args.logFile try: os.makedirs(args.cleanTrackPath) except: pass if not os.path.isdir(args.cleanTrackPath): raise RuntimeError("Unable to find or create cleanTrack dir %s" % args.cleanTrackPath) tempTracksInfo = getLocalTempPath("Temp_mustang_alyrata_clean", "xml") runCleaning(args, tempTracksInfo) assert os.path.isfile(tempTracksInfo) runTsd(args, tempTracksInfo) runScaling(args, tempTracksInfo) runShellCommand("rm -f %s" % tempTracksInfo) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Fill in masked intervals of an hmm prediction " "(from teHmmEval.py) with state corresponding to surrounding" " intervals.") parser.add_argument("tracksXML", help="XML track list (used to id masking" " tracks") parser.add_argument("allBed", help="Target scope. Masked intervals outside" " of these regions will not be included") parser.add_argument( "inBed", help="TE prediction BED file. State labels" " should probably be mapped (ie with fitStateNames.py)") parser.add_argument("outBed", help="Output BED. Will be equivalent to" " the input bed except all gaps corresponding to " "masked intervals will be filled") parser.add_argument( "--maxLen", help="Maximum length of a masked interval" " to fill (inclusive). Use --delMask option with same value" "if running compareBedStates.py after.", type=int, default=sys.maxint) parser.add_argument("--default", help="Default label to give to masked " "region if no label can be determined", default="0") parser.add_argument( "--tgts", help="Only relabel gaps that " "are flanked on both sides by the same state, and this state" " is in this comma- separated list. --default used for other" " gaps. If not targetst specified then all states checked.", default=None) parser.add_argument( "--oneSidedTgts", help="Only relabel gaps that " "are flanked on at least one side by a state in this comma-" "separated list --default used for other gaps", default=None) parser.add_argument( "--onlyDefault", help="Add the default state (--default) no" " no all masked gaps no matter what. ie ignoring all other " "logic", action="store_true", default=False) parser.add_argument( "--cut", help="Cut out gaps for masked tracks from the input." " By default, the input is expected to come from the HMM " "with mask intervals already absent, and will crash on with" " an assertion error if an overlap is detected.", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # make sets tgtSet = set() if args.tgts is not None: tgtSet = set(args.tgts.split(",")) oneSidedTgtSet = set() if args.oneSidedTgts is not None: oneSidedTgtSet = set(args.oneSidedTgts.split(",")) assert len(tgtSet.intersection(oneSidedTgtSet)) == 0 # read the track list trackList = TrackList(args.tracksXML) maskTracks = trackList.getMaskTracks() # read the input bed inBed = args.inBed if args.cut is True: inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML) inputIntervals = readBedIntervals(inBed, ncol=4, sort=True) if args.cut is True: runShellCommand("rm -f %s" % inBed) if len(maskTracks) == 0 or len(inputIntervals) == 0: runShellCommand("cp %s %s" % (args.inBed, args.outBed)) logger.warning("No mask tracks located in %s or" " %s empty" % (args.tracksXML, args.inBed)) return 0 # make a temporary, combined, merged masking bed file tempMaskBed = getLocalTempPath("Temp_mb", ".bed") for maskTrack in maskTracks: assert os.path.isfile(maskTrack.getPath()) runShellCommand( "cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % (maskTrack.getPath(), tempMaskBed)) maskedIntervals = getMergedBedIntervals(tempMaskBed, sort=True) resolvedMasks = 0 if len(inputIntervals) == 0: logger.warning("No mask tracks located in %s" % args.tracksXML) return inputIdx = 0 rightFlank = inputIntervals[inputIdx] tempOutMask = getLocalTempPath("Temp_om", ".bed") tempOutMaskFile = open(tempOutMask, "w") for maskIdx, maskInterval in enumerate(maskedIntervals): if maskInterval[2] - maskInterval[1] > args.maxLen: continue # find candidate right flank while rightFlank < maskInterval: if inputIdx == len(inputIntervals) - 1: rightFlank = None break else: inputIdx += 1 rightFlank = inputIntervals[inputIdx] # candidate left flank leftFlank = None if inputIdx > 0: leftFlank = inputIntervals[inputIdx - 1] # identify flanking states if the intervals perfectly abut leftState = None if leftFlank is not None: if leftFlank[0] == maskInterval[0] and leftFlank[ 2] == maskInterval[1]: leftState = str(leftFlank[3]) else: assert intersectSize(leftFlank, maskInterval) == 0 rightState = None if rightFlank is not None: if rightFlank[0] == maskInterval[0] and rightFlank[ 1] == maskInterval[2]: rightState = str(rightFlank[3]) else: assert intersectSize(rightFlank, maskInterval) == 0 # choose a state for the mask interval maskState = str(args.default) if args.onlyDefault is True: pass elif leftState is not None and leftState == rightState: if len(tgtSet) == 0 or leftState in tgtSet: maskState = leftState elif leftState in oneSidedTgtSet: maskState = leftState elif rightState in oneSidedTgtSet: maskState = rightState # write our mask interval tempOutMaskFile.write( "%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1], maskInterval[2], maskState)) tempOutMaskFile.close() tempMergePath1 = getLocalTempPath("Temp_mp", ".bed") tempMergePath2 = getLocalTempPath("Temp_mp", ".bed") runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1, tempOutMask, tempMergePath1)) runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2)) tempScopePath = getLocalTempPath("temp_all", ".bed") runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath)) runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath, args.outBed)) runShellCommand("rm -f %s" % " ".join([ tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath ])) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Evaluate a given data set with a trained HMM. Display" " the log probability of the input data given the model, and " "optionally output the most likely sequence of hidden states.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("inputModel", help="Path of hmm created with" "teHmmTrain.py") parser.add_argument("bedRegions", help="Intervals to process") parser.add_argument("--bed", help="path of file to write viterbi " "output to (most likely sequence of hidden states)", default=None) parser.add_argument("--numThreads", help="Number of threads to use (only" " applies to CFG parser for the moment)", type=int, default=1) parser.add_argument("--slice", help="Make sure that regions are sliced" " to a maximum length of the given value. Most " "useful when model is a CFG to keep memory down. " "When 0, no slicing is done", type=int, default=0) parser.add_argument("--segment", help="Use the intervals in bedRegions" " as segments which each count as a single column" " for evaluattion. Note the model should have been" " trained with the --segment option pointing to this" " same bed file.", action="store_true", default=False) parser.add_argument("--segLen", help="Effective segment length used for" " normalizing input segments (specifying 0 means no" " normalization applied)", type=int, default=0) parser.add_argument("--maxPost", help="Use maximum posterior decoding instead" " of Viterbi for evaluation", action="store_true", default=False) parser.add_argument("--pd", help="Output BED file for posterior distribution. Must" " be used in conjunction with --pdStates (View on the " "browser via bedGraphToBigWig)", default=None) parser.add_argument("--pdStates", help="comma-separated list of state names to use" " for computing posterior distribution. For example: " " --pdStates inside,LTR_left,LTR_right will compute the probability" ", for each observation, that the hidden state is inside OR LTR_left" " OR LTR_right. Must be used with --pd to specify output " "file.", default=None) parser.add_argument("--bic", help="save Bayesian Information Criterion (BIC) score" " in given file", default=None) parser.add_argument("--ed", help="Output BED file for emission distribution. Must" " be used in conjunction with --edStates (View on the " "browser via bedGraphToBigWig)", default=None) parser.add_argument("--edStates", help="comma-separated list of state names to use" " for computing emission distribution. For example: " " --edStates inside,LTR_left for each obsercation the probability " " that inside emitted that observaiton plus the probabillity that" " LTR_left emitted it. If more than one state is selected, this " " is not a distribution, but a sum of distributions (and values" " can exceed 1). Mostly for debugging purposes. Note output in LOG", default=None) parser.add_argument("--chroms", help="list of chromosomes, or regions, to run in parallel" " (in BED format). input regions will be intersected with each line" " in this file, and the result will correspsond to an individual job", default=None) parser.add_argument("--proc", help="number of processes (use in conjunction with --chroms)", type=int, default=1) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.slice <= 0: args.slice = sys.maxint elif args.segment is True: raise RuntimeError("--slice and --segment options are not compatible at " "this time") if (args.pd is not None) ^ (args.pdStates is not None): raise RuntimeError("--pd requires --pdStates and vice versa") if (args.ed is not None) ^ (args.edStates is not None): raise RuntimeError("--ed requires --edStates and vice versa") if args.bed is None and (args.pd is not None or args.ed is not None): raise RuntimeError("Both --ed and --pd only usable in conjunction with" " --bed") if args.chroms is not None: # hack to allow chroms argument to chunk and rerun parallelDispatch(argv, args) cleanBedTool(tempBedToolPath) return 0 # load model created with teHmmTrain.py logger.info("loading model %s" % args.inputModel) model = loadModel(args.inputModel) if isinstance(model, MultitrackCfg): if args.maxPost is True: raise RuntimeErorr("--post not supported on CFG models") # apply the effective segment length if args.segLen > 0: assert args.segment is True model.getEmissionModel().effectiveSegmentLength = args.segLen # read intervals from the bed file logger.info("loading target intervals from %s" % args.bedRegions) mergedIntervals = getMergedBedIntervals(args.bedRegions, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.bedRegions) # slice if desired choppedIntervals = [x for x in slicedIntervals(mergedIntervals, args.slice)] # read segment intervals segIntervals = None if args.segment is True: logger.info("loading segment intervals from %s" % args.bedRegions) segIntervals = readBedIntervals(args.bedRegions, sort=True) # load the input # read the tracks, while intersecting them with the given interval trackData = TrackData() # note we pass in the trackList that was saved as part of the model # because we do not want to generate a new one. logger.info("loading tracks %s" % args.tracksInfo) trackData.loadTrackData(args.tracksInfo, choppedIntervals, model.getTrackList(), segmentIntervals=segIntervals) # do the viterbi algorithm if isinstance(model, MultitrackHmm): algname = "viterbi" if args.maxPost is True: algname = "posterior decoding" logger.info("running %s algorithm" % algname) elif isinstance(model, MultitrackCfg): logger.info("running CYK algorithm") vitOutFile = None if args.bed is not None: vitOutFile = open(args.bed, "w") totalScore = 0 tableIndex = 0 totalDatapoints = 0 # Note: in general there's room to save on memory by only computing single # track table at once (just need to add table by table interface to hmm...) posteriors = [None] * trackData.getNumTrackTables() posteriorsFile = None posteriorsMask = None if args.pd is not None: posteriors = model.posteriorDistribution(trackData) posteriorsFile = open(args.pd, "w") posteriorsMask = getPosteriorsMask(args.pdStates, model) assert len(posteriors[0][0]) == len(posteriorsMask) emProbs = [None] * trackData.getNumTrackTables() emissionsFile = None emissionsMask = None if args.ed is not None: emProbs = model.emissionDistribution(trackData) emissionsFile = open(args.ed, "w") emissionsMask = getPosteriorsMask(args.edStates, model) assert len(emProbs[0][0]) == len(emissionsMask) decodeFunction = model.viterbi if args.maxPost is True: decodeFunction = model.posteriorDecode for i, (vitLogProb, vitStates) in enumerate(decodeFunction(trackData, numThreads=args.numThreads)): totalScore += vitLogProb if args.bed is not None or args.pd is not None: if args.bed is not None: vitOutFile.write("#Viterbi Score: %f\n" % (vitLogProb)) trackTable = trackData.getTrackTableList()[tableIndex] tableIndex += 1 statesToBed(trackTable, vitStates, vitOutFile, posteriors[i], posteriorsMask, posteriorsFile, emProbs[i], emissionsMask, emissionsFile) totalDatapoints += len(vitStates) * trackTable.getNumTracks() print "Viterbi (log) score: %f" % totalScore if isinstance(model, MultitrackHmm) and model.current_iteration is not None: print "Number of EM iterations: %d" % model.current_iteration if args.bed is not None: vitOutFile.close() if posteriorsFile is not None: posteriorsFile.close() if emissionsFile is not None: emissionsFile.close() if args.bic is not None: bicFile = open(args.bic, "w") # http://en.wikipedia.org/wiki/Bayesian_information_criterion lnL = float(totalScore) try: k = float(model.getNumFreeParameters()) except: # numFreeParameters still not done for semi-supervised # just pass through a 0 instead of crashing for now k = 0.0 n = float(totalDatapoints) bic = -2.0 * lnL + k * (np.log(n) + np.log(2 * np.pi)) bicFile.write("%f\n" % bic) bicFile.write("# = -2.0 * lnL + k * (lnN + ln(2 * np.pi))\n" "# where lnL=%f k=%d (%d states) N=%d (%d obs * %d tracks) lnN=%f\n" % ( lnL, int(k), model.getEmissionModel().getNumStates(), int(totalDatapoints), totalDatapoints / model.getEmissionModel().getNumTracks(), model.getEmissionModel().getNumTracks(), np.log(n))) bicFile.close() cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Create starting transition and emission distributions " "from a candidate BED annotation, which can" " be used with teHmmTrain.py using the --initTransProbs and " "--initEmProbs options, respectively. The distributions created here" " are extremely simple, but this can be a good shortcut to at least " "getting the state names into the init files, which can be further " "tweeked by hand.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("trackName", help="Name of Track to use as initial" " annotation") parser.add_argument("queryBed", help="Bed file with regions to query") parser.add_argument("outTransProbs", help="File to write transition model" " to") parser.add_argument("outEmProbs", help="File to write emission model to") parser.add_argument("--numOut", help="Number of \"outside\" states to add" " to the model.", default=1, type=int) parser.add_argument("--numTot", help="Add x \"outside\" states such " "that total states is this. (overrieds --numOut)", default=0, type=int) parser.add_argument("--outName", help="Name of outside states (will have" " numeric suffix if more than 1)", default="Outside") parser.add_argument("--mode", help="Strategy for initializing the " "transition graph: {\'star\': all states are connected" " to the oustide state(s) but not each other; " " \'data\': transitions estimated from input bed; " " \'full\': dont write edges and let teHmmTrain.py " "initialize as a clique}", default="star") parser.add_argument("--selfTran", help="This script will always write all" " the self-transition probabilities to the output file. " "They will all be set to the specified value using this" " option, or estimated from the data if -1", default=-1., type=float) parser.add_argument("--em", help="Emission probability for input track (" "ie probability that state emits itself)", type=float, default=0.95) parser.add_argument("--outEmNone", help="Add None emission probabilities" " for target track for Outside states", action="store_true", default=None) addLoggingOptions(parser) args = parser.parse_args() if args.mode == "star" and args.numOut < 1: raise RuntimeError("--numOut must be at least 1 if --mode star is used") if args.mode != "star" and args.mode != "data" and args.mode != "full": raise RuntimeError("--mode must be one of {star, data, full}") if args.mode == "data": raise RuntimeError("--data not implemented yet") assert os.path.isfile(args.tracksInfo) setLoggingFromOptions(args) tempBedToolPath = initBedTool() # Read the tracks info trackList = TrackList(args.tracksInfo) # Extract the track we want track = trackList.getTrackByName(args.trackName) if track is None: raise RuntimeError("Track %s not found in tracksInfo" % args.trackName) trackPath = track.getPath() if track.getDist() != "multinomial" and track.getDist() != "gaussian": raise RuntimeError("Track %s does not have multinomial or " "gaussian distribution" % args.trackName) if track.getScale() is not None or track.getLogScale() is not None: raise RuntimeError("Track %s must not have scale" % args.trackName) # read query intervals from the bed file logger.info("loading query intervals from %s" % args.queryBed) mergedIntervals = getMergedBedIntervals(args.queryBed, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.queryBed) # read the track, while intersecting with query intervals # (track is saved as temp XML file for sake not changing interface) bedIntervals = [] for queryInterval in mergedIntervals: bedIntervals += readBedIntervals(trackPath, ncol = track.getValCol() + 1, chrom=queryInterval[0], start=queryInterval[1], end=queryInterval[2]) # 1st pass to collect set of names nameMap = CategoryMap(reserved = 0) for interval in bedIntervals: nameMap.update(interval[track.getValCol()]) outNameMap = CategoryMap(reserved = 0) if args.numTot > 0: args.numOut = max(0, args.numTot - len(nameMap)) for i in xrange(args.numOut): outName = args.outName if args.numOut > 1: outName += str(i) assert nameMap.has(outName) is False outNameMap.update(outName) # write the transition model for use with teHmmTrain.py --initTransProbs writeTransitions(bedIntervals, nameMap, outNameMap, args) # write the emission model for use with teHmmTrain.py --initEmProbs writeEmissions(bedIntervals, nameMap, outNameMap, args) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Make some tables of statistics from a BED file. All" " output will be written in one big CSV table to be viewed in a " "spreadsheet.") parser.add_argument("inBed", help="Input bed file") parser.add_argument("outCsv", help="Path to write output in CSV format") parser.add_argument("--ignore", help="Comma-separated list of names" " to ignore", default="") parser.add_argument("--numBins", help="Number of (linear) bins for " "histograms", type=int, default=10) parser.add_argument("--logHist", help="Apply log-transform to data for " "histogram", action="store_true", default=False) parser.add_argument("--histRange", help="Histogram range as comma-" "separated pair of numbers", default=None) parser.add_argument("--noHist", help="skip hisograms", action="store_true", default=False) parser.add_argument("--noScore", help="Just do length stats", action="store_true", default=False) parser.add_argument("--noLen", help="Just do score stats", action="store_true", default=False) parser.add_argument("--nearness", help="Compute nearness stats (instead " "of normal stats) of input bed with given BED. Output" " will be a BED instead of CSV, with nearness in the " "score position", default=None) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.histRange is not None: args.histRange = args.histRange.split(",") assert len(args.histRange) == 2 args.histRange = int(args.histRange[0]), int(args.histRange[1]) outFile = open(args.outCsv, "w") args.ignoreSet = set(args.ignore.split(",")) intervals = readBedIntervals(args.inBed, ncol=5, sort=args.nearness is not None) csvStats = "" # nearness stats if args.nearness is not None: args.noScore = True csvStats = makeNearnessBED(intervals, args) # length stats elif args.noLen is False: csvStats = makeCSV(intervals, args, lambda x: int(x[2]) - int(x[1]), "Length") # score stats try: if args.noScore is False: csvStats += "\n" + makeCSV(intervals, args, lambda x: float(x[4]), "Score") csvStats += "\n" + makeCSV( intervals, args, lambda x: float(x[4]) * (float(x[2]) - float(x[1])), "Score*Length") except Exception as e: logger.warning("Couldn't make score stats because %s" % str(e)) outFile.write(csvStats) outFile.write("\n") outFile.close() cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=" Given two bed files: a prediction and a true (or target)" " annotation, re-label the prediction's state names so that they " " best match the true annotation. Usees same logic as " " compareBedStates.py for determining accuracy") parser.add_argument("tgtBed", help="Target bed file") parser.add_argument("predBed", help="Predicted bed file to re-label. ") parser.add_argument("outBed", help="Output bed (relabeling of predBed)") parser.add_argument("--col", help="Column of bed files to use for state" " (currently only support 4(name) or 5(score))", default = 4, type = int) parser.add_argument("--intThresh", help="Threshold to consider interval from" " tgtBed covered by predBed. If not specified, then base" " level statistics will be used. Value in range (0,1]", type=float, default=None) parser.add_argument("--noFrag", help="Dont allow fragmented interval matches (" "see help for --frag in compareBedStates.py). Only" " relevant with --intThresh", action="store_true", default=False) parser.add_argument("--qualThresh", help="Minimum match ratio between truth" " and prediction to relabel prediction. Example, if" " predicted state X overlaps target state LTR 25 pct of " "the time, then qualThresh must be at least 0.25 to " "label X as LTR in the output. Value in range (0, 1]", type=float, default=0.1) parser.add_argument("--ignore", help="Comma-separated list of stateNames to" " ignore (in prediction)", default=None) parser.add_argument("--ignoreTgt", help="Comma-separated list of stateNames to" " ignore (int target)", default=None) parser.add_argument("--tgt", help="Comma-separated list of stateNames to " " consider (in target). All others will be ignored", default=None) parser.add_argument("--unique", help="If more than one predicted state maps" " to the same target state, add a unique id (numeric " "suffix) to the output so that they can be distinguished", action="store_true", default=False) parser.add_argument("--model", help="Apply state name mapping to the model" " in the specified path (it is strongly advised to" " make a backup of the model first)", default=None) parser.add_argument("--noMerge", help="By default, adjacent intervals" " with the same state name in the output are " "automatically merged into a single interval. This" " flag disables this.", action="store_true", default=False) parser.add_argument("--hm", help="Write confusion matrix as heatmap in PDF" " format to specified file", default = None) parser.add_argument("--old", help="Use old name mapping logic which just " "takes biggest overlap in forward confusion matrix. " "faster than new default logic which does the greedy" " f1 optimization", action="store_true", default=False) parser.add_argument("--fdr", help="Use FDR cutoff instead of (default)" " greedy F1 optimization for state labeling", type=float, default=None) parser.add_argument("--tl", help="Path to tracks XML file. Used to cut " "out mask tracks so they are removed from comparison." " (convenience option to not have to manually run " "subtractBed everytime...)", default=None) parser.add_argument("--colOrder", help="List of states used to force" " ordering in heatmap (otherwise alphabetical) columns. These" " states will correspond to the tgtBed when --old used and" " --predBed otherwise.", default=None) parser.add_argument("--hmCovRow", help="Path to write 1-row heatmap of " "state coverage (fraction of bases). only works with --hm", default=None) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.ignore is not None: args.ignore = set(args.ignore.split(",")) else: args.ignore = set() if args.ignoreTgt is not None: args.ignoreTgt = set(args.ignoreTgt.split(",")) else: args.ignoreTgt = set() if args.tgt is not None: args.tgt = set(args.tgt.split(",")) if args.old is True: raise RuntimeError("--tgt option not implemented for --old") else: args.tgt = set() if args.old is True and args.fdr is not None: raise RuntimeError("--old and --fdr options are exclusive") assert args.col == 4 or args.col == 5 tempFiles = [] if args.tl is not None: cutBedTgt = cutOutMaskIntervals(args.tgtBed, -1, sys.maxint, args.tl) cutBedPred = cutOutMaskIntervals(args.predBed, -1, sys.maxint, args.tl) if cutBedTgt is not None: assert cutBedPred is not None tempFiles += [cutBedTgt, cutBedPred] args.tgtBed = cutBedTgt args.predBed = cutBedPred checkExactOverlap(args.tgtBed, args.predBed) intervals1 = readBedIntervals(args.tgtBed, ncol = args.col) intervals2 = readBedIntervals(args.predBed, ncol = args.col) cfName = "reverse" if args.old is True: intervals1, intervals2 = intervals2, intervals1 cfName = "forward" # generate confusion matrix based on accuracy comparison using # base or interval stats as desired if args.intThresh is not None: logger.info("Computing interval %s confusion matrix" % cfName) confMat = compareIntervalsOneSided(intervals2, intervals1, args.col -1, args.intThresh, False, not args.noFrag)[1] else: logger.info("Computing base %s confusion matrix" % cfName) confMat = compareBaseLevel(intervals2, intervals1, args.col - 1)[1] logger.info("%s Confusion Matrix:\n%s" % (cfName, str(confMat))) # find the best "true" match for each predicted state if args.old is True: intervals1, intervals2 = intervals2, intervals1 stateMap = getStateMapFromConfMatrix_simple(confMat) else: stateMap = getStateMapFromConfMatrix(confMat, args.tgt, args.ignoreTgt, args.ignore, args.qualThresh, args.fdr) # filter the stateMap to take into account the command-line options # notably --ignore, --ignoreTgt, --qualThresh, and --unique filterStateMap(stateMap, args) logger.info("State Map:\n%s", str(stateMap)) # write the model if spefied if args.model is not None: applyNamesToModel(stateMap, args.model) # generate the output bed using the statemap writeFittedBed(intervals2, stateMap, args.outBed, args.col-1, args.noMerge, args.ignoreTgt) # write the confusion matrix as heatmap if args.hm is not None: if canPlot is False: raise RuntimeError("Unable to write heatmap. Maybe matplotlib is " "not installed?") writeHeatMap(confMat, args.hm, args.colOrder, args.hmCovRow) if len(tempFiles) > 0: runShellCommand("rm -f %s" % " ".join(tempFiles)) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Train, evalaute, then compare hmm model on input") parser.add_argument("trainingTracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks used " "for training") parser.add_argument("outputDir", help="directory to write output") parser.add_argument("inBeds", nargs="*", help="list of training beds") parser.add_argument("--evalTracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks used" " for evaluation (only need if different from" " trainingTracksInfo", default=None) parser.add_argument("--numProc", help="Max number of processors to use", type=int, default=1) parser.add_argument("--allTrackCombinations", help="Rerun with all" " possible combinations of tracks from the input" " tracksInfo file. Note that this number gets big" " pretty fast.", action = "store_true", default= False) parser.add_argument("--emStates", help="By default the supervised mode" " of teHmmTrain is activated. This option overrides" " that and uses the EM mode and the given number of " "states instead", type=int, default=None) parser.add_argument("--cross", help="Do 50/50 cross validation by training" " on first half input and validating on second", action="store_true", default=False) parser.add_argument("--emFac", help="Normalization factor for weighting" " emission probabilities because when there are " "many tracks, the transition probabilities can get " "totally lost. 0 = no normalization. 1 =" " divide by number of tracks. k = divide by number " "of tracks / k", type=int, default=0) parser.add_argument("--mod", help="Path to trained model. This will " "bypass the training phase that would normally be done" " and just skip to the evaluation. Note that the user" " must make sure that the trained model has the " "states required to process the input data", default = None) parser.add_argument("--iter", help="Number of EM iterations. Needs to be" " used in conjunction with --emStates to specify EM" " training", type = int, default=None) parser.add_argument("--initTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". This file (all other transitions get probability 0)" " is used to specifiy the initial transition model." " The names and number of states will be initialized " "according to this file (overriding --numStates)", default = None) parser.add_argument("--fixTrans", help="Do not learn transition parameters" " (best used with --initTransProbs)", action="store_true", default=False) parser.add_argument("--initEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". This file (all other emissions get probability 0)" " is used to specifiy the initial emission model. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixEm", help="Do not learn emission parameters" " (best used with --initEmProbs)", action="store_true", default=False) parser.add_argument("--initStartProbs", help="Path of text file where each " "line has two entries: State Probability" ". This file (all other start probs get probability 0)" " is used to specifiy the initial start dist. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixStart", help="Do not learn start parameters" " (best used with --initStartProbs)", action="store_true", default=False) parser.add_argument("--forceTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". These transition probabilities will override any " " learned probabilities after training (unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed" , default=None) parser.add_argument("--forceEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". These " "emission probabilities will override any learned" " probabilities after training (unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed." , default = None) parser.add_argument("--flatEm", help="Use a flat emission distribution as " "a baseline. If not specified, the initial emission " "distribution will be randomized by default. Emission" " probabilities specified with --initEmpProbs or " "--forceEmProbs will never be affected by randomizaiton" ". The randomization is important for Baum Welch " "training, since if two states dont have at least one" " different emission or transition probability to begin" " with, they will never learn to be different.", action="store_true", default=False) parser.add_argument("--emRandRange", help="When randomly initialzing a" " multinomial emission distribution, constrain" " the values to the given range (pair of " "comma-separated numbers). Overridden by " "--initEmProbs and --forceEmProbs when applicable." " Completely overridden by --flatEm (which is equivalent" " to --emRandRange .5,.5.). Actual values used will" " always be normalized.", default=None) parser.add_argument("--mandTracks", help="Mandatory track names for use " "with --allTrackCombinations in comma-separated list", default=None) parser.add_argument("--combinationRange", help="in form MIN,MAX: Only " "explore track combination in given (closed) range. " "A more refined version of --allTrackCombinations.", default=None) parser.add_argument("--supervised", help="Use name (4th) column of " "<traingingBed> for the true hidden states of the" " model. Transition parameters will be estimated" " directly from this information rather than EM." " NOTE: The number of states will be determined " "from the bed.", action = "store_true", default = False) parser.add_argument("--segment", help="Input bed files are also used to " "segment data. Ie teHmmTrain is called with --segment" " set to the input file. Not currently working with " " --supervised", action = "store_true", default=False) parser.add_argument("--segLen", help="Effective segment length used for" " normalizing input segments (specifying 0 means no" " normalization applied) in training", type=int, default=None) parser.add_argument("--truth", help="Use specifed file instead of " "input file(s) for truth comparison. Makes sense" " when --segment is specified and only one input" " bed specified", default = None) parser.add_argument("--eval", help="Bed file used for evaluation. It should" " cover same region in same order as --truth. Option " "exists mostly to specify segmentation of --truth", default=None) parser.add_argument("--seed", help="Seed for random number generator" " which will be used to initialize emissions " "(if --flatEM and --supervised not specified)", default=None, type=int) parser.add_argument("--reps", help="Number of training replicates (with " " different" " random initializations) to run. The replicate" " with the highest likelihood will be chosen for the" " output", default=None, type=int) parser.add_argument("--numThreads", help="Number of threads to use when" " running training replicates (see --rep) in parallel.", type=int, default=None) parser.add_argument("--emThresh", help="Threshold used for convergence" " in baum welch training. IE delta log likelihood" " must be bigger than this number (which should be" " positive) for convergence", type=float, default=None) parser.add_argument("--fit", help="Run fitStateNames.py to automap names" " before running comparison", action="store_true", default=False) parser.add_argument("--fitOpts", help="Options to pass to fitStateNames.py" " (only effective if used with --fit)", default=None) parser.add_argument("--saveAllReps", help="Save all replicates (--reps)" " models to disk, instead of just the best one" ". Format is <outputModel>.repN. There will be " " --reps -1 such models saved as the best output" " counts as a replicate. Comparison statistics" " will be generated for each rep.", action="store_true", default=False) parser.add_argument("--maxProb", help="Gaussian distributions and/or" " segment length corrections can cause probability" " to *decrease* during BW iteration. Use this option" " to remember the parameters with the highest probability" " rather than returning the parameters after the final " "iteration.", action="store_true", default=False) parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop" " training if a given number of iterations go by without" " hitting a new maxProb", default=None, type=int) parser.add_argument("--transMatEpsilons", help="By default, epsilons are" " added to all transition probabilities to prevent " "converging on 0 due to rounding error only for fully" " unsupervised training. Use this option to force this" " behaviour for supervised and semisupervised modes", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) logOps = "--logLevel %s" % getLogLevelString() if args.logFile is not None: logOps += " --logFile %s" % args.logFile if not os.path.exists(args.outputDir): os.makedirs(args.outputDir) if args.evalTracksInfo is None: args.evalTracksInfo = args.trainingTracksInfo trainingTrackList = TrackList(args.trainingTracksInfo) evalTrackList = TrackList(args.evalTracksInfo) checkTrackListCompatible(trainingTrackList, evalTrackList) sizeRange = (len(trainingTrackList), len(trainingTrackList) + 1) if args.allTrackCombinations is True: sizeRange = (1, len(trainingTrackList) + 1) if args.combinationRange is not None: toks = args.combinationRange.split(",") sizeRange = int(toks[0]),int(toks[1]) + 1 logger.debug("manual range (%d, %d) " % sizeRange) mandTracks = set() if args.mandTracks is not None: mandTracks = set(args.mandTracks.split(",")) logger.debug("mandatory set %s" % str(mandTracks)) trainFlags = "" if args.emStates is not None: trainFlags += " --numStates %d" % args.emStates if args.supervised is True: trainFlags += " --supervised" if args.segment is True: raise RuntimeError("--supervised not currently compatible with " "--segment") trainFlags += " --emFac %d" % args.emFac if args.forceEmProbs is not None: trainFlags += " --forceEmProbs %s" % args.forceEmProbs if args.iter is not None: assert args.emStates is not None or args.initTransProbs is not None trainFlags += " --iter %d" % args.iter if args.initTransProbs is not None: trainFlags += " --initTransProbs %s" % args.initTransProbs if args.initEmProbs is not None: trainFlags += " --initEmProbs %s" % args.initEmProbs if args.fixEm is True: trainFlags += " --fixEm" if args.initStartProbs is not None: trainFlags += " --initStartProbs %s" % args.initStartProbs if args.fixStart is True: trainFlags += " --fixStart" if args.forceTransProbs is not None: trainFlags += " --forceTransProbs %s" % args.forceTransProbs if args.forceEmProbs is not None: trainFlags += " --forceEmProbs %s" % args.forceEmProbs if args.flatEm is True: trainFlags += " --flatEm" if args.emRandRange is not None: trainFlags += " --emRandRange %s" % args.emRandRange if args.segLen is not None: trainFlags += " --segLen %d" % args.segLen if args.seed is not None: trainFlags += " --seed %d" % args.seed if args.reps is not None: trainFlags += " --reps %d" % args.reps if args.numThreads is not None: trainFlags += " --numThreads %d" % args.numThreads if args.emThresh is not None: trainFlags += " --emThresh %f" % args.emThresh if args.saveAllReps is True: trainFlags += " --saveAllReps" if args.maxProb is True: trainFlags += " --maxProb" if args.transMatEpsilons is True: trainFlags += " --transMatEpsilons" if args.maxProbCut is not None: trainFlags += " --maxProbCut %d" % args.maxProbCut # write out command line for posteriorty's sake if not os.path.exists(args.outputDir): os.makedirs(args.outputDir) cmdPath = os.path.join(args.outputDir, "teHmmBenchmark_cmd.txt") cmdFile = open(cmdPath, "w") cmdFile.write(" ".join(argv) + "\n") cmdFile.close() #todo: try to get timing for each command commands = [] rows = dict() for pn, pList in enumerate(subsetTrackList(trainingTrackList, sizeRange, mandTracks)): if len(pList) == len(trainingTrackList): outDir = args.outputDir else: outDir = os.path.join(args.outputDir, "perm%d" % pn) if not os.path.exists(outDir): os.makedirs(outDir) trainingTrackPath = os.path.join(outDir, "training_tracks.xml") evalTrackPath = os.path.join(outDir, "eval_tracks.xml") for maskTrack in trainingTrackList.getMaskTracks(): pList.addTrack(copy.deepcopy(maskTrack)) pList.saveXML(trainingTrackPath) epList = TrackList() for track in pList: t = copy.deepcopy(evalTrackList.getTrackByName(track.getName())) epList.addTrack(t) for maskTrack in trainingTrackList.getMaskTracks(): epList.addTrack(copy.deepcopy(maskTrack)) epList.saveXML(evalTrackPath) for inBed in args.inBeds: base = os.path.basename(inBed) truthBed = inBed testBed = inBed if args.cross is True: truthBed = os.path.join(outDir, os.path.splitext(base)[0] + "_truth_temp.bed") testBed = os.path.join(outDir, os.path.splitext(base)[0] + "_test_temp.bed") splitBed(inBed, truthBed, testBed) # train if args.mod is not None: modPath = args.mod command = "ls %s" % modPath else: modPath = os.path.join(outDir, os.path.splitext(base)[0] + ".mod") command = "teHmmTrain.py %s %s %s %s %s" % (trainingTrackPath, truthBed, modPath, logOps, trainFlags) if args.segment is True: command += " --segment %s" % truthBed # view viewPath = os.path.join(outDir, os.path.splitext(base)[0] + "_view.txt") command += " && teHmmView.py %s > %s" % (modPath, viewPath) # evaluate numReps = 1 if args.reps is not None and args.saveAllReps is True: numReps = args.reps assert numReps > 0 missed = 0 # little hack to repeat evaluation for each training replicate for repNum in xrange(-1, numReps-1): if repNum == -1: repSuffix = "" else: repSuffix = ".rep%d" % repNum evalBed = os.path.join(outDir, os.path.splitext(base)[0] + "_eval.bed" + repSuffix) hmmEvalInputBed = testBed if args.eval is not None: hmmEvalInputBed = args.eval bicPath = os.path.join(outDir, os.path.splitext(base)[0] + "_bic.txt" + repSuffix) command += " && teHmmEval.py %s %s %s --bed %s %s --bic %s" % ( evalTrackPath, modPath + repSuffix, hmmEvalInputBed, evalBed, logOps, bicPath) zin = True if args.segment is True: command += " --segment" # fit compTruth = testBed if args.truth is not None: compTruth = args.truth compareInputBed = evalBed if args.fit is True: fitBed = os.path.join(outDir, os.path.splitext(base)[0] + "_eval_fit.bed" + repSuffix) command += " && fitStateNames.py %s %s %s --tl %s" % (compTruth, evalBed, fitBed, evalTrackPath) if args.fitOpts is not None: command += " " + args.fitOpts compareInputBed = fitBed # compare compPath = os.path.join(outDir, os.path.splitext(base)[0] + "_comp.txt" + repSuffix) command += " && compareBedStates.py %s %s --tl %s > %s" % ( compTruth, compareInputBed, evalTrackPath, compPath) # make table row if repSuffix == "": rowPath = os.path.join(outDir, os.path.splitext(base)[0] + "_row.txt") if inBed in rows: rows[inBed].append(rowPath) else: rows[inBed] = [rowPath] command += " && scrapeBenchmarkRow.py %s %s %s %s %s" % ( args.trainingTracksInfo, trainingTrackPath, evalBed, compPath, rowPath) # remember command inCmdPath = os.path.join(outDir, os.path.splitext(base)[0] + "_cmd.txt") inCmdFile = open(inCmdPath, "w") inCmdFile.write(command + "\n") inCmdFile.close() commands.append(command) runParallelShellCommands(commands, args.numProc) writeTables(args.outputDir, rows)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Find candidate TSDs (exact forward matches) flanking given" "BED intervals. Score is distance between TSD and bed interval.") parser.add_argument("fastaSequence", help="DNA sequence in FASTA format") parser.add_argument("inBed", help="BED file with TEs whose flanking regions " "we wish to search") parser.add_argument("outBed", help="BED file containing (only) output TSDs") parser.add_argument("--min", help="Minimum length of a TSD", default=4, type=int) parser.add_argument("--max", help="Maximum length of a TSD", default=6, type=int) parser.add_argument("--all", help="Report all matches in region (as opposed" " to only the nearest to the BED element which is the " "default behaviour", action="store_true", default=False) parser.add_argument("--maxScore", help="Only report matches with given " "score or smaller. The score is definied as the " "maximum distance between the (two) TSD intervals and " "the query interval", default=None, type=int) parser.add_argument("--left", help="Number of bases immediately left of the " "BED element to search for the left TSD", default=7, type=int) parser.add_argument("--right", help="Number of bases immediately right of " "the BED element to search for the right TSD", default=7, type=int) parser.add_argument("--overlap", help="Number of bases overlapping the " "BED element to include in search (so total space " "on each side will be --left + overlap, and --right + " "--overlap", default=3, type=int) parser.add_argument("--leftName", help="Name of left TSDs in output Bed", default="L_TSD") parser.add_argument("--rightName", help="Name of right TSDs in output Bed", default="R_TSD") parser.add_argument("--id", help="Assign left/right pairs of TSDs a unique" " matching ID", action="store_true", default=False) parser.add_argument("--names", help="Only apply to bed interval whose " "name is in (comma-separated) list. If not specified" " then all intervals are processed", default=None) parser.add_argument("--numProc", help="Number of jobs to run in parallel." " (parallization done on different sequences in FASTA" "file", type=int, default=1) parser.add_argument("--sequences", help="Only process given sequences of input" " FASTA file (comma-separated list).", default=None) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) assert os.path.exists(args.inBed) assert os.path.exists(args.fastaSequence) assert args.min <= args.max args.nextId = 0 if args.sequences is not None: args.sequences = set(args.sequences.split(",")) # read intervals from the bed file logger.info("loading target intervals from %s" % args.inBed) bedIntervals = readBedIntervals(args.inBed, ncol=4, sort=True) if bedIntervals is None or len(bedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.inBed) if args.numProc > 1: runParallel(args, bedIntervals) return 0 tsds = findTsds(args, bedIntervals) writeBedIntervals(tsds, args.outBed)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Produce a bed file of genome segments which are atomic" " elements with resepect to the hmm. ie each segment emits a single" " state. Mask tracks always cut. " "Output intervals are assigned name 0 1 0 1 etc.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("allBed", help="Bed file spanning entire genome") parser.add_argument("outBed", help="Output segments") parser.add_argument("--thresh", help="Number of tracks that can change " "before a new segment formed. Increasing this value" " increases the expected lengths of output segments", type=int, default=1) parser.add_argument("--cutTracks", help="Create a new segment if something" " changes in one of these tracks (as specified by " "comman-separated list), overriding --thresh options" " if necessary. For example, --cutTracks tsd,chaux" " would invoke a new segment everytime the value at" "either of these tracks changed", default=None) parser.add_argument("--cutUnscaled", help="Cut on all unscaled (used as " "a proxy for non-numeric) tracks", default=False, action="store_true") parser.add_argument("--cutMultinomial", help="Cut non-gaussian, non-binary" " tracks everytime", default=False, action="store_true") parser.add_argument("--cutNonGaussian", help="Cut all but guassian tracks", default=False, action="store_true") parser.add_argument("--comp", help="Strategy for comparing columns for the " "threshold cutoff. Options are [first, prev], where" " first compares with first column of segment and " "prev compares with column immediately left", default="first") parser.add_argument("--ignore", help="Comma-separated list of tracks to " "ignore (the FASTA DNA sequence would be a good " "candidate", default="sequence") parser.add_argument("--maxLen", help="Maximum length of a segment (<= 0 means" " no max length applied", type=int, default=0) parser.add_argument( "--fixLen", help="Just make segments of specifed fixed " "length ignoring other parameters and logic (<= 0 means" " no fixed length applied", type=int, default=0) parser.add_argument("--stats", help="Write some statistics to specified " "file. Of the form <trackName> <Diff> <DiffPct> " " where <Diff> is the number of times a track differs" " between two consecutive segments, and <DiffPct> " " is the average perecentage of all such differences " "accounted for by the track", default=None) parser.add_argument( "--delMask", help="Entirely remove intervals from " "mask tracks that are > given length (otherwise " "they would just be ignored by HMM tools). The difference" " here is that removed intervals will break contiguity.", type=int, default=None) parser.add_argument( "--chroms", help="list of chromosomes, or regions, to run in parallel" " (in BED format). input regions will be intersected with each line" " in this file, and the result will correspsond to an individual job", default=None) parser.add_argument( "--proc", help="number of processes (use in conjunction with --chroms)", type=int, default=1) parser.add_argument( "--co", help="count offset for segment labels. only used internally", type=int, default=0) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.comp != "first" and args.comp != "prev": raise RuntimeError("--comp must be either first or prev") if args.chroms is not None: # hack to allow chroms argument to chunk and rerun parallelDispatch(argv, args) cleanBedTool(tempBedToolPath) return 0 # read query intervals from the bed file tempFiles = [] if args.delMask is not None: cutBed = cutOutMaskIntervals(args.allBed, args.delMask, sys.maxint, args.tracksInfo) if cutBed is not None: tempFiles.append(cutBed) args.allBed = cutBed logger.info("loading segment region intervals from %s" % args.allBed) mergedIntervals = getMergedBedIntervals(args.allBed, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.allBed) # read the tracks, while intersecting them with the query intervals logger.info("loading tracks %s" % args.tracksInfo) trackData = TrackData() trackData.loadTrackData(args.tracksInfo, mergedIntervals, treatMaskAsBinary=True) # process the --cutTracks option trackList = trackData.getTrackList() cutList = np.zeros((len(trackList)), np.int) if args.cutTracks is not None: cutNames = args.cutTracks.split(",") for name in cutNames: track = trackList.getTrackByName(name) if track is None: raise RuntimeError("cutTrack %s not found" % name) trackNo = track.getNumber() assert trackNo < len(cutList) cutList[trackNo] = 1 args.cutList = cutList # make sure mask tracks count as cut tracks for track in trackList: if track.getDist() == 'mask': args.cutList[track.getNumber()] = 1 # process the --ignore option ignoreList = np.zeros((len(trackList)), np.int) if args.ignore is not None: ignoreNames = args.ignore.split(",") for name in ignoreNames: track = trackList.getTrackByName(name) if track is None: if name is not "sequence": logger.warning("ignore track %s not found" % name) continue trackNo = track.getNumber() assert trackNo < len(ignoreList) ignoreList[trackNo] = 1 if args.cutList[trackNo] == 1: raise RuntimeError("Same track (%s) cant be cut and ignored" % name) args.ignoreList = ignoreList #process the --cutUnscaled option if args.cutUnscaled is True: for track in trackList: trackNo = track.getNumber() if track.scale is None and track.shift is None and\ track.logScale is None and\ args.ignoreList[trackNo] == 0: assert trackNo < len(cutList) cutList[trackNo] = 1 #process the --cutMultinomial option if args.cutMultinomial is True: for track in trackList: trackNo = track.getNumber() if track.dist == "multinomial" and\ args.ignoreList[trackNo] == 0: assert trackNo < len(cutList) cutList[trackNo] = 1 #process the --cutNonGaussian option if args.cutNonGaussian is True: for track in trackList: trackNo = track.getNumber() if track.dist != "gaussian" and\ args.ignoreList[trackNo] == 0: assert trackNo < len(cutList) cutList[trackNo] = 1 # segment the tracks stats = dict() segmentTracks(trackData, args, stats) writeStats(trackData, args, stats) if len(tempFiles) > 0: runShellCommand("rm -f %s" % " ".join(tempFiles)) cleanBedTool(tempBedToolPath)
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Combine a bunch of non-numeric BED tracks into" " single file using fitStateNames.py to try to keep names " "consistent. Idea is to be used as baseline to compare" " hmm to (via base-by-base statistics, primarily, since" " this procedure could induce some fragmentation)") parser.add_argument("tracksXML", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("regionBed", help="BED file representing " "target region (best if whole genome)") parser.add_argument("outBed", help="Output bed") parser.add_argument("--tracks", help="Comma-separated list of " "track names to use. All tracks will be" " used by default", default=None) parser.add_argument("--outside", help="Name to give non-annotated" "regions", default="Outside") parser.add_argument("--fitThresh", help="Min map percentage (0,1)" " in order to rename (see --qualThresh option" "of fitStateNames.py", type=float, default=0.5) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() inputTrackList = TrackList(args.tracksXML) iter = 0 # get regionBed where all intervals are merged when possible regionIntervals = getMergedBedIntervals(args.regionBed, sort=True) tempRegionPath = getLocalTempPath("Temp", "_reg.bed") tempRegionFile = open(tempRegionPath, "w") for interval in regionIntervals: tempRegionFile.write("\t".join([str(x) for x in interval]) + "\n") tempRegionFile.close() # accumulate tracks in temp file tempOutPath = getLocalTempPath("Temp", "_out.bed") for track in inputTrackList: if track.shift is not None or track.scale is not None or\ track.logScale is not None or track.dist == "gaussian" or\ os.path.splitext(track.getPath())[1].lower() != ".bed": logger.warning("Skipping numeric track %s" % track.getName()) elif args.tracks is None or track.getName() in args.tracks.split(","): combineTrack(track, tempOutPath, tempRegionPath, iter, args) iter += 1 # nothing got written, make everything outside if iter == 0: tempOutFile = open(tempOutPath, "w") for interval in regionIntervals: tempOutFile.write("%s\t%s\t%s\t%s\n" % (interval[0], interval[1], interval[2], args.outside)) tempOutFile.close() runShellCommand("mv %s %s" % (tempOutPath, args.outBed)) runShellCommand("rm -f %s" % (tempRegionPath)) cleanBedTool(tempBedToolPath)
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=" Thin wrapper of teHmmTrain.py and teHmmEval.py " "to generate a table of Number-of-HMM-states VS BIC. Lower BIC" " is better") parser.add_argument("tracks", help="tracks xml used for training and eval") parser.add_argument("trainingBeds", help="comma-separated list of training regions" " (training region size will be a variable in output table). " "if segmentation is activated, these must also be the " "segmented beds...") parser.add_argument("evalBed", help="eval region") parser.add_argument("trainOpts", help="all teHmmTrain options in quotes") parser.add_argument("evalOpts", help="all teHmmEval options in quotes") parser.add_argument("states", help="comma separated-list of numbers of states" " to try") parser.add_argument("outDir", help="output directory") parser.add_argument("--reps", help="number of replicates", type = int, default=1) parser.add_argument("--proc", help="maximum number of processors to use" " in parallel", type = int, default = 1) parser.add_argument("--resume", help="try not to rewrite existing files", action="store_true", default=False) parser.add_argument("--initTrans", help="the states argument is overridden" " to specify a list of transition initialization files " "instead of state numbers", action="store_true", default=False) parser.add_argument("--numReps", help="the states argument is overridden" " to specifiy a list of replicate numbers (--reps)" " arguments", action="store_true", default=False) parser.add_argument("--numIter", help="the states argument is overridden" " to specifiy a list of iteration counts (--iter)" " arugments", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1: raise RuntimeError("only one of {--initTrans, --numReps, --numIter} " "can be used at a time") if not os.path.isdir(args.outDir): runShellCommand("mkdir %s" % args.outDir) # get the sizes of the trianing beds trainingSizes = [] trainingBeds = [] for tb in args.trainingBeds.split(","): if len(tb) > 0: trainingBeds.append(tb) for bed in trainingBeds: assert os.path.isfile(bed) bedLen = 0 for interval in readBedIntervals(bed): bedLen += interval[2] - interval[1] trainingSizes.append(bedLen) # make sure --bed not in teHmmEval options and --numStates not in train # options trainOpts = args.trainOpts.split() if "--numStates" in args.trainOpts and not args.numReps and not args.numIter: nsIdx = trainOpts.index("--numStates") assert nsIdx < len(trainOpts) - 1 del trainOpts[nsIdx] del trainOpts[nsIdx] if "--initTransProbs" in args.trainOpts: tpIdx = trainOpts.index("--initTransProbs") assert tpIdx < len(trainOpts) - 1 del trainOpts[tpIdx] del trianOpts[tpIdx] trainProcs = 1 if "--numThreads" in args.trainOpts: npIdx = trainOpts.index("--numThreads") assert npIdx < len(trainOpts) - 1 trainProcs = int(trainOpts[npIdx + 1]) segOptIdx = -1 if "--segment" in args.trainOpts: segIdx = trainOpts.index("--segment") assert segIdx < len(trainOpts) - 1 segOptIdx = segIdx + 1 if args.numReps and "--reps" in args.trainOpts: repsIdx = trainOpts.index("--reps") assert repsIdx < len(trainOpts) - 1 del trainOpts[repsIdx] del trainOpts[repsIdx] if args.numIter and "--iter" in args.trainOpts: iterIdx = trainOpts.index("--iter") assert iterIdx < len(trainOpts) - 1 del trainOpts[iterIdx] del trainOpts[iterIdx] evalOpts = args.evalOpts.split() if "--bed" in args.evalOpts: bedIdx = evalOpts.index("--bed") assert bedIdx < len(evalOpts) - 1 del evalOpts[bedIdx] del evalOpts[bedIdx] if "--bic" in args.evalOpts: bicIdx = evalOpts.index("--bic") assert bicIdx < len(evalOpts) - 1 del evalOpts[bicIdx] del evalOpts[bicIdx] # hack in support for --initTrans option by munging out model sizes # from the text files if args.initTrans is True: transFiles = args.states.split(",") states = [] for tf in transFiles: stateSet = set() with open(tf) as f: for line in f: toks = line.split() print toks if len(toks) > 1 and toks[0][0] != "#": stateSet.add(toks[0]) stateSet.add(toks[1]) states.append(len(stateSet)) else: states = args.states.split(",") trainCmds = [] evalCmds = [] prevSize = -1 sameSizeCount = 0 for trainingSize, trainingBed in zip(trainingSizes, trainingBeds): # hack to take into account we may have different inputs with same # same size, so their corresponding results need unique filenames if trainingSize == prevSize: sameSizeCount += 1 else: sameSizeCount = 0 prevSize = trainingSize print prevSize, trainingSize, sameSizeCount for numStates in states: for rep in xrange(args.reps): outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % ( trainingSize, sameSizeCount, int(numStates), int(rep))) if segOptIdx != -1: trainOpts[segOptIdx] = trainingBed if args.initTrans is True: statesOpt = "--initTransProbs %s" % transFiles[states.index(numStates)] elif args.numIter is True: # states argument overridden by iterations statesOpt = "--iter %d" % int(numStates) elif args.numReps is True: # states argument overridden by reps statesOpt = "--reps %d" % int(numStates) else: statesOpt = "--numStates %d" % int(numStates) trainCmd = "teHmmTrain.py %s %s %s %s %s" % ( args.tracks, trainingBed, outMod, " ".join(trainOpts), statesOpt) if not args.resume or not os.path.isfile(outMod) or \ os.path.getsize(outMod) < 100: trainCmds.append(trainCmd) outBic = outMod.replace(".mod", ".bic") outBed = outMod.replace(".mod", "_eval.bed") evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % ( args.tracks, outMod, args.evalBed, outBed, outBic, " ".join(evalOpts)) if not args.resume or not os.path.isfile(outBic) or \ os.path.getsize(outBic) < 2: evalCmds.append(evalCmd) # run the training runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs)) # run the eval runParallelShellCommands(evalCmds, args.proc) # make the table header tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w") stateColName = "states" if args.numIter is True: statesColName = "iter" elif args.numReps is True: stateColName = "reps" tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" % stateColName) for i in xrange(args.reps): tableFile.write(", bic.%d" % i) tableFile.write("\n") # make the table body prevSize = -1 sameSizeCount = 0 for (trainingSize,trainingBed) in zip(trainingSizes, trainingBeds): # hack to take into account we may have different inputs with same # same size, so their corresponding results need unique filenames if trainingSize == prevSize: sameSizeCount += 1 else: sameSizeCount = 0 prevSize = trainingSize for numStates in states: bics = [] printBics = [] for rep in xrange(args.reps): outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % ( trainingSize, sameSizeCount, int(numStates), int(rep))) outBic = outMod.replace(".mod", ".bic") try: with open(outBic, "r") as obFile: for line in obFile: bic = float(line.split()[0]) break bics.append(bic) printBics.append(bic) except: logger.warning("Coudn't find bic %s" % outBic) printBics.append("ERROR") # write row tableFile.write("%s, %d, %d" % (trainingBed, int(trainingSize), int(numStates))) if len(bics) > 0: tableFile.write(", %f, %f, %f" % (np.mean(bics), np.min(bics), np.max(bics))) else: tableFile.write(", ERROR, ERROR, ERROR") for pb in printBics: tableFile.write(", %s" % pb) tableFile.write("\n") tableFile.close() cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "fix up track names and sort alphabetically. easier to do here on xml than at end for pape\ r.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML") addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() args.logOpString = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.logOpString += " --logFile %s" % args.logFile nm = dict() nm["hollister"] = "RM-RepBase-Hollister" nm["chaux"] = "RM-RepBase-deLaChaux" nm["repeat_modeler"] = "RM-RepeatModeler" nm["repbase"] = "RM-RepBase" nm["repet"] = "REPET" nm["ltr_finder"] = "LTR_FINDER" nm["ltr_harvest"] = "LTR_Harvest" nm["ltr_termini"] = "lastz-Termini" nm["lastz-Termini"] = "lastz-LTRTermini" nm["tir_termini"] = "lastz-InvTermini" nm["irf"] = "IRF" nm["palindrome"] = "lastz-Palindrome" nm["overlap"] = "lastz-Overlap" nm["mitehunter"] = "MITE-Hunter" nm["helitronscanner"] = "HelitronScanner" nm["cov_80-"] = "lastz-SelfLowId" nm["cov_80-90"] = "lastz-SelfMedId" nm["cov_90+"] = "lastz-SelfHighId" nm["left_peak_80-"] = "lastz-SelfPeakLeftLow" nm["lastz-SelfLowLeftPeak"] = nm["left_peak_80-"] nm["left_peak_80-90"] = "lastz-SelfPeakLeftMed" nm["lastz-SelfMedLeftPeak"] = nm["left_peak_80-90"] nm["left_peak_90+"] = "lastz-SelfPeakLeftHigh" nm["lastz-SelfHighLeftPeak"] = nm["left_peak_90+"] nm["right_peak_80-"] = "lastz-SelfPeakRightLow" nm["lastz-SelfLowRightPeak"] = nm["right_peak_80-"] nm["right_peak_80-90"] = "lastz-SelfPeakRightMed" nm["lastz-SelfMedRightPeak"] = nm["right_peak_80-90"] nm["right_peak_90+"] = "lastz-SelfPeakRightHigh" nm["lastz-SelfHighRightPeak"] = nm["right_peak_90+"] nm["cov_maxPId"] = "lastz-SelfPctMaxId" nm["lastz-SelfMaxPctId"] = nm["cov_maxPId"] nm["te_domains"] = "TE-Domains" nm["fgenesh"] = "Genes" nm["genes"] = nm["fgenesh"] nm["refseq"] = nm["fgenesh"] nm["mrna"] = "mRNA" nm["srna"] = "sRNA" nm["ortho_depth"] = "Alignment-Depth" nm["orthology"] = nm["ortho_depth"] nm["chain_depth"] = nm["ortho_depth"] nm["alignment_depth"] = nm["ortho_depth"] nm["gcpct"] = "GC" nm["trf"] = "TRF" nm["windowmasker"] = "WindowMasker" nm["polyN"] = "Ns" nm["phastcons_ce"] = "Conservation" nm["phastcons"] = nm["phastcons_ce"] nm["PhastCons"] = nm["phastcons_ce"] nm["phyloP"] = nm["phastcons_ce"] nm["phylop"] = nm["phastcons_ce"] rtracks = dict() rtracks["tantan"] = True rtracks["polyA"] = True rtracks["transposon_psi"] = True rtracks["transposonpsi"] = True rtracks["repbase_censor"] = True rtracks["tsd"] = True rtracks["repbase_default"] = True rtracks["dustmasker"] = True inTracks = TrackList(args.tracksInfo) outTracks = TrackList() outList = [] for track in itertools.chain(inTracks.trackList, inTracks.maskTrackList): if not os.path.exists(track.path): raise RuntimeError("Track DNE %s" % track.path) if track.name not in rtracks: if track.name in nm: track.name = nm[track.name] else: logger.warning("Did not map track %s" % track.name) outList.append(track) else: logger.warning("Deleted track %s" % track.name) # sort the list def sortComp(x): lname = x.name.lower() if x.name == "RM-RepeatModeler": return "aaaaa" + lname elif "RM" in x.name: return "aaaa" + lname elif "REPET" in x.name: return "aaa" + lname elif "softmask" in lname or "tigr" in lname or "te-domains" in lname: return "aa" + lname elif x.getDist == "mask": return "zzzz" + lname else: return lname outList = sorted(outList, key=lambda track: sortComp(track)) for track in outList: outTracks.addTrack(track) outTracks.saveXML(args.outTracksInfo) cleanBedTool(tempBedToolPath)
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Helper script to rank a list of tracks based on how well " "they improve some measure of HMM accuracy, by wrapping " "teHmmBenchmark.py") parser.add_argument("tracks", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("training", help="BED Training regions" "teHmmTrain.py") parser.add_argument("truth", help="BED Truth used for scoring") parser.add_argument("states", help="States (in truth) to use for" " average F1 score (comma-separated") parser.add_argument("outDir", help="Directory to place all results") parser.add_argument("--benchOpts", help="Options to pass to " "teHmmBenchmark.py (wrap in double quotes)", default="") parser.add_argument("--startTracks", help="comma-separated list of " "tracks to start off with", default = None) parser.add_argument("--segOpts", help="Options to pass to " "segmentTracks.py (wrap in double quotes)", default="--comp first --thresh 1 --cutUnscaled") parser.add_argument("--fullSegment", help="Only use segmentation" " based on entire track list for each iteration" " rather than compute segmentation each time (as" " done by default)", action="store_true", default=False) parser.add_argument("--bic", help="rank by BIC instead of score " " (both always present in output table though)", action="store_true", default=False) parser.add_argument("--base", help="use base-level F1 instead of " "interval-level", default=False, action="store_true") parser.add_argument("--naive", help="rank by \"naive\" score", action="store_true", default=False) parser.add_argument("--doNaive", help="compute naive stats. will be " "turned on by default if --naive is used", default=False, action="store_true") parser.add_argument("--segTracks", help="tracks XML to use for segmentation" " (by default will be same as tracks))", default=None) parser.add_argument("--recallSkew", help="when computing f1, multiply recall" " by this number (hack to favour larger recall)", default=1., type=float) parser.add_argument("--score", help="accuracy score to use from " "{f1, prec, rec}", default="f1") addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) # make sure no no-no options in benchOpts if "--eval" in args.benchOpts or "--truth" in args.benchOpts: raise RuntimeError("--eval and --truth cannot be passed through to " "teHmmBenchmark.py as they are generated from " "<training> and <truth> args from this script") # don't want to keep track of extra logic required for not segmenting if "--segment" not in args.benchOpts: args.benchOpts += " --segment" logger.warning("Adding --segment to teHmmBenchmark.py options") if args.bic is True and args.naive is True: raise RuntimeError("--bic and --naive are mutually incompatible") if args.naive is True: args.doNaive = True if args.segTracks is None: args.segTracks = args.tracks if not os.path.exists(args.outDir): os.makedirs(args.outDir) greedyRank(args)
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=" Thin wrapper of teHmmTrain.py and teHmmEval.py " "to generate a table of Number-of-HMM-states VS BIC. Lower BIC" " is better") parser.add_argument("tracks", help="tracks xml used for training and eval") parser.add_argument( "trainingBeds", help="comma-separated list of training regions" " (training region size will be a variable in output table). " "if segmentation is activated, these must also be the " "segmented beds...") parser.add_argument("evalBed", help="eval region") parser.add_argument("trainOpts", help="all teHmmTrain options in quotes") parser.add_argument("evalOpts", help="all teHmmEval options in quotes") parser.add_argument("states", help="comma separated-list of numbers of states" " to try") parser.add_argument("outDir", help="output directory") parser.add_argument("--reps", help="number of replicates", type=int, default=1) parser.add_argument("--proc", help="maximum number of processors to use" " in parallel", type=int, default=1) parser.add_argument("--resume", help="try not to rewrite existing files", action="store_true", default=False) parser.add_argument( "--initTrans", help="the states argument is overridden" " to specify a list of transition initialization files " "instead of state numbers", action="store_true", default=False) parser.add_argument("--numReps", help="the states argument is overridden" " to specifiy a list of replicate numbers (--reps)" " arguments", action="store_true", default=False) parser.add_argument("--numIter", help="the states argument is overridden" " to specifiy a list of iteration counts (--iter)" " arugments", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1: raise RuntimeError("only one of {--initTrans, --numReps, --numIter} " "can be used at a time") if not os.path.isdir(args.outDir): runShellCommand("mkdir %s" % args.outDir) # get the sizes of the trianing beds trainingSizes = [] trainingBeds = [] for tb in args.trainingBeds.split(","): if len(tb) > 0: trainingBeds.append(tb) for bed in trainingBeds: assert os.path.isfile(bed) bedLen = 0 for interval in readBedIntervals(bed): bedLen += interval[2] - interval[1] trainingSizes.append(bedLen) # make sure --bed not in teHmmEval options and --numStates not in train # options trainOpts = args.trainOpts.split() if "--numStates" in args.trainOpts and not args.numReps and not args.numIter: nsIdx = trainOpts.index("--numStates") assert nsIdx < len(trainOpts) - 1 del trainOpts[nsIdx] del trainOpts[nsIdx] if "--initTransProbs" in args.trainOpts: tpIdx = trainOpts.index("--initTransProbs") assert tpIdx < len(trainOpts) - 1 del trainOpts[tpIdx] del trianOpts[tpIdx] trainProcs = 1 if "--numThreads" in args.trainOpts: npIdx = trainOpts.index("--numThreads") assert npIdx < len(trainOpts) - 1 trainProcs = int(trainOpts[npIdx + 1]) segOptIdx = -1 if "--segment" in args.trainOpts: segIdx = trainOpts.index("--segment") assert segIdx < len(trainOpts) - 1 segOptIdx = segIdx + 1 if args.numReps and "--reps" in args.trainOpts: repsIdx = trainOpts.index("--reps") assert repsIdx < len(trainOpts) - 1 del trainOpts[repsIdx] del trainOpts[repsIdx] if args.numIter and "--iter" in args.trainOpts: iterIdx = trainOpts.index("--iter") assert iterIdx < len(trainOpts) - 1 del trainOpts[iterIdx] del trainOpts[iterIdx] evalOpts = args.evalOpts.split() if "--bed" in args.evalOpts: bedIdx = evalOpts.index("--bed") assert bedIdx < len(evalOpts) - 1 del evalOpts[bedIdx] del evalOpts[bedIdx] if "--bic" in args.evalOpts: bicIdx = evalOpts.index("--bic") assert bicIdx < len(evalOpts) - 1 del evalOpts[bicIdx] del evalOpts[bicIdx] # hack in support for --initTrans option by munging out model sizes # from the text files if args.initTrans is True: transFiles = args.states.split(",") states = [] for tf in transFiles: stateSet = set() with open(tf) as f: for line in f: toks = line.split() print toks if len(toks) > 1 and toks[0][0] != "#": stateSet.add(toks[0]) stateSet.add(toks[1]) states.append(len(stateSet)) else: states = args.states.split(",") trainCmds = [] evalCmds = [] prevSize = -1 sameSizeCount = 0 for trainingSize, trainingBed in zip(trainingSizes, trainingBeds): # hack to take into account we may have different inputs with same # same size, so their corresponding results need unique filenames if trainingSize == prevSize: sameSizeCount += 1 else: sameSizeCount = 0 prevSize = trainingSize print prevSize, trainingSize, sameSizeCount for numStates in states: for rep in xrange(args.reps): outMod = os.path.join( args.outDir, "hmm_%d.%d.%d.%d.mod" % (trainingSize, sameSizeCount, int(numStates), int(rep))) if segOptIdx != -1: trainOpts[segOptIdx] = trainingBed if args.initTrans is True: statesOpt = "--initTransProbs %s" % transFiles[ states.index(numStates)] elif args.numIter is True: # states argument overridden by iterations statesOpt = "--iter %d" % int(numStates) elif args.numReps is True: # states argument overridden by reps statesOpt = "--reps %d" % int(numStates) else: statesOpt = "--numStates %d" % int(numStates) trainCmd = "teHmmTrain.py %s %s %s %s %s" % ( args.tracks, trainingBed, outMod, " ".join(trainOpts), statesOpt) if not args.resume or not os.path.isfile(outMod) or \ os.path.getsize(outMod) < 100: trainCmds.append(trainCmd) outBic = outMod.replace(".mod", ".bic") outBed = outMod.replace(".mod", "_eval.bed") evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % ( args.tracks, outMod, args.evalBed, outBed, outBic, " ".join(evalOpts)) if not args.resume or not os.path.isfile(outBic) or \ os.path.getsize(outBic) < 2: evalCmds.append(evalCmd) # run the training runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs)) # run the eval runParallelShellCommands(evalCmds, args.proc) # make the table header tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w") stateColName = "states" if args.numIter is True: statesColName = "iter" elif args.numReps is True: stateColName = "reps" tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" % stateColName) for i in xrange(args.reps): tableFile.write(", bic.%d" % i) tableFile.write("\n") # make the table body prevSize = -1 sameSizeCount = 0 for (trainingSize, trainingBed) in zip(trainingSizes, trainingBeds): # hack to take into account we may have different inputs with same # same size, so their corresponding results need unique filenames if trainingSize == prevSize: sameSizeCount += 1 else: sameSizeCount = 0 prevSize = trainingSize for numStates in states: bics = [] printBics = [] for rep in xrange(args.reps): outMod = os.path.join( args.outDir, "hmm_%d.%d.%d.%d.mod" % (trainingSize, sameSizeCount, int(numStates), int(rep))) outBic = outMod.replace(".mod", ".bic") try: with open(outBic, "r") as obFile: for line in obFile: bic = float(line.split()[0]) break bics.append(bic) printBics.append(bic) except: logger.warning("Coudn't find bic %s" % outBic) printBics.append("ERROR") # write row tableFile.write("%s, %d, %d" % (trainingBed, int(trainingSize), int(numStates))) if len(bics) > 0: tableFile.write(", %f, %f, %f" % (np.mean(bics), np.min(bics), np.max(bics))) else: tableFile.write(", ERROR, ERROR, ERROR") for pb in printBics: tableFile.write(", %s" % pb) tableFile.write("\n") tableFile.close() cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Add a TSD track (or modify an existing one) based on a " "given track") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("tsdTrackDir", help="Directory to write cleaned BED" " tracks to") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML" " to.") parser.add_argument("inputTrack", help="Name of track to createTSDs from") parser.add_argument("fastaTrack", help="Name of track for fasta sequence") parser.add_argument("outputTrack", help="Name of tsd track to add. Will" " overwrite if it already exists (or append with" " --append option)") parser.add_argument("--append", help="Add onto existing TSD track if exists", default=False, action="store_true") parser.add_argument("--inPath", help="Use given file instead of inputTrack" " path to generate TSD", default=None) ############ TSDFINDER OPTIONS ############## parser.add_argument("--min", help="Minimum length of a TSD", default=None, type=int) parser.add_argument("--max", help="Maximum length of a TSD", default=None, type=int) parser.add_argument("--all", help="Report all matches in region (as opposed" " to only the nearest to the BED element which is the " "default behaviour", action="store_true", default=False) parser.add_argument("--maxScore", help="Only report matches with given " "score or smaller. The score is definied as the " "maximum distance between the (two) TSD intervals and " "the query interval", default=None, type=int) parser.add_argument("--left", help="Number of bases immediately left of the " "BED element to search for the left TSD", default=None, type=int) parser.add_argument("--right", help="Number of bases immediately right of " "the BED element to search for the right TSD", default=None, type=int) parser.add_argument("--overlap", help="Number of bases overlapping the " "BED element to include in search (so total space " "on each side will be --left + overlap, and --right + " "--overlap", default=None, type=int) parser.add_argument("--leftName", help="Name of left TSDs in output Bed", default=None) parser.add_argument("--rightName", help="Name of right TSDs in output Bed", default=None) parser.add_argument("--id", help="Assign left/right pairs of TSDs a unique" " matching ID", action="store_true", default=False) parser.add_argument("--names", help="Only apply to bed interval whose " "name is in (comma-separated) list. If not specified" " then all intervals are processed", default=None) parser.add_argument("--numProc", help="Number of jobs to run in parallel." " (parallization done on different sequences in FASTA" "file", type=int, default=1) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # copy out all options for call to tsd finder args.tsdFinderOptions = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.tsdFinderOptions += " --logFile %s" % args.logFile for option in [ "min", "max", "all", "maxScore", "left", "right", "overlap", "leftName", "rightName", "id", "names", "numProc" ]: val = getattr(args, option) if val is True: args.tsdFinderOptions += " --%s" % option elif val is not None and val is not False: args.tsdFinderOptions += " --%s %s" % (option, val) try: os.makedirs(args.tsdTrackDir) except: pass if not os.path.isdir(args.tsdTrackDir): raise RuntimeError("Unable to find or create tsdTrack dir %s" % args.tsdTrackDir) trackList = TrackList(args.tracksInfo) outTrackList = copy.deepcopy(trackList) inputTrack = trackList.getTrackByName(args.inputTrack) if inputTrack is None: raise RuntimeError("Track %s not found" % args.inputTrack) if args.inPath is not None: assert os.path.isfile(args.inPath) inputTrack.setPath(args.inPath) inTrackExt = os.path.splitext(inputTrack.getPath())[1].lower() if inTrackExt != ".bb" and inTrackExt != ".bed": raise RuntimeError("Track %s has non-bed extension %s" % (args.inputTrack, inTrackExt)) fastaTrack = trackList.getTrackByName(args.fastaTrack) if fastaTrack is None: raise RuntimeError("Fasta Track %s not found" % args.fastaTrack) faTrackExt = os.path.splitext(fastaTrack.getPath())[1].lower() if faTrackExt[:3] != ".fa": raise RuntimeError("Fasta Track %s has non-fasta extension %s" % (args.fastaTrack, faTrackExt)) tsdTrack = outTrackList.getTrackByName(args.outputTrack) if tsdTrack is None: if args.append is True: raise RuntimeError("TSD track %s not found. Cannot append" % (args.outputTrack)) tsdTrack = Track() tsdTrack.name = args.outputTrack tsdTrack.path = os.path.join( args.tsdTrackDir, args.inputTrack + "_" + args.outputTrack + ".bed") runTsdFinder(fastaTrack.getPath(), inputTrack.getPath(), tsdTrack.getPath(), args) if outTrackList.getTrackByName(tsdTrack.getName()) is None: outTrackList.addTrack(tsdTrack) outTrackList.saveXML(args.outTracksInfo) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Produce a bed file of genome segments which are atomic" " elements with resepect to the hmm. ie each segment emits a single" " state. Mask tracks always cut. " "Output intervals are assigned name 0 1 0 1 etc.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("allBed", help="Bed file spanning entire genome") parser.add_argument("outBed", help="Output segments") parser.add_argument("--thresh", help="Number of tracks that can change " "before a new segment formed. Increasing this value" " increases the expected lengths of output segments", type=int, default=1) parser.add_argument("--cutTracks", help="Create a new segment if something" " changes in one of these tracks (as specified by " "comman-separated list), overriding --thresh options" " if necessary. For example, --cutTracks tsd,chaux" " would invoke a new segment everytime the value at" "either of these tracks changed", default=None) parser.add_argument("--cutUnscaled", help="Cut on all unscaled (used as " "a proxy for non-numeric) tracks", default=False, action="store_true") parser.add_argument("--cutMultinomial", help="Cut non-gaussian, non-binary" " tracks everytime", default=False, action="store_true") parser.add_argument("--cutNonGaussian", help="Cut all but guassian tracks", default=False, action="store_true") parser.add_argument("--comp", help="Strategy for comparing columns for the " "threshold cutoff. Options are [first, prev], where" " first compares with first column of segment and " "prev compares with column immediately left", default="first") parser.add_argument("--ignore", help="Comma-separated list of tracks to " "ignore (the FASTA DNA sequence would be a good " "candidate", default="sequence") parser.add_argument("--maxLen", help="Maximum length of a segment (<= 0 means" " no max length applied", type=int, default=0) parser.add_argument("--fixLen", help="Just make segments of specifed fixed " "length ignoring other parameters and logic (<= 0 means" " no fixed length applied", type=int, default=0) parser.add_argument("--stats", help="Write some statistics to specified " "file. Of the form <trackName> <Diff> <DiffPct> " " where <Diff> is the number of times a track differs" " between two consecutive segments, and <DiffPct> " " is the average perecentage of all such differences " "accounted for by the track", default=None) parser.add_argument("--delMask", help="Entirely remove intervals from " "mask tracks that are > given length (otherwise " "they would just be ignored by HMM tools). The difference" " here is that removed intervals will break contiguity.", type=int, default=None) parser.add_argument("--chroms", help="list of chromosomes, or regions, to run in parallel" " (in BED format). input regions will be intersected with each line" " in this file, and the result will correspsond to an individual job", default=None) parser.add_argument("--proc", help="number of processes (use in conjunction with --chroms)", type=int, default=1) parser.add_argument("--co", help="count offset for segment labels. only used internally", type=int, default=0) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.comp != "first" and args.comp != "prev": raise RuntimeError("--comp must be either first or prev") if args.chroms is not None: # hack to allow chroms argument to chunk and rerun parallelDispatch(argv, args) cleanBedTool(tempBedToolPath) return 0 # read query intervals from the bed file tempFiles = [] if args.delMask is not None: cutBed = cutOutMaskIntervals(args.allBed, args.delMask, sys.maxint, args.tracksInfo) if cutBed is not None: tempFiles.append(cutBed) args.allBed = cutBed logger.info("loading segment region intervals from %s" % args.allBed) mergedIntervals = getMergedBedIntervals(args.allBed, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.allBed) # read the tracks, while intersecting them with the query intervals logger.info("loading tracks %s" % args.tracksInfo) trackData = TrackData() trackData.loadTrackData(args.tracksInfo, mergedIntervals, treatMaskAsBinary=True) # process the --cutTracks option trackList = trackData.getTrackList() cutList = np.zeros((len(trackList)), np.int) if args.cutTracks is not None: cutNames = args.cutTracks.split(",") for name in cutNames: track = trackList.getTrackByName(name) if track is None: raise RuntimeError("cutTrack %s not found" % name) trackNo = track.getNumber() assert trackNo < len(cutList) cutList[trackNo] = 1 args.cutList = cutList # make sure mask tracks count as cut tracks for track in trackList: if track.getDist() == 'mask': args.cutList[track.getNumber()] = 1 # process the --ignore option ignoreList = np.zeros((len(trackList)), np.int) if args.ignore is not None: ignoreNames = args.ignore.split(",") for name in ignoreNames: track = trackList.getTrackByName(name) if track is None: if name is not "sequence": logger.warning("ignore track %s not found" % name) continue trackNo = track.getNumber() assert trackNo < len(ignoreList) ignoreList[trackNo] = 1 if args.cutList[trackNo] == 1: raise RuntimeError("Same track (%s) cant be cut and ignored" % name) args.ignoreList = ignoreList #process the --cutUnscaled option if args.cutUnscaled is True: for track in trackList: trackNo = track.getNumber() if track.scale is None and track.shift is None and\ track.logScale is None and\ args.ignoreList[trackNo] == 0: assert trackNo < len(cutList) cutList[trackNo] = 1 #process the --cutMultinomial option if args.cutMultinomial is True: for track in trackList: trackNo = track.getNumber() if track.dist == "multinomial" and\ args.ignoreList[trackNo] == 0: assert trackNo < len(cutList) cutList[trackNo] = 1 #process the --cutNonGaussian option if args.cutNonGaussian is True: for track in trackList: trackNo = track.getNumber() if track.dist != "gaussian" and\ args.ignoreList[trackNo] == 0: assert trackNo < len(cutList) cutList[trackNo] = 1 # segment the tracks stats = dict() segmentTracks(trackData, args, stats) writeStats(trackData, args, stats) if len(tempFiles) > 0: runShellCommand("rm -f %s" % " ".join(tempFiles)) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Make some tables of statistics from a BED file. All" " output will be written in one big CSV table to be viewed in a " "spreadsheet.") parser.add_argument("inBed", help="Input bed file") parser.add_argument("outCsv", help="Path to write output in CSV format") parser.add_argument("--ignore", help="Comma-separated list of names" " to ignore", default="") parser.add_argument("--numBins", help="Number of (linear) bins for " "histograms", type=int, default=10) parser.add_argument("--logHist", help="Apply log-transform to data for " "histogram", action="store_true", default=False) parser.add_argument("--histRange", help="Histogram range as comma-" "separated pair of numbers", default=None) parser.add_argument("--noHist", help="skip hisograms", action="store_true", default=False) parser.add_argument("--noScore", help="Just do length stats", action="store_true", default=False) parser.add_argument("--noLen", help="Just do score stats", action="store_true", default=False) parser.add_argument("--nearness", help="Compute nearness stats (instead " "of normal stats) of input bed with given BED. Output" " will be a BED instead of CSV, with nearness in the " "score position", default=None) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.histRange is not None: args.histRange = args.histRange.split(",") assert len(args.histRange) == 2 args.histRange = int(args.histRange[0]), int(args.histRange[1]) outFile = open(args.outCsv, "w") args.ignoreSet = set(args.ignore.split(",")) intervals = readBedIntervals(args.inBed, ncol = 5, sort = args.nearness is not None) csvStats = "" # nearness stats if args.nearness is not None: args.noScore = True csvStats = makeNearnessBED(intervals, args) # length stats elif args.noLen is False: csvStats = makeCSV(intervals, args, lambda x : int(x[2])-int(x[1]), "Length") # score stats try: if args.noScore is False: csvStats += "\n" + makeCSV(intervals, args, lambda x : float(x[4]), "Score") csvStats += "\n" + makeCSV(intervals, args, lambda x : float(x[4]) * ( float(x[2]) - float(x[1])), "Score*Length") except Exception as e: logger.warning("Couldn't make score stats because %s" % str(e)) outFile.write(csvStats) outFile.write("\n") outFile.close() cleanBedTool(tempBedToolPath)
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Write track data into ASCII dump. Row i corresponds" " to the ith position found when scanning query BED IN SORTED ORDER." "Column j corresponds to the jth track in the XML file. --map option" " used to write internal integer format used by HMM. Unobserved values" " written as \"None\" if default attribute not specified or track not" " binary. Rounding can occur if scaling parameters present.\n\n" "IMPORTANT: values stored in 8bit integers internally. Any track with" " more than 256 different values will get clamped (with a warning)") parser.add_argument("tracks", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("query", help="BED region(s) to dump. SCANNED IN" " SORTED ORDER") parser.add_argument("output", help="Path of file to write output to") parser.add_argument("--map", help="Apply name mapping, including" " transformation specified in scale, logScale" ", etc. attributes, that HMM uses internally" ". Important to note that resulting integers" " are just unique IDs. ID_1 > ID_2 does not" " mean anything", action="store_true", default=False) parser.add_argument("--segment", help="Treat each interval in query" " as a single segment (ie with only one data point)" ". In this case, query should probably have been" " generated with segmentTracks.py", action="store_true", default=False) parser.add_argument("--noPos", help="Do not print genomic position" " (first 2 columnts)", action="store_true", default=False) parser.add_argument("--noMask", help="Ignore mask tracks", default=False, action="store_true") addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) # make sure output writeable outFile = open(args.output, "w") # need to remember to fix this, disable as precaution for now assert args.noMask is True or args.segment is False # read query intervals from the bed file logger.info("loading query intervals from %s" % args.query) mergedIntervals = getMergedBedIntervals(args.query, ncol=3) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.query) # read the segment intervals from the (same) bed file segIntervals = None if args.segment is True: logger.info("loading segment intervals from %s" % args.query) segIntervals = readBedIntervals(args.query, sort=True) # read all data from track xml logger.info("loading tracks %s" % args.tracks) trackData = TrackData() trackData.loadTrackData(args.tracks, mergedIntervals, segmentIntervals=segIntervals, applyMasking = not args.noMask) # dump the data to output dumpTrackData(trackData, outFile, args.map, not args.noPos) outFile.close()
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Generate HMM-usable tracklist from raw tracklist. EX " "used to transform mustang_alyrata_tracks.xml -> " "mustang_alyrata_clean.xml. Runs cleanRM.py cleanLtrFinder.py and " " cleanTermini.py and addTsdTrack.py and setTrackScaling.py (also runs " " removeBedOverlaps.py before each of the clean scripts)") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("allBed", help="Bed file spanning entire genome") parser.add_argument("cleanTrackPath", help="Directory to write cleaned BED" " tracks to") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML" " to.") parser.add_argument("--numBins", help="Maximum number of bins after scaling", default=10, type=int) parser.add_argument("--scaleTracks", help="Comma-separated list of tracks " "to process for scaling. If not set, all" " tracks listed as having a multinomial distribution" " (since this is the default value, this includes " "tracks with no distribution attribute) or gaussian " "distribution will be processed.", default=None) parser.add_argument("--skipScale", help="Comma-separated list of tracks to " "skip for scaling.", default=None) parser.add_argument("--ltr_termini", help="Name of termini track (appy tsd)", default="ltr_termini") parser.add_argument("--repeat_modeler", help="Name of repeat_modeler track (appy tsd)", default="repeat_modeler") parser.add_argument("--sequence", help="Name of fasta sequence track", default="sequence") parser.add_argument( "--tsd", help="Name of tsd track to generate (appy cleanTermini.py)", default="tsd") parser.add_argument( "--tir", help="Name of tir_termini track (appy cleanTermini.py)", default="tir_termini") parser.add_argument("--noScale", help="Dont do any scaling", default=False, action="store_true") parser.add_argument("--noTsd", help="Dont generate TSD track. NOTE:" " TSD track is hardcoded to be generated from " "termini and (non-LTR elements of ) chaux", default=False, action="store_true") parser.add_argument("--numProc", help="Number of processes to use for tsdFinder.py", default=1, type=int) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() args.logOpString = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.logOpString += " --logFile %s" % args.logFile try: os.makedirs(args.cleanTrackPath) except: pass if not os.path.isdir(args.cleanTrackPath): raise RuntimeError("Unable to find or create cleanTrack dir %s" % args.cleanTrackPath) tempTracksInfo = getLocalTempPath("Temp_mustang_alyrata_clean", "xml") runCleaning(args, tempTracksInfo) assert os.path.isfile(tempTracksInfo) runTsd(args, tempTracksInfo) runScaling(args, tempTracksInfo) runShellCommand("rm -f %s" % tempTracksInfo) cleanBedTool(tempBedToolPath)