def readBedIntervals(bedPath, ncol = 3, chrom = None, start = None, end = None, sort = False, ignoreBed12 = True): """ Read bed intervals from a bed file (or a specifeid range therein). NOTE: intervals are sorted by their coordinates""" if not os.path.isfile(bedPath): raise RuntimeError("Bed interval file %s not found" % bedPath) assert ncol == 3 or ncol == 4 or ncol == 5 outIntervals = [] logger.debug("readBedIntervals(%s)" % bedPath) bedTool = BedTool(bedPath) if sort is True: bedTool = bedTool.sort() logger.debug("sortBed(%s)" % bedPath) if ignoreBed12 is False: bedTool = bedTool.bed6() logger.debug("bed6(%s)" % bedPath) if chrom is None: bedIntervals = bedTool else: assert start is not None and end is not None interval = Interval(chrom, start, end) logger.debug("intersecting (%s,%d,%d) and %s" % (chrom, start, end, bedPath)) # Below, we try switching from all_hits to intersect() # all_hits seems to leak a ton of memory for big files, so # we hope intersect (which creates a temp file) will be better #bedIntervals = bedTool.all_hits(interval) tempTool = BedTool(str(interval), from_string = True) bedIntervals = bedTool.intersect(tempTool) tempTool.delete_temporary_history(ask=False) logger.debug("appending bed intervals") for feat in bedIntervals: outInterval = (feat.chrom, feat.start, feat.end) if ncol >= 4: outInterval += (feat.name,) if ncol >= 5: outInterval += (feat.score,) outIntervals.append(outInterval) logger.debug("finished readBedIntervals(%s)" % bedPath) return outIntervals
def getMergedBedIntervals(bedPath, ncol=3, sort = False, ignoreBed12 = True): """ Merge all contiguous and overlapping intervals""" if not os.path.isfile(bedPath): raise RuntimeError("Bed interval file %s not found" % bedPath) logger.debug("mergeBedIntervals(%s)" % bedPath) outIntervals = [] bedTool = BedTool(bedPath) if sort is True: bedTool = bedTool.sort() logger.debug("sortBed(%s)" % bedPath) if ignoreBed12 is False: logger.debug("bed6(%s)" % bedPath) bedTool = bedTool.bed6() for feat in bedTool.merge(): outInterval = (feat.chrom, feat.start, feat.end) if ncol >= 4: outInterval += (feat.name,) if ncol >= 5: outInterval += (feat.score,) outIntervals.append(outInterval) logger.debug("finished mergeBedIntervals(%s)" % bedPath) return outIntervals
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Filter overlapping intervals out") parser.add_argument("inputBed", help="Bed file to filter") parser.add_argument("--bed12", help="Use bed12 exons instead of start/end" " if present (equivalent to running bed12ToBed6 on" " input first).", action="store_true", default=False) parser.add_argument("--rm", help="Make sure intervals that are labeled as TE " "by rm2State.sh script are never cut by ones that are not", default=False, action='store_true') addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) assert os.path.isfile(args.inputBed) tempBedToolPath = initBedTool() # do the --rm filter. by splitting into TE / non-TE # then removing everything in non-TE that overlaps # TE. The adding the remainder back to TE. inputPath = args.inputBed if args.rm is True: tempPath = getLocalTempPath("Temp_", ".bed") tePath = getLocalTempPath("Temp_te_", ".bed") runShellCommand("rm2State.sh %s |grep TE | sortBed > %s" % ( args.inputBed, tempPath)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %( args.inputBed, tempPath, tePath)) otherPath = getLocalTempPath("Temp_other_", ".bed") runShellCommand("rm2State.sh %s |grep -v TE | sortBed > %s" % ( args.inputBed, tempPath)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %( args.inputBed, tempPath, otherPath)) if os.path.getsize(tePath) > 0 and\ os.path.getsize(otherPath) > 0: filterPath = getLocalTempPath("Temp_filter_", ".bed") runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % ( otherPath, tePath, filterPath)) inputPath = getLocalTempPath("Temp_input_", ".bed") runShellCommand("cat %s %s | sortBed > %s" % ( tePath, filterPath, inputPath)) runShellCommand("rm -f %s" % filterPath) runShellCommand("rm -f %s %s %s" % (tePath, otherPath, tempPath)) bedIntervals = BedTool(inputPath).sort() if args.bed12 is True: bedIntervals = bedIntervals.bed6() prevInterval = None # this code has been way to buggy for something so simple # keep extra list to check for sure even though it's a waste of # time and space sanity = [] for interval in bedIntervals: if (prevInterval is not None and interval.chrom == prevInterval.chrom and interval.start < prevInterval.end): logger.debug("Replace %d bases of \n%s with\n%s" % ( prevInterval.end - interval.start, str(interval), str(prevInterval))) interval.start = prevInterval.end if interval.end > interval.start: sys.stdout.write("%s" % str(interval)) sanity.append(interval) prevInterval = interval for i in xrange(len(sanity) - 1): if sanity[i].chrom == sanity[i+1].chrom: assert sanity[i+1].start >= sanity[i].end cleanBedTool(tempBedToolPath) if args.inputBed != inputPath: runShellCommand("rm -f %s" % inputPath)