def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) # IMS: new method: extend intervals by set amount parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("merge", "filter-genome", "bins", "block", "sanitize-genome", "shift", "extend"), help="method to apply [default=%default]") parser.add_option("--num-bins", dest="num_bins", type="int", help="number of bins into which to merge (used for method `bins) [default=%default]") parser.add_option("--bin-edges", dest="bin_edges", type="string", help="bin_edges for binning method [default=%default]") parser.add_option("--binning-method", dest="binning_method", type="choice", choices=( "equal-bases", "equal-intervals", "equal-range"), help="method used for binning (used for method `bins` if no bin_edges is given) [default=%default]") parser.add_option("--merge-distance", dest="merge_distance", type="int", help="distance in bases over which to merge that are not directly adjacent [default=%default]") parser.add_option("--merge-min-intervals", dest="merge_min_intervals", type="int", help="only output merged intervals that are build from at least x intervals [default=%default]") parser.add_option("--merge-by-name", dest="merge_by_name", action="store_true", help="only merge intervals with the same name [default=%default]") parser.add_option("--remove-inconsistent", dest="remove_inconsistent", action="store_true", help="when merging, do not output intervals where the names of overlapping intervals " "do not match [default=%default]") parser.add_option("--offset", dest="offset", type="int", help="offset for shifting intervals [default=%default]") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-b", "--bam-file", dest="bam_file", type="string", help="bam-formatted filename with genome.") parser.set_defaults(methods=[], merge_distance=0, binning_method="equal-bases", merge_by_name=False, genome_file=None, bam_file=None, num_bins=5, merge_min_intervals=1, bin_edges=None, offset=10000, test=None, extend_distance=1000, remove_inconsistent=False) (options, args) = E.Start(parser, add_pipe_options=True) contigs = None # Why provide full indexed genome, when a tsv of contig sizes would do? if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = genome_fasta.getContigSizes() if options.bam_file: samfile = pysam.Samfile(options.bam_file) contigs = dict(zip(samfile.references, samfile.lengths)) processor = Bed.iterator(options.stdin) for method in options.methods: if method == "filter-genome": if not contigs: raise ValueError("please supply contig sizes") processor = filterGenome(processor, contigs) elif method == "sanitize-genome": if not contigs: raise ValueError("please supply contig sizes") processor = sanitizeGenome(processor, contigs) elif method == "merge": processor = merge(processor, options.merge_distance, by_name=options.merge_by_name, min_intervals=options.merge_min_intervals, remove_inconsistent=options.remove_inconsistent) elif method == "bins": if options.bin_edges: bin_edges = map(float, options.bin_edges.split(",")) # IMS: check bin edges are valid if not(len(bin_edges) == options.num_bins + 1): raise ValueError( "Number of bin edge must be one more than number of bins") else: bin_edges = None processor, bin_edges = Bed.binIntervals(processor, num_bins=options.num_bins, method=options.binning_method, bin_edges=bin_edges) E.info("# split bed: bin_edges=%s" % (str(bin_edges))) elif method == "block": processor = Bed.blocked_iterator(processor) elif method == "shift": # IMS: test that contig sizes are availible if not contigs: raise ValueError("please supply genome file") processor = shiftIntervals( processor, contigs, offset=options.offset) # IMS: new method: extend intervals by set amount elif method == "extend": if not contigs: raise ValueError("please supply genome file") processor = extendInterval(processor, contigs, options.offset) noutput = 0 for bed in processor: options.stdout.write(str(bed) + "\n") noutput += 1 E.info("noutput=%i" % (noutput)) E.Stop()
def main(argv=sys.argv): parser = E.OptionParser( version= "%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) # IMS: new method: extend intervals by set amount parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("merge", "filter-genome", "bins", "block", "sanitize-genome", "shift", "extend", "filter-names"), help="method to apply [default=%default]") parser.add_option("--num-bins", dest="num_bins", type="int", help="number of bins into which to merge (used for " "method `bins) [default=%default]") parser.add_option("--bin-edges", dest="bin_edges", type="string", help="bin_edges for binning method [default=%default]") parser.add_option( "--binning-method", dest="binning_method", type="choice", choices=("equal-bases", "equal-intervals", "equal-range"), help="method used for binning (used for method `bins` if no " "bin_edges is given) [default=%default]") parser.add_option( "--merge-distance", dest="merge_distance", type="int", help="distance in bases over which to merge that are not " "directly adjacent [default=%default]") parser.add_option( "--merge-min-intervals", dest="merge_min_intervals", type="int", help="only output merged intervals that are build from at least " "x intervals [default=%default]") parser.add_option( "--merge-by-name", dest="merge_by_name", action="store_true", help="only merge intervals with the same name [default=%default]") parser.add_option( "--merge-and-resolve-blocks", dest="resolve_blocks", action="store_true", help="When merging bed12 entrys, should blocks be resolved?") parser.add_option("--merge-stranded", dest="stranded", action="store_true", help="Only merge intervals on the same strand") parser.add_option( "--remove-inconsistent-names", dest="remove_inconsistent_names", action="store_true", help="when merging, do not output intervals where the names of " "overlapping intervals do not match [default=%default]") parser.add_option("--offset", dest="offset", type="int", help="offset for shifting intervals [default=%default]") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-b", "--bam-file", dest="bam_file", type="string", help="bam-formatted filename with genome.") parser.add_option("--filter-names-file", dest="names", type="string", help="list of names to keep. One per line") parser.set_defaults(methods=[], merge_distance=0, binning_method="equal-bases", merge_by_name=False, genome_file=None, bam_file=None, num_bins=5, merge_min_intervals=1, bin_edges=None, offset=10000, test=None, extend_distance=1000, remove_inconsistent_names=False, resolve_blocks=False) (options, args) = E.Start(parser, add_pipe_options=True) contigs = None # Why provide full indexed genome, when a tsv of contig sizes would do? if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = genome_fasta.getContigSizes() if options.bam_file: samfile = pysam.AlignmentFile(options.bam_file) contigs = dict(list(zip(samfile.references, samfile.lengths))) processor = Bed.iterator(options.stdin) for method in options.methods: if method == "filter-genome": if not contigs: raise ValueError("please supply contig sizes") processor = filterGenome(processor, contigs) elif method == "sanitize-genome": if not contigs: raise ValueError("please supply contig sizes") processor = sanitizeGenome(processor, contigs) elif method == "merge": processor = merge( processor, options.merge_distance, by_name=options.merge_by_name, min_intervals=options.merge_min_intervals, remove_inconsistent=options.remove_inconsistent_names, resolve_blocks=options.resolve_blocks, stranded=options.stranded) elif method == "bins": if options.bin_edges: bin_edges = list(map(float, options.bin_edges.split(","))) # IMS: check bin edges are valid if not (len(bin_edges) == options.num_bins + 1): raise ValueError( "Number of bin edge must be one more than " "number of bins") else: bin_edges = None processor, bin_edges = Bed.binIntervals( processor, num_bins=options.num_bins, method=options.binning_method, bin_edges=bin_edges) E.info("# split bed: bin_edges=%s" % (str(bin_edges))) elif method == "block": processor = Bed.blocked_iterator(processor) elif method == "shift": # IMS: test that contig sizes are availible if not contigs: raise ValueError("please supply genome file") processor = shiftIntervals(processor, contigs, offset=options.offset) # IMS: new method: extend intervals by set amount elif method == "extend": if not contigs: raise ValueError("please supply genome file") processor = extendInterval(processor, contigs, options.offset) elif method == "filter-names": if not options.names: raise ValueError("please supply list of names to filter") names = [name.strip() for name in open(options.names)] processor = filterNames(processor, names) noutput = 0 for bed in processor: options.stdout.write(str(bed) + "\n") noutput += 1 E.info("noutput=%i" % (noutput)) E.Stop()