Exemple #1
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $",
                            usage=globals()["__doc__"])

    # IMS: new method: extend intervals by set amount
    parser.add_option("-m", "--method", dest="methods", type="choice", action="append",
                      choices=("merge", "filter-genome", "bins",
                               "block", "sanitize-genome", "shift", "extend"),
                      help="method to apply [default=%default]")

    parser.add_option("--num-bins", dest="num_bins", type="int",
                      help="number of bins into which to merge (used for method `bins) [default=%default]")

    parser.add_option("--bin-edges", dest="bin_edges", type="string",
                      help="bin_edges for binning method [default=%default]")

    parser.add_option("--binning-method", dest="binning_method", type="choice",
                      choices=(
                          "equal-bases", "equal-intervals", "equal-range"),
                      help="method used for binning (used for method `bins` if no bin_edges is given) [default=%default]")

    parser.add_option("--merge-distance", dest="merge_distance", type="int",
                      help="distance in bases over which to merge that are not directly adjacent [default=%default]")

    parser.add_option("--merge-min-intervals", dest="merge_min_intervals", type="int",
                      help="only output merged intervals that are build from at least x intervals [default=%default]")

    parser.add_option("--merge-by-name", dest="merge_by_name", action="store_true",
                      help="only merge intervals with the same name [default=%default]")

    parser.add_option("--remove-inconsistent", dest="remove_inconsistent", action="store_true",
                      help="when merging, do not output intervals where the names of overlapping intervals "
                      "do not match [default=%default]")

    parser.add_option("--offset", dest="offset",  type="int",
                      help="offset for shifting intervals [default=%default]")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("-b", "--bam-file", dest="bam_file", type="string",
                      help="bam-formatted filename with genome.")

    parser.set_defaults(methods=[],
                        merge_distance=0,
                        binning_method="equal-bases",
                        merge_by_name=False,
                        genome_file=None,
                        bam_file=None,
                        num_bins=5,
                        merge_min_intervals=1,
                        bin_edges=None,
                        offset=10000,
                        test=None,
                        extend_distance=1000,
                        remove_inconsistent=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    contigs = None

    # Why provide full indexed genome, when a tsv of contig sizes would do?
    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = genome_fasta.getContigSizes()

    if options.bam_file:
        samfile = pysam.Samfile(options.bam_file)
        contigs = dict(zip(samfile.references, samfile.lengths))

    processor = Bed.iterator(options.stdin)

    for method in options.methods:
        if method == "filter-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = filterGenome(processor, contigs)
        elif method == "sanitize-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = sanitizeGenome(processor, contigs)
        elif method == "merge":
            processor = merge(processor,
                              options.merge_distance,
                              by_name=options.merge_by_name,
                              min_intervals=options.merge_min_intervals,
                              remove_inconsistent=options.remove_inconsistent)
        elif method == "bins":
            if options.bin_edges:
                bin_edges = map(float, options.bin_edges.split(","))
                # IMS: check bin edges are valid
                if not(len(bin_edges) == options.num_bins + 1):
                    raise ValueError(
                        "Number of bin edge must be one more than number of bins")
            else:
                bin_edges = None
            processor, bin_edges = Bed.binIntervals(processor,
                                                    num_bins=options.num_bins,
                                                    method=options.binning_method,
                                                    bin_edges=bin_edges)
            E.info("# split bed: bin_edges=%s" % (str(bin_edges)))

        elif method == "block":
            processor = Bed.blocked_iterator(processor)
        elif method == "shift":
            # IMS: test that contig sizes are availible
            if not contigs:
                raise ValueError("please supply genome file")
            processor = shiftIntervals(
                processor, contigs, offset=options.offset)
        # IMS: new method: extend intervals by set amount
        elif method == "extend":
            if not contigs:
                raise ValueError("please supply genome file")
            processor = extendInterval(processor, contigs, options.offset)

    noutput = 0
    for bed in processor:
        options.stdout.write(str(bed) + "\n")
        noutput += 1

    E.info("noutput=%i" % (noutput))

    E.Stop()
Exemple #2
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    # IMS: new method: extend intervals by set amount
    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("merge", "filter-genome", "bins", "block",
                               "sanitize-genome", "shift", "extend",
                               "filter-names"),
                      help="method to apply [default=%default]")

    parser.add_option("--num-bins",
                      dest="num_bins",
                      type="int",
                      help="number of bins into which to merge (used for "
                      "method `bins) [default=%default]")

    parser.add_option("--bin-edges",
                      dest="bin_edges",
                      type="string",
                      help="bin_edges for binning method [default=%default]")

    parser.add_option(
        "--binning-method",
        dest="binning_method",
        type="choice",
        choices=("equal-bases", "equal-intervals", "equal-range"),
        help="method used for binning (used for method `bins` if no "
        "bin_edges is given) [default=%default]")

    parser.add_option(
        "--merge-distance",
        dest="merge_distance",
        type="int",
        help="distance in bases over which to merge that are not "
        "directly adjacent [default=%default]")

    parser.add_option(
        "--merge-min-intervals",
        dest="merge_min_intervals",
        type="int",
        help="only output merged intervals that are build from at least "
        "x intervals [default=%default]")

    parser.add_option(
        "--merge-by-name",
        dest="merge_by_name",
        action="store_true",
        help="only merge intervals with the same name [default=%default]")

    parser.add_option(
        "--merge-and-resolve-blocks",
        dest="resolve_blocks",
        action="store_true",
        help="When merging bed12 entrys, should blocks be resolved?")

    parser.add_option("--merge-stranded",
                      dest="stranded",
                      action="store_true",
                      help="Only merge intervals on the same strand")

    parser.add_option(
        "--remove-inconsistent-names",
        dest="remove_inconsistent_names",
        action="store_true",
        help="when merging, do not output intervals where the names of "
        "overlapping intervals do not match [default=%default]")

    parser.add_option("--offset",
                      dest="offset",
                      type="int",
                      help="offset for shifting intervals [default=%default]")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-b",
                      "--bam-file",
                      dest="bam_file",
                      type="string",
                      help="bam-formatted filename with genome.")

    parser.add_option("--filter-names-file",
                      dest="names",
                      type="string",
                      help="list of names to keep. One per line")

    parser.set_defaults(methods=[],
                        merge_distance=0,
                        binning_method="equal-bases",
                        merge_by_name=False,
                        genome_file=None,
                        bam_file=None,
                        num_bins=5,
                        merge_min_intervals=1,
                        bin_edges=None,
                        offset=10000,
                        test=None,
                        extend_distance=1000,
                        remove_inconsistent_names=False,
                        resolve_blocks=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    contigs = None

    # Why provide full indexed genome, when a tsv of contig sizes would do?
    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = genome_fasta.getContigSizes()

    if options.bam_file:
        samfile = pysam.AlignmentFile(options.bam_file)
        contigs = dict(list(zip(samfile.references, samfile.lengths)))

    processor = Bed.iterator(options.stdin)

    for method in options.methods:
        if method == "filter-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = filterGenome(processor, contigs)
        elif method == "sanitize-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = sanitizeGenome(processor, contigs)
        elif method == "merge":
            processor = merge(
                processor,
                options.merge_distance,
                by_name=options.merge_by_name,
                min_intervals=options.merge_min_intervals,
                remove_inconsistent=options.remove_inconsistent_names,
                resolve_blocks=options.resolve_blocks,
                stranded=options.stranded)
        elif method == "bins":
            if options.bin_edges:
                bin_edges = list(map(float, options.bin_edges.split(",")))
                # IMS: check bin edges are valid
                if not (len(bin_edges) == options.num_bins + 1):
                    raise ValueError(
                        "Number of bin edge must be one more than "
                        "number of bins")
            else:
                bin_edges = None
            processor, bin_edges = Bed.binIntervals(
                processor,
                num_bins=options.num_bins,
                method=options.binning_method,
                bin_edges=bin_edges)
            E.info("# split bed: bin_edges=%s" % (str(bin_edges)))

        elif method == "block":
            processor = Bed.blocked_iterator(processor)
        elif method == "shift":
            # IMS: test that contig sizes are availible
            if not contigs:
                raise ValueError("please supply genome file")
            processor = shiftIntervals(processor,
                                       contigs,
                                       offset=options.offset)
        # IMS: new method: extend intervals by set amount
        elif method == "extend":
            if not contigs:
                raise ValueError("please supply genome file")
            processor = extendInterval(processor, contigs, options.offset)
        elif method == "filter-names":
            if not options.names:
                raise ValueError("please supply list of names to filter")
            names = [name.strip() for name in open(options.names)]
            processor = filterNames(processor, names)

    noutput = 0
    for bed in processor:
        options.stdout.write(str(bed) + "\n")
        noutput += 1

    E.info("noutput=%i" % (noutput))

    E.Stop()